/* $NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $ */
/*
* Copyright (c) 2008, 2019 The NetBSD Foundation, Inc. All
* rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran and Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1991 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)npx.c 7.2 (Berkeley) 5/12/91
*/
/*
* Copyright (c) 1994, 1995, 1998 Charles M. Hannum. All rights reserved.
* Copyright (c) 1990 William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)npx.c 7.2 (Berkeley) 5/12/91
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.87 2023/07/18 12:34:25 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_multiprocessor.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/sysctl.h>
#include <sys/xcall.h>
#include <machine/cpu.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/intr.h>
#include <machine/cpufunc.h>
#include <machine/pcb.h>
#include <machine/trap.h>
#include <machine/specialreg.h>
#include <x86/cpu.h>
#include <x86/fpu.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
#ifdef XENPV
#define clts() HYPERVISOR_fpu_taskswitch(0)
#define stts() HYPERVISOR_fpu_taskswitch(1)
#endif
void fpu_handle_deferred(void);
void fpu_switch(struct lwp *, struct lwp *);
uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
static inline union savefpu *
fpu_lwp_area(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
union savefpu *area = &pcb->pcb_savefpu;
KASSERT((l->l_flag & LW_SYSTEM) == 0); if (l == curlwp) { fpu_save();
}
KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU));
return area;
}
static inline void
fpu_save_lwp(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
union savefpu *area = &pcb->pcb_savefpu;
int s;
s = splvm();
if (l->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT((l->l_flag & LW_SYSTEM) == 0); fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32));
l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
splx(s);
}
/*
* Bring curlwp's FPU state in memory. It will get installed back in the CPU
* when returning to userland.
*/
void
fpu_save(void)
{ fpu_save_lwp(curlwp);
}
void
fpuinit(struct cpu_info *ci)
{
/*
* This might not be strictly necessary since it will be initialized
* for each process. However it does no harm.
*/
clts();
fninit();
stts();
}
void
fpuinit_mxcsr_mask(void)
{
#ifndef XENPV
union savefpu fpusave __aligned(64);
u_long psl;
memset(&fpusave, 0, sizeof(fpusave));
/* Disable interrupts, and enable FPU */
psl = x86_read_psl();
x86_disable_intr();
clts();
/* Fill in the FPU area */
fxsave(&fpusave);
/* Restore previous state */
stts();
x86_write_psl(psl);
if (fpusave.sv_xmm.fx_mxcsr_mask == 0) {
x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
} else {
x86_fpu_mxcsr_mask = fpusave.sv_xmm.fx_mxcsr_mask;
}
#else
/*
* XXX XXX XXX: On Xen the FXSAVE above faults. That's because
* &fpusave is not 16-byte aligned. Stack alignment problem
* somewhere, it seems.
*/
x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
#endif
}
static inline void
fpu_errata_amd(void)
{
uint16_t sw;
/*
* AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor
* when FSW.ES=0, leaking other threads' execution history.
*
* Clear them manually by loading a zero (fldummy). We do this
* unconditionally, regardless of FSW.ES.
*
* Before that, clear the ES bit in the x87 status word if it is
* currently set, in order to avoid causing a fault in the
* upcoming load.
*
* Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2],
* which indicates that FIP/FDP/FOP are restored (same behavior
* as Intel). We're not using it though.
*/
fnstsw(&sw);
if (sw & 0x80) fnclex();
fldummy();
}
#ifdef __x86_64__
#define XS64(x) (is_64bit ? x##64 : x)
#else
#define XS64(x) x
#endif
void
fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit)
{
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
fnsave(area);
break;
case FPU_SAVE_FXSAVE:
XS64(fxsave)(area);
break;
case FPU_SAVE_XSAVE:
XS64(xsave)(area, xsave_features);
break;
case FPU_SAVE_XSAVEOPT:
XS64(xsaveopt)(area, xsave_features);
break;
}
stts();
}
void
fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit)
{
clts();
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
frstor(area);
break;
case FPU_SAVE_FXSAVE:
if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd();
XS64(fxrstor)(area);
break;
case FPU_SAVE_XSAVE:
case FPU_SAVE_XSAVEOPT:
if (cpu_vendor == CPUVENDOR_AMD) fpu_errata_amd();
XS64(xrstor)(area, xsave_features);
break;
}
}
void
fpu_handle_deferred(void)
{
struct pcb *pcb = lwp_getpcb(curlwp);
fpu_area_restore(&pcb->pcb_savefpu, x86_xsave_features,
!(curlwp->l_proc->p_flag & PK_32));
}
void
fpu_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
struct cpu_info *ci __diagused = curcpu();
struct pcb *pcb;
KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d",
cpu_index(ci), ci->ci_ilevel);
if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) { KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
pcb = lwp_getpcb(oldlwp);
fpu_area_save(&pcb->pcb_savefpu, x86_xsave_features,
!(oldlwp->l_proc->p_flag & PK_32));
oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU));
}
void
fpu_lwp_fork(struct lwp *l1, struct lwp *l2)
{
struct pcb *pcb2 = lwp_getpcb(l2);
union savefpu *fpu_save;
/* Kernel threads have no FPU. */
if (__predict_false(l2->l_flag & LW_SYSTEM)) {
return;
}
/* For init(8). */
if (__predict_false(l1->l_flag & LW_SYSTEM)) {
memset(&pcb2->pcb_savefpu, 0, x86_fpu_save_size);
return;
}
fpu_save = fpu_lwp_area(l1);
memcpy(&pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
}
void
fpu_lwp_abandon(struct lwp *l)
{
int s;
KASSERT(l == curlwp);
s = splvm();
l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
stts();
splx(s);
}
/* -------------------------------------------------------------------------- */
/*
* fpu_kern_enter()
*
* Begin using the FPU. Raises to splvm, disabling most
* interrupts and rendering the thread non-preemptible; caller
* should not use this for long periods of time, and must call
* fpu_kern_leave() afterward. Non-recursive -- you cannot call
* fpu_kern_enter() again without calling fpu_kern_leave() first.
*
* Must be used only at IPL_VM or below -- never in IPL_SCHED or
* IPL_HIGH interrupt handlers.
*/
void
fpu_kern_enter(void)
{
static const union savefpu safe_fpu __aligned(64) = {
.sv_xmm = {
.fx_mxcsr = __SAFE_MXCSR__,
},
};
struct lwp *l = curlwp;
struct cpu_info *ci;
int s;
s = splvm();
ci = curcpu();
#if 0
/*
* Can't assert this because if the caller holds a spin lock at
* IPL_VM, and previously held and released a spin lock at
* higher IPL, the IPL remains raised above IPL_VM.
*/
KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d",
ci->ci_ilevel);
#endif
KASSERT(ci->ci_kfpu_spl == -1);
ci->ci_kfpu_spl = s;
/*
* If we are in a softint and have a pinned lwp, the fpu state is that
* of the pinned lwp, so save it there.
*/
while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL))
l = l->l_switchto;
fpu_save_lwp(l);
/*
* Clear CR0_TS, which fpu_save_lwp set if it saved anything --
* otherwise the CPU will trap if we try to use the FPU under
* the false impression that there has been a task switch since
* the last FPU usage requiring that we save the FPU state.
*/
clts();
/*
* Zero the FPU registers and install safe control words.
*/
fpu_area_restore(&safe_fpu, x86_xsave_features, /*is_64bit*/false);
}
/*
* fpu_kern_leave()
*
* End using the FPU after fpu_kern_enter().
*/
void
fpu_kern_leave(void)
{
static const union savefpu zero_fpu __aligned(64);
struct cpu_info *ci = curcpu();
int s;
#if 0
/*
* Can't assert this because if the caller holds a spin lock at
* IPL_VM, and previously held and released a spin lock at
* higher IPL, the IPL remains raised above IPL_VM.
*/
KASSERT(ci->ci_ilevel == IPL_VM || cold);
#endif
KASSERT(ci->ci_kfpu_spl != -1);
/*
* Zero the fpu registers; otherwise we might leak secrets
* through Spectre-class attacks to userland, even if there are
* no bugs in fpu state management.
*/
fpu_area_restore(&zero_fpu, x86_xsave_features, /*is_64bit*/false);
/*
* Set CR0_TS again so that the kernel can't accidentally use
* the FPU.
*/
stts();
s = ci->ci_kfpu_spl;
ci->ci_kfpu_spl = -1;
splx(s);
}
/* -------------------------------------------------------------------------- */
/*
* The following table is used to ensure that the FPE_... value
* that is passed as a trapcode to the signal handler of the user
* process does not have more than one bit set.
*
* Multiple bits may be set if SSE simd instructions generate errors
* on more than one value or if the user process modifies the control
* word while a status word bit is already set (which this is a sign
* of bad coding).
* We have no choice than to narrow them down to one bit, since we must
* not send a trapcode that is not exactly one of the FPE_ macros.
*
* The mechanism has a static table with 127 entries. Each combination
* of the 7 FPU status word exception bits directly translates to a
* position in this table, where a single FPE_... value is stored.
* This FPE_... value stored there is considered the "most important"
* of the exception bits and will be sent as the signal code. The
* precedence of the bits is based upon Intel Document "Numerical
* Applications", Chapter "Special Computational Situations".
*
* The code to choose one of these values does these steps:
* 1) Throw away status word bits that cannot be masked.
* 2) Throw away the bits currently masked in the control word,
* assuming the user isn't interested in them anymore.
* 3) Reinsert status word bit 7 (stack fault) if it is set, which
* cannot be masked but must be preserved.
* 'Stack fault' is a sub-class of 'invalid operation'.
* 4) Use the remaining bits to point into the trapcode table.
*
* The 6 maskable bits in order of their preference, as stated in the
* above referenced Intel manual:
* 1 Invalid operation (FP_X_INV)
* 1a Stack underflow
* 1b Stack overflow
* 1c Operand of unsupported format
* 1d SNaN operand.
* 2 QNaN operand (not an exception, irrelevant here)
* 3 Any other invalid-operation not mentioned above or zero divide
* (FP_X_INV, FP_X_DZ)
* 4 Denormal operand (FP_X_DNML)
* 5 Numeric over/underflow (FP_X_OFL, FP_X_UFL)
* 6 Inexact result (FP_X_IMP)
*
* NB: the above seems to mix up the mxscr error bits and the x87 ones.
* They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx
* status.
*
* The table is nearly, but not quite, in bit order (ZERODIV and DENORM
* are swapped).
*
* This table assumes that any stack fault is cleared - so that an INVOP
* fault will only be reported as FLTSUB once.
* This might not happen if the mask is being changed.
*/
#define FPE_xxx1(f) (f & EN_SW_INVOP \
? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \
: f & EN_SW_ZERODIV ? FPE_FLTDIV \
: f & EN_SW_DENORM ? FPE_FLTUND \
: f & EN_SW_OVERFLOW ? FPE_FLTOVF \
: f & EN_SW_UNDERFLOW ? FPE_FLTUND \
: f & EN_SW_PRECLOSS ? FPE_FLTRES \
: f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0)
#define FPE_xxx2(f) FPE_xxx1(f), FPE_xxx1((f + 1))
#define FPE_xxx4(f) FPE_xxx2(f), FPE_xxx2((f + 2))
#define FPE_xxx8(f) FPE_xxx4(f), FPE_xxx4((f + 4))
#define FPE_xxx16(f) FPE_xxx8(f), FPE_xxx8((f + 8))
#define FPE_xxx32(f) FPE_xxx16(f), FPE_xxx16((f + 16))
static const uint8_t fpetable[128] = {
FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96)
};
#undef FPE_xxx1
#undef FPE_xxx2
#undef FPE_xxx4
#undef FPE_xxx8
#undef FPE_xxx16
#undef FPE_xxx32
/*
* This is a synchronous trap on either an x87 instruction (due to an unmasked
* error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due
* to an error on the instruction itself.
*
* If trap actually generates a signal, then the fpu state is saved and then
* copied onto the lwp's user-stack, and then recovered from there when the
* signal returns.
*
* All this code needs to do is save the reason for the trap. For x87 traps the
* status word bits need clearing to stop the trap re-occurring. For SSE traps
* the mxcsr bits are 'sticky' and need clearing to not confuse a later trap.
*
* We come here with interrupts disabled.
*/
void
fputrap(struct trapframe *frame)
{
uint32_t statbits;
ksiginfo_t ksi;
if (__predict_false(!USERMODE(frame->tf_cs))) {
register_t ip = X86_TF_RIP(frame);
char where[128];
#ifdef DDB
db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC);
#else
snprintf(where, sizeof(where), "%p", (void *)ip);
#endif
panic("fpu trap from kernel at %s, trapframe %p\n", where,
frame);
}
KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU);
if (frame->tf_trapno == T_XMM) {
uint32_t mxcsr;
x86_stmxcsr(&mxcsr);
statbits = mxcsr;
/* Clear the sticky status bits */
mxcsr &= ~0x3f;
x86_ldmxcsr(&mxcsr);
/* Remove masked interrupts and non-status bits */
statbits &= ~(statbits >> 7) & 0x3f;
/* Mark this is an XMM status */
statbits |= 0x10000;
} else {
uint16_t cw, sw;
/* Get current control and status words */
fnstcw(&cw);
fnstsw(&sw);
/* Clear any pending exceptions from status word */
fnclex();
/* Remove masked interrupts */
statbits = sw & ~(cw & 0x3f);
}
/* Doesn't matter now if we get pre-empted */
x86_enable_intr();
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGFPE;
ksi.ksi_addr = (void *)X86_TF_RIP(frame);
ksi.ksi_code = fpetable[statbits & 0x7f];
ksi.ksi_trap = statbits;
(*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi);
}
void
fpudna(struct trapframe *frame)
{
panic("fpudna from %s, ip %p, trapframe %p",
USERMODE(frame->tf_cs) ? "userland" : "kernel",
(void *)X86_TF_RIP(frame), frame);
}
/* -------------------------------------------------------------------------- */
static inline void
fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate)
{
/*
* Force a reload of the given xstate during the next XRSTOR.
*/
if (x86_fpu_save >= FPU_SAVE_XSAVE) {
fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate;
}
}
void
fpu_set_default_cw(struct lwp *l, unsigned int x87_cw)
{
union savefpu *fpu_save = fpu_lwp_area(l);
struct pcb *pcb = lwp_getpcb(l);
if (i386_use_fxsave) {
fpu_save->sv_xmm.fx_cw = x87_cw;
if (x87_cw != __INITIAL_NPXCW__) {
fpu_xstate_reload(fpu_save, XCR0_X87);
}
} else {
fpu_save->sv_87.s87_cw = x87_cw;
}
pcb->pcb_fpu_dflt_cw = x87_cw;
}
void
fpu_clear(struct lwp *l, unsigned int x87_cw)
{
union savefpu *fpu_save;
struct pcb *pcb;
KASSERT(l == curlwp);
fpu_save = fpu_lwp_area(l);
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
memset(&fpu_save->sv_87, 0, x86_fpu_save_size);
fpu_save->sv_87.s87_tw = 0xffff;
fpu_save->sv_87.s87_cw = x87_cw;
break;
case FPU_SAVE_FXSAVE:
memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_cw = x87_cw;
break;
case FPU_SAVE_XSAVE:
case FPU_SAVE_XSAVEOPT:
memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_cw = x87_cw;
if (__predict_false(x87_cw != __INITIAL_NPXCW__)) {
fpu_xstate_reload(fpu_save, XCR0_X87);
}
break;
}
pcb = lwp_getpcb(l);
pcb->pcb_fpu_dflt_cw = x87_cw;
}
void
fpu_sigreset(struct lwp *l)
{ union savefpu *fpu_save = fpu_lwp_area(l);
struct pcb *pcb = lwp_getpcb(l);
/*
* For signal handlers the register values don't matter. Just reset
* a few fields.
*/
if (i386_use_fxsave) {
fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_tw = 0;
fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw;
} else {
fpu_save->sv_87.s87_tw = 0xffff;
fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw;
}
}
void
process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm));
/*
* Invalid bits in mxcsr or mxcsr_mask will cause faults.
*/
fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask;
fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
} else {
process_xmm_to_s87(fpregs, &fpu_save->sv_87);
}
}
void
process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
process_s87_to_xmm(fpregs, &fpu_save->sv_xmm);
fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
} else {
memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87));
}
}
void
process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs)
{ union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm));
} else {
memset(fpregs, 0, sizeof(*fpregs));
process_s87_to_xmm(&fpu_save->sv_87, fpregs);
}
}
void
process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs)
{
union savefpu *fpu_save = fpu_lwp_area(l);
if (i386_use_fxsave) {
memset(fpregs, 0, sizeof(*fpregs));
process_xmm_to_s87(&fpu_save->sv_xmm, fpregs);
} else {
memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87));
}
}
int
process_read_xstate(struct lwp *l, struct xstate *xstate)
{ union savefpu *fpu_save = fpu_lwp_area(l);
if (x86_fpu_save == FPU_SAVE_FSAVE) {
/* Convert from legacy FSAVE format. */
memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave));
process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave);
/* We only got x87 data. */
xstate->xs_rfbm = XCR0_X87;
xstate->xs_xstate_bv = XCR0_X87;
return 0;
}
/* Copy the legacy area. */
memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave,
sizeof(xstate->xs_fxsave));
if (x86_fpu_save == FPU_SAVE_FXSAVE) {
/* FXSAVE means we've got x87 + SSE data. */
xstate->xs_rfbm = XCR0_X87 | XCR0_SSE;
xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE;
return 0;
}
/* Copy the bitmap indicating which states are available. */
xstate->xs_rfbm = x86_xsave_features & XCR0_FPU;
xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv;
KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm));
#define COPY_COMPONENT(xcr0_val, xsave_val, field) \
if (xstate->xs_xstate_bv & xcr0_val) { \
KASSERT(x86_xsave_offsets[xsave_val] \
>= sizeof(struct xsave_header)); \
KASSERT(x86_xsave_sizes[xsave_val] \
>= sizeof(xstate->field)); \
memcpy(&xstate->field, \
(char*)fpu_save + x86_xsave_offsets[xsave_val], \
sizeof(xstate->field)); \
}
COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128); COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask); COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256); COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
#undef COPY_COMPONENT
return 0;
}
int
process_verify_xstate(const struct xstate *xstate)
{
/* xstate_bv must be a subset of RFBM */
if (xstate->xs_xstate_bv & ~xstate->xs_rfbm)
return EINVAL;
switch (x86_fpu_save) {
case FPU_SAVE_FSAVE:
if ((xstate->xs_rfbm & ~XCR0_X87))
return EINVAL;
break;
case FPU_SAVE_FXSAVE:
if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE)))
return EINVAL;
break;
default:
/* Verify whether no unsupported features are enabled */
if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0)
return EINVAL;
}
return 0;
}
int
process_write_xstate(struct lwp *l, const struct xstate *xstate)
{ union savefpu *fpu_save = fpu_lwp_area(l);
/* Convert data into legacy FSAVE format. */
if (x86_fpu_save == FPU_SAVE_FSAVE) {
if (xstate->xs_xstate_bv & XCR0_X87) process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87);
return 0;
}
/* If XSAVE is supported, make sure that xstate_bv is set correctly. */
if (x86_fpu_save >= FPU_SAVE_XSAVE) {
/*
* Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv :
* fpu_save->sv_xsave_hdr.xsh_xstate_bv"
*/
fpu_save->sv_xsave_hdr.xsh_xstate_bv =
(fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) |
xstate->xs_xstate_bv;
}
if (xstate->xs_xstate_bv & XCR0_X87) {
/*
* X87 state is split into two areas, interspersed with SSE
* data.
*/
memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24);
memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac,
sizeof(xstate->xs_fxsave.fx_87_ac));
}
/*
* Copy MXCSR if either SSE or AVX state is requested, to match the
* XSAVE behavior for those flags.
*/
if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) {
/*
* Invalid bits in mxcsr or mxcsr_mask will cause faults.
*/
fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask
& x86_fpu_mxcsr_mask;
fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr &
fpu_save->sv_xmm.fx_mxcsr_mask;
}
if (xstate->xs_xstate_bv & XCR0_SSE) { memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160],
xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm));
}
#define COPY_COMPONENT(xcr0_val, xsave_val, field) \
if (xstate->xs_xstate_bv & xcr0_val) { \
KASSERT(x86_xsave_offsets[xsave_val] \
>= sizeof(struct xsave_header)); \
KASSERT(x86_xsave_sizes[xsave_val] \
>= sizeof(xstate->field)); \
memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val], \
&xstate->field, sizeof(xstate->field)); \
}
COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128); COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask); COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256); COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
#undef COPY_COMPONENT
return 0;
}
/* $NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $ */
/*-
* Copyright (c)2003, 2005, 2009 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* uvm_object read-ahead
*
* TODO:
* - tune.
* - handle multiple streams.
* - find a better way to deal with PGO_LOCKED pager requests.
* (currently just ignored)
* - consider the amount of memory in the system.
* - consider the speed of the underlying device.
* - consider filesystem block size / block layout.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_readahead.c,v 1.16 2023/09/23 18:21:12 ad Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#if defined(READAHEAD_DEBUG)
#define DPRINTF(a) printf a
#else /* defined(READAHEAD_DEBUG) */
#define DPRINTF(a) /* nothing */
#endif /* defined(READAHEAD_DEBUG) */
/*
* uvm_ractx: read-ahead context.
*/
struct uvm_ractx {
int ra_flags;
#define RA_VALID 1
off_t ra_winstart; /* window start offset */
size_t ra_winsize; /* window size */
off_t ra_next; /* next offset to read-ahead */
};
#if defined(sun2) || defined(sun3)
/* XXX: on sun2 and sun3 MAXPHYS is 0xe000 */
#undef MAXPHYS
#define MAXPHYS 0x8000 /* XXX */
#endif
#define RA_WINSIZE_INIT MAXPHYS /* initial window size */
#define RA_WINSIZE_MAX (MAXPHYS * 16) /* max window size */
#define RA_WINSIZE_SEQENTIAL RA_WINSIZE_MAX /* fixed window size used for
SEQUENTIAL hint */
#define RA_MINSIZE (MAXPHYS * 2) /* min size to start i/o */
#define RA_IOCHUNK MAXPHYS /* read-ahead i/o chunk size */
static off_t ra_startio(struct uvm_object *, off_t, size_t);
static struct uvm_ractx *ra_allocctx(void);
static void ra_freectx(struct uvm_ractx *);
/*
* uvm_ra_init: initialize readahead module.
*/
void
uvm_ra_init(void)
{
}
static struct uvm_ractx *
ra_allocctx(void)
{
return kmem_alloc(sizeof(struct uvm_ractx), KM_NOSLEEP);
}
static void
ra_freectx(struct uvm_ractx *ra)
{
kmem_free(ra, sizeof(struct uvm_ractx));
}
/*
* ra_startio: start i/o for read-ahead.
*
* => start i/o for each RA_IOCHUNK sized chunk.
* => return offset to which we started i/o.
*/
static off_t
ra_startio(struct uvm_object *uobj, off_t off, size_t sz)
{
const off_t endoff = off + sz;
DPRINTF(("%s: uobj=%p, off=%" PRIu64 ", endoff=%" PRIu64 "\n",
__func__, uobj, off, endoff));
KASSERT(rw_write_held(uobj->vmobjlock));
/*
* Don't issue read-ahead if the last page of the range is already cached.
* The assumption is that since the access is sequential, the intermediate
* pages would have similar LRU stats, and hence likely to be still in cache
* too. This speeds up I/O using cache, since it avoids lookups and temporary
* allocations done by full pgo_get.
*/
struct vm_page *pg = uvm_pagelookup(uobj, trunc_page(endoff - 1));
if (pg != NULL) {
DPRINTF(("%s: off=%" PRIu64 ", sz=%zu already cached\n",
__func__, off, sz));
return endoff;
}
off = trunc_page(off);
while (off < endoff) {
const size_t chunksize = RA_IOCHUNK;
int error;
size_t donebytes;
int npages;
int orignpages;
size_t bytelen;
KASSERT((chunksize & (chunksize - 1)) == 0);
KASSERT((off & PAGE_MASK) == 0);
bytelen = ((off + chunksize) & -(off_t)chunksize) - off;
KASSERT((bytelen & PAGE_MASK) == 0);
npages = orignpages = bytelen >> PAGE_SHIFT;
KASSERT(npages != 0);
/*
* use UVM_ADV_RANDOM to avoid recursion.
*/
error = (*uobj->pgops->pgo_get)(uobj, off, NULL,
&npages, 0, VM_PROT_READ, UVM_ADV_RANDOM, PGO_NOTIMESTAMP);
rw_enter(uobj->vmobjlock, RW_WRITER);
DPRINTF(("%s: off=%" PRIu64 ", bytelen=%zu -> %d\n",
__func__, off, bytelen, error));
if (error != 0 && error != EBUSY) {
if (error != EINVAL) { /* maybe past EOF */
DPRINTF(("%s: error=%d\n", __func__, error));
}
break;
}
KASSERT(orignpages == npages);
donebytes = orignpages << PAGE_SHIFT;
off += donebytes;
}
return off;
}
/* ------------------------------------------------------------ */
/*
* uvm_ra_allocctx: allocate a context.
*/
struct uvm_ractx *
uvm_ra_allocctx(void)
{
struct uvm_ractx *ra;
ra = ra_allocctx();
if (ra != NULL) { ra->ra_flags = 0;
}
return ra;
}
/*
* uvm_ra_freectx: free a context.
*/
void
uvm_ra_freectx(struct uvm_ractx *ra)
{
KASSERT(ra != NULL);
ra_freectx(ra);
}
/*
* uvm_ra_request: update a read-ahead context and start i/o if appropriate.
*
* => called when [reqoff, reqoff+reqsize) is requested.
* => object must be locked by caller, will return locked.
*/
void
uvm_ra_request(struct uvm_ractx *ra, int advice, struct uvm_object *uobj,
off_t reqoff, size_t reqsize)
{ KASSERT(rw_write_held(uobj->vmobjlock)); if (ra == NULL || advice == UVM_ADV_RANDOM) {
return;
}
if (advice == UVM_ADV_SEQUENTIAL) {
/*
* always do read-ahead with a large window.
*/
if ((ra->ra_flags & RA_VALID) == 0) {
ra->ra_winstart = ra->ra_next = 0;
ra->ra_flags |= RA_VALID;
}
if (reqoff < ra->ra_winstart) { ra->ra_next = reqoff;
}
ra->ra_winsize = RA_WINSIZE_SEQENTIAL;
goto do_readahead;
}
/*
* a request with UVM_ADV_NORMAL hint. (ie. no hint)
*
* we keep a sliding window in order to determine:
* - if the previous read-ahead was successful or not.
* - how many bytes to read-ahead.
*/
/*
* if it's the first request for this context,
* initialize context and return.
*/
if ((ra->ra_flags & RA_VALID) == 0) {
initialize:
ra->ra_winstart = ra->ra_next = reqoff + reqsize;
ra->ra_winsize = RA_WINSIZE_INIT;
ra->ra_flags |= RA_VALID;
goto done;
}
/*
* if it isn't in our window,
* initialize context and return.
* (read-ahead miss)
*/
if (reqoff < ra->ra_winstart ||
ra->ra_winstart + ra->ra_winsize < reqoff) {
/*
* ... unless we seem to be reading the same chunk repeatedly.
*
* XXX should have some margin?
*/
if (reqoff + reqsize == ra->ra_winstart) {
DPRINTF(("%s: %p: same block: off=%" PRIu64
", size=%zd, winstart=%" PRIu64 "\n",
__func__, ra, reqoff, reqsize, ra->ra_winstart));
goto done;
}
goto initialize;
}
/*
* it's in our window. (read-ahead hit)
* - start read-ahead i/o if appropriate.
* - advance and enlarge window.
*/
do_readahead:
/*
* don't bother to read-ahead behind current request.
*/
if (reqoff > ra->ra_next) { ra->ra_next = reqoff;
}
/*
* try to make [reqoff, reqoff+ra_winsize) in-core.
* note that [reqoff, ra_next) is considered already done.
*/
if (reqoff + ra->ra_winsize > ra->ra_next) {
off_t raoff = MAX(reqoff, ra->ra_next);
size_t rasize = reqoff + ra->ra_winsize - ra->ra_next;
#if defined(DIAGNOSTIC)
if (rasize > RA_WINSIZE_MAX) {
printf("%s: corrupted context", __func__);
rasize = RA_WINSIZE_MAX;
}
#endif /* defined(DIAGNOSTIC) */
/*
* issue read-ahead only if we can start big enough i/o.
* otherwise we end up with a stream of small i/o.
*/
if (rasize >= RA_MINSIZE) {
off_t next;
next = ra_startio(uobj, raoff, rasize);
ra->ra_next = next;
}
}
/*
* update window.
*
* enlarge window by reqsize, so that it grows in a predictable manner
* regardless of the size of each read(2).
*/
ra->ra_winstart = reqoff + reqsize;
ra->ra_winsize = MIN(RA_WINSIZE_MAX, ra->ra_winsize + reqsize);
done:;
}
int
uvm_readahead(struct uvm_object *uobj, off_t off, off_t size)
{
/*
* don't allow too much read-ahead.
*/
if (size > RA_WINSIZE_MAX) {
size = RA_WINSIZE_MAX;
}
rw_enter(uobj->vmobjlock, RW_WRITER);
ra_startio(uobj, off, size);
rw_exit(uobj->vmobjlock);
return 0;
}
/* $NetBSD: statvfs.h,v 1.5 2024/01/19 18:39:15 christos Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _COMPAT_SYS_STATVFS_H_
#define _COMPAT_SYS_STATVFS_H_
#include <sys/statvfs.h>
struct statvfs90 {
unsigned long f_flag; /* copy of mount exported flags */
unsigned long f_bsize; /* file system block size */
unsigned long f_frsize; /* fundamental file system block size */
unsigned long f_iosize; /* optimal file system block size */
/* The following are in units of f_frsize */
fsblkcnt_t f_blocks; /* number of blocks in file system, */
fsblkcnt_t f_bfree; /* free blocks avail in file system */
fsblkcnt_t f_bavail; /* free blocks avail to non-root */
fsblkcnt_t f_bresvd; /* blocks reserved for root */
fsfilcnt_t f_files; /* total file nodes in file system */
fsfilcnt_t f_ffree; /* free file nodes in file system */
fsfilcnt_t f_favail; /* free file nodes avail to non-root */
fsfilcnt_t f_fresvd; /* file nodes reserved for root */
uint64_t f_syncreads; /* count of sync reads since mount */
uint64_t f_syncwrites; /* count of sync writes since mount */
uint64_t f_asyncreads; /* count of async reads since mount */
uint64_t f_asyncwrites; /* count of async writes since mount */
fsid_t f_fsidx; /* NetBSD compatible fsid */
unsigned long f_fsid; /* Posix compatible fsid */
unsigned long f_namemax; /* maximum filename length */
uid_t f_owner; /* user that mounted the file system */
uint32_t f_spare[4]; /* spare space */
char f_fstypename[_VFS_NAMELEN]; /* fs type name */
char f_mntonname[_VFS_MNAMELEN]; /* directory on which mounted */
char f_mntfromname[_VFS_MNAMELEN]; /* mounted file system */
};
__BEGIN_DECLS
#ifndef _KERNEL
#include <string.h>
#endif
static __inline void
statvfs_to_statvfs90(const struct statvfs *s, struct statvfs90 *s90)
{
memset(s90, 0, sizeof(*s90));
s90->f_flag = s->f_flag;
s90->f_bsize = s->f_bsize;
s90->f_frsize = s->f_frsize;
s90->f_iosize = s->f_iosize;
s90->f_blocks = s->f_blocks;
s90->f_bfree = s->f_bfree;
s90->f_bavail = s->f_bavail;
s90->f_bresvd = s->f_bresvd;
s90->f_files = s->f_files;
s90->f_ffree = s->f_ffree;
s90->f_favail = s->f_favail;
s90->f_fresvd = s->f_fresvd;
s90->f_syncreads = s->f_syncreads;
s90->f_syncwrites = s->f_syncwrites;
s90->f_asyncreads = s->f_asyncreads;
s90->f_asyncwrites = s->f_asyncwrites;
s90->f_fsidx = s->f_fsidx;
s90->f_fsid = s->f_fsid;
s90->f_namemax = s->f_namemax;
s90->f_owner = s->f_owner;
memcpy(s90->f_fstypename, s->f_fstypename, sizeof(s90->f_fstypename));
memcpy(s90->f_mntonname, s->f_mntonname, sizeof(s90->f_mntonname));
memcpy(s90->f_mntfromname, s->f_mntfromname, sizeof(s90->f_mntfromname));
}
#ifdef _KERNEL
static __inline int
statvfs_to_statvfs90_copy(const void *vs, void *vs90, size_t l)
{
struct statvfs90 *s90 = kmem_zalloc(sizeof(*s90), KM_SLEEP);
int error;
statvfs_to_statvfs90(vs, s90);
error = copyout(s90, vs90, sizeof(*s90));
kmem_free(s90, sizeof(*s90));
return error;
}
#else
#ifdef __LIBC12_SOURCE__
int __compat_statvfs(const char *__restrict, struct statvfs90 *__restrict);
int __compat_statvfs1(const char *__restrict, struct statvfs90 *__restrict,
int);
int __compat_fstatvfs(int, struct statvfs90 *);
int __compat_fstatvfs1(int, struct statvfs90 *, int);
int __compat___getmntinfo13(struct statvfs90 **, int);
int __compat___fhstatvfs40(const void *, size_t, struct statvfs90 *);
int __compat___fhstatvfs140(const void *, size_t, struct statvfs90 *, int);
int __compat_getvfsstat(struct statvfs90 *, size_t, int);
int __statvfs90(const char *__restrict, struct statvfs *__restrict);
int __statvfs190(const char *__restrict, struct statvfs *__restrict, int);
int __fstatvfs90(int, struct statvfs *);
int __fstatvfs190(int, struct statvfs *, int);
int __fhstatvfs90(const void *, size_t, struct statvfs *);
int __fhstatvfs190(const void *, size_t, struct statvfs *, int);
int __getvfsstat90(struct statvfs *, size_t, int);
int __getmntinfo90(struct statvfs **, int);
struct compat_30_fhandle;
int fhstatvfs(const struct compat_30_fhandle *, struct statvfs90 *);
int fhstatvfs1(const struct compat_30_fhandle *, struct statvfs90 *, int);
#endif /* __LIBC12_SOURCE__ */
#endif /* _KERNEL */
__END_DECLS
#endif /* !_COMPAT_SYS_STATVFS_H_ */
/*-
* Copyright (c) 2009-2013 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This material is based upon work partially supported by The
* NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* NPF main: dynamic load/initialisation and unload routines.
*/
#ifdef _KERNEL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf.c,v 1.44 2020/08/27 18:50:25 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/kmem.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#endif
#include "npf_impl.h"
#include "npf_conn.h"
static __read_mostly npf_t * npf_kernel_ctx = NULL;
__dso_public int
npfk_sysinit(unsigned nworkers)
{
npf_bpf_sysinit();
npf_tableset_sysinit();
npf_nat_sysinit();
npf_portmap_sysinit();
return npf_worker_sysinit(nworkers);
}
__dso_public void
npfk_sysfini(void)
{
npf_worker_sysfini();
npf_portmap_sysfini();
npf_nat_sysfini();
npf_tableset_sysfini();
npf_bpf_sysfini();
}
__dso_public npf_t *
npfk_create(int flags, const npf_mbufops_t *mbufops,
const npf_ifops_t *ifops, void *arg)
{
npf_t *npf;
npf = kmem_zalloc(sizeof(npf_t), KM_SLEEP);
npf->ebr = npf_ebr_create();
npf->stats_percpu = percpu_alloc(NPF_STATS_SIZE);
npf->mbufops = mbufops;
npf->arg = arg;
npf_param_init(npf);
npf_state_sysinit(npf);
npf_ifmap_init(npf, ifops);
npf_conn_init(npf);
npf_portmap_init(npf);
npf_alg_init(npf);
npf_ext_init(npf);
/* Load an empty configuration. */
npf_config_init(npf);
if ((flags & NPF_NO_GC) == 0) {
npf_worker_enlist(npf);
}
return npf;
}
__dso_public void
npfk_destroy(npf_t *npf)
{
npf_worker_discharge(npf);
/*
* Destroy the current configuration. Note: at this point all
* handlers must be deactivated; we will drain any processing.
*/
npf_config_fini(npf);
/* Finally, safe to destroy the subsystems. */
npf_ext_fini(npf);
npf_alg_fini(npf);
npf_portmap_fini(npf);
npf_conn_fini(npf);
npf_ifmap_fini(npf);
npf_state_sysfini(npf);
npf_param_fini(npf);
npf_ebr_destroy(npf->ebr);
percpu_free(npf->stats_percpu, NPF_STATS_SIZE);
kmem_free(npf, sizeof(npf_t));
}
/*
* npfk_load: (re)load the configuration.
*
* => Will not modify the configuration reference.
*/
__dso_public int
npfk_load(npf_t *npf, const void *config_ref, npf_error_t *err)
{
const nvlist_t *req = (const nvlist_t *)config_ref;
nvlist_t *resp;
int error;
resp = nvlist_create(0);
error = npfctl_run_op(npf, IOC_NPF_LOAD, req, resp);
nvlist_destroy(resp);
return error;
}
__dso_public void
npfk_gc(npf_t *npf)
{
npf_conn_worker(npf);
}
__dso_public void
npfk_thread_register(npf_t *npf)
{
npf_ebr_register(npf->ebr);
}
__dso_public void
npfk_thread_unregister(npf_t *npf)
{
npf_ebr_full_sync(npf->ebr);
npf_ebr_unregister(npf->ebr);
}
__dso_public void *
npfk_getarg(npf_t *npf)
{
return npf->arg;
}
void
npf_setkernctx(npf_t *npf)
{
npf_kernel_ctx = npf;
}
npf_t *
npf_getkernctx(void)
{
return npf_kernel_ctx;
}
/*
* NPF statistics interface.
*/
void
npf_stats_inc(npf_t *npf, npf_stats_t st)
{
uint64_t *stats = percpu_getref(npf->stats_percpu);
stats[st]++;
percpu_putref(npf->stats_percpu);
}
void
npf_stats_dec(npf_t *npf, npf_stats_t st)
{
uint64_t *stats = percpu_getref(npf->stats_percpu);
stats[st]--;
percpu_putref(npf->stats_percpu);
}
static void
npf_stats_collect(void *mem, void *arg, struct cpu_info *ci)
{
uint64_t *percpu_stats = mem, *full_stats = arg;
for (unsigned i = 0; i < NPF_STATS_COUNT; i++) {
full_stats[i] += percpu_stats[i];
}
}
static void
npf_stats_clear_cb(void *mem, void *arg, struct cpu_info *ci)
{
uint64_t *percpu_stats = mem;
for (unsigned i = 0; i < NPF_STATS_COUNT; i++) {
percpu_stats[i] = 0;
}
}
/*
* npf_stats: export collected statistics.
*/
__dso_public void
npfk_stats(npf_t *npf, uint64_t *buf)
{
memset(buf, 0, NPF_STATS_SIZE);
percpu_foreach_xcall(npf->stats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET),
npf_stats_collect, buf);
}
__dso_public void
npfk_stats_clear(npf_t *npf)
{
percpu_foreach_xcall(npf->stats_percpu, XC_HIGHPRI_IPL(IPL_SOFTNET),
npf_stats_clear_cb, NULL);
}
/* $NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_alloc.c 8.19 (Berkeley) 7/13/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_alloc.c,v 1.172 2023/01/07 19:41:30 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_uvm_page_trkown.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>
#include <sys/cprng.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#ifdef UVM_PAGE_TRKOWN
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#endif
static daddr_t ffs_alloccg(struct inode *, u_int, daddr_t, int, int, int);
static daddr_t ffs_alloccgblk(struct inode *, struct buf *, daddr_t, int, int);
static ino_t ffs_dirpref(struct inode *);
static daddr_t ffs_fragextend(struct inode *, u_int, daddr_t, int, int);
static void ffs_fserr(struct fs *, kauth_cred_t, const char *);
static daddr_t ffs_hashalloc(struct inode *, u_int, daddr_t, int, int, int,
daddr_t (*)(struct inode *, u_int, daddr_t, int, int, int));
static daddr_t ffs_nodealloccg(struct inode *, u_int, daddr_t, int, int, int);
static int32_t ffs_mapsearch(struct fs *, struct cg *,
daddr_t, int);
static void ffs_blkfree_common(struct ufsmount *, struct fs *, dev_t, struct buf *,
daddr_t, long, bool);
static void ffs_freefile_common(struct ufsmount *, struct fs *, dev_t, struct buf *, ino_t,
int, bool);
/* if 1, changes in optimalization strategy are logged */
int ffs_log_changeopt = 0;
/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];
/* Basic consistency check for block allocations */
static int
ffs_check_bad_allocation(const char *func, struct fs *fs, daddr_t bno,
long size, dev_t dev, ino_t inum)
{
if ((u_int)size > fs->fs_bsize || ffs_fragoff(fs, size) != 0 ||
ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) > fs->fs_frag) {
panic("%s: bad size: dev = 0x%llx, bno = %" PRId64
" bsize = %d, size = %ld, fs = %s", func,
(long long)dev, bno, fs->fs_bsize, size, fs->fs_fsmnt);
}
if (bno >= fs->fs_size) {
printf("%s: bad block %" PRId64 ", ino %llu\n", func, bno,
(unsigned long long)inum);
ffs_fserr(fs, NOCRED, "bad block");
return EINVAL;
}
return 0;
}
/*
* Allocate a block in the file system.
*
* The size of the requested block is given, which must be some
* multiple of fs_fsize and <= fs_bsize.
* A preference may be optionally specified. If a preference is given
* the following hierarchy is used to allocate a block:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate a block in the same cylinder group.
* 4) quadradically rehash into other cylinder groups, until an
* available block is located.
* If no block preference is given the following hierarchy is used
* to allocate a block:
* 1) allocate a block in the cylinder group that contains the
* inode for the file.
* 2) quadradically rehash into other cylinder groups, until an
* available block is located.
*
* => called with um_lock held
* => releases um_lock before returning
*/
int
ffs_alloc(struct inode *ip, daddr_t lbn, daddr_t bpref, int size,
int flags, kauth_cred_t cred, daddr_t *bnp)
{
struct ufsmount *ump;
struct fs *fs;
daddr_t bno;
u_int cg;
#if defined(QUOTA) || defined(QUOTA2)
int error;
#endif
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock));
#ifdef UVM_PAGE_TRKOWN
/*
* Sanity-check that allocations within the file size
* do not allow other threads to read the stale contents
* of newly allocated blocks.
* Usually pages will exist to cover the new allocation.
* There is an optimization in ffs_write() where we skip
* creating pages if several conditions are met:
* - the file must not be mapped (in any user address space).
* - the write must cover whole pages and whole blocks.
* If those conditions are not met then pages must exist and
* be locked by the current thread.
*/
struct vnode *vp = ITOV(ip);
if (vp->v_type == VREG && (flags & IO_EXT) == 0 &&
ffs_lblktosize(fs, (voff_t)lbn) < round_page(vp->v_size) &&
((vp->v_vflag & VV_MAPPED) != 0 || (size & PAGE_MASK) != 0 ||
ffs_blkoff(fs, size) != 0)) {
struct vm_page *pg __diagused;
struct uvm_object *uobj = &vp->v_uobj;
voff_t off = trunc_page(ffs_lblktosize(fs, lbn));
voff_t endoff = round_page(ffs_lblktosize(fs, lbn) + size);
rw_enter(uobj->vmobjlock, RW_WRITER);
while (off < endoff) {
pg = uvm_pagelookup(uobj, off);
KASSERT((pg != NULL && pg->owner_tag != NULL &&
pg->owner == curproc->p_pid &&
pg->lowner == curlwp->l_lid));
off += PAGE_SIZE;
}
rw_exit(uobj->vmobjlock);
}
#endif
*bnp = 0;
KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)size <= fs->fs_bsize),
"bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
(unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, size) == 0),
"bad size: dev = 0x%llx, bsize = %d, size = %d, fs = %s",
(unsigned long long)ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
goto nospace;
if (freespace(fs, fs->fs_minfree) <= 0 &&
kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
NULL, NULL) != 0)
goto nospace;
#if defined(QUOTA) || defined(QUOTA2)
mutex_exit(&ump->um_lock);
if ((error = chkdq(ip, btodb(size), cred, 0)) != 0)
return (error);
mutex_enter(&ump->um_lock);
#endif
if (bpref >= fs->fs_size)
bpref = 0;
if (bpref == 0)
cg = ino_to_cg(fs, ip->i_number);
else
cg = dtog(fs, bpref); bno = ffs_hashalloc(ip, cg, bpref, size, 0, flags, ffs_alloccg);
if (bno > 0) {
DIP_ADD(ip, blocks, btodb(size));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
*bnp = bno;
return (0);
}
#if defined(QUOTA) || defined(QUOTA2)
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(size), cred, FORCE);
#endif
if (flags & B_CONTIG) {
/*
* XXX ump->um_lock handling is "suspect" at best.
* For the case where ffs_hashalloc() fails early
* in the B_CONTIG case we reach here with um_lock
* already unlocked, so we can't release it again
* like in the normal error path. See kern/39206.
*
*
* Fail silently - it's up to our caller to report
* errors.
*/
return (ENOSPC);
}
nospace:
mutex_exit(&ump->um_lock);
ffs_fserr(fs, cred, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
}
/*
* Reallocate a fragment to a bigger size
*
* The number and size of the old block is given, and a preference
* and new size is also specified. The allocator attempts to extend
* the original block. Failing that, the regular block allocator is
* invoked to get an appropriate block.
*
* => called with um_lock held
* => return with um_lock released
*/
int
ffs_realloccg(struct inode *ip, daddr_t lbprev, daddr_t bprev, daddr_t bpref,
int osize, int nsize, int flags, kauth_cred_t cred, struct buf **bpp,
daddr_t *blknop)
{
struct ufsmount *ump;
struct fs *fs;
struct buf *bp;
u_int cg, request;
int error;
daddr_t bno;
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock));
#ifdef UVM_PAGE_TRKOWN
/*
* Sanity-check that allocations within the file size
* do not allow other threads to read the stale contents
* of newly allocated blocks.
* Unlike in ffs_alloc(), here pages must always exist
* for such allocations, because only the last block of a file
* can be a fragment and ffs_write() will reallocate the
* fragment to the new size using ufs_balloc_range(),
* which always creates pages to cover blocks it allocates.
*/
if (ITOV(ip)->v_type == VREG) {
struct vm_page *pg __diagused;
struct uvm_object *uobj = &ITOV(ip)->v_uobj;
voff_t off = trunc_page(ffs_lblktosize(fs, lbprev));
voff_t endoff = round_page(ffs_lblktosize(fs, lbprev) + osize);
rw_enter(uobj->vmobjlock, RW_WRITER);
while (off < endoff) {
pg = uvm_pagelookup(uobj, off);
KASSERT(pg->owner == curproc->p_pid &&
pg->lowner == curlwp->l_lid);
off += PAGE_SIZE;
}
rw_exit(uobj->vmobjlock);
}
#endif
KASSERTMSG((cred != NOCRED), "missing credential"); KASSERTMSG(((u_int)osize <= fs->fs_bsize),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, osize) == 0),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG(((u_int)nsize <= fs->fs_bsize),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
KASSERTMSG((ffs_fragoff(fs, nsize) == 0),
"bad size: dev=0x%llx, bsize=%d, osize=%d, nsize=%d, fs=%s",
(unsigned long long)ip->i_dev, fs->fs_bsize, osize, nsize,
fs->fs_fsmnt);
if (freespace(fs, fs->fs_minfree) <= 0 &&
kauth_authorize_system(cred, KAUTH_SYSTEM_FS_RESERVEDSPACE, 0, NULL,
NULL, NULL) != 0) {
mutex_exit(&ump->um_lock);
goto nospace;
}
if (bprev == 0) {
panic("%s: bad bprev: dev = 0x%llx, bsize = %d, bprev = %"
PRId64 ", fs = %s", __func__,
(unsigned long long)ip->i_dev, fs->fs_bsize, bprev,
fs->fs_fsmnt);
}
mutex_exit(&ump->um_lock);
/*
* Allocate the extra space in the buffer.
*/
if (bpp != NULL &&
(error = bread(ITOV(ip), lbprev, osize, 0, &bp)) != 0) {
return (error);
}
#if defined(QUOTA) || defined(QUOTA2)
if ((error = chkdq(ip, btodb(nsize - osize), cred, 0)) != 0) {
if (bpp != NULL) { brelse(bp, 0);
}
return (error);
}
#endif
/*
* Check for extension in the existing location.
*/
cg = dtog(fs, bprev);
mutex_enter(&ump->um_lock);
if ((bno = ffs_fragextend(ip, cg, bprev, osize, nsize)) != 0) { DIP_ADD(ip, blocks, btodb(nsize - osize));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
if (bp->b_blkno != FFS_FSBTODB(fs, bno)) {
panic("%s: bad blockno %#llx != %#llx",
__func__, (unsigned long long) bp->b_blkno,
(unsigned long long)FFS_FSBTODB(fs, bno));
}
allocbuf(bp, nsize, 1);
memset((char *)bp->b_data + osize, 0, nsize - osize);
mutex_enter(bp->b_objlock);
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_oflags |= BO_DONE;
mutex_exit(bp->b_objlock);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
}
/*
* Allocate a new disk location.
*/
if (bpref >= fs->fs_size)
bpref = 0;
switch ((int)fs->fs_optim) {
case FS_OPTSPACE:
/*
* Allocate an exact sized fragment. Although this makes
* best use of space, we will waste time relocating it if
* the file continues to grow. If the fragmentation is
* less than half of the minimum free reserve, we choose
* to begin optimizing for time.
*/
request = nsize;
if (fs->fs_minfree < 5 ||
fs->fs_cstotal.cs_nffree >
fs->fs_dsize * fs->fs_minfree / (2 * 100))
break;
if (ffs_log_changeopt) {
log(LOG_NOTICE,
"%s: optimization changed from SPACE to TIME\n",
fs->fs_fsmnt);
}
fs->fs_optim = FS_OPTTIME;
break;
case FS_OPTTIME:
/*
* At this point we have discovered a file that is trying to
* grow a small fragment to a larger fragment. To save time,
* we allocate a full sized block, then free the unused portion.
* If the file continues to grow, the `ffs_fragextend' call
* above will be able to grow it in place without further
* copying. If aberrant programs cause disk fragmentation to
* grow within 2% of the free reserve, we choose to begin
* optimizing for space.
*/
request = fs->fs_bsize;
if (fs->fs_cstotal.cs_nffree <
fs->fs_dsize * (fs->fs_minfree - 2) / 100)
break;
if (ffs_log_changeopt) {
log(LOG_NOTICE,
"%s: optimization changed from TIME to SPACE\n",
fs->fs_fsmnt);
}
fs->fs_optim = FS_OPTSPACE;
break;
default:
panic("%s: bad optim: dev = 0x%llx, optim = %d, fs = %s",
__func__, (unsigned long long)ip->i_dev, fs->fs_optim,
fs->fs_fsmnt);
/* NOTREACHED */
}
bno = ffs_hashalloc(ip, cg, bpref, request, nsize, 0, ffs_alloccg); if (bno > 0) {
/*
* Use forced deallocation registration, we can't handle
* failure here. This is safe, as this place is ever hit
* maximum once per write operation, when fragment is extended
* to longer fragment, or a full block.
*/
if ((ip->i_ump->um_mountp->mnt_wapbl) &&
(ITOV(ip)->v_type != VREG)) {
/* this should never fail */
error = UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(
ip->i_ump->um_mountp, FFS_FSBTODB(fs, bprev),
osize);
if (error) panic("ffs_realloccg: dealloc registration failed");
} else {
ffs_blkfree(fs, ip->i_devvp, bprev, (long)osize,
ip->i_number);
}
DIP_ADD(ip, blocks, btodb(nsize - osize));
if (flags & IO_EXT)
ip->i_flag |= IN_CHANGE;
else
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (bpp != NULL) {
bp->b_blkno = FFS_FSBTODB(fs, bno);
allocbuf(bp, nsize, 1);
memset((char *)bp->b_data + osize, 0, (u_int)nsize - osize);
mutex_enter(bp->b_objlock);
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_oflags |= BO_DONE;
mutex_exit(bp->b_objlock);
*bpp = bp;
}
if (blknop != NULL) { *blknop = bno;
}
return (0);
}
mutex_exit(&ump->um_lock);
#if defined(QUOTA) || defined(QUOTA2)
/*
* Restore user's disk quota because allocation failed.
*/
(void) chkdq(ip, -btodb(nsize - osize), cred, FORCE);
#endif
if (bpp != NULL) { brelse(bp, 0);
}
nospace:
/*
* no space available
*/
ffs_fserr(fs, cred, "file system full");
uprintf("\n%s: write failed, file system is full\n", fs->fs_fsmnt);
return (ENOSPC);
}
/*
* Allocate an inode in the file system.
*
* If allocating a directory, use ffs_dirpref to select the inode.
* If allocating in a directory, the following hierarchy is followed:
* 1) allocate the preferred inode.
* 2) allocate an inode in the same cylinder group.
* 3) quadradically rehash into other cylinder groups, until an
* available inode is located.
* If no inode preference is given the following hierarchy is used
* to allocate an inode:
* 1) allocate an inode in cylinder group 0.
* 2) quadradically rehash into other cylinder groups, until an
* available inode is located.
*
* => um_lock not held upon entry or return
*/
int
ffs_valloc(struct vnode *pvp, int mode, kauth_cred_t cred, ino_t *inop)
{
struct ufsmount *ump;
struct inode *pip;
struct fs *fs;
ino_t ino, ipref;
u_int cg;
int error;
UFS_WAPBL_JUNLOCK_ASSERT(pvp->v_mount);
pip = VTOI(pvp);
fs = pip->i_fs;
ump = pip->i_ump;
error = UFS_WAPBL_BEGIN(pvp->v_mount);
if (error) {
return error;
}
mutex_enter(&ump->um_lock);
if (fs->fs_cstotal.cs_nifree == 0)
goto noinodes;
if ((mode & IFMT) == IFDIR)
ipref = ffs_dirpref(pip);
else
ipref = pip->i_number;
if (ipref >= fs->fs_ncg * fs->fs_ipg)
ipref = 0;
cg = ino_to_cg(fs, ipref);
/*
* Track number of dirs created one after another
* in a same cg without intervening by files.
*/
if ((mode & IFMT) == IFDIR) {
if (fs->fs_contigdirs[cg] < 255)
fs->fs_contigdirs[cg]++;
} else {
if (fs->fs_contigdirs[cg] > 0)
fs->fs_contigdirs[cg]--;
}
ino = (ino_t)ffs_hashalloc(pip, cg, ipref, mode, 0, 0, ffs_nodealloccg);
if (ino == 0)
goto noinodes;
UFS_WAPBL_END(pvp->v_mount);
*inop = ino;
return 0;
noinodes:
mutex_exit(&ump->um_lock);
UFS_WAPBL_END(pvp->v_mount);
ffs_fserr(fs, cred, "out of inodes");
uprintf("\n%s: create/symlink failed, no inodes free\n", fs->fs_fsmnt);
return ENOSPC;
}
/*
* Find a cylinder group in which to place a directory.
*
* The policy implemented by this algorithm is to allocate a
* directory inode in the same cylinder group as its parent
* directory, but also to reserve space for its files inodes
* and data. Restrict the number of directories which may be
* allocated one after another in the same cylinder group
* without intervening allocation of files.
*
* If we allocate a first level directory then force allocation
* in another cylinder group.
*/
static ino_t
ffs_dirpref(struct inode *pip)
{
register struct fs *fs;
u_int cg, prefcg;
uint64_t dirsize, cgsize, curdsz;
u_int avgifree, avgbfree, avgndir;
u_int minifree, minbfree, maxndir;
u_int mincg, minndir;
u_int maxcontigdirs;
KASSERT(mutex_owned(&pip->i_ump->um_lock));
fs = pip->i_fs;
avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
avgndir = fs->fs_cstotal.cs_ndir / fs->fs_ncg;
/*
* Force allocation in another cg if creating a first level dir.
*/
if (ITOV(pip)->v_vflag & VV_ROOT) {
prefcg = cprng_fast32() % fs->fs_ncg;
mincg = prefcg;
minndir = fs->fs_ipg;
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < minndir && fs->fs_cs(fs, cg).cs_nifree >= avgifree && fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
mincg = cg;
minndir = fs->fs_cs(fs, cg).cs_ndir;
}
return ((ino_t)(fs->fs_ipg * mincg));
}
/*
* Count various limits which used for
* optimal allocation of a directory inode.
* Try cylinder groups with >75% avgifree and avgbfree.
* Avoid cylinder groups with no free blocks or inodes as that
* triggers an I/O-expensive cylinder group scan.
*/
maxndir = uimin(avgndir + fs->fs_ipg / 16, fs->fs_ipg);
minifree = avgifree - avgifree / 4;
if (minifree < 1)
minifree = 1;
minbfree = avgbfree - avgbfree / 4;
if (minbfree < 1)
minbfree = 1;
cgsize = (int64_t)fs->fs_fsize * fs->fs_fpg;
dirsize = (int64_t)fs->fs_avgfilesize * fs->fs_avgfpdir;
if (avgndir != 0) { curdsz = (cgsize - (int64_t)avgbfree * fs->fs_bsize) / avgndir;
if (dirsize < curdsz)
dirsize = curdsz;
}
if (cgsize < dirsize * 255) maxcontigdirs = (avgbfree * fs->fs_bsize) / dirsize;
else
maxcontigdirs = 255;
if (fs->fs_avgfpdir > 0)
maxcontigdirs = uimin(maxcontigdirs,
fs->fs_ipg / fs->fs_avgfpdir);
if (maxcontigdirs == 0)
maxcontigdirs = 1;
/*
* Limit number of dirs in one cg and reserve space for
* regular files, but only if we have no deficit in
* inodes or space.
*/
prefcg = ino_to_cg(fs, pip->i_number);
for (cg = prefcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg));
}
for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_ndir < maxndir && fs->fs_cs(fs, cg).cs_nifree >= minifree &&
fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
if (fs->fs_contigdirs[cg] < maxcontigdirs) return ((ino_t)(fs->fs_ipg * cg));
}
/*
* This is a backstop when we are deficient in space.
*/
for (cg = prefcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
return ((ino_t)(fs->fs_ipg * cg)); for (cg = 0; cg < prefcg; cg++) if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
break;
return ((ino_t)(fs->fs_ipg * cg));
}
/*
* Select the desired position for the next block in a file. The file is
* logically divided into sections. The first section is composed of the
* direct blocks. Each additional section contains fs_maxbpg blocks.
*
* If no blocks have been allocated in the first section, the policy is to
* request a block in the same cylinder group as the inode that describes
* the file. If no blocks have been allocated in any other section, the
* policy is to place the section in a cylinder group with a greater than
* average number of free blocks. An appropriate cylinder group is found
* by using a rotor that sweeps the cylinder groups. When a new group of
* blocks is needed, the sweep begins in the cylinder group following the
* cylinder group from which the previous allocation was made. The sweep
* continues until a cylinder group with greater than the average number
* of free blocks is found. If the allocation is for the first block in an
* indirect block, the information on the previous allocation is unavailable;
* here a best guess is made based upon the logical block number being
* allocated.
*
* If a section is already partially allocated, the policy is to
* contiguously allocate fs_maxcontig blocks. The end of one of these
* contiguous blocks and the beginning of the next is laid out
* contigously if possible.
*
* => um_lock held on entry and exit
*/
daddr_t
ffs_blkpref_ufs1(struct inode *ip, daddr_t lbn, int indx, int flags,
int32_t *bap /* XXX ondisk32 */)
{
struct fs *fs;
u_int cg;
u_int avgbfree, startcg;
KASSERT(mutex_owned(&ip->i_ump->um_lock));
fs = ip->i_fs;
/*
* If allocating a contiguous file with B_CONTIG, use the hints
* in the inode extensions to return the desired block.
*
* For metadata (indirect blocks) return the address of where
* the first indirect block resides - we'll scan for the next
* available slot if we need to allocate more than one indirect
* block. For data, return the address of the actual block
* relative to the address of the first data block.
*/
if (flags & B_CONTIG) {
KASSERT(ip->i_ffs_first_data_blk != 0);
KASSERT(ip->i_ffs_first_indir_blk != 0);
if (flags & B_METAONLY)
return ip->i_ffs_first_indir_blk;
else
return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
}
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
cg = ino_to_cg(fs, ip->i_number);
return (cgbase(fs, cg) + fs->fs_frag);
}
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg =
ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs,
ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
for (cg = 0; cg < startcg; cg++)
if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
return (0);
}
/*
* We just always try to lay things out contiguously.
*/
return ufs_rw32(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}
daddr_t
ffs_blkpref_ufs2(struct inode *ip, daddr_t lbn, int indx, int flags,
int64_t *bap)
{
struct fs *fs;
u_int cg;
u_int avgbfree, startcg;
KASSERT(mutex_owned(&ip->i_ump->um_lock));
fs = ip->i_fs;
/*
* If allocating a contiguous file with B_CONTIG, use the hints
* in the inode extensions to return the desired block.
*
* For metadata (indirect blocks) return the address of where
* the first indirect block resides - we'll scan for the next
* available slot if we need to allocate more than one indirect
* block. For data, return the address of the actual block
* relative to the address of the first data block.
*/
if (flags & B_CONTIG) {
KASSERT(ip->i_ffs_first_data_blk != 0); KASSERT(ip->i_ffs_first_indir_blk != 0);
if (flags & B_METAONLY)
return ip->i_ffs_first_indir_blk;
else
return ip->i_ffs_first_data_blk + ffs_blkstofrags(fs, lbn);
}
if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
if (lbn < UFS_NDADDR + FFS_NINDIR(fs)) {
cg = ino_to_cg(fs, ip->i_number);
return (cgbase(fs, cg) + fs->fs_frag);
}
/*
* Find a cylinder with greater than average number of
* unused data blocks.
*/
if (indx == 0 || bap[indx - 1] == 0)
startcg =
ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
else
startcg = dtog(fs,
ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + 1);
startcg %= fs->fs_ncg;
avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
for (cg = startcg; cg < fs->fs_ncg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
for (cg = 0; cg < startcg; cg++) if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
return (cgbase(fs, cg) + fs->fs_frag);
}
return (0);
}
/*
* We just always try to lay things out contiguously.
*/
return ufs_rw64(bap[indx - 1], UFS_FSNEEDSWAP(fs)) + fs->fs_frag;
}
/*
* Implement the cylinder overflow algorithm.
*
* The policy implemented by this algorithm is:
* 1) allocate the block in its requested cylinder group.
* 2) quadradically rehash on the cylinder group number.
* 3) brute force search for a free block.
*
* => called with um_lock held
* => returns with um_lock released on success, held on failure
* (*allocator releases lock on success, retains lock on failure)
*/
/*VARARGS5*/
static daddr_t
ffs_hashalloc(struct inode *ip, u_int cg, daddr_t pref,
int size /* size for data blocks, mode for inodes */,
int realsize,
int flags,
daddr_t (*allocator)(struct inode *, u_int, daddr_t, int, int, int))
{
struct fs *fs;
daddr_t result;
u_int i, icg = cg;
fs = ip->i_fs;
/*
* 1: preferred cylinder group
*/
result = (*allocator)(ip, cg, pref, size, realsize, flags);
if (result)
return (result);
if (flags & B_CONTIG)
return (result);
/*
* 2: quadratic rehash
*/
for (i = 1; i < fs->fs_ncg; i *= 2) {
cg += i;
if (cg >= fs->fs_ncg)
cg -= fs->fs_ncg;
result = (*allocator)(ip, cg, 0, size, realsize, flags);
if (result)
return (result);
}
/*
* 3: brute force search
* Note that we start at i == 2, since 0 was checked initially,
* and 1 is always checked in the quadratic rehash.
*/
cg = (icg + 2) % fs->fs_ncg;
for (i = 2; i < fs->fs_ncg; i++) {
result = (*allocator)(ip, cg, 0, size, realsize, flags);
if (result)
return (result);
cg++;
if (cg == fs->fs_ncg)
cg = 0;
}
return (0);
}
/*
* Determine whether a fragment can be extended.
*
* Check to see if the necessary fragments are available, and
* if they are, allocate them.
*
* => called with um_lock held
* => returns with um_lock released on success, held on failure
*/
static daddr_t
ffs_fragextend(struct inode *ip, u_int cg, daddr_t bprev, int osize, int nsize)
{
struct ufsmount *ump;
struct fs *fs;
struct cg *cgp;
struct buf *bp;
daddr_t bno;
int frags, bbase;
int i, error;
u_int8_t *blksfree;
fs = ip->i_fs;
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nffree < ffs_numfrags(fs, nsize - osize))
return (0);
frags = ffs_numfrags(fs, nsize);
bbase = ffs_fragnum(fs, bprev);
if (bbase > ffs_fragnum(fs, (bprev + frags - 1))) {
/* cannot extend across a block boundary */
return (0);
}
mutex_exit(&ump->um_lock);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs)))
goto fail;
cgp->cg_old_time = ufs_rw32(time_second, UFS_FSNEEDSWAP(fs)); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, UFS_FSNEEDSWAP(fs));
bno = dtogd(fs, bprev);
blksfree = cg_blksfree(cgp, UFS_FSNEEDSWAP(fs)); for (i = ffs_numfrags(fs, osize); i < frags; i++) if (isclr(blksfree, bno + i))
goto fail;
/*
* the current fragment can be extended
* deduct the count on fragment being extended into
* increase the count on the remaining fragment (if any)
* allocate the extended piece
*/
for (i = frags; i < fs->fs_frag - bbase; i++) if (isclr(blksfree, bno + i))
break;
ufs_add32(cgp->cg_frsum[i - ffs_numfrags(fs, osize)], -1, UFS_FSNEEDSWAP(fs)); if (i != frags) ufs_add32(cgp->cg_frsum[i - frags], 1, UFS_FSNEEDSWAP(fs));
mutex_enter(&ump->um_lock);
for (i = ffs_numfrags(fs, osize); i < frags; i++) {
clrbit(blksfree, bno + i);
ufs_add32(cgp->cg_cs.cs_nffree, -1, UFS_FSNEEDSWAP(fs));
fs->fs_cstotal.cs_nffree--;
fs->fs_cs(fs, cg).cs_nffree--;
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return (bprev);
fail:
if (bp != NULL) brelse(bp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Determine whether a block can be allocated.
*
* Check to see if a block of the appropriate size is available,
* and if it is, allocate it.
*/
static daddr_t
ffs_alloccg(struct inode *ip, u_int cg, daddr_t bpref, int size, int realsize,
int flags)
{
struct ufsmount *ump;
struct fs *fs = ip->i_fs;
struct cg *cgp;
struct buf *bp;
int32_t bno;
daddr_t blkno;
int error, frags, allocsiz, i;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
ump = ip->i_ump;
KASSERT(mutex_owned(&ump->um_lock)); if (fs->fs_cs(fs, cg).cs_nbfree == 0 && size == fs->fs_bsize)
return (0);
mutex_exit(&ump->um_lock);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap) || (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize))
goto fail;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap);
if (size == fs->fs_bsize) {
mutex_enter(&ump->um_lock);
blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
/*
* If actually needed size is lower, free the extra blocks now.
* This is safe to call here, there is no outside reference
* to this block yet. It is not necessary to keep um_lock
* locked.
*/
if (realsize != 0 && realsize < size) { ffs_blkfree_common(ip->i_ump, ip->i_fs,
ip->i_devvp->v_rdev,
bp, blkno + ffs_numfrags(fs, realsize),
(long)(size - realsize), false);
}
bdwrite(bp);
return (blkno);
}
/*
* check to see if any fragments are already available
* allocsiz is the size which will be allocated, hacking
* it down to a smaller size if necessary
*/
blksfree = cg_blksfree(cgp, needswap);
frags = ffs_numfrags(fs, size);
for (allocsiz = frags; allocsiz < fs->fs_frag; allocsiz++) if (cgp->cg_frsum[allocsiz] != 0)
break;
if (allocsiz == fs->fs_frag) {
/*
* no fragments were available, so a block will be
* allocated, and hacked up
*/
if (cgp->cg_cs.cs_nbfree == 0)
goto fail;
mutex_enter(&ump->um_lock);
blkno = ffs_alloccgblk(ip, bp, bpref, realsize, flags);
bno = dtogd(fs, blkno);
for (i = frags; i < fs->fs_frag; i++)
setbit(blksfree, bno + i);
i = fs->fs_frag - frags;
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
fs->fs_fmod = 1;
ufs_add32(cgp->cg_frsum[i], 1, needswap); ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return (blkno);
}
bno = ffs_mapsearch(fs, cgp, bpref, allocsiz);
#if 0
/*
* XXX fvdl mapsearch will panic, and never return -1
* also: returning NULL as daddr_t ?
*/
if (bno < 0)
goto fail;
#endif
for (i = 0; i < frags; i++)
clrbit(blksfree, bno + i);
mutex_enter(&ump->um_lock);
ufs_add32(cgp->cg_cs.cs_nffree, -frags, needswap);
fs->fs_cstotal.cs_nffree -= frags;
fs->fs_cs(fs, cg).cs_nffree -= frags;
fs->fs_fmod = 1;
ufs_add32(cgp->cg_frsum[allocsiz], -1, needswap);
if (frags != allocsiz) ufs_add32(cgp->cg_frsum[allocsiz - frags], 1, needswap);
blkno = cgbase(fs, cg) + bno;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return blkno;
fail:
if (bp != NULL) brelse(bp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Allocate a block in a cylinder group.
*
* This algorithm implements the following policy:
* 1) allocate the requested block.
* 2) allocate a rotationally optimal block in the same cylinder.
* 3) allocate the next available block on the block rotor for the
* specified cylinder group.
* Note that this routine only allocates fs_bsize blocks; these
* blocks may be fragmented by the routine that allocates them.
*/
static daddr_t
ffs_alloccgblk(struct inode *ip, struct buf *bp, daddr_t bpref, int realsize,
int flags)
{
struct fs *fs = ip->i_fs;
struct cg *cgp;
int cg;
daddr_t blkno;
int32_t bno;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(mutex_owned(&ip->i_ump->um_lock));
cgp = (struct cg *)bp->b_data;
blksfree = cg_blksfree(cgp, needswap); if (bpref == 0 || dtog(fs, bpref) != ufs_rw32(cgp->cg_cgx, needswap)) { bpref = ufs_rw32(cgp->cg_rotor, needswap);
} else {
bpref = ffs_blknum(fs, bpref);
bno = dtogd(fs, bpref);
/*
* if the requested block is available, use it
*/
if (ffs_isblock(fs, blksfree, ffs_fragstoblks(fs, bno)))
goto gotit;
/*
* if the requested data block isn't available and we are
* trying to allocate a contiguous file, return an error.
*/
if ((flags & (B_CONTIG | B_METAONLY)) == B_CONTIG)
return (0);
}
/*
* Take the next available block in this cylinder group.
*/
bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
#if 0
/*
* XXX jdolecek ffs_mapsearch() succeeds or panics
*/
if (bno < 0)
return (0);
#endif
cgp->cg_rotor = ufs_rw32(bno, needswap);
gotit:
blkno = ffs_fragstoblks(fs, bno);
ffs_clrblock(fs, blksfree, blkno);
ffs_clusteracct(fs, cgp, blkno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, ufs_rw32(cgp->cg_cgx, needswap)).cs_nbfree--; if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
int cylno;
cylno = old_cbtocylno(fs, bno);
KASSERT(cylno >= 0); KASSERT(cylno < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, cylno, needswap)[old_cbtorpos(fs, bno)], -1,
needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[cylno], -1, needswap);
}
fs->fs_fmod = 1;
cg = ufs_rw32(cgp->cg_cgx, needswap);
blkno = cgbase(fs, cg) + bno;
return (blkno);
}
/*
* Determine whether an inode can be allocated.
*
* Check to see if an inode is available, and if it is,
* allocate it using the following policy:
* 1) allocate the requested inode.
* 2) allocate the next available inode after the requested
* inode in the specified cylinder group.
*/
static daddr_t
ffs_nodealloccg(struct inode *ip, u_int cg, daddr_t ipref, int mode, int realsize,
int flags)
{
struct ufsmount *ump = ip->i_ump;
struct fs *fs = ip->i_fs;
struct cg *cgp;
struct buf *bp, *ibp;
u_int8_t *inosused;
int error, start, len, loc, map, i;
int32_t initediblk, maxiblk, irotor;
daddr_t nalloc;
struct ufs2_dinode *dp2;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(mutex_owned(&ump->um_lock)); UFS_WAPBL_JLOCK_ASSERT(ip->i_ump->um_mountp); if (fs->fs_cs(fs, cg).cs_nifree == 0)
return (0);
mutex_exit(&ump->um_lock);
ibp = NULL;
if (fs->fs_magic == FS_UFS2_MAGIC) {
initediblk = -1;
} else {
initediblk = fs->fs_ipg;
}
maxiblk = initediblk;
retry:
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error)
goto fail;
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap) || cgp->cg_cs.cs_nifree == 0)
goto fail;
if (ibp != NULL && initediblk != ufs_rw32(cgp->cg_initediblk, needswap)) {
/* Another thread allocated more inodes so we retry the test. */
brelse(ibp, 0);
ibp = NULL;
}
/*
* Check to see if we need to initialize more inodes.
*/
if (fs->fs_magic == FS_UFS2_MAGIC && ibp == NULL) { initediblk = ufs_rw32(cgp->cg_initediblk, needswap);
maxiblk = initediblk;
nalloc = fs->fs_ipg - ufs_rw32(cgp->cg_cs.cs_nifree, needswap);
if (nalloc + FFS_INOPB(fs) > initediblk && initediblk < ufs_rw32(cgp->cg_niblk, needswap)) {
/*
* We have to release the cg buffer here to prevent
* a deadlock when reading the inode block will
* run a copy-on-write that might use this cg.
*/
brelse(bp, 0);
bp = NULL;
error = ffs_getblk(ip->i_devvp, FFS_FSBTODB(fs,
ino_to_fsba(fs, cg * fs->fs_ipg + initediblk)),
FFS_NOBLK, fs->fs_bsize, false, &ibp);
if (error)
goto fail;
maxiblk += FFS_INOPB(fs);
goto retry;
}
}
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap); if (ipref) {
ipref %= fs->fs_ipg;
/* safeguard to stay in (to be) allocated range */
if (ipref < maxiblk && isclr(inosused, ipref))
goto gotit;
}
irotor = ufs_rw32(cgp->cg_irotor, needswap); KASSERTMSG(irotor < initediblk, "%s: allocation botch: cg=%d, irotor %d"
" out of bounds, initediblk=%d",
__func__, cg, irotor, initediblk);
start = irotor / NBBY;
len = howmany(maxiblk - irotor, NBBY);
loc = skpc(0xff, len, &inosused[start]);
if (loc == 0) {
len = start + 1;
start = 0;
loc = skpc(0xff, len, &inosused[0]);
if (loc == 0) {
panic("%s: map corrupted: cg=%d, irotor=%d, fs=%s",
__func__, cg, ufs_rw32(cgp->cg_irotor, needswap),
fs->fs_fsmnt);
/* NOTREACHED */
}
}
i = start + len - loc;
map = inosused[i] ^ 0xff;
if (map == 0) {
panic("%s: block not in map: fs=%s", __func__, fs->fs_fsmnt);
}
ipref = i * NBBY + ffs(map) - 1;
cgp->cg_irotor = ufs_rw32(ipref, needswap);
gotit:
KASSERTMSG(ipref < maxiblk, "%s: allocation botch: cg=%d attempt to "
"allocate inode index %d beyond max allocated index %d"
" of %d inodes/cg",
__func__, cg, (int)ipref, maxiblk, cgp->cg_niblk);
UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp, cg * fs->fs_ipg + ipref,
mode);
/*
* Check to see if we need to initialize more inodes.
*/
if (ibp != NULL) { KASSERT(initediblk == ufs_rw32(cgp->cg_initediblk, needswap));
memset(ibp->b_data, 0, fs->fs_bsize);
dp2 = (struct ufs2_dinode *)(ibp->b_data);
for (i = 0; i < FFS_INOPB(fs); i++) {
/*
* Don't bother to swap, it's supposed to be
* random, after all.
*/
dp2->di_gen = (cprng_fast32() & INT32_MAX) / 2 + 1;
dp2++;
}
initediblk += FFS_INOPB(fs);
cgp->cg_initediblk = ufs_rw32(initediblk, needswap);
}
mutex_enter(&ump->um_lock);
ACTIVECG_CLR(fs, cg);
setbit(inosused, ipref);
ufs_add32(cgp->cg_cs.cs_nifree, -1, needswap);
fs->fs_cstotal.cs_nifree--;
fs->fs_cs(fs, cg).cs_nifree--;
fs->fs_fmod = 1;
if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, 1, needswap);
fs->fs_cstotal.cs_ndir++;
fs->fs_cs(fs, cg).cs_ndir++;
}
mutex_exit(&ump->um_lock);
if (ibp != NULL) {
bwrite(ibp);
bwrite(bp);
} else
bdwrite(bp);
return ((ino_t)(cg * fs->fs_ipg + ipref));
fail:
if (bp != NULL) brelse(bp, 0); if (ibp != NULL) brelse(ibp, 0);
mutex_enter(&ump->um_lock);
return (0);
}
/*
* Allocate a block or fragment.
*
* The specified block or fragment is removed from the
* free map, possibly fragmenting a block in the process.
*
* This implementation should mirror fs_blkfree
*
* => um_lock not held on entry or exit
*/
int
ffs_blkalloc(struct inode *ip, daddr_t bno, long size)
{
int error;
error = ffs_check_bad_allocation(__func__, ip->i_fs, bno, size,
ip->i_dev, ip->i_uid);
if (error)
return error;
return ffs_blkalloc_ump(ip->i_ump, bno, size);
}
int
ffs_blkalloc_ump(struct ufsmount *ump, daddr_t bno, long size)
{
struct fs *fs = ump->um_fs;
struct cg *cgp;
struct buf *bp;
int32_t fragno, cgbno;
int i, error, blk, frags, bbase;
u_int cg;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT((u_int)size <= fs->fs_bsize && ffs_fragoff(fs, size) == 0 &&
ffs_fragnum(fs, bno) + ffs_numfrags(fs, size) <= fs->fs_frag);
KASSERT(bno < fs->fs_size);
cg = dtog(fs, bno);
error = bread(ump->um_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, B_MODIFY, &bp);
if (error) {
return error;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return EIO;
}
cgp->cg_old_time = ufs_rw32(time_second, needswap);
cgp->cg_time = ufs_rw64(time_second, needswap);
cgbno = dtogd(fs, bno);
blksfree = cg_blksfree(cgp, needswap);
mutex_enter(&ump->um_lock);
if (size == fs->fs_bsize) {
fragno = ffs_fragstoblks(fs, cgbno);
if (!ffs_isblock(fs, blksfree, fragno)) {
mutex_exit(&ump->um_lock);
brelse(bp, 0);
return EBUSY;
}
ffs_clrblock(fs, blksfree, fragno);
ffs_clusteracct(fs, cgp, fragno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cg).cs_nbfree--;
} else {
bbase = cgbno - ffs_fragnum(fs, cgbno);
frags = ffs_numfrags(fs, size);
for (i = 0; i < frags; i++) {
if (isclr(blksfree, cgbno + i)) {
mutex_exit(&ump->um_lock);
brelse(bp, 0);
return EBUSY;
}
}
/*
* if a complete block is being split, account for it
*/
fragno = ffs_fragstoblks(fs, bbase);
if (ffs_isblock(fs, blksfree, fragno)) {
ufs_add32(cgp->cg_cs.cs_nffree, fs->fs_frag, needswap);
fs->fs_cstotal.cs_nffree += fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree += fs->fs_frag;
ffs_clusteracct(fs, cgp, fragno, -1);
ufs_add32(cgp->cg_cs.cs_nbfree, -1, needswap);
fs->fs_cstotal.cs_nbfree--;
fs->fs_cs(fs, cg).cs_nbfree--;
}
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
/*
* allocate the fragment
*/
for (i = 0; i < frags; i++) {
clrbit(blksfree, cgbno + i);
}
ufs_add32(cgp->cg_cs.cs_nffree, -i, needswap);
fs->fs_cstotal.cs_nffree -= i;
fs->fs_cs(fs, cg).cs_nffree -= i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
bdwrite(bp);
return 0;
}
/*
* Free a block or fragment.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*
* => um_lock not held on entry or exit
*/
static void
ffs_blkfree_cg(struct fs *fs, struct vnode *devvp, daddr_t bno, long size)
{
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
daddr_t cgblkno;
int error;
u_int cg;
dev_t dev;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(!devvp_is_snapshot);
cg = dtog(fs, bno);
dev = devvp->v_rdev;
ump = VFSTOUFS(spec_node_getmountedfs(devvp));
KASSERT(fs == ump->um_fs); cgblkno = FFS_FSBTODB(fs, cgtod(fs, cg));
error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0);
return;
}
ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
bdwrite(bp);
}
struct discardopdata {
struct work wk; /* must be first */
struct vnode *devvp;
daddr_t bno;
long size;
};
struct discarddata {
struct fs *fs;
struct discardopdata *entry;
long maxsize;
kmutex_t entrylk;
struct workqueue *wq;
int wqcnt, wqdraining;
kmutex_t wqlk;
kcondvar_t wqcv;
/* timer for flush? */
};
static void
ffs_blkfree_td(struct fs *fs, struct discardopdata *td)
{
struct mount *mp = spec_node_getmountedfs(td->devvp);
long todo;
int error;
while (td->size) {
todo = uimin(td->size,
ffs_lfragtosize(fs, (fs->fs_frag - ffs_fragnum(fs, td->bno))));
error = UFS_WAPBL_BEGIN(mp);
if (error) {
printf("ffs: failed to begin wapbl transaction"
" for discard: %d\n", error);
break;
}
ffs_blkfree_cg(fs, td->devvp, td->bno, todo);
UFS_WAPBL_END(mp);
td->bno += ffs_numfrags(fs, todo);
td->size -= todo;
}
}
static void
ffs_discardcb(struct work *wk, void *arg)
{
struct discardopdata *td = (void *)wk;
struct discarddata *ts = arg;
struct fs *fs = ts->fs;
off_t start, len;
#ifdef TRIMDEBUG
int error;
#endif
/* like FSBTODB but emits bytes; XXX move to fs.h */
#ifndef FFS_FSBTOBYTES
#define FFS_FSBTOBYTES(fs, b) ((b) << (fs)->fs_fshift)
#endif
start = FFS_FSBTOBYTES(fs, td->bno);
len = td->size;
vn_lock(td->devvp, LK_EXCLUSIVE | LK_RETRY);
#ifdef TRIMDEBUG
error =
#endif
VOP_FDISCARD(td->devvp, start, len);
VOP_UNLOCK(td->devvp);
#ifdef TRIMDEBUG
printf("trim(%" PRId64 ",%ld):%d\n", td->bno, td->size, error);
#endif
ffs_blkfree_td(fs, td);
kmem_free(td, sizeof(*td));
mutex_enter(&ts->wqlk);
ts->wqcnt--;
if (ts->wqdraining && !ts->wqcnt)
cv_signal(&ts->wqcv);
mutex_exit(&ts->wqlk);
}
void *
ffs_discard_init(struct vnode *devvp, struct fs *fs)
{
struct discarddata *ts;
int error;
ts = kmem_zalloc(sizeof (*ts), KM_SLEEP);
error = workqueue_create(&ts->wq, "trimwq", ffs_discardcb, ts,
PRI_USER, IPL_NONE, 0);
if (error) {
kmem_free(ts, sizeof (*ts));
return NULL;
}
mutex_init(&ts->entrylk, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&ts->wqlk, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ts->wqcv, "trimwqcv");
ts->maxsize = 100*1024; /* XXX */
ts->fs = fs;
return ts;
}
void
ffs_discard_finish(void *vts, int flags)
{
struct discarddata *ts = vts;
struct discardopdata *td = NULL;
/* wait for workqueue to drain */
mutex_enter(&ts->wqlk);
if (ts->wqcnt) {
ts->wqdraining = 1;
cv_wait(&ts->wqcv, &ts->wqlk);
}
mutex_exit(&ts->wqlk);
mutex_enter(&ts->entrylk);
if (ts->entry) {
td = ts->entry;
ts->entry = NULL;
}
mutex_exit(&ts->entrylk);
if (td) {
/* XXX don't tell disk, its optional */
ffs_blkfree_td(ts->fs, td);
#ifdef TRIMDEBUG
printf("finish(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
kmem_free(td, sizeof(*td));
}
cv_destroy(&ts->wqcv);
mutex_destroy(&ts->entrylk);
mutex_destroy(&ts->wqlk);
workqueue_destroy(ts->wq);
kmem_free(ts, sizeof(*ts));
}
void
ffs_blkfree(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
ino_t inum)
{
struct ufsmount *ump;
int error;
dev_t dev;
struct discarddata *ts;
struct discardopdata *td;
dev = devvp->v_rdev;
ump = VFSTOUFS(spec_node_getmountedfs(devvp));
if (ffs_snapblkfree(fs, devvp, bno, size, inum))
return;
error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
if (error)
return;
if (!ump->um_discarddata) {
ffs_blkfree_cg(fs, devvp, bno, size);
return;
}
#ifdef TRIMDEBUG
printf("blkfree(%" PRId64 ",%ld)\n", bno, size);
#endif
ts = ump->um_discarddata;
td = NULL;
mutex_enter(&ts->entrylk);
if (ts->entry) {
td = ts->entry;
/* ffs deallocs backwards, check for prepend only */
if (td->bno == bno + ffs_numfrags(fs, size) && td->size + size <= ts->maxsize) {
td->bno = bno;
td->size += size;
if (td->size < ts->maxsize) {
#ifdef TRIMDEBUG
printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_exit(&ts->entrylk);
return;
}
size = 0; /* mark done */
}
ts->entry = NULL;
}
mutex_exit(&ts->entrylk);
if (td) {
#ifdef TRIMDEBUG
printf("enq old(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_enter(&ts->wqlk);
ts->wqcnt++;
mutex_exit(&ts->wqlk);
workqueue_enqueue(ts->wq, &td->wk, NULL);
}
if (!size)
return;
td = kmem_alloc(sizeof(*td), KM_SLEEP);
td->devvp = devvp;
td->bno = bno;
td->size = size;
if (td->size < ts->maxsize) { /* XXX always the case */
mutex_enter(&ts->entrylk);
if (!ts->entry) { /* possible race? */
#ifdef TRIMDEBUG
printf("defer(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
ts->entry = td;
td = NULL;
}
mutex_exit(&ts->entrylk);
}
if (td) {
#ifdef TRIMDEBUG
printf("enq new(%" PRId64 ",%ld)\n", td->bno, td->size);
#endif
mutex_enter(&ts->wqlk);
ts->wqcnt++;
mutex_exit(&ts->wqlk);
workqueue_enqueue(ts->wq, &td->wk, NULL);
}
}
/*
* Free a block or fragment from a snapshot cg copy.
*
* The specified block or fragment is placed back in the
* free map. If a fragment is deallocated, a possible
* block reassembly is checked.
*
* => um_lock not held on entry or exit
*/
void
ffs_blkfree_snap(struct fs *fs, struct vnode *devvp, daddr_t bno, long size,
ino_t inum)
{
struct cg *cgp;
struct buf *bp;
struct ufsmount *ump;
daddr_t cgblkno;
int error, cg;
dev_t dev;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(devvp_is_snapshot);
cg = dtog(fs, bno);
dev = VTOI(devvp)->i_devvp->v_rdev;
ump = VFSTOUFS(devvp->v_mount);
cgblkno = ffs_fragstoblks(fs, cgtod(fs, cg));
error = ffs_check_bad_allocation(__func__, fs, bno, size, dev, inum);
if (error)
return;
error = bread(devvp, cgblkno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return;
}
ffs_blkfree_common(ump, fs, dev, bp, bno, size, devvp_is_snapshot);
bdwrite(bp);
}
static void
ffs_blkfree_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
struct buf *bp, daddr_t bno, long size, bool devvp_is_snapshot)
{
struct cg *cgp;
int32_t fragno, cgbno;
int i, blk, frags, bbase;
u_int cg;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
cg = dtog(fs, bno);
cgp = (struct cg *)bp->b_data;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap);
cgbno = dtogd(fs, bno);
blksfree = cg_blksfree(cgp, needswap);
mutex_enter(&ump->um_lock);
if (size == fs->fs_bsize) {
fragno = ffs_fragstoblks(fs, cgbno);
if (!ffs_isfreeblock(fs, blksfree, fragno)) {
if (devvp_is_snapshot) {
mutex_exit(&ump->um_lock);
return;
}
panic("%s: freeing free block: dev = 0x%llx, block = %"
PRId64 ", fs = %s", __func__,
(unsigned long long)dev, bno, fs->fs_fsmnt);
}
ffs_setblock(fs, blksfree, fragno);
ffs_clusteracct(fs, cgp, fragno, 1);
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
i = old_cbtocylno(fs, cgbno);
KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, cgbno) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, cgbno) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs, cgbno)], 1,
needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
}
} else {
bbase = cgbno - ffs_fragnum(fs, cgbno);
/*
* decrement the counts associated with the old frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, -1, needswap);
/*
* deallocate the fragment
*/
frags = ffs_numfrags(fs, size);
for (i = 0; i < frags; i++) {
if (isset(blksfree, cgbno + i)) {
panic("%s: freeing free frag: "
"dev = 0x%llx, block = %" PRId64
", fs = %s", __func__,
(unsigned long long)dev, bno + i,
fs->fs_fsmnt);
}
setbit(blksfree, cgbno + i);
}
ufs_add32(cgp->cg_cs.cs_nffree, i, needswap);
fs->fs_cstotal.cs_nffree += i;
fs->fs_cs(fs, cg).cs_nffree += i;
/*
* add back in counts associated with the new frags
*/
blk = blkmap(fs, blksfree, bbase);
ffs_fragacct(fs, blk, cgp->cg_frsum, 1, needswap);
/*
* if a complete block has been reassembled, account for it
*/
fragno = ffs_fragstoblks(fs, bbase);
if (ffs_isblock(fs, blksfree, fragno)) { ufs_add32(cgp->cg_cs.cs_nffree, -fs->fs_frag, needswap);
fs->fs_cstotal.cs_nffree -= fs->fs_frag;
fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
ffs_clusteracct(fs, cgp, fragno, 1);
ufs_add32(cgp->cg_cs.cs_nbfree, 1, needswap);
fs->fs_cstotal.cs_nbfree++;
fs->fs_cs(fs, cg).cs_nbfree++;
if ((fs->fs_magic == FS_UFS1_MAGIC) &&
((fs->fs_old_flags & FS_FLAGS_UPDATED) == 0)) {
i = old_cbtocylno(fs, bbase);
KASSERT(i >= 0); KASSERT(i < fs->fs_old_ncyl); KASSERT(old_cbtorpos(fs, bbase) >= 0); KASSERT(fs->fs_old_nrpos == 0 || old_cbtorpos(fs, bbase) < fs->fs_old_nrpos); ufs_add16(old_cg_blks(fs, cgp, i, needswap)[old_cbtorpos(fs,
bbase)], 1, needswap);
ufs_add32(old_cg_blktot(cgp, needswap)[i], 1, needswap);
}
}
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
}
/*
* Free an inode.
*/
int
ffs_vfree(struct vnode *vp, ino_t ino, int mode)
{
return ffs_freefile(vp->v_mount, ino, mode);
}
/*
* Do the actual free operation.
* The specified inode is placed back in the free map.
*
* => um_lock not held on entry or exit
*/
int
ffs_freefile(struct mount *mp, ino_t ino, int mode)
{
struct ufsmount *ump = VFSTOUFS(mp);
struct fs *fs = ump->um_fs;
struct vnode *devvp;
struct cg *cgp;
struct buf *bp;
int error;
u_int cg;
daddr_t cgbno;
dev_t dev;
const int needswap = UFS_FSNEEDSWAP(fs);
cg = ino_to_cg(fs, ino);
devvp = ump->um_devvp;
dev = devvp->v_rdev;
cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
(long long)dev, (unsigned long long)ino, fs->fs_fsmnt);
error = bread(devvp, cgbno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) { brelse(bp, 0);
return (0);
}
ffs_freefile_common(ump, fs, dev, bp, ino, mode, false);
bdwrite(bp);
return 0;
}
int
ffs_freefile_snap(struct fs *fs, struct vnode *devvp, ino_t ino, int mode)
{
struct ufsmount *ump;
struct cg *cgp;
struct buf *bp;
int error, cg;
daddr_t cgbno;
dev_t dev;
const int needswap = UFS_FSNEEDSWAP(fs);
KASSERT(devvp->v_type != VBLK);
cg = ino_to_cg(fs, ino);
dev = VTOI(devvp)->i_devvp->v_rdev;
ump = VFSTOUFS(devvp->v_mount);
cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
panic("%s: range: dev = 0x%llx, ino = %llu, fs = %s", __func__,
(unsigned long long)dev, (unsigned long long)ino,
fs->fs_fsmnt);
error = bread(devvp, cgbno, (int)fs->fs_cgsize,
B_MODIFY, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, needswap)) {
brelse(bp, 0);
return (0);
}
ffs_freefile_common(ump, fs, dev, bp, ino, mode, true);
bdwrite(bp);
return 0;
}
static void
ffs_freefile_common(struct ufsmount *ump, struct fs *fs, dev_t dev,
struct buf *bp, ino_t ino, int mode, bool devvp_is_snapshot)
{
u_int cg;
struct cg *cgp;
u_int8_t *inosused;
const int needswap = UFS_FSNEEDSWAP(fs);
ino_t cgino;
cg = ino_to_cg(fs, ino);
cgp = (struct cg *)bp->b_data;
cgp->cg_old_time = ufs_rw32(time_second, needswap); if ((fs->fs_magic != FS_UFS1_MAGIC) ||
(fs->fs_old_flags & FS_FLAGS_UPDATED))
cgp->cg_time = ufs_rw64(time_second, needswap); inosused = cg_inosused(cgp, needswap);
cgino = ino % fs->fs_ipg;
if (isclr(inosused, cgino)) {
printf("ifree: dev = 0x%llx, ino = %llu, fs = %s\n",
(unsigned long long)dev, (unsigned long long)ino,
fs->fs_fsmnt);
if (fs->fs_ronly == 0)
panic("%s: freeing free inode", __func__);
}
clrbit(inosused, cgino); if (!devvp_is_snapshot) UFS_WAPBL_UNREGISTER_INODE(ump->um_mountp, ino, mode); if (cgino < ufs_rw32(cgp->cg_irotor, needswap)) cgp->cg_irotor = ufs_rw32(cgino, needswap);
ufs_add32(cgp->cg_cs.cs_nifree, 1, needswap);
mutex_enter(&ump->um_lock);
fs->fs_cstotal.cs_nifree++;
fs->fs_cs(fs, cg).cs_nifree++;
if ((mode & IFMT) == IFDIR) { ufs_add32(cgp->cg_cs.cs_ndir, -1, needswap);
fs->fs_cstotal.cs_ndir--;
fs->fs_cs(fs, cg).cs_ndir--;
}
fs->fs_fmod = 1;
ACTIVECG_CLR(fs, cg);
mutex_exit(&ump->um_lock);
}
/*
* Check to see if a file is free.
*/
int
ffs_checkfreefile(struct fs *fs, struct vnode *devvp, ino_t ino)
{
struct cg *cgp;
struct buf *bp;
daddr_t cgbno;
int ret;
u_int cg;
u_int8_t *inosused;
const bool devvp_is_snapshot = (devvp->v_type != VBLK);
KASSERT(devvp_is_snapshot);
cg = ino_to_cg(fs, ino);
if (devvp_is_snapshot)
cgbno = ffs_fragstoblks(fs, cgtod(fs, cg));
else
cgbno = FFS_FSBTODB(fs, cgtod(fs, cg));
if (ino >= fs->fs_ipg * fs->fs_ncg)
return 1;
if (bread(devvp, cgbno, (int)fs->fs_cgsize, 0, &bp)) {
return 1;
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, UFS_FSNEEDSWAP(fs))) {
brelse(bp, 0);
return 1;
}
inosused = cg_inosused(cgp, UFS_FSNEEDSWAP(fs));
ino %= fs->fs_ipg;
ret = isclr(inosused, ino);
brelse(bp, 0);
return ret;
}
/*
* Find a block of the specified size in the specified cylinder group.
*
* It is a panic if a request is made to find a block if none are
* available.
*/
static int32_t
ffs_mapsearch(struct fs *fs, struct cg *cgp, daddr_t bpref, int allocsiz)
{
int32_t bno;
int start, len, loc, i;
int blk, field, subfield, pos;
int ostart, olen;
u_int8_t *blksfree;
const int needswap = UFS_FSNEEDSWAP(fs);
/* KASSERT(mutex_owned(&ump->um_lock)); */
/*
* find the fragment by searching through the free block
* map for an appropriate bit pattern
*/
if (bpref)
start = dtogd(fs, bpref) / NBBY;
else
start = ufs_rw32(cgp->cg_frotor, needswap) / NBBY; blksfree = cg_blksfree(cgp, needswap);
len = howmany(fs->fs_fpg, NBBY) - start;
ostart = start;
olen = len;
loc = scanc((u_int)len,
(const u_char *)&blksfree[start],
(const u_char *)fragtbl[fs->fs_frag],
(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
if (loc == 0) {
len = start + 1;
start = 0;
loc = scanc((u_int)len,
(const u_char *)&blksfree[0],
(const u_char *)fragtbl[fs->fs_frag],
(1 << (allocsiz - 1 + (fs->fs_frag & (NBBY - 1)))));
if (loc == 0) {
panic("%s: map corrupted: start=%d, len=%d, "
"fs = %s, offset=%d/%ld, cg %d", __func__,
ostart, olen, fs->fs_fsmnt,
ufs_rw32(cgp->cg_freeoff, needswap),
(long)blksfree - (long)cgp, cgp->cg_cgx);
/* NOTREACHED */
}
}
bno = (start + len - loc) * NBBY;
cgp->cg_frotor = ufs_rw32(bno, needswap);
/*
* found the byte in the map
* sift through the bits to find the selected frag
*/
for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
blk = blkmap(fs, blksfree, bno);
blk <<= 1;
field = around[allocsiz];
subfield = inside[allocsiz];
for (pos = 0; pos <= fs->fs_frag - allocsiz; pos++) {
if ((blk & field) == subfield)
return (bno + pos);
field <<= 1;
subfield <<= 1;
}
}
panic("%s: block not in map: bno=%d, fs=%s", __func__,
bno, fs->fs_fsmnt);
/* return (-1); */
}
/*
* Fserr prints the name of a file system with an error diagnostic.
*
* The form of the error message is:
* fs: error message
*/
static void
ffs_fserr(struct fs *fs, kauth_cred_t cred, const char *cp)
{
KASSERT(cred != NULL);
if (cred == NOCRED || cred == FSCRED) {
log(LOG_ERR, "pid %d, command %s, on %s: %s\n",
curproc->p_pid, curproc->p_comm,
fs->fs_fsmnt, cp);
} else {
log(LOG_ERR, "uid %d, pid %d, command %s, on %s: %s\n",
kauth_cred_getuid(cred), curproc->p_pid, curproc->p_comm,
fs->fs_fsmnt, cp);
}
}
/* $NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $ */
/*-
* Copyright (c) 2000, 2008, 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by RedBack Networks Inc.
*
* Author: Bill Sommerfeld
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ipi.c,v 1.30 2019/12/01 15:34:46 ad Exp $");
#include "opt_mtrr.h"
#include <sys/param.h>
#include <sys/device.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#ifdef MULTIPROCESSOR
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/i82093var.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#include <machine/mtrr.h>
#include <machine/gdt.h>
#include "acpica.h"
#include <x86/fpu.h>
static void x86_ipi_ast(struct cpu_info *);
static void x86_ipi_halt(struct cpu_info *);
static void x86_ipi_kpreempt(struct cpu_info *);
static void x86_ipi_xcall(struct cpu_info *);
static void x86_ipi_generic(struct cpu_info *);
#ifdef MTRR
static void x86_ipi_reload_mtrr(struct cpu_info *);
#else
#define x86_ipi_reload_mtrr NULL
#endif
#if NACPICA > 0
void acpi_cpu_sleep(struct cpu_info *);
#else
#define acpi_cpu_sleep NULL
#endif
static void x86_ipi_synch_fpu(struct cpu_info *);
void (* const ipifunc[X86_NIPI])(struct cpu_info *) =
{
x86_ipi_halt, /* X86_IPI_HALT */
x86_ipi_ast, /* X86_IPI_AST */
x86_ipi_generic, /* X86_IPI_GENERIC */
x86_ipi_synch_fpu, /* X86_IPI_SYNCH_FPU */
x86_ipi_reload_mtrr, /* X86_IPI_MTRR */
NULL, /* X86_IPI_GDT */
x86_ipi_xcall, /* X86_IPI_XCALL */
acpi_cpu_sleep, /* X86_IPI_ACPI_CPU_SLEEP */
x86_ipi_kpreempt /* X86_IPI_KPREEMPT */
};
/*
* x86 IPI interface.
*/
int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
uint32_t o, n;
int ret = 0;
/* Don't send IPI to CPU which isn't (yet) running. */
if (__predict_false((ci->ci_flags & CPUF_RUNNING) == 0))
return ENOENT;
/* Set in new IPI bit, and capture previous state. */
for (o = 0;; o = n) {
n = atomic_cas_32(&ci->ci_ipis, o, o | ipimask);
if (__predict_true(o == n)) {
break;
}
}
/* If no IPI already pending, send one. */
if (o == 0) {
ret = x86_ipi(LAPIC_IPI_VECTOR, ci->ci_cpuid, LAPIC_DLMODE_FIXED);
if (ret != 0) {
printf("ipi of %x from %s to %s failed\n",
ipimask,
device_xname(curcpu()->ci_dev),
device_xname(ci->ci_dev));
}
}
return ret;
}
void
x86_broadcast_ipi(int ipimask)
{
struct cpu_info *ci, *self = curcpu();
int count = 0;
CPU_INFO_ITERATOR cii;
for (CPU_INFO_FOREACH(cii, ci)) { if (ci == self)
continue;
if ((ci->ci_flags & CPUF_RUNNING) == 0)
continue;
atomic_or_32(&ci->ci_ipis, ipimask);
count++;
}
if (!count)
return;
x86_ipi(LAPIC_IPI_VECTOR, LAPIC_DEST_ALLEXCL, LAPIC_DLMODE_FIXED);
}
void
x86_ipi_handler(void)
{
struct cpu_info *ci = curcpu();
uint32_t pending;
int bit;
pending = atomic_swap_32(&ci->ci_ipis, 0);
KDASSERT((pending >> X86_NIPI) == 0);
while ((bit = ffs(pending)) != 0) {
bit--;
pending &= ~(1 << bit);
ci->ci_ipi_events[bit].ev_count++;
(*ipifunc[bit])(ci);
}
}
/*
* Common x86 IPI handlers.
*/
static void
x86_ipi_halt(struct cpu_info *ci)
{
x86_disable_intr();
atomic_and_32(&ci->ci_flags, ~CPUF_RUNNING);
for (;;) {
x86_hlt();
}
}
static void
x86_ipi_synch_fpu(struct cpu_info *ci)
{
panic("%s: impossible", __func__);
}
#ifdef MTRR
static void
x86_ipi_reload_mtrr(struct cpu_info *ci)
{
if (mtrr_funcs != NULL) {
/*
* mtrr_reload_cpu() is a macro in mtrr.h which picks
* the appropriate function to use.
*/
mtrr_reload_cpu(ci);
}
}
#endif
static void
x86_ipi_kpreempt(struct cpu_info *ci)
{
softint_trigger(1 << SIR_PREEMPT);
}
static void
x86_ipi_ast(struct cpu_info *ci)
{
aston(ci->ci_onproc);
}
/*
* MD support for xcall(9) interface.
*/
static void
x86_ipi_xcall(struct cpu_info *ci)
{
xc_ipi_handler();
}
static void
x86_ipi_generic(struct cpu_info *ci)
{
ipi_cpu_handler();
}
void
xc_send_ipi(struct cpu_info *ci)
{ KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci);
if (ci) {
/* Unicast: remote CPU. */
x86_send_ipi(ci, X86_IPI_XCALL);
} else {
/* Broadcast: all, but local CPU (caller will handle it). */
x86_broadcast_ipi(X86_IPI_XCALL);
}
}
void
cpu_ipi(struct cpu_info *ci)
{ KASSERT(kpreempt_disabled()); KASSERT(curcpu() != ci);
if (ci) {
/* Unicast: remote CPU. */
x86_send_ipi(ci, X86_IPI_GENERIC);
} else {
/* Broadcast: all, but local CPU (caller will handle it). */
x86_broadcast_ipi(X86_IPI_GENERIC);
}
}
#else
int
x86_send_ipi(struct cpu_info *ci, int ipimask)
{
return 0;
}
void
x86_broadcast_ipi(int ipimask)
{
}
void
cpu_ipi(struct cpu_info *ci)
{
}
#endif
/* $NetBSD: at_control.c,v 1.44 2023/03/30 15:58:10 riastradh Exp $ */
/*
* Copyright (c) 1990,1994 Regents of The University of Michigan.
* All Rights Reserved.
*
* Permission to use, copy, modify, and distribute this software and
* its documentation for any purpose and without fee is hereby granted,
* provided that the above copyright notice appears in all copies and
* that both that copyright notice and this permission notice appear
* in supporting documentation, and that the name of The University
* of Michigan not be used in advertising or publicity pertaining to
* distribution of the software without specific, written prior
* permission. This software is supplied as is without expressed or
* implied warranties of any kind.
*
* This product includes software developed by the University of
* California, Berkeley and its contributors.
*
* Research Systems Unix Group
* The University of Michigan
* c/o Wesley Craig
* 535 W. William Street
* Ann Arbor, Michigan
* +1-313-764-2278
* netatalk@umich.edu
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: at_control.c,v 1.44 2023/03/30 15:58:10 riastradh Exp $");
#include "opt_atalk.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/mbuf.h>
#include <sys/kernel.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/if_ether.h>
#include <netinet/in.h>
#undef s_net
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#include <netatalk/aarp.h>
#include <netatalk/phase2.h>
#include <netatalk/at_extern.h>
static int aa_dorangeroute(struct ifaddr * ifa,
u_int first, u_int last, int cmd);
static int aa_addsingleroute(struct ifaddr * ifa,
struct at_addr * addr, struct at_addr * mask);
static int aa_delsingleroute(struct ifaddr * ifa,
struct at_addr * addr, struct at_addr * mask);
static int aa_dosingleroute(struct ifaddr * ifa, struct at_addr * addr,
struct at_addr * mask, int cmd, int flags);
static int at_scrub(struct ifnet * ifp, struct at_ifaddr * aa);
static int at_ifinit(struct ifnet *, struct at_ifaddr *,
const struct sockaddr_at *);
#if 0
static void aa_clean(void);
#endif
#define sateqaddr(a,b) ((a)->sat_len == (b)->sat_len && \
(a)->sat_family == (b)->sat_family && \
(a)->sat_addr.s_net == (b)->sat_addr.s_net && \
(a)->sat_addr.s_node == (b)->sat_addr.s_node )
int
at_control(u_long cmd, void *data, struct ifnet *ifp)
{
struct ifreq *ifr = (struct ifreq *) data;
const struct sockaddr_at *csat;
struct netrange *nr;
const struct netrange *cnr;
struct at_aliasreq *ifra = (struct at_aliasreq *) data;
struct at_ifaddr *aa0;
struct at_ifaddr *aa = 0;
/*
* If we have an ifp, then find the matching at_ifaddr if it exists
*/
if (ifp) TAILQ_FOREACH(aa, &at_ifaddr, aa_list) if (aa->aa_ifp == ifp)
break;
/*
* In this first switch table we are basically getting ready for
* the second one, by getting the atalk-specific things set up
* so that they start to look more similar to other protocols etc.
*/
switch (cmd) {
case SIOCAIFADDR:
case SIOCDIFADDR:
/*
* If we have an appletalk sockaddr, scan forward of where
* we are now on the at_ifaddr list to find one with a matching
* address on this interface.
* This may leave aa pointing to the first address on the
* NEXT interface!
*/
if (ifra->ifra_addr.sat_family == AF_APPLETALK) { for (; aa; aa = TAILQ_NEXT(aa, aa_list)) if (aa->aa_ifp == ifp && sateqaddr(&aa->aa_addr, &ifra->ifra_addr))
break;
}
/*
* If we a retrying to delete an address but didn't find such,
* then return with an error
*/
if (cmd == SIOCDIFADDR && aa == 0)
return (EADDRNOTAVAIL);
/* FALLTHROUGH */
case SIOCSIFADDR:
/*
* If we are not superuser, then we don't get to do these
* ops.
*/
if (kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd,
NULL) != 0)
return (EPERM);
csat = satocsat(ifreq_getaddr(cmd, ifr));
cnr = (const struct netrange *)csat->sat_zero;
if (cnr->nr_phase == 1) {
/*
* Look for a phase 1 address on this interface.
* This may leave aa pointing to the first address on
* the NEXT interface!
*/
for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp &&
(aa->aa_flags & AFA_PHASE2) == 0)
break;
}
} else { /* default to phase 2 */
/*
* Look for a phase 2 address on this interface.
* This may leave aa pointing to the first address on
* the NEXT interface!
*/
for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp &&
(aa->aa_flags & AFA_PHASE2))
break;
}
}
if (ifp == 0)
panic("at_control");
/*
* If we failed to find an existing at_ifaddr entry, then we
* allocate a fresh one.
* XXX change this to use malloc
*/
if (aa == (struct at_ifaddr *) 0) {
aa = (struct at_ifaddr *)
malloc(sizeof(struct at_ifaddr), M_IFADDR,
M_WAITOK|M_ZERO);
if (aa == NULL)
return (ENOBUFS);
callout_init(&aa->aa_probe_ch, 0);
if ((aa0 = TAILQ_FIRST(&at_ifaddr)) != NULL) {
/*
* Don't let the loopback be first, since the
* first address is the machine's default
* address for binding.
* If it is, stick ourself in front, otherwise
* go to the back of the list.
*/
if (aa0->aa_ifp->if_flags & IFF_LOOPBACK) {
TAILQ_INSERT_HEAD(&at_ifaddr, aa,
aa_list);
} else {
TAILQ_INSERT_TAIL(&at_ifaddr, aa,
aa_list);
}
} else {
TAILQ_INSERT_TAIL(&at_ifaddr, aa, aa_list);
}
ifaref(&aa->aa_ifa);
ifa_psref_init(&aa->aa_ifa);
/*
* Find the end of the interface's addresses
* and link our new one on the end
*/
ifa_insert(ifp, &aa->aa_ifa);
/*
* As the at_ifaddr contains the actual sockaddrs,
* and the ifaddr itself, link them al together
* correctly.
*/
aa->aa_ifa.ifa_addr =
(struct sockaddr *) &aa->aa_addr;
aa->aa_ifa.ifa_dstaddr =
(struct sockaddr *) &aa->aa_addr;
aa->aa_ifa.ifa_netmask =
(struct sockaddr *) &aa->aa_netmask;
/*
* Set/clear the phase 2 bit.
*/
if (cnr->nr_phase == 1)
aa->aa_flags &= ~AFA_PHASE2;
else
aa->aa_flags |= AFA_PHASE2;
/*
* and link it all together
*/
aa->aa_ifp = ifp;
} else {
/*
* If we DID find one then we clobber any routes
* dependent on it..
*/
at_scrub(ifp, aa);
}
break;
case SIOCGIFADDR:
csat = satocsat(ifreq_getaddr(cmd, ifr));
cnr = (const struct netrange *)csat->sat_zero;
if (cnr->nr_phase == 1) {
/*
* If the request is specifying phase 1, then
* only look at a phase one address
*/
for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp &&
(aa->aa_flags & AFA_PHASE2) == 0)
break;
}
} else if (cnr->nr_phase == 2) {
/*
* If the request is specifying phase 2, then
* only look at a phase two address
*/
for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp &&
(aa->aa_flags & AFA_PHASE2))
break;
}
} else {
/*
* default to everything
*/
for (; aa; aa = TAILQ_NEXT(aa, aa_list)) { if (aa->aa_ifp == ifp)
break;
}
}
if (aa == (struct at_ifaddr *) 0)
return (EADDRNOTAVAIL);
break;
}
/*
* By the time this switch is run we should be able to assume that
* the "aa" pointer is valid when needed.
*/
switch (cmd) {
case SIOCGIFADDR: {
union {
struct sockaddr sa;
struct sockaddr_at sat;
} u;
/*
* copy the contents of the sockaddr blindly.
*/
sockaddr_copy(&u.sa, sizeof(u),
(const struct sockaddr *)&aa->aa_addr);
/*
* and do some cleanups
*/
nr = (struct netrange *)&u.sat.sat_zero;
nr->nr_phase = (aa->aa_flags & AFA_PHASE2) ? 2 : 1;
nr->nr_firstnet = aa->aa_firstnet;
nr->nr_lastnet = aa->aa_lastnet;
ifreq_setaddr(cmd, ifr, &u.sa);
break;
}
case SIOCSIFADDR:
return at_ifinit(ifp, aa,
(const struct sockaddr_at *)ifreq_getaddr(cmd, ifr));
case SIOCAIFADDR:
if (sateqaddr(&ifra->ifra_addr, &aa->aa_addr))
return 0;
return at_ifinit(ifp, aa,
(const struct sockaddr_at *)ifreq_getaddr(cmd, ifr));
case SIOCDIFADDR:
at_purgeaddr(&aa->aa_ifa);
break;
default:
return ENOTTY;
}
return (0);
}
void
at_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct at_ifaddr *aa = (void *) ifa;
/*
* scrub all routes.. didn't we just DO this? XXX yes, del it
* XXX above XXX not necessarily true anymore
*/
at_scrub(ifp, aa);
/*
* remove the ifaddr from the interface
*/
ifa_remove(ifp, &aa->aa_ifa);
TAILQ_REMOVE(&at_ifaddr, aa, aa_list);
ifafree(&aa->aa_ifa);
}
void
at_purgeif(struct ifnet *ifp)
{
if_purgeaddrs(ifp, AF_APPLETALK, at_purgeaddr);
}
/*
* Given an interface and an at_ifaddr (supposedly on that interface) remove
* any routes that depend on this. Why ifp is needed I'm not sure, as
* aa->at_ifaddr.ifa_ifp should be the same.
*/
static int
at_scrub(struct ifnet *ifp, struct at_ifaddr *aa)
{
int error = 0;
if (aa->aa_flags & AFA_ROUTE) {
if (ifp->if_flags & IFF_LOOPBACK)
error = aa_delsingleroute(&aa->aa_ifa,
&aa->aa_addr.sat_addr, &aa->aa_netmask.sat_addr);
else if (ifp->if_flags & IFF_POINTOPOINT)
error = rtinit(&aa->aa_ifa, RTM_DELETE, RTF_HOST); else if (ifp->if_flags & IFF_BROADCAST) error = aa_dorangeroute(&aa->aa_ifa,
ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet),
RTM_DELETE);
aa->aa_ifa.ifa_flags &= ~IFA_ROUTE;
aa->aa_flags &= ~AFA_ROUTE;
}
return error;
}
/*
* given an at_ifaddr,a sockaddr_at and an ifp,
* bang them all together at high speed and see what happens
*/
static int
at_ifinit(struct ifnet *ifp, struct at_ifaddr *aa, const struct sockaddr_at *sat)
{
struct netrange nr, onr;
struct sockaddr_at oldaddr;
int s = splnet(), error = 0, i, j;
int netinc, nodeinc, nnets;
u_short net;
/*
* save the old addresses in the at_ifaddr just in case we need them.
*/
oldaddr = aa->aa_addr;
onr.nr_firstnet = aa->aa_firstnet;
onr.nr_lastnet = aa->aa_lastnet;
/*
* take the address supplied as an argument, and add it to the
* at_ifnet (also given). Remember ing to update
* those parts of the at_ifaddr that need special processing
*/
memset(AA_SAT(aa), 0, sizeof(struct sockaddr_at));
memcpy(&nr, sat->sat_zero, sizeof(struct netrange));
memcpy(AA_SAT(aa)->sat_zero, sat->sat_zero, sizeof(struct netrange));
nnets = ntohs(nr.nr_lastnet) - ntohs(nr.nr_firstnet) + 1;
aa->aa_firstnet = nr.nr_firstnet;
aa->aa_lastnet = nr.nr_lastnet;
#ifdef NETATALKDEBUG
printf("at_ifinit: %s: %u.%u range %u-%u phase %d\n",
ifp->if_xname,
ntohs(sat->sat_addr.s_net), sat->sat_addr.s_node,
ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet),
(aa->aa_flags & AFA_PHASE2) ? 2 : 1);
#endif
/*
* We could eliminate the need for a second phase 1 probe (post
* autoconf) if we check whether we're resetting the node. Note
* that phase 1 probes use only nodes, not net.node pairs. Under
* phase 2, both the net and node must be the same.
*/
AA_SAT(aa)->sat_len = sizeof(struct sockaddr_at);
AA_SAT(aa)->sat_family = AF_APPLETALK;
if (ifp->if_flags & IFF_LOOPBACK) {
AA_SAT(aa)->sat_addr.s_net = sat->sat_addr.s_net;
AA_SAT(aa)->sat_addr.s_node = sat->sat_addr.s_node;
#if 0
} else if (fp->if_flags & IFF_POINTOPOINT) {
/* unimplemented */
/*
* we'd have to copy the dstaddr field over from the sat
* but it's not clear that it would contain the right info..
*/
#endif
} else {
/*
* We are a normal (probably ethernet) interface.
* apply the new address to the interface structures etc.
* We will probe this address on the net first, before
* applying it to ensure that it is free.. If it is not, then
* we will try a number of other randomly generated addresses
* in this net and then increment the net. etc.etc. until
* we find an unused address.
*/
aa->aa_flags |= AFA_PROBING; /* if not loopback we Must
* probe? */
if (aa->aa_flags & AFA_PHASE2) {
if (sat->sat_addr.s_net == ATADDR_ANYNET) {
/*
* If we are phase 2, and the net was not
* specified * then we select a random net
* within the supplied netrange.
* XXX use /dev/random?
*/
if (nnets != 1) {
net = ntohs(nr.nr_firstnet) +
time_second % (nnets - 1);
} else {
net = ntohs(nr.nr_firstnet);
}
} else {
/*
* if a net was supplied, then check that it
* is within the netrange. If it is not then
* replace the old values and return an error
*/
if (ntohs(sat->sat_addr.s_net) < ntohs(nr.nr_firstnet) ||
ntohs(sat->sat_addr.s_net) >
ntohs(nr.nr_lastnet)) {
aa->aa_addr = oldaddr;
aa->aa_firstnet = onr.nr_firstnet;
aa->aa_lastnet = onr.nr_lastnet;
splx(s);
return (EINVAL);
}
/*
* otherwise just use the new net number..
*/
net = ntohs(sat->sat_addr.s_net);
}
} else {
/*
* we must be phase one, so just use whatever we were
* given. I guess it really isn't going to be used...
* RIGHT?
*/
net = ntohs(sat->sat_addr.s_net);
}
/*
* set the node part of the address into the ifaddr. If it's
* not specified, be random about it... XXX use /dev/random?
*/
if (sat->sat_addr.s_node == ATADDR_ANYNODE) { AA_SAT(aa)->sat_addr.s_node = time_second;
} else {
AA_SAT(aa)->sat_addr.s_node = sat->sat_addr.s_node;
}
/*
* step through the nets in the range starting at the
* (possibly random) start point.
*/
for (i = nnets, netinc = 1; i > 0; net = ntohs(nr.nr_firstnet) +
((net - ntohs(nr.nr_firstnet) + netinc) % nnets), i--) {
AA_SAT(aa)->sat_addr.s_net = htons(net);
/*
* using a rather strange stepping method,
* stagger through the possible node addresses
* Once again, starting at the (possibly random)
* initial node address.
*/
for (j = 0, nodeinc = time_second | 1; j < 256; j++, AA_SAT(aa)->sat_addr.s_node += nodeinc) { if (AA_SAT(aa)->sat_addr.s_node > 253 ||
AA_SAT(aa)->sat_addr.s_node < 1) {
continue;
}
aa->aa_probcnt = 10;
/*
* start off the probes as an asynchronous
* activity. though why wait 200mSec?
*/
callout_reset(&aa->aa_probe_ch, hz / 5,
aarpprobe, ifp);
if (tsleep(aa, PPAUSE | PCATCH, "at_ifinit",
0)) {
/*
* theoretically we shouldn't time out
* here so if we returned with an error.
*/
printf("at_ifinit: timeout?!\n");
aa->aa_addr = oldaddr;
aa->aa_firstnet = onr.nr_firstnet;
aa->aa_lastnet = onr.nr_lastnet;
splx(s);
return (EINTR);
}
/*
* The async activity should have woken us
* up. We need to see if it was successful in
* finding a free spot, or if we need to
* iterate to the next address to try.
*/
if ((aa->aa_flags & AFA_PROBING) == 0)
break;
}
/*
* of course we need to break out through two loops...
*/
if ((aa->aa_flags & AFA_PROBING) == 0)
break;
/* reset node for next network */
AA_SAT(aa)->sat_addr.s_node = time_second;
}
/*
* if we are still trying to probe, then we have finished all
* the possible addresses, so we need to give up
*/
if (aa->aa_flags & AFA_PROBING) { aa->aa_addr = oldaddr;
aa->aa_firstnet = onr.nr_firstnet;
aa->aa_lastnet = onr.nr_lastnet;
splx(s);
return (EADDRINUSE);
}
}
/*
* Now that we have selected an address, we need to tell the
* interface about it, just in case it needs to adjust something.
*/
if ((error = if_addr_init(ifp, &aa->aa_ifa, true)) != 0) {
/*
* of course this could mean that it objects violently
* so if it does, we back out again..
*/
aa->aa_addr = oldaddr;
aa->aa_firstnet = onr.nr_firstnet;
aa->aa_lastnet = onr.nr_lastnet;
splx(s);
return (error);
}
/*
* set up the netmask part of the at_ifaddr and point the appropriate
* pointer in the ifaddr to it. probably pointless, but what the
* heck.. XXX
*/
memset(&aa->aa_netmask, 0, sizeof(aa->aa_netmask));
aa->aa_netmask.sat_len = sizeof(struct sockaddr_at);
aa->aa_netmask.sat_family = AF_APPLETALK;
aa->aa_netmask.sat_addr.s_net = 0xffff;
aa->aa_netmask.sat_addr.s_node = 0;
#if 0
aa->aa_ifa.ifa_netmask = (struct sockaddr *) &(aa->aa_netmask);/* XXX */
#endif
/*
* Initialize broadcast (or remote p2p) address
*/
memset(&aa->aa_broadaddr, 0, sizeof(aa->aa_broadaddr));
aa->aa_broadaddr.sat_len = sizeof(struct sockaddr_at);
aa->aa_broadaddr.sat_family = AF_APPLETALK;
aa->aa_ifa.ifa_metric = ifp->if_metric;
if (ifp->if_flags & IFF_BROADCAST) {
aa->aa_broadaddr.sat_addr.s_net = htons(ATADDR_ANYNET);
aa->aa_broadaddr.sat_addr.s_node = ATADDR_BCAST;
aa->aa_ifa.ifa_broadaddr =
(struct sockaddr *) &aa->aa_broadaddr;
/* add the range of routes needed */
error = aa_dorangeroute(&aa->aa_ifa,
ntohs(aa->aa_firstnet), ntohs(aa->aa_lastnet), RTM_ADD);
} else if (ifp->if_flags & IFF_POINTOPOINT) {
struct at_addr rtaddr, rtmask;
memset(&rtaddr, 0, sizeof(rtaddr));
memset(&rtmask, 0, sizeof(rtmask));
/* fill in the far end if we know it here XXX */
aa->aa_ifa.ifa_dstaddr = (struct sockaddr *) & aa->aa_dstaddr;
error = aa_addsingleroute(&aa->aa_ifa, &rtaddr, &rtmask);
} else if (ifp->if_flags & IFF_LOOPBACK) { struct at_addr rtaddr, rtmask;
memset(&rtaddr, 0, sizeof(rtaddr));
memset(&rtmask, 0, sizeof(rtmask));
rtaddr.s_net = AA_SAT(aa)->sat_addr.s_net;
rtaddr.s_node = AA_SAT(aa)->sat_addr.s_node;
rtmask.s_net = 0xffff;
rtmask.s_node = 0x0;
error = aa_addsingleroute(&aa->aa_ifa, &rtaddr, &rtmask);
}
/*
* of course if we can't add these routes we back out, but it's getting
* risky by now XXX
*/
if (error) {
at_scrub(ifp, aa);
aa->aa_addr = oldaddr;
aa->aa_firstnet = onr.nr_firstnet;
aa->aa_lastnet = onr.nr_lastnet;
splx(s);
return (error);
}
/*
* note that the address has a route associated with it....
*/
aa->aa_ifa.ifa_flags |= IFA_ROUTE;
aa->aa_flags |= AFA_ROUTE;
splx(s);
return (0);
}
/*
* check whether a given address is a broadcast address for us..
*/
int
at_broadcast(const struct sockaddr_at *sat)
{
struct at_ifaddr *aa;
/*
* If the node is not right, it can't be a broadcast
*/
if (sat->sat_addr.s_node != ATADDR_BCAST)
return 0;
/*
* If the node was right then if the net is right, it's a broadcast
*/
if (sat->sat_addr.s_net == ATADDR_ANYNET)
return 1;
/*
* failing that, if the net is one we have, it's a broadcast as well.
*/
TAILQ_FOREACH(aa, &at_ifaddr, aa_list) {
if ((aa->aa_ifp->if_flags & IFF_BROADCAST)
&& (ntohs(sat->sat_addr.s_net) >= ntohs(aa->aa_firstnet)
&& ntohs(sat->sat_addr.s_net) <= ntohs(aa->aa_lastnet)))
return 1;
}
return 0;
}
/*
* aa_dorangeroute()
*
* Add a route for a range of networks from bot to top - 1.
* Algorithm:
*
* Split the range into two subranges such that the middle
* of the two ranges is the point where the highest bit of difference
* between the two addresses, makes its transition
* Each of the upper and lower ranges might not exist, or might be
* representable by 1 or more netmasks. In addition, if both
* ranges can be represented by the same netmask, then teh can be merged
* by using the next higher netmask..
*/
static int
aa_dorangeroute(struct ifaddr *ifa, u_int bot, u_int top, int cmd)
{
u_int mask1;
struct at_addr addr;
struct at_addr mask;
int error;
/*
* slight sanity check
*/
if (bot > top)
return (EINVAL);
addr.s_node = 0;
mask.s_node = 0;
/*
* just start out with the lowest boundary
* and keep extending the mask till it's too big.
*/
while (bot <= top) {
mask1 = 1;
while (((bot & ~mask1) >= bot)
&& ((bot | mask1) <= top)) {
mask1 <<= 1;
mask1 |= 1;
}
mask1 >>= 1;
mask.s_net = htons(~mask1);
addr.s_net = htons(bot);
if (cmd == RTM_ADD) {
error = aa_addsingleroute(ifa, &addr, &mask);
if (error) {
/* XXX clean up? */
return (error);
}
} else {
error = aa_delsingleroute(ifa, &addr, &mask);
}
bot = (bot | mask1) + 1;
}
return 0;
}
static int
aa_addsingleroute(struct ifaddr *ifa, struct at_addr *addr, struct at_addr *mask)
{
int error;
#ifdef NETATALKDEBUG
printf("aa_addsingleroute: %x.%x mask %x.%x ...",
ntohs(addr->s_net), addr->s_node,
ntohs(mask->s_net), mask->s_node);
#endif
error = aa_dosingleroute(ifa, addr, mask, RTM_ADD, RTF_UP);
#ifdef NETATALKDEBUG
if (error)
printf("aa_addsingleroute: error %d\n", error);
#endif
return (error);
}
static int
aa_delsingleroute(struct ifaddr *ifa, struct at_addr *addr, struct at_addr *mask)
{
int error;
#ifdef NETATALKDEBUG
printf("aa_delsingleroute: %x.%x mask %x.%x ...",
ntohs(addr->s_net), addr->s_node,
ntohs(mask->s_net), mask->s_node);
#endif
error = aa_dosingleroute(ifa, addr, mask, RTM_DELETE, 0);
#ifdef NETATALKDEBUG
if (error)
printf("aa_delsingleroute: error %d\n", error);
#endif
return (error);
}
static int
aa_dosingleroute(struct ifaddr *ifa, struct at_addr *at_addr, struct at_addr *at_mask, int cmd, int flags)
{
struct sockaddr_at addr, mask, *gate;
memset(&addr, 0, sizeof(addr));
memset(&mask, 0, sizeof(mask));
addr.sat_family = AF_APPLETALK;
addr.sat_len = sizeof(struct sockaddr_at);
addr.sat_addr.s_net = at_addr->s_net;
addr.sat_addr.s_node = at_addr->s_node;
mask.sat_family = AF_APPLETALK;
mask.sat_len = sizeof(struct sockaddr_at);
mask.sat_addr.s_net = at_mask->s_net;
mask.sat_addr.s_node = at_mask->s_node;
if (at_mask->s_node) {
gate = satosat(ifa->ifa_dstaddr);
flags |= RTF_HOST;
} else {
gate = satosat(ifa->ifa_addr);
}
#ifdef NETATALKDEBUG
printf("on %s %x.%x\n", (flags & RTF_HOST) ? "host" : "net",
ntohs(gate->sat_addr.s_net), gate->sat_addr.s_node);
#endif
return (rtrequest(cmd, (struct sockaddr *) &addr,
(struct sockaddr *) gate, (struct sockaddr *) &mask, flags, NULL));
}
#if 0
static void
aa_clean(void)
{
struct at_ifaddr *aa;
struct ifaddr *ifa;
struct ifnet *ifp;
while ((aa = TAILQ_FIRST(&at_ifaddr)) != NULL) {
TAILQ_REMOVE(&at_ifaddr, aa, aa_list);
ifp = aa->aa_ifp;
at_scrub(ifp, aa);
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa == &aa->aa_ifa)
break;
}
if (ifa == NULL)
panic("aa not present");
ifa_remove(ifp, ifa);
}
}
#endif
/* $NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $ */
/*
* Copyright (c) 1998, 2000, 2017 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum, and by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the University of Utah, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)trap.c 7.4 (Berkeley) 5/13/91
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: trap.c,v 1.129 2023/10/05 19:41:03 ad Exp $");
#include "opt_ddb.h"
#include "opt_kgdb.h"
#include "opt_xen.h"
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/acct.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/ras.h>
#include <sys/signal.h>
#include <sys/syscall.h>
#include <sys/cpu.h>
#include <sys/ucontext.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <uvm/uvm_extern.h>
#include <machine/cpufunc.h>
#include <x86/fpu.h>
#include <x86/dbregs.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/trap.h>
#include <machine/userret.h>
#include <machine/db_machdep.h>
#include <x86/nmi.h>
#ifndef XENPV
#include "isa.h"
#endif
#include <sys/kgdb.h>
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
/*
* This is a hook which is initialized by the dtrace module to handle traps
* which might occur during DTrace probe execution.
*/
dtrace_trap_func_t dtrace_trap_func = NULL;
dtrace_doubletrap_func_t dtrace_doubletrap_func = NULL;
#endif
/*
* Module hook for amd64_oosyscall
*/
struct amd64_oosyscall_hook_t amd64_oosyscall_hook;
void nmitrap(struct trapframe *);
void doubletrap(struct trapframe *);
void trap(struct trapframe *);
const char * const trap_type[] = {
"privileged instruction fault", /* 0 T_PRIVINFLT */
"breakpoint trap", /* 1 T_BPTFLT */
"arithmetic trap", /* 2 T_ARITHTRAP */
"asynchronous system trap", /* 3 T_ASTFLT */
"protection fault", /* 4 T_PROTFLT */
"trace trap", /* 5 T_TRCTRAP */
"page fault", /* 6 T_PAGEFLT */
"alignment fault", /* 7 T_ALIGNFLT */
"integer divide fault", /* 8 T_DIVIDE */
"non-maskable interrupt", /* 9 T_NMI */
"overflow trap", /* 10 T_OFLOW */
"bounds check fault", /* 11 T_BOUND */
"FPU not available fault", /* 12 T_DNA */
"double fault", /* 13 T_DOUBLEFLT */
"FPU operand fetch fault", /* 14 T_FPOPFLT */
"invalid TSS fault", /* 15 T_TSSFLT */
"segment not present fault", /* 16 T_SEGNPFLT */
"stack fault", /* 17 T_STKFLT */
"machine check fault", /* 18 T_MCA */
"SSE FP exception", /* 19 T_XMM */
"reserved trap", /* 20 T_RESERVED */
};
int trap_types = __arraycount(trap_type);
#ifdef TRAP_SIGDEBUG
static void sigdebug(const struct trapframe *, const ksiginfo_t *, int);
#define SIGDEBUG(a, b, c) sigdebug(a, b, c)
#else
#define SIGDEBUG(a, b, c)
#endif
static void
onfault_restore(struct trapframe *frame, void *onfault, int error)
{
frame->tf_rip = (uintptr_t)onfault;
frame->tf_rax = error;
}
static void *
onfault_handler(const struct pcb *pcb, const struct trapframe *tf)
{
struct onfault_table {
uintptr_t start;
uintptr_t end;
void *handler;
};
extern const struct onfault_table onfault_table[];
const struct onfault_table *p;
uintptr_t pc;
if (pcb->pcb_onfault != NULL) {
return pcb->pcb_onfault;
}
pc = tf->tf_rip;
for (p = onfault_table; p->start; p++) { if (p->start <= pc && pc < p->end) { return p->handler;
}
}
return NULL;
}
static void
trap_print(const struct trapframe *frame, const lwp_t *l)
{
const int type = frame->tf_trapno;
if (frame->tf_trapno < trap_types) {
printf("fatal %s", trap_type[type]);
} else {
printf("unknown trap %d", type);
}
printf(" in %s mode\n", (type & T_USER) ? "user" : "supervisor");
printf("trap type %d code %#lx rip %#lx cs %#lx rflags %#lx cr2 %#lx "
"ilevel %#x rsp %#lx\n",
type, frame->tf_err, (u_long)frame->tf_rip, frame->tf_cs,
frame->tf_rflags, rcr2(), curcpu()->ci_ilevel, frame->tf_rsp);
printf("curlwp %p pid %d.%d lowest kstack %p\n",
l, l->l_proc->p_pid, l->l_lid, KSTACK_LOWEST_ADDR(l));
}
void
nmitrap(struct trapframe *frame)
{
const int type = T_NMI;
if (nmi_dispatch(frame))
return;
/* NMI can be hooked up to a pushbutton for debugging */
if (kgdb_trap(type, frame))
return;
if (kdb_trap(type, 0, frame))
return;
/* machine/parity/power fail/"kitchen sink" faults */
x86_nmi();
}
void
doubletrap(struct trapframe *frame)
{
const int type = T_DOUBLEFLT;
struct lwp *l = curlwp;
trap_print(frame, l);
if (kdb_trap(type, 0, frame))
return;
if (kgdb_trap(type, frame))
return;
panic("double fault");
}
/*
* trap(frame): exception, fault, and trap interface to BSD kernel.
*
* This common code is called from assembly language IDT gate entry routines
* that prepare a suitable stack frame, and restore this frame after the
* exception has been processed. Note that the effect is as if the arguments
* were passed call by reference.
*
* Note that the fpu traps (07 T_DNA, 10 T_ARITHTRAP and 13 T_XMM)
* jump directly into the code in x86/fpu.c so they get processed
* without interrupts being enabled.
*/
void
trap(struct trapframe *frame)
{
struct lwp *l = curlwp;
struct proc *p;
struct pcb *pcb;
extern char kcopy_fault[];
ksiginfo_t ksi;
void *onfault;
int type, error;
uint64_t cr2;
bool pfail;
if (__predict_true(l != NULL)) { pcb = lwp_getpcb(l);
p = l->l_proc;
} else {
/*
* This can happen eg on break points in early on boot.
*/
pcb = NULL;
p = NULL;
}
type = frame->tf_trapno;
if (!KERNELMODE(frame->tf_cs)) { type |= T_USER;
l->l_md.md_regs = frame;
}
#ifdef KDTRACE_HOOKS
/*
* A trap can occur while DTrace executes a probe. Before
* executing the probe, DTrace blocks re-scheduling and sets
* a flag in its per-cpu flags to indicate that it doesn't
* want to fault. On returning from the probe, the no-fault
* flag is cleared and finally re-scheduling is enabled.
*
* If the DTrace kernel module has registered a trap handler,
* call it and if it returns non-zero, assume that it has
* handled the trap and modified the trap frame so that this
* function can return normally.
*/
if ((type == T_PROTFLT || type == T_PAGEFLT) &&
dtrace_trap_func != NULL) {
if ((*dtrace_trap_func)(frame, type)) {
return;
}
}
#endif
switch (type) {
default:
we_re_toast:
trap_print(frame, l);
if (kdb_trap(type, 0, frame))
return;
if (kgdb_trap(type, frame))
return;
/*
* If this is a breakpoint, don't panic if we're not connected.
*/
if (type == T_BPTFLT && kgdb_disconnected()) { printf("kgdb: ignored %s\n", trap_type[type]);
return;
}
panic("trap");
/*NOTREACHED*/
case T_PROTFLT:
case T_SEGNPFLT:
case T_ALIGNFLT:
case T_STKFLT:
case T_TSSFLT:
if (p == NULL)
goto we_re_toast;
/* Check for copyin/copyout fault. */
onfault = onfault_handler(pcb, frame); if (onfault != NULL) {
onfault_restore(frame, onfault, EFAULT);
return;
}
goto we_re_toast;
case T_PROTFLT|T_USER: /* protection fault */
{ int hook_ret;
MODULE_HOOK_CALL(amd64_oosyscall_hook, (p, frame),
ENOSYS, hook_ret);
if (hook_ret == 0) {
/* Do the syscall */
p->p_md.md_syscall(frame);
goto out;
}
}
/* FALLTHROUGH */
case T_TSSFLT|T_USER:
case T_SEGNPFLT|T_USER:
case T_STKFLT|T_USER:
case T_ALIGNFLT|T_USER:
KSI_INIT_TRAP(&ksi);
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)frame->tf_rip;
switch (type) {
case T_SEGNPFLT|T_USER:
case T_STKFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRERR;
break;
case T_TSSFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_OBJERR;
break;
case T_ALIGNFLT|T_USER:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRALN;
break;
case T_PROTFLT|T_USER:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_ACCERR;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_PRIVINFLT|T_USER: /* privileged instruction fault */
case T_FPOPFLT|T_USER: /* coprocessor operand fault */
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGILL;
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *) frame->tf_rip;
switch (type) {
case T_PRIVINFLT|T_USER:
ksi.ksi_code = ILL_PRVOPC;
break;
case T_FPOPFLT|T_USER:
ksi.ksi_code = ILL_COPROC;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_ASTFLT|T_USER:
/* Allow process switch. */
//curcpu()->ci_data.cpu_nast++;
if (l->l_pflag & LP_OWEUPC) { l->l_pflag &= ~LP_OWEUPC;
ADDUPROF(l);
}
goto out;
case T_BOUND|T_USER:
case T_OFLOW|T_USER:
case T_DIVIDE|T_USER:
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGFPE;
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)frame->tf_rip;
switch (type) {
case T_BOUND|T_USER:
ksi.ksi_code = FPE_FLTSUB;
break;
case T_OFLOW|T_USER:
ksi.ksi_code = FPE_INTOVF;
break;
case T_DIVIDE|T_USER:
ksi.ksi_code = FPE_INTDIV;
break;
default:
KASSERT(0);
break;
}
goto trapsignal;
case T_PAGEFLT:
/* Allow page faults in kernel mode. */
if (__predict_false(l == NULL))
goto we_re_toast;
onfault = pcb->pcb_onfault;
if (cpu_intr_p() || (l->l_pflag & LP_INTR) != 0) {
goto we_re_toast;
}
cr2 = rcr2();
if (frame->tf_err & PGEX_I) {
/* SMEP might have brought us here */
if (cr2 < VM_MAXUSER_ADDRESS) {
printf("prevented execution of %p (SMEP)\n",
(void *)cr2);
goto we_re_toast;
}
}
if ((frame->tf_err & PGEX_P) &&
cr2 < VM_MAXUSER_ADDRESS) {
/* SMAP might have brought us here */
if (onfault_handler(pcb, frame) == NULL) {
printf("prevented access to %p (SMAP)\n",
(void *)cr2);
goto we_re_toast;
}
}
goto pagefltcommon;
case T_PAGEFLT|T_USER: {
register vaddr_t va;
register struct vmspace *vm;
register struct vm_map *map;
vm_prot_t ftype;
extern struct vm_map *kernel_map;
cr2 = rcr2();
if (p->p_emul->e_usertrap != NULL &&
(*p->p_emul->e_usertrap)(l, cr2, frame) != 0)
return;
pagefltcommon:
vm = p->p_vmspace;
if (__predict_false(vm == NULL)) {
goto we_re_toast;
}
pcb->pcb_cr2 = cr2;
va = trunc_page((vaddr_t)cr2);
/*
* It is only a kernel address space fault iff:
* 1. (type & T_USER) == 0 and
* 2. pcb_onfault not set or
* 3. pcb_onfault set but supervisor space fault
* The last can occur during an exec() copyin where the
* argument space is lazy-allocated.
*/
if (type == T_PAGEFLT && va >= VM_MIN_KERNEL_ADDRESS)
map = kernel_map;
else
map = &vm->vm_map;
if (frame->tf_err & PGEX_W)
ftype = VM_PROT_WRITE;
else if (frame->tf_err & PGEX_I)
ftype = VM_PROT_EXECUTE;
else
ftype = VM_PROT_READ;
#ifdef DIAGNOSTIC
if (map == kernel_map && va == 0) {
printf("trap: bad kernel access at %lx\n", va);
goto we_re_toast;
}
#endif
/* Fault the original page in. */
onfault = pcb->pcb_onfault;
pcb->pcb_onfault = NULL;
error = uvm_fault(map, va, ftype);
pcb->pcb_onfault = onfault;
if (error == 0) {
if (map != kernel_map && (void *)va >= vm->vm_maxsaddr) uvm_grow(p, va);
pfail = false;
while (type == T_PAGEFLT) {
/*
* we need to switch pmap now if we're in
* the middle of copyin/out.
*
* but we don't need to do so for kcopy as
* it never touch userspace.
*/
kpreempt_disable();
if (curcpu()->ci_want_pmapload) { onfault = onfault_handler(pcb, frame); if (onfault != kcopy_fault) {
pmap_load();
}
}
/*
* We need to keep the pmap loaded and
* so avoid being preempted until back
* into the copy functions. Disable
* interrupts at the hardware level before
* re-enabling preemption. Interrupts
* will be re-enabled by 'iret' when
* returning back out of the trap stub.
* They'll only be re-enabled when the
* program counter is once again in
* the copy functions, and so visible
* to cpu_kpreempt_exit().
*/
#ifndef XENPV
x86_disable_intr();
#endif
l->l_nopreempt--;
if (l->l_nopreempt > 0 || !l->l_dopreempt ||
pfail) {
return;
}
#ifndef XENPV
x86_enable_intr();
#endif
/*
* If preemption fails for some reason,
* don't retry it. The conditions won't
* change under our nose.
*/
pfail = kpreempt(0);
}
goto out;
}
if (type == T_PAGEFLT) {
onfault = onfault_handler(pcb, frame); if (onfault != NULL) {
onfault_restore(frame, onfault, error);
return;
}
printf("uvm_fault(%p, 0x%lx, %d) -> %x\n",
map, va, ftype, error);
goto we_re_toast;
}
KSI_INIT_TRAP(&ksi);
ksi.ksi_trap = type & ~T_USER;
ksi.ksi_addr = (void *)cr2;
switch (error) {
case EINVAL:
ksi.ksi_signo = SIGBUS;
ksi.ksi_code = BUS_ADRERR;
break;
case EACCES:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_ACCERR;
error = EFAULT;
break;
case ENOMEM:
ksi.ksi_signo = SIGKILL;
printf("UVM: pid %d.%d (%s), uid %d killed: "
"out of swap\n", p->p_pid, l->l_lid, p->p_comm,
l->l_cred ? kauth_cred_geteuid(l->l_cred) : -1);
break;
default:
ksi.ksi_signo = SIGSEGV;
ksi.ksi_code = SEGV_MAPERR;
break;
}
SIGDEBUG(frame, &ksi, error);
(*p->p_emul->e_trapsignal)(l, &ksi);
break;
}
case T_TRCTRAP:
/*
* Ignore debug register trace traps due to
* accesses in the user's address space, which
* can happen under several conditions such as
* if a user sets a watchpoint on a buffer and
* then passes that buffer to a system call.
* We still want to get TRCTRAPS for addresses
* in kernel space because that is useful when
* debugging the kernel.
*/
if (x86_dbregs_user_trap())
break;
goto we_re_toast;
case T_BPTFLT|T_USER: /* bpt instruction fault */
case T_TRCTRAP|T_USER: /* trace trap */
/*
* Don't go single-stepping into a RAS.
*/
if (p->p_raslist == NULL ||
(ras_lookup(p, (void *)frame->tf_rip) == (void *)-1)) {
KSI_INIT_TRAP(&ksi);
ksi.ksi_signo = SIGTRAP;
ksi.ksi_trap = type & ~T_USER;
if (x86_dbregs_user_trap()) {
x86_dbregs_store_dr6(l);
ksi.ksi_code = TRAP_DBREG;
} else if (type == (T_BPTFLT|T_USER))
ksi.ksi_code = TRAP_BRKPT;
else
ksi.ksi_code = TRAP_TRACE;
(*p->p_emul->e_trapsignal)(l, &ksi);
}
break;
}
if ((type & T_USER) == 0)
return;
out:
userret(l);
return;
trapsignal:
SIGDEBUG(frame, &ksi, 0);
(*p->p_emul->e_trapsignal)(l, &ksi);
userret(l);
}
/*
* startlwp: start of a new LWP.
*/
void
startlwp(void *arg)
{
ucontext_t *uc = arg;
lwp_t *l = curlwp;
int error __diagused;
error = cpu_setmcontext(l, &uc->uc_mcontext, uc->uc_flags);
KASSERT(error == 0);
kmem_free(uc, sizeof(ucontext_t));
userret(l);
}
#ifdef TRAP_SIGDEBUG
static void
frame_dump(const struct trapframe *tf, struct pcb *pcb)
{
printf("trapframe %p\n", tf);
printf("rip %#018lx rsp %#018lx rfl %#018lx\n",
tf->tf_rip, tf->tf_rsp, tf->tf_rflags);
printf("rdi %#018lx rsi %#018lx rdx %#018lx\n",
tf->tf_rdi, tf->tf_rsi, tf->tf_rdx);
printf("rcx %#018lx r8 %#018lx r9 %#018lx\n",
tf->tf_rcx, tf->tf_r8, tf->tf_r9);
printf("r10 %#018lx r11 %#018lx r12 %#018lx\n",
tf->tf_r10, tf->tf_r11, tf->tf_r12);
printf("r13 %#018lx r14 %#018lx r15 %#018lx\n",
tf->tf_r13, tf->tf_r14, tf->tf_r15);
printf("rbp %#018lx rbx %#018lx rax %#018lx\n",
tf->tf_rbp, tf->tf_rbx, tf->tf_rax);
printf("cs %#04lx ds %#04lx es %#04lx "
"fs %#04lx gs %#04lx ss %#04lx\n",
tf->tf_cs & 0xffff, tf->tf_ds & 0xffff, tf->tf_es & 0xffff,
tf->tf_fs & 0xffff, tf->tf_gs & 0xffff, tf->tf_ss & 0xffff);
printf("fsbase %#018lx gsbase %#018lx\n", pcb->pcb_fs, pcb->pcb_gs);
printf("\n");
hexdump(printf, "Stack dump", tf, 256);
}
static void
sigdebug(const struct trapframe *tf, const ksiginfo_t *ksi, int e)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
printf("pid %d.%d (%s): signal %d code=%d (trap %#lx) "
"@rip %#lx addr %#lx error=%d\n",
p->p_pid, l->l_lid, p->p_comm, ksi->ksi_signo, ksi->ksi_code,
tf->tf_trapno, tf->tf_rip, rcr2(), e);
frame_dump(tf, lwp_getpcb(l));
}
#endif
/* $NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c)2007 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* CPU related routines shared with rump.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_cpu.c,v 1.22 2024/03/05 20:59:41 thorpej Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
static void cpu_topology_fake1(struct cpu_info *);
kmutex_t cpu_lock __cacheline_aligned;
int ncpu __read_mostly;
int ncpuonline __read_mostly;
bool mp_online __read_mostly;
static bool cpu_topology_present __read_mostly;
static bool cpu_topology_haveslow __read_mostly;
int64_t cpu_counts[CPU_COUNT_MAX];
/* An array of CPUs. There are ncpu entries. */
struct cpu_info **cpu_infos __read_mostly;
/* Note: set on mi_cpu_attach() and idle_loop(). */
kcpuset_t * kcpuset_attached __read_mostly = NULL;
kcpuset_t * kcpuset_running __read_mostly = NULL;
static char cpu_model[128];
/*
* mi_cpu_init: early initialisation of MI CPU related structures.
*
* Note: may not block and memory allocator is not yet available.
*/
void
mi_cpu_init(void)
{
struct cpu_info *ci;
mutex_init(&cpu_lock, MUTEX_DEFAULT, IPL_NONE);
kcpuset_create(&kcpuset_attached, true);
kcpuset_create(&kcpuset_running, true);
kcpuset_set(kcpuset_running, 0);
ci = curcpu();
cpu_topology_fake1(ci);
}
int
cpu_setmodel(const char *fmt, ...)
{
int len;
va_list ap;
va_start(ap, fmt);
len = vsnprintf(cpu_model, sizeof(cpu_model), fmt, ap);
va_end(ap);
return len;
}
const char *
cpu_getmodel(void)
{
return cpu_model;
}
bool
cpu_softintr_p(void)
{
return (curlwp->l_pflag & LP_INTR) != 0;
}
bool
curcpu_stable(void)
{
struct lwp *const l = curlwp;
const int pflag = l->l_pflag;
const int nopreempt = l->l_nopreempt;
/*
* - Softints (LP_INTR) never migrate between CPUs.
* - Bound lwps (LP_BOUND), either kthreads created bound to
* a CPU or any lwps bound with curlwp_bind, never migrate.
* - If kpreemption is disabled, the lwp can't migrate.
* - If we're in interrupt context, preemption is blocked.
*
* We combine the LP_INTR, LP_BOUND, and l_nopreempt test into
* a single predicted-true branch so this is cheap to assert in
* most contexts where it will be used, then fall back to
* calling the full kpreempt_disabled() and cpu_intr_p() as
* subroutines.
*
* XXX Is cpu_intr_p redundant with kpreempt_disabled?
*/
return __predict_true(((pflag & (LP_INTR|LP_BOUND)) | nopreempt)
!= 0) ||
kpreempt_disabled() ||
cpu_intr_p();
}
/*
* Collect CPU topology information as each CPU is attached. This can be
* called early during boot, so we need to be careful what we do.
*/
void
cpu_topology_set(struct cpu_info *ci, u_int package_id, u_int core_id,
u_int smt_id, u_int numa_id)
{
enum cpu_rel rel;
cpu_topology_present = true;
ci->ci_package_id = package_id;
ci->ci_core_id = core_id;
ci->ci_smt_id = smt_id;
ci->ci_numa_id = numa_id;
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
ci->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = 1;
}
}
/*
* Collect CPU relative speed
*/
void
cpu_topology_setspeed(struct cpu_info *ci, bool slow)
{
cpu_topology_haveslow |= slow;
ci->ci_is_slow = slow;
}
/*
* Link a CPU into the given circular list.
*/
static void
cpu_topology_link(struct cpu_info *ci, struct cpu_info *ci2, enum cpu_rel rel)
{
struct cpu_info *ci3;
/* Walk to the end of the existing circular list and append. */
for (ci3 = ci2;; ci3 = ci3->ci_sibling[rel]) {
ci3->ci_nsibling[rel]++;
if (ci3->ci_sibling[rel] == ci2) {
break;
}
}
ci->ci_sibling[rel] = ci2;
ci3->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = ci3->ci_nsibling[rel];
}
/*
* Print out the topology lists.
*/
static void
cpu_topology_dump(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci, *ci2;
const char *names[] = { "core", "pkg", "1st" };
enum cpu_rel rel;
int i;
CTASSERT(__arraycount(names) >= __arraycount(ci->ci_sibling));
if (ncpu == 1) {
return;
}
for (CPU_INFO_FOREACH(cii, ci)) {
if (cpu_topology_haveslow)
aprint_debug("%s ", ci->ci_is_slow ? "slow" : "fast");
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
aprint_debug("%s has %d %s siblings:", cpu_name(ci),
ci->ci_nsibling[rel], names[rel]);
ci2 = ci->ci_sibling[rel];
i = 0;
do {
aprint_debug(" %s", cpu_name(ci2));
ci2 = ci2->ci_sibling[rel];
} while (++i < 64 && ci2 != ci->ci_sibling[rel]);
if (i == 64) {
aprint_debug(" GAVE UP");
}
aprint_debug("\n");
}
aprint_debug("%s first in package: %s\n", cpu_name(ci),
cpu_name(ci->ci_package1st));
}
}
/*
* Fake up topology info if we have none, or if what we got was bogus.
* Used early in boot, and by cpu_topology_fake().
*/
static void
cpu_topology_fake1(struct cpu_info *ci)
{
enum cpu_rel rel;
for (rel = 0; rel < __arraycount(ci->ci_sibling); rel++) {
ci->ci_sibling[rel] = ci;
ci->ci_nsibling[rel] = 1;
}
if (!cpu_topology_present) {
ci->ci_package_id = cpu_index(ci);
}
ci->ci_schedstate.spc_flags |=
(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
ci->ci_package1st = ci;
if (!cpu_topology_haveslow) {
ci->ci_is_slow = false;
}
}
/*
* Fake up topology info if we have none, or if what we got was bogus.
* Don't override ci_package_id, etc, if cpu_topology_present is set.
* MD code also uses these.
*/
static void
cpu_topology_fake(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
cpu_topology_fake1(ci);
/* Undo (early boot) flag set so everything links OK. */
ci->ci_schedstate.spc_flags &=
~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
}
}
/*
* Fix up basic CPU topology info. Right now that means attach each CPU to
* circular lists of its siblings in the same core, and in the same package.
*/
void
cpu_topology_init(void)
{
CPU_INFO_ITERATOR cii, cii2;
struct cpu_info *ci, *ci2, *ci3;
u_int minsmt, mincore;
if (!cpu_topology_present) {
cpu_topology_fake();
goto linkit;
}
/* Find siblings in same core and package. */
for (CPU_INFO_FOREACH(cii, ci)) {
ci->ci_schedstate.spc_flags &=
~(SPCF_CORE1ST | SPCF_PACKAGE1ST | SPCF_1STCLASS);
for (CPU_INFO_FOREACH(cii2, ci2)) {
/* Avoid bad things happening. */
if (ci2->ci_package_id == ci->ci_package_id &&
ci2->ci_core_id == ci->ci_core_id &&
ci2->ci_smt_id == ci->ci_smt_id &&
ci2 != ci) {
#ifdef DEBUG
printf("cpu%u %p pkg %u core %u smt %u same as "
"cpu%u %p pkg %u core %u smt %u\n",
cpu_index(ci), ci, ci->ci_package_id,
ci->ci_core_id, ci->ci_smt_id,
cpu_index(ci2), ci2, ci2->ci_package_id,
ci2->ci_core_id, ci2->ci_smt_id);
#endif
printf("cpu_topology_init: info bogus, "
"faking it\n");
cpu_topology_fake();
goto linkit;
}
if (ci2 == ci ||
ci2->ci_package_id != ci->ci_package_id) {
continue;
}
/* Find CPUs in the same core. */
if (ci->ci_nsibling[CPUREL_CORE] == 1 &&
ci->ci_core_id == ci2->ci_core_id) {
cpu_topology_link(ci, ci2, CPUREL_CORE);
}
/* Find CPUs in the same package. */
if (ci->ci_nsibling[CPUREL_PACKAGE] == 1) {
cpu_topology_link(ci, ci2, CPUREL_PACKAGE);
}
if (ci->ci_nsibling[CPUREL_CORE] > 1 &&
ci->ci_nsibling[CPUREL_PACKAGE] > 1) {
break;
}
}
}
linkit:
/* Identify lowest numbered SMT in each core. */
for (CPU_INFO_FOREACH(cii, ci)) {
ci2 = ci3 = ci;
minsmt = ci->ci_smt_id;
do {
if (ci2->ci_smt_id < minsmt) {
ci3 = ci2;
minsmt = ci2->ci_smt_id;
}
ci2 = ci2->ci_sibling[CPUREL_CORE];
} while (ci2 != ci);
ci3->ci_schedstate.spc_flags |= SPCF_CORE1ST;
}
/* Identify lowest numbered SMT in each package. */
ci3 = NULL;
for (CPU_INFO_FOREACH(cii, ci)) {
if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) == 0) {
continue;
}
ci2 = ci3 = ci;
mincore = ci->ci_core_id;
do {
if ((ci2->ci_schedstate.spc_flags &
SPCF_CORE1ST) != 0 &&
ci2->ci_core_id < mincore) {
ci3 = ci2;
mincore = ci2->ci_core_id;
}
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci);
if ((ci3->ci_schedstate.spc_flags & SPCF_PACKAGE1ST) != 0) {
/* Already identified - nothing more to do. */
continue;
}
ci3->ci_schedstate.spc_flags |= SPCF_PACKAGE1ST;
/* Walk through all CPUs in package and point to first. */
ci2 = ci3;
do {
ci2->ci_package1st = ci3;
ci2->ci_sibling[CPUREL_PACKAGE1ST] = ci3;
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci3);
/* Now look for somebody else to link to. */
for (CPU_INFO_FOREACH(cii2, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_PACKAGE1ST)
!= 0 && ci2 != ci3) {
cpu_topology_link(ci3, ci2, CPUREL_PACKAGE1ST);
break;
}
}
}
/* Walk through all packages, starting with value of ci3 from above. */
KASSERT(ci3 != NULL);
ci = ci3;
do {
/* Walk through CPUs in the package and copy in PACKAGE1ST. */
ci2 = ci;
do {
ci2->ci_sibling[CPUREL_PACKAGE1ST] =
ci->ci_sibling[CPUREL_PACKAGE1ST];
ci2->ci_nsibling[CPUREL_PACKAGE1ST] =
ci->ci_nsibling[CPUREL_PACKAGE1ST];
ci2 = ci2->ci_sibling[CPUREL_PACKAGE];
} while (ci2 != ci);
ci = ci->ci_sibling[CPUREL_PACKAGE1ST];
} while (ci != ci3);
if (cpu_topology_haveslow) {
/*
* For asymmetric systems where some CPUs are slower than
* others, mark first class CPUs for the scheduler. This
* conflicts with SMT right now so whinge if observed.
*/
if (curcpu()->ci_nsibling[CPUREL_CORE] > 1) {
printf("cpu_topology_init: asymmetric & SMT??\n");
}
for (CPU_INFO_FOREACH(cii, ci)) {
if (!ci->ci_is_slow) {
ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
}
}
} else {
/*
* For any other configuration mark the 1st CPU in each
* core as a first class CPU.
*/
for (CPU_INFO_FOREACH(cii, ci)) {
if ((ci->ci_schedstate.spc_flags & SPCF_CORE1ST) != 0) {
ci->ci_schedstate.spc_flags |= SPCF_1STCLASS;
}
}
}
cpu_topology_dump();
}
/*
* Adjust one count, for a counter that's NOT updated from interrupt
* context. Hardly worth making an inline due to preemption stuff.
*/
void
cpu_count(enum cpu_count idx, int64_t delta)
{
lwp_t *l = curlwp;
KPREEMPT_DISABLE(l);
l->l_cpu->ci_counts[idx] += delta;
KPREEMPT_ENABLE(l);
}
/*
* Fetch fresh sum total for all counts. Expensive - don't call often.
*
* If poll is true, the caller is okay with less recent values (but
* no more than 1/hz seconds old). Where this is called very often that
* should be the case.
*
* This should be reasonably quick so that any value collected get isn't
* totally out of whack, and it can also be called from interrupt context,
* so go to splvm() while summing the counters. It's tempting to use a spin
* mutex here but this routine is called from DDB.
*/
void
cpu_count_sync(bool poll)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
int64_t sum[CPU_COUNT_MAX], *ptr;
static int lasttick;
int curtick, s;
enum cpu_count i;
KASSERT(sizeof(ci->ci_counts) == sizeof(cpu_counts));
if (__predict_false(!mp_online)) {
memcpy(cpu_counts, curcpu()->ci_counts, sizeof(cpu_counts));
return;
}
s = splvm();
curtick = getticks();
if (poll && atomic_load_acquire(&lasttick) == curtick) { splx(s);
return;
}
memset(sum, 0, sizeof(sum));
curcpu()->ci_counts[CPU_COUNT_SYNC]++;
for (CPU_INFO_FOREACH(cii, ci)) {
ptr = ci->ci_counts;
for (i = 0; i < CPU_COUNT_MAX; i += 8) {
sum[i+0] += ptr[i+0];
sum[i+1] += ptr[i+1];
sum[i+2] += ptr[i+2];
sum[i+3] += ptr[i+3];
sum[i+4] += ptr[i+4];
sum[i+5] += ptr[i+5];
sum[i+6] += ptr[i+6];
sum[i+7] += ptr[i+7];
}
KASSERT(i == CPU_COUNT_MAX);
}
memcpy(cpu_counts, sum, sizeof(cpu_counts));
atomic_store_release(&lasttick, curtick);
splx(s);
}
/* $NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $ */
/*
* Copyright (c) 2002, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2004 The FreeBSD Foundation
* Copyright (c) 2004 Robert Watson
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
*/
/*
* Socket operation routines.
*
* These routines are called by the routines in sys_socket.c or from a
* system process, and implement the semantics of socket operations by
* switching out to the protocol specific routines.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket.c,v 1.309 2024/02/11 13:01:29 jdolecek Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_sock_counters.h"
#include "opt_sosend_loan.h"
#include "opt_mbuftrace.h"
#include "opt_somaxkva.h"
#include "opt_multiprocessor.h" /* XXX */
#include "opt_sctp.h"
#include "opt_pipe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/uidinfo.h>
#include <sys/event.h>
#include <sys/poll.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_loan.h>
#include <uvm/uvm_page.h>
#ifdef SCTP
#include <netinet/sctp_route.h>
#endif
MALLOC_DEFINE(M_SONAME, "soname", "socket name");
extern const struct fileops socketops;
static int sooptions;
extern int somaxconn; /* patchable (XXX sysctl) */
int somaxconn = SOMAXCONN;
kmutex_t *softnet_lock;
#ifdef SOSEND_COUNTERS
#include <sys/device.h>
static struct evcnt sosend_loan_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "loan big");
static struct evcnt sosend_copy_big = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "copy big");
static struct evcnt sosend_copy_small = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "copy small");
static struct evcnt sosend_kvalimit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "sosend", "kva limit");
#define SOSEND_COUNTER_INCR(ev) (ev)->ev_count++
EVCNT_ATTACH_STATIC(sosend_loan_big);
EVCNT_ATTACH_STATIC(sosend_copy_big);
EVCNT_ATTACH_STATIC(sosend_copy_small);
EVCNT_ATTACH_STATIC(sosend_kvalimit);
#else
#define SOSEND_COUNTER_INCR(ev) /* nothing */
#endif /* SOSEND_COUNTERS */
#if defined(SOSEND_NO_LOAN) || defined(MULTIPROCESSOR)
int sock_loan_thresh = -1;
#else
int sock_loan_thresh = 4096;
#endif
static kmutex_t so_pendfree_lock;
static struct mbuf *so_pendfree = NULL;
#ifndef SOMAXKVA
#define SOMAXKVA (16 * 1024 * 1024)
#endif
int somaxkva = SOMAXKVA;
static int socurkva;
static kcondvar_t socurkva_cv;
#ifndef SOFIXEDBUF
#define SOFIXEDBUF true
#endif
bool sofixedbuf = SOFIXEDBUF;
static kauth_listener_t socket_listener;
#define SOCK_LOAN_CHUNK 65536
static void sopendfree_thread(void *);
static kcondvar_t pendfree_thread_cv;
static lwp_t *sopendfree_lwp;
static void sysctl_kern_socket_setup(void);
static struct sysctllog *socket_sysctllog;
static vsize_t
sokvareserve(struct socket *so, vsize_t len)
{
int error;
mutex_enter(&so_pendfree_lock);
while (socurkva + len > somaxkva) {
SOSEND_COUNTER_INCR(&sosend_kvalimit);
error = cv_wait_sig(&socurkva_cv, &so_pendfree_lock);
if (error) {
len = 0;
break;
}
}
socurkva += len;
mutex_exit(&so_pendfree_lock);
return len;
}
static void
sokvaunreserve(vsize_t len)
{
mutex_enter(&so_pendfree_lock);
socurkva -= len;
cv_broadcast(&socurkva_cv);
mutex_exit(&so_pendfree_lock);
}
/*
* sokvaalloc: allocate kva for loan.
*/
vaddr_t
sokvaalloc(vaddr_t sva, vsize_t len, struct socket *so)
{
vaddr_t lva;
if (sokvareserve(so, len) == 0)
return 0;
lva = uvm_km_alloc(kernel_map, len, atop(sva) & uvmexp.colormask,
UVM_KMF_COLORMATCH | UVM_KMF_VAONLY | UVM_KMF_WAITVA);
if (lva == 0) {
sokvaunreserve(len);
return 0;
}
return lva;
}
/*
* sokvafree: free kva for loan.
*/
void
sokvafree(vaddr_t sva, vsize_t len)
{
uvm_km_free(kernel_map, sva, len, UVM_KMF_VAONLY);
sokvaunreserve(len);
}
static void
sodoloanfree(struct vm_page **pgs, void *buf, size_t size)
{
vaddr_t sva, eva;
vsize_t len;
int npgs;
KASSERT(pgs != NULL);
eva = round_page((vaddr_t) buf + size);
sva = trunc_page((vaddr_t) buf);
len = eva - sva;
npgs = len >> PAGE_SHIFT;
pmap_kremove(sva, len);
pmap_update(pmap_kernel());
uvm_unloan(pgs, npgs, UVM_LOAN_TOPAGE);
sokvafree(sva, len);
}
/*
* sopendfree_thread: free mbufs on "pendfree" list. Unlock and relock
* so_pendfree_lock when freeing mbufs.
*/
static void
sopendfree_thread(void *v)
{
struct mbuf *m, *next;
size_t rv;
mutex_enter(&so_pendfree_lock);
for (;;) {
rv = 0;
while (so_pendfree != NULL) {
m = so_pendfree;
so_pendfree = NULL;
mutex_exit(&so_pendfree_lock);
for (; m != NULL; m = next) {
next = m->m_next;
KASSERT((~m->m_flags & (M_EXT|M_EXT_PAGES)) ==
0);
KASSERT(m->m_ext.ext_refcnt == 0);
rv += m->m_ext.ext_size;
sodoloanfree(m->m_ext.ext_pgs, m->m_ext.ext_buf,
m->m_ext.ext_size);
pool_cache_put(mb_cache, m);
}
mutex_enter(&so_pendfree_lock);
}
if (rv)
cv_broadcast(&socurkva_cv);
cv_wait(&pendfree_thread_cv, &so_pendfree_lock);
}
panic("sopendfree_thread");
/* NOTREACHED */
}
void
soloanfree(struct mbuf *m, void *buf, size_t size, void *arg)
{
KASSERT(m != NULL);
/*
* postpone freeing mbuf.
*
* we can't do it in interrupt context
* because we need to put kva back to kernel_map.
*/
mutex_enter(&so_pendfree_lock);
m->m_next = so_pendfree;
so_pendfree = m;
cv_signal(&pendfree_thread_cv);
mutex_exit(&so_pendfree_lock);
}
static long
sosend_loan(struct socket *so, struct uio *uio, struct mbuf *m, long space)
{
struct iovec *iov = uio->uio_iov;
vaddr_t sva, eva;
vsize_t len;
vaddr_t lva;
int npgs, error;
vaddr_t va;
int i;
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace))
return 0;
if (iov->iov_len < (size_t) space)
space = iov->iov_len;
if (space > SOCK_LOAN_CHUNK)
space = SOCK_LOAN_CHUNK;
eva = round_page((vaddr_t) iov->iov_base + space);
sva = trunc_page((vaddr_t) iov->iov_base);
len = eva - sva;
npgs = len >> PAGE_SHIFT;
KASSERT(npgs <= M_EXT_MAXPAGES);
lva = sokvaalloc(sva, len, so);
if (lva == 0)
return 0;
error = uvm_loan(&uio->uio_vmspace->vm_map, sva, len,
m->m_ext.ext_pgs, UVM_LOAN_TOPAGE);
if (error) {
sokvafree(lva, len);
return 0;
}
for (i = 0, va = lva; i < npgs; i++, va += PAGE_SIZE)
pmap_kenter_pa(va, VM_PAGE_TO_PHYS(m->m_ext.ext_pgs[i]),
VM_PROT_READ, 0);
pmap_update(pmap_kernel());
lva += (vaddr_t) iov->iov_base & PAGE_MASK;
MEXTADD(m, (void *) lva, space, M_MBUF, soloanfree, so);
m->m_flags |= M_EXT_PAGES | M_EXT_ROMAP;
uio->uio_resid -= space;
/* uio_offset not updated, not set/used for write(2) */
uio->uio_iov->iov_base = (char *)uio->uio_iov->iov_base + space;
uio->uio_iov->iov_len -= space;
if (uio->uio_iov->iov_len == 0) { uio->uio_iov++;
uio->uio_iovcnt--;
}
return space;
}
static int
socket_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_network_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg0;
if ((action != KAUTH_NETWORK_SOCKET) &&
(action != KAUTH_NETWORK_BIND))
return result;
switch (req) {
case KAUTH_REQ_NETWORK_BIND_PORT:
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_NETWORK_SOCKET_DROP: {
/* Normal users can only drop their own connections. */
struct socket *so = (struct socket *)arg1;
if (so->so_cred && proc_uidmatch(cred, so->so_cred) == 0)
result = KAUTH_RESULT_ALLOW;
break;
}
case KAUTH_REQ_NETWORK_SOCKET_OPEN:
/* We allow "raw" routing/bluetooth sockets to anyone. */
switch ((u_long)arg1) {
case PF_ROUTE:
case PF_OROUTE:
case PF_BLUETOOTH:
case PF_CAN:
result = KAUTH_RESULT_ALLOW;
break;
default:
/* Privileged, let secmodel handle this. */
if ((u_long)arg2 == SOCK_RAW)
break;
result = KAUTH_RESULT_ALLOW;
break;
}
break;
case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return result;
}
void
soinit(void)
{
sysctl_kern_socket_setup();
#ifdef SCTP
/* Update the SCTP function hooks if necessary*/
vec_sctp_add_ip_address = sctp_add_ip_address;
vec_sctp_delete_ip_address = sctp_delete_ip_address;
#endif
mutex_init(&so_pendfree_lock, MUTEX_DEFAULT, IPL_VM);
softnet_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
cv_init(&socurkva_cv, "sokva");
cv_init(&pendfree_thread_cv, "sopendfr");
soinit2();
/* Set the initial adjusted socket buffer size. */
if (sb_max_set(sb_max))
panic("bad initial sb_max value: %lu", sb_max);
socket_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
socket_listener_cb, NULL);
}
void
soinit1(void)
{
int error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
sopendfree_thread, NULL, &sopendfree_lwp, "sopendfree");
if (error)
panic("soinit1 %d", error);
}
/*
* socreate: create a new socket of the specified type and the protocol.
*
* => Caller may specify another socket for lock sharing (must not be held).
* => Returns the new socket without lock held.
*/
int
socreate(int dom, struct socket **aso, int type, int proto, struct lwp *l,
struct socket *lockso)
{
const struct protosw *prp;
struct socket *so;
uid_t uid;
int error;
kmutex_t *lock;
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_SOCKET,
KAUTH_REQ_NETWORK_SOCKET_OPEN, KAUTH_ARG(dom), KAUTH_ARG(type),
KAUTH_ARG(proto));
if (error != 0)
return error;
if (proto)
prp = pffindproto(dom, proto, type);
else
prp = pffindtype(dom, type);
if (prp == NULL) {
/* no support for domain */
if (pffinddomain(dom) == 0)
return EAFNOSUPPORT;
/* no support for socket type */
if (proto == 0 && type != 0)
return EPROTOTYPE;
return EPROTONOSUPPORT;
}
if (prp->pr_usrreqs == NULL)
return EPROTONOSUPPORT;
if (prp->pr_type != type)
return EPROTOTYPE;
so = soget(true);
so->so_type = type;
so->so_proto = prp;
so->so_send = sosend;
so->so_receive = soreceive;
so->so_options = sooptions;
#ifdef MBUFTRACE
so->so_rcv.sb_mowner = &prp->pr_domain->dom_mowner;
so->so_snd.sb_mowner = &prp->pr_domain->dom_mowner;
so->so_mowner = &prp->pr_domain->dom_mowner;
#endif
uid = kauth_cred_geteuid(l->l_cred);
so->so_uidinfo = uid_find(uid);
so->so_egid = kauth_cred_getegid(l->l_cred);
so->so_cpid = l->l_proc->p_pid;
/*
* Lock assigned and taken during PCB attach, unless we share
* the lock with another socket, e.g. socketpair(2) case.
*/
if (lockso) {
/*
* lockso->so_lock should be stable at this point, so
* no need for atomic_load_*.
*/
lock = lockso->so_lock;
so->so_lock = lock;
mutex_obj_hold(lock);
mutex_enter(lock);
}
/* Attach the PCB (returns with the socket lock held). */
error = (*prp->pr_usrreqs->pr_attach)(so, proto);
KASSERT(solocked(so));
if (error) {
KASSERT(so->so_pcb == NULL);
so->so_state |= SS_NOFDREF;
sofree(so);
return error;
}
so->so_cred = kauth_cred_hold(l->l_cred);
sounlock(so);
*aso = so;
return 0;
}
/*
* fsocreate: create a socket and a file descriptor associated with it.
* Returns the allocated file structure in *fpp, but the descriptor
* is not visible yet for the process.
* Caller is responsible for calling fd_affix() for the returned *fpp once
* it's socket initialization is finished successfully, or fd_abort() if it's
* initialization fails.
*
*
* => On success, write file descriptor to *fdout and *fpp and return zero.
* => On failure, return non-zero; *fdout and *fpp will be undefined.
*/
int
fsocreate(int domain, struct socket **sop, int type, int proto, int *fdout,
file_t **fpp, struct socket *lockso)
{
lwp_t *l = curlwp;
int error, fd, flags;
struct socket *so;
file_t *fp;
flags = type & SOCK_FLAGS_MASK;
type &= ~SOCK_FLAGS_MASK;
error = socreate(domain, &so, type, proto, l, lockso);
if (error) {
return error;
}
if ((error = fd_allocfile(&fp, &fd)) != 0) {
soclose(so);
return error;
}
fd_set_exclose(l, fd, (flags & SOCK_CLOEXEC) != 0);
fp->f_flag = FREAD|FWRITE|((flags & SOCK_NONBLOCK) ? FNONBLOCK : 0)|
((flags & SOCK_NOSIGPIPE) ? FNOSIGPIPE : 0);
fp->f_type = DTYPE_SOCKET;
fp->f_ops = &socketops;
if (flags & SOCK_NONBLOCK) {
so->so_state |= SS_NBIO;
}
fp->f_socket = so;
if (sop != NULL) {
*sop = so;
}
*fdout = fd;
*fpp = fp;
return error;
}
int
sofamily(const struct socket *so)
{
const struct protosw *pr;
const struct domain *dom;
if ((pr = so->so_proto) == NULL)
return AF_UNSPEC;
if ((dom = pr->pr_domain) == NULL)
return AF_UNSPEC;
return dom->dom_family;
}
int
sobind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
int error;
solock(so); if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
sounlock(so);
return EAFNOSUPPORT;
}
error = (*so->so_proto->pr_usrreqs->pr_bind)(so, nam, l);
sounlock(so);
return error;
}
int
solisten(struct socket *so, int backlog, struct lwp *l)
{
int error;
short oldopt, oldqlimit;
solock(so); if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING |
SS_ISDISCONNECTING)) != 0) {
sounlock(so);
return EINVAL;
}
oldopt = so->so_options;
oldqlimit = so->so_qlimit;
if (TAILQ_EMPTY(&so->so_q)) so->so_options |= SO_ACCEPTCONN;
if (backlog < 0)
backlog = 0;
so->so_qlimit = uimin(backlog, somaxconn);
error = (*so->so_proto->pr_usrreqs->pr_listen)(so, l);
if (error != 0) { so->so_options = oldopt;
so->so_qlimit = oldqlimit;
sounlock(so);
return error;
}
sounlock(so);
return 0;
}
void
sofree(struct socket *so)
{
u_int refs;
KASSERT(solocked(so)); if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) {
sounlock(so);
return;
}
if (so->so_head) {
/*
* We must not decommission a socket that's on the accept(2)
* queue. If we do, then accept(2) may hang after select(2)
* indicated that the listening socket was ready.
*/
if (!soqremque(so, 0)) { sounlock(so);
return;
}
}
if (so->so_rcv.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_rcv.sb_hiwat, 0,
RLIM_INFINITY);
if (so->so_snd.sb_hiwat) (void)chgsbsize(so->so_uidinfo, &so->so_snd.sb_hiwat, 0,
RLIM_INFINITY);
sbrelease(&so->so_snd, so);
KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
sorflush(so);
refs = so->so_aborting; /* XXX */
/* Remove accept filter if one is present. */
if (so->so_accf != NULL) (void)accept_filt_clear(so);
sounlock(so);
if (refs == 0) /* XXX */ soput(so);
}
/*
* soclose: close a socket on last file table reference removal.
* Initiate disconnect if connected. Free socket when disconnect complete.
*/
int
soclose(struct socket *so)
{
struct socket *so2;
int error = 0;
solock(so); if (so->so_options & SO_ACCEPTCONN) {
for (;;) {
if ((so2 = TAILQ_FIRST(&so->so_q0)) != 0) {
KASSERT(solocked2(so, so2));
(void) soqremque(so2, 0);
/* soabort drops the lock. */
(void) soabort(so2);
solock(so);
continue;
}
if ((so2 = TAILQ_FIRST(&so->so_q)) != 0) { KASSERT(solocked2(so, so2));
(void) soqremque(so2, 1);
/* soabort drops the lock. */
(void) soabort(so2);
solock(so);
continue;
}
break;
}
}
if (so->so_pcb == NULL)
goto discard;
if (so->so_state & SS_ISCONNECTED) { if ((so->so_state & SS_ISDISCONNECTING) == 0) { error = sodisconnect(so); if (error)
goto drop;
}
if (so->so_options & SO_LINGER) { if ((so->so_state & (SS_ISDISCONNECTING|SS_NBIO)) ==
(SS_ISDISCONNECTING|SS_NBIO))
goto drop;
while (so->so_state & SS_ISCONNECTED) {
error = sowait(so, true, so->so_linger * hz);
if (error)
break;
}
}
}
drop:
if (so->so_pcb) { KASSERT(solocked(so));
(*so->so_proto->pr_usrreqs->pr_detach)(so);
}
discard:
KASSERT((so->so_state & SS_NOFDREF) == 0);
kauth_cred_free(so->so_cred);
so->so_cred = NULL;
so->so_state |= SS_NOFDREF;
sofree(so);
return error;
}
/*
* Must be called with the socket locked.. Will return with it unlocked.
*/
int
soabort(struct socket *so)
{
u_int refs;
int error;
KASSERT(solocked(so)); KASSERT(so->so_head == NULL);
so->so_aborting++; /* XXX */
error = (*so->so_proto->pr_usrreqs->pr_abort)(so);
refs = --so->so_aborting; /* XXX */
if (error || (refs == 0)) {
sofree(so);
} else {
sounlock(so);
}
return error;
}
int
soaccept(struct socket *so, struct sockaddr *nam)
{
int error;
KASSERT(solocked(so)); KASSERT((so->so_state & SS_NOFDREF) != 0);
so->so_state &= ~SS_NOFDREF;
if ((so->so_state & SS_ISDISCONNECTED) == 0 ||
(so->so_proto->pr_flags & PR_ABRTACPTDIS) == 0)
error = (*so->so_proto->pr_usrreqs->pr_accept)(so, nam);
else
error = ECONNABORTED;
return error;
}
int
soconnect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
int error;
KASSERT(solocked(so)); if (so->so_options & SO_ACCEPTCONN)
return EOPNOTSUPP;
/*
* If protocol is connection-based, can only connect once.
* Otherwise, if connected, try to disconnect first.
* This allows user to disconnect by connecting to, e.g.,
* a null address.
*/
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && ((so->so_proto->pr_flags & PR_CONNREQUIRED) || (error = sodisconnect(so)))) {
error = EISCONN;
} else {
if (nam->sa_family != so->so_proto->pr_domain->dom_family) {
return EAFNOSUPPORT;
}
error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
}
return error;
}
int
soconnect2(struct socket *so1, struct socket *so2)
{ KASSERT(solocked2(so1, so2));
return (*so1->so_proto->pr_usrreqs->pr_connect2)(so1, so2);
}
int
sodisconnect(struct socket *so)
{
int error;
KASSERT(solocked(so)); if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
} else if (so->so_state & SS_ISDISCONNECTING) {
error = EALREADY;
} else {
error = (*so->so_proto->pr_usrreqs->pr_disconnect)(so);
}
return error;
}
#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
/*
* Send on a socket.
* If send must go all at once and message is larger than
* send buffering, then hard error.
* Lock against other senders.
* If must go all at once and not enough room now, then
* inform user that this would block and do nothing.
* Otherwise, if nonblocking, send as much as possible.
* The data to be sent is described by "uio" if nonzero,
* otherwise by the mbuf chain "top" (which must be null
* if uio is not). Data provided in mbuf chain must be small
* enough to send all at once.
*
* Returns nonzero on error, timeout or signal; callers
* must check for short counts if EINTR/ERESTART are returned.
* Data and control buffers are freed on return.
*/
int
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
struct mbuf *top, struct mbuf *control, int flags, struct lwp *l)
{
struct mbuf **mp, *m;
long space, len, resid, clen, mlen;
int error, s, dontroute, atomic;
short wakeup_state = 0;
clen = 0;
/*
* solock() provides atomicity of access. splsoftnet() prevents
* protocol processing soft interrupts from interrupting us and
* blocking (expensive).
*/
s = splsoftnet();
solock(so);
atomic = sosendallatonce(so) || top;
if (uio)
resid = uio->uio_resid;
else
resid = top->m_pkthdr.len;
/*
* In theory resid should be unsigned.
* However, space must be signed, as it might be less than 0
* if we over-committed, and we must use a signed comparison
* of space and resid. On the other hand, a negative resid
* causes us to loop sending 0-length segments to the protocol.
*/
if (resid < 0) {
error = EINVAL;
goto out;
}
dontroute =
(flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
(so->so_proto->pr_flags & PR_ATOMIC);
l->l_ru.ru_msgsnd++;
if (control) clen = control->m_len;
restart:
if ((error = sblock(&so->so_snd, SBLOCKWAIT(flags))) != 0)
goto out;
do {
if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
goto release;
}
if (so->so_error) {
error = so->so_error;
if ((flags & MSG_PEEK) == 0) so->so_error = 0;
goto release;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if (resid || clen == 0) {
error = ENOTCONN;
goto release;
}
} else if (addr == NULL) {
error = EDESTADDRREQ;
goto release;
}
}
space = sbspace(&so->so_snd);
if (flags & MSG_OOB)
space += 1024;
if ((atomic && resid > so->so_snd.sb_hiwat) ||
clen > so->so_snd.sb_hiwat) {
error = EMSGSIZE;
goto release;
}
if (space < resid + clen && (atomic || space < so->so_snd.sb_lowat || space < clen)) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
error = EWOULDBLOCK;
goto release;
}
sbunlock(&so->so_snd);
if (wakeup_state & SS_RESTARTSYS) {
error = ERESTART;
goto out;
}
error = sbwait(&so->so_snd);
if (error)
goto out;
wakeup_state = so->so_state;
goto restart;
}
wakeup_state = 0;
mp = ⊤
space -= clen;
do {
if (uio == NULL) {
/*
* Data is prepackaged in "top".
*/
resid = 0;
if (flags & MSG_EOR)
top->m_flags |= M_EOR;
} else do {
sounlock(so);
splx(s);
if (top == NULL) {
m = m_gethdr(M_WAIT, MT_DATA);
mlen = MHLEN;
m->m_pkthdr.len = 0;
m_reset_rcvif(m);
} else {
m = m_get(M_WAIT, MT_DATA);
mlen = MLEN;
}
MCLAIM(m, so->so_snd.sb_mowner);
if (sock_loan_thresh >= 0 && uio->uio_iov->iov_len >= sock_loan_thresh &&
space >= sock_loan_thresh &&
(len = sosend_loan(so, uio, m,
space)) != 0) {
SOSEND_COUNTER_INCR(&sosend_loan_big);
space -= len;
goto have_data;
}
if (resid >= MINCLSIZE && space >= MCLBYTES) {
SOSEND_COUNTER_INCR(&sosend_copy_big);
m_clget(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0)
goto nopages;
mlen = MCLBYTES;
if (atomic && top == 0) {
len = lmin(MCLBYTES - max_hdr,
resid);
m->m_data += max_hdr;
} else
len = lmin(MCLBYTES, resid);
space -= len;
} else {
nopages:
SOSEND_COUNTER_INCR(&sosend_copy_small);
len = lmin(lmin(mlen, resid), space);
space -= len;
/*
* For datagram protocols, leave room
* for protocol headers in first mbuf.
*/
if (atomic && top == 0 && len < mlen) m_align(m, len);
}
error = uiomove(mtod(m, void *), (int)len, uio);
have_data:
resid = uio->uio_resid;
m->m_len = len;
*mp = m;
top->m_pkthdr.len += len;
s = splsoftnet();
solock(so); if (error != 0)
goto release;
mp = &m->m_next;
if (resid <= 0) {
if (flags & MSG_EOR)
top->m_flags |= M_EOR;
break;
}
} while (space > 0 && atomic); if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
goto release;
}
if (dontroute) so->so_options |= SO_DONTROUTE; if (resid > 0) so->so_state |= SS_MORETOCOME;
if (flags & MSG_OOB) {
error = (*so->so_proto->pr_usrreqs->pr_sendoob)(
so, top, control);
} else {
error = (*so->so_proto->pr_usrreqs->pr_send)(so,
top, addr, control, l);
}
if (dontroute) so->so_options &= ~SO_DONTROUTE; if (resid > 0) so->so_state &= ~SS_MORETOCOME;
clen = 0;
control = NULL;
top = NULL;
mp = ⊤
if (error != 0)
goto release;
} while (resid && space > 0); } while (resid);
release:
sbunlock(&so->so_snd);
out:
sounlock(so);
splx(s);
if (top) m_freem(top); if (control) m_freem(control);
return error;
}
/*
* Following replacement or removal of the first mbuf on the first
* mbuf chain of a socket buffer, push necessary state changes back
* into the socket buffer so that other consumers see the values
* consistently. 'nextrecord' is the caller's locally stored value of
* the original value of sb->sb_mb->m_nextpkt which must be restored
* when the lead mbuf changes. NOTE: 'nextrecord' may be NULL.
*/
static void
sbsync(struct sockbuf *sb, struct mbuf *nextrecord)
{
KASSERT(solocked(sb->sb_so));
/*
* First, update for the new value of nextrecord. If necessary,
* make it the first record.
*/
if (sb->sb_mb != NULL)
sb->sb_mb->m_nextpkt = nextrecord;
else
sb->sb_mb = nextrecord;
/*
* Now update any dependent socket buffer fields to reflect
* the new state. This is an inline of SB_EMPTY_FIXUP, with
* the addition of a second clause that takes care of the
* case where sb_mb has been updated, but remains the last
* record.
*/
if (sb->sb_mb == NULL) {
sb->sb_mbtail = NULL;
sb->sb_lastrecord = NULL;
} else if (sb->sb_mb->m_nextpkt == NULL)
sb->sb_lastrecord = sb->sb_mb;
}
/*
* Implement receive operations on a socket.
*
* We depend on the way that records are added to the sockbuf by sbappend*. In
* particular, each record (mbufs linked through m_next) must begin with an
* address if the protocol so specifies, followed by an optional mbuf or mbufs
* containing ancillary data, and then zero or more mbufs of data.
*
* In order to avoid blocking network interrupts for the entire time here, we
* splx() while doing the actual copy to user space. Although the sockbuf is
* locked, new data may still be appended, and thus we must maintain
* consistency of the sockbuf during that time.
*
* The caller may receive the data as a single mbuf chain by supplying an mbuf
* **mp0 for use in returning the chain. The uio is then used only for the
* count in uio_resid.
*/
int
soreceive(struct socket *so, struct mbuf **paddr, struct uio *uio,
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
struct lwp *l = curlwp;
struct mbuf *m, **mp, *mt;
size_t len, offset, moff, orig_resid;
int atomic, flags, error, s, type;
const struct protosw *pr;
struct mbuf *nextrecord;
int mbuf_removed = 0;
const struct domain *dom;
short wakeup_state = 0;
pr = so->so_proto;
atomic = pr->pr_flags & PR_ATOMIC;
dom = pr->pr_domain;
mp = mp0;
type = 0;
orig_resid = uio->uio_resid;
if (paddr != NULL) *paddr = NULL; if (controlp != NULL) *controlp = NULL; if (flagsp != NULL)
flags = *flagsp &~ MSG_EOR;
else
flags = 0;
if (flags & MSG_OOB) {
m = m_get(M_WAIT, MT_DATA);
solock(so);
error = (*pr->pr_usrreqs->pr_recvoob)(so, m, flags & MSG_PEEK);
sounlock(so);
if (error)
goto bad;
do {
error = uiomove(mtod(m, void *),
MIN(uio->uio_resid, m->m_len), uio);
m = m_free(m);
} while (uio->uio_resid > 0 && error == 0 && m);
bad:
if (m != NULL) m_freem(m);
return error;
}
if (mp != NULL) *mp = NULL;
/*
* solock() provides atomicity of access. splsoftnet() prevents
* protocol processing soft interrupts from interrupting us and
* blocking (expensive).
*/
s = splsoftnet();
solock(so);
restart:
if ((error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) != 0) {
sounlock(so);
splx(s);
return error;
}
m = so->so_rcv.sb_mb;
/*
* If we have less data than requested, block awaiting more
* (subject to any timeout) if:
* 1. the current count is less than the low water mark,
* 2. MSG_WAITALL is set, and it is possible to do the entire
* receive operation at once if we block (resid <= hiwat), or
* 3. MSG_DONTWAIT is not set.
* If MSG_WAITALL is set but resid is larger than the receive buffer,
* we have to do the receive in sections, and thus risk returning
* a short count if a timeout or signal occurs after we start.
*/
if (m == NULL ||
((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && m->m_nextpkt == NULL && !atomic)) {
#ifdef DIAGNOSTIC
if (m == NULL && so->so_rcv.sb_cc) panic("receive 1");
#endif
if (so->so_error || so->so_rerror) {
u_short *e;
if (m != NULL)
goto dontblock;
e = so->so_error ? &so->so_error : &so->so_rerror;
error = *e;
if ((flags & MSG_PEEK) == 0) *e = 0;
goto release;
}
if (so->so_state & SS_CANTRCVMORE) {
if (m != NULL)
goto dontblock;
else
goto release;
}
for (; m != NULL; m = m->m_next) if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
m = so->so_rcv.sb_mb;
goto dontblock;
}
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
error = ENOTCONN;
goto release;
}
if (uio->uio_resid == 0)
goto release;
if ((so->so_state & SS_NBIO) ||
(flags & (MSG_DONTWAIT|MSG_NBIO))) {
error = EWOULDBLOCK;
goto release;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
sbunlock(&so->so_rcv);
if (wakeup_state & SS_RESTARTSYS)
error = ERESTART;
else
error = sbwait(&so->so_rcv);
if (error != 0) {
sounlock(so);
splx(s);
return error;
}
wakeup_state = so->so_state;
goto restart;
}
dontblock:
/*
* On entry here, m points to the first record of the socket buffer.
* From this point onward, we maintain 'nextrecord' as a cache of the
* pointer to the next record in the socket buffer. We must keep the
* various socket buffer pointers and local stack versions of the
* pointers in sync, pushing out modifications before dropping the
* socket lock, and re-reading them when picking it up.
*
* Otherwise, we will race with the network stack appending new data
* or records onto the socket buffer by using inconsistent/stale
* versions of the field, possibly resulting in socket buffer
* corruption.
*
* By holding the high-level sblock(), we prevent simultaneous
* readers from pulling off the front of the socket buffer.
*/
if (l != NULL)
l->l_ru.ru_msgrcv++;
KASSERT(m == so->so_rcv.sb_mb);
SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
nextrecord = m->m_nextpkt;
if (pr->pr_flags & PR_ADDR) { KASSERT(m->m_type == MT_SONAME);
orig_resid = 0;
if (flags & MSG_PEEK) {
if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
mbuf_removed = 1;
if (paddr != NULL) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = NULL;
m = so->so_rcv.sb_mb;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
sbsync(&so->so_rcv, nextrecord);
}
}
if (pr->pr_flags & PR_ADDR_OPT) {
/*
* For SCTP we may be getting a whole message OR a partial
* delivery.
*/
if (m->m_type == MT_SONAME) {
orig_resid = 0;
if (flags & MSG_PEEK) {
if (paddr) *paddr = m_copym(m, 0, m->m_len, M_DONTWAIT);
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
mbuf_removed = 1;
if (paddr) {
*paddr = m;
so->so_rcv.sb_mb = m->m_next;
m->m_next = 0;
m = so->so_rcv.sb_mb;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
sbsync(&so->so_rcv, nextrecord);
}
}
}
/*
* Process one or more MT_CONTROL mbufs present before any data mbufs
* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
* just copy the data; if !MSG_PEEK, we call into the protocol to
* perform externalization (or freeing if controlp == NULL).
*/
if (__predict_false(m != NULL && m->m_type == MT_CONTROL)) {
struct mbuf *cm = NULL, *cmn;
struct mbuf **cme = &cm;
do {
if (flags & MSG_PEEK) {
if (controlp != NULL) { *controlp = m_copym(m, 0, m->m_len, M_DONTWAIT);
controlp = (*controlp == NULL ? NULL :
&(*controlp)->m_next);
}
m = m->m_next;
} else {
sbfree(&so->so_rcv, m);
so->so_rcv.sb_mb = m->m_next;
m->m_next = NULL;
*cme = m;
cme = &(*cme)->m_next;
m = so->so_rcv.sb_mb;
}
} while (m != NULL && m->m_type == MT_CONTROL); if ((flags & MSG_PEEK) == 0) sbsync(&so->so_rcv, nextrecord); for (; cm != NULL; cm = cmn) {
cmn = cm->m_next;
cm->m_next = NULL;
type = mtod(cm, struct cmsghdr *)->cmsg_type;
if (controlp != NULL) {
if (dom->dom_externalize != NULL &&
type == SCM_RIGHTS) {
sounlock(so);
splx(s);
error = (*dom->dom_externalize)(cm, l,
(flags & MSG_CMSG_CLOEXEC) ?
O_CLOEXEC : 0);
s = splsoftnet();
solock(so);
}
*controlp = cm;
while (*controlp != NULL) controlp = &(*controlp)->m_next;
} else {
/*
* Dispose of any SCM_RIGHTS message that went
* through the read path rather than recv.
*/
if (dom->dom_dispose != NULL &&
type == SCM_RIGHTS) {
sounlock(so);
(*dom->dom_dispose)(cm);
solock(so);
}
m_freem(cm);
}
}
if (m != NULL)
nextrecord = so->so_rcv.sb_mb->m_nextpkt;
else
nextrecord = so->so_rcv.sb_mb;
orig_resid = 0;
}
/* If m is non-NULL, we have some data to read. */
if (__predict_true(m != NULL)) {
type = m->m_type;
if (type == MT_OOBDATA)
flags |= MSG_OOB;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
moff = 0;
offset = 0;
while (m != NULL && uio->uio_resid > 0 && error == 0) {
/*
* If the type of mbuf has changed, end the receive
* operation and do a short read.
*/
if (m->m_type == MT_OOBDATA) { if (type != MT_OOBDATA)
break;
} else if (type == MT_OOBDATA) {
break;
} else if (m->m_type == MT_CONTROL) {
break;
}
#ifdef DIAGNOSTIC
else if (m->m_type != MT_DATA && m->m_type != MT_HEADER) {
panic("%s: m_type=%d", __func__, m->m_type);
}
#endif
so->so_state &= ~SS_RCVATMARK;
wakeup_state = 0;
len = uio->uio_resid;
if (so->so_oobmark && len > so->so_oobmark - offset)
len = so->so_oobmark - offset;
if (len > m->m_len - moff)
len = m->m_len - moff;
/*
* If mp is set, just pass back the mbufs.
* Otherwise copy them out via the uio, then free.
* Sockbuf must be consistent here (points to current mbuf,
* it points to next record) when we drop priority;
* we must note any additions to the sockbuf when we
* block interrupts again.
*/
if (mp == NULL) {
SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
sounlock(so);
splx(s);
error = uiomove(mtod(m, char *) + moff, len, uio);
s = splsoftnet();
solock(so); if (error != 0) {
/*
* If any part of the record has been removed
* (such as the MT_SONAME mbuf, which will
* happen when PR_ADDR, and thus also
* PR_ATOMIC, is set), then drop the entire
* record to maintain the atomicity of the
* receive operation.
*
* This avoids a later panic("receive 1a")
* when compiled with DIAGNOSTIC.
*/
if (m && mbuf_removed && atomic) (void) sbdroprecord(&so->so_rcv);
goto release;
}
} else {
uio->uio_resid -= len;
}
if (len == m->m_len - moff) {
if (m->m_flags & M_EOR)
flags |= MSG_EOR;
#ifdef SCTP
if (m->m_flags & M_NOTIFICATION)
flags |= MSG_NOTIFICATION;
#endif
if (flags & MSG_PEEK) {
m = m->m_next;
moff = 0;
} else {
nextrecord = m->m_nextpkt;
sbfree(&so->so_rcv, m);
if (mp) {
*mp = m;
mp = &m->m_next;
so->so_rcv.sb_mb = m = m->m_next;
*mp = NULL;
} else {
m = so->so_rcv.sb_mb = m_free(m);
}
/*
* If m != NULL, we also know that
* so->so_rcv.sb_mb != NULL.
*/
KASSERT(so->so_rcv.sb_mb == m);
if (m) {
m->m_nextpkt = nextrecord;
if (nextrecord == NULL) so->so_rcv.sb_lastrecord = m;
} else {
so->so_rcv.sb_mb = nextrecord;
SB_EMPTY_FIXUP(&so->so_rcv);
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
}
} else if (flags & MSG_PEEK) {
moff += len;
} else {
if (mp != NULL) {
mt = m_copym(m, 0, len, M_NOWAIT);
if (__predict_false(mt == NULL)) {
sounlock(so);
mt = m_copym(m, 0, len, M_WAIT);
solock(so);
}
*mp = mt;
}
m->m_data += len;
m->m_len -= len;
so->so_rcv.sb_cc -= len;
}
if (so->so_oobmark) {
if ((flags & MSG_PEEK) == 0) {
so->so_oobmark -= len;
if (so->so_oobmark == 0) { so->so_state |= SS_RCVATMARK;
break;
}
} else {
offset += len;
if (offset == so->so_oobmark)
break;
}
} else {
so->so_state &= ~SS_POLLRDBAND;
}
if (flags & MSG_EOR)
break;
/*
* If the MSG_WAITALL flag is set (for non-atomic socket),
* we must not quit until "uio->uio_resid == 0" or an error
* termination. If a signal/timeout occurs, return
* with a short count but without error.
* Keep sockbuf locked against other readers.
*/
while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 && !sosendallatonce(so) && !nextrecord) { if (so->so_error || so->so_rerror ||
so->so_state & SS_CANTRCVMORE)
break;
/*
* If we are peeking and the socket receive buffer is
* full, stop since we can't get more data to peek at.
*/
if ((flags & MSG_PEEK) && sbspace(&so->so_rcv) <= 0)
break;
/*
* If we've drained the socket buffer, tell the
* protocol in case it needs to do something to
* get it filled again.
*/
if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
if (wakeup_state & SS_RESTARTSYS)
error = ERESTART;
else
error = sbwait(&so->so_rcv);
if (error != 0) {
sbunlock(&so->so_rcv);
sounlock(so);
splx(s);
return 0;
}
if ((m = so->so_rcv.sb_mb) != NULL)
nextrecord = m->m_nextpkt;
wakeup_state = so->so_state;
}
}
if (m && atomic) {
flags |= MSG_TRUNC;
if ((flags & MSG_PEEK) == 0) (void) sbdroprecord(&so->so_rcv);
}
if ((flags & MSG_PEEK) == 0) { if (m == NULL) {
/*
* First part is an inline SB_EMPTY_FIXUP(). Second
* part makes sure sb_lastrecord is up-to-date if
* there is still data in the socket buffer.
*/
so->so_rcv.sb_mb = nextrecord;
if (so->so_rcv.sb_mb == NULL) {
so->so_rcv.sb_mbtail = NULL;
so->so_rcv.sb_lastrecord = NULL;
} else if (nextrecord->m_nextpkt == NULL)
so->so_rcv.sb_lastrecord = nextrecord;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) (*pr->pr_usrreqs->pr_rcvd)(so, flags, l);
}
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
sbunlock(&so->so_rcv);
goto restart;
}
if (flagsp != NULL) *flagsp |= flags;
release:
sbunlock(&so->so_rcv);
sounlock(so);
splx(s);
return error;
}
int
soshutdown(struct socket *so, int how)
{
const struct protosw *pr;
int error;
KASSERT(solocked(so));
pr = so->so_proto;
if (!(how == SHUT_RD || how == SHUT_WR || how == SHUT_RDWR))
return EINVAL;
if (how == SHUT_RD || how == SHUT_RDWR) {
sorflush(so);
error = 0;
}
if (how == SHUT_WR || how == SHUT_RDWR)
error = (*pr->pr_usrreqs->pr_shutdown)(so);
return error;
}
void
sorestart(struct socket *so)
{
/*
* An application has called close() on an fd on which another
* of its threads has called a socket system call.
* Mark this and wake everyone up, and code that would block again
* instead returns ERESTART.
* On system call re-entry the fd is validated and EBADF returned.
* Any other fd will block again on the 2nd syscall.
*/
solock(so);
so->so_state |= SS_RESTARTSYS;
cv_broadcast(&so->so_cv);
cv_broadcast(&so->so_snd.sb_cv);
cv_broadcast(&so->so_rcv.sb_cv);
sounlock(so);
}
void
sorflush(struct socket *so)
{
struct sockbuf *sb, asb;
const struct protosw *pr;
KASSERT(solocked(so));
sb = &so->so_rcv;
pr = so->so_proto;
socantrcvmore(so);
sb->sb_flags |= SB_NOINTR;
(void )sblock(sb, M_WAITOK);
sbunlock(sb);
asb = *sb;
/*
* Clear most of the sockbuf structure, but leave some of the
* fields valid.
*/
memset(&sb->sb_startzero, 0,
sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) {
sounlock(so);
(*pr->pr_domain->dom_dispose)(asb.sb_mb);
solock(so);
}
sbrelease(&asb, so);
}
/*
* internal set SOL_SOCKET options
*/
static int
sosetopt1(struct socket *so, const struct sockopt *sopt)
{
int error, opt;
int optval = 0; /* XXX: gcc */
struct linger l;
struct timeval tv;
opt = sopt->sopt_name;
switch (opt) {
case SO_ACCEPTFILTER:
error = accept_filt_setopt(so, sopt);
KASSERT(solocked(so));
break;
case SO_LINGER:
error = sockopt_get(sopt, &l, sizeof(l)); solock(so); if (error)
break;
if (l.l_linger < 0 || l.l_linger > USHRT_MAX ||
l.l_linger > (INT_MAX / hz)) {
error = EDOM;
break;
}
so->so_linger = l.l_linger;
if (l.l_onoff)
so->so_options |= SO_LINGER;
else
so->so_options &= ~SO_LINGER;
break;
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_DONTROUTE:
case SO_USELOOPBACK:
case SO_BROADCAST:
case SO_REUSEADDR:
case SO_REUSEPORT:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_NOSIGPIPE:
case SO_RERROR:
error = sockopt_getint(sopt, &optval); solock(so); if (error)
break;
if (optval)
so->so_options |= opt;
else
so->so_options &= ~opt;
break;
case SO_SNDBUF:
case SO_RCVBUF:
case SO_SNDLOWAT:
case SO_RCVLOWAT:
error = sockopt_getint(sopt, &optval); solock(so); if (error)
break;
/*
* Values < 1 make no sense for any of these
* options, so disallow them.
*/
if (optval < 1) {
error = EINVAL;
break;
}
switch (opt) {
case SO_SNDBUF:
if (sbreserve(&so->so_snd, (u_long)optval, so) == 0) {
error = ENOBUFS;
break;
}
if (sofixedbuf) so->so_snd.sb_flags &= ~SB_AUTOSIZE;
break;
case SO_RCVBUF:
if (sbreserve(&so->so_rcv, (u_long)optval, so) == 0) {
error = ENOBUFS;
break;
}
if (sofixedbuf) so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
break;
/*
* Make sure the low-water is never greater than
* the high-water.
*/
case SO_SNDLOWAT:
if (optval > so->so_snd.sb_hiwat) optval = so->so_snd.sb_hiwat;
so->so_snd.sb_lowat = optval;
break;
case SO_RCVLOWAT:
if (optval > so->so_rcv.sb_hiwat) optval = so->so_rcv.sb_hiwat;
so->so_rcv.sb_lowat = optval;
break;
}
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
solock(so); error = sockopt_get(sopt, &tv, sizeof(tv));
if (error)
break;
if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
error = EDOM;
break;
}
if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
error = EDOM;
break;
}
optval = tv.tv_sec * hz + tv.tv_usec / tick;
if (optval == 0 && tv.tv_usec != 0)
optval = 1;
switch (opt) {
case SO_SNDTIMEO:
so->so_snd.sb_timeo = optval;
break;
case SO_RCVTIMEO:
so->so_rcv.sb_timeo = optval;
break;
}
break;
default:
MODULE_HOOK_CALL(uipc_socket_50_setopt1_hook,
(opt, so, sopt), enosys(), error);
if (error == ENOSYS || error == EPASSTHROUGH) { solock(so);
error = ENOPROTOOPT;
}
break;
}
KASSERT(solocked(so));
return error;
}
int
sosetopt(struct socket *so, struct sockopt *sopt)
{
int error, prerr;
if (sopt->sopt_level == SOL_SOCKET) {
error = sosetopt1(so, sopt); KASSERT(solocked(so));
} else {
error = ENOPROTOOPT;
solock(so);
}
if ((error == 0 || error == ENOPROTOOPT) && so->so_proto != NULL && so->so_proto->pr_ctloutput != NULL) {
/* give the protocol stack a shot */
prerr = (*so->so_proto->pr_ctloutput)(PRCO_SETOPT, so, sopt);
if (prerr == 0)
error = 0;
else if (prerr != ENOPROTOOPT)
error = prerr;
}
sounlock(so);
return error;
}
/*
* so_setsockopt() is a wrapper providing a sockopt structure for sosetopt()
*/
int
so_setsockopt(struct lwp *l, struct socket *so, int level, int name,
const void *val, size_t valsize)
{
struct sockopt sopt;
int error;
KASSERT(valsize == 0 || val != NULL);
sockopt_init(&sopt, level, name, valsize);
sockopt_set(&sopt, val, valsize);
error = sosetopt(so, &sopt);
sockopt_destroy(&sopt);
return error;
}
/*
* internal get SOL_SOCKET options
*/
static int
sogetopt1(struct socket *so, struct sockopt *sopt)
{
int error, optval, opt;
struct linger l;
struct timeval tv;
switch ((opt = sopt->sopt_name)) {
case SO_ACCEPTFILTER:
error = accept_filt_getopt(so, sopt);
break;
case SO_LINGER:
l.l_onoff = (so->so_options & SO_LINGER) ? 1 : 0;
l.l_linger = so->so_linger;
error = sockopt_set(sopt, &l, sizeof(l));
break;
case SO_USELOOPBACK:
case SO_DONTROUTE:
case SO_DEBUG:
case SO_KEEPALIVE:
case SO_REUSEADDR:
case SO_REUSEPORT:
case SO_BROADCAST:
case SO_OOBINLINE:
case SO_TIMESTAMP:
case SO_NOSIGPIPE:
case SO_RERROR:
case SO_ACCEPTCONN:
error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
break;
case SO_TYPE:
error = sockopt_setint(sopt, so->so_type);
break;
case SO_ERROR:
if (so->so_error == 0) { so->so_error = so->so_rerror;
so->so_rerror = 0;
}
error = sockopt_setint(sopt, so->so_error);
so->so_error = 0;
break;
case SO_SNDBUF:
error = sockopt_setint(sopt, so->so_snd.sb_hiwat);
break;
case SO_RCVBUF:
error = sockopt_setint(sopt, so->so_rcv.sb_hiwat);
break;
case SO_SNDLOWAT:
error = sockopt_setint(sopt, so->so_snd.sb_lowat);
break;
case SO_RCVLOWAT:
error = sockopt_setint(sopt, so->so_rcv.sb_lowat);
break;
case SO_SNDTIMEO:
case SO_RCVTIMEO:
optval = (opt == SO_SNDTIMEO ?
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
memset(&tv, 0, sizeof(tv));
tv.tv_sec = optval / hz;
tv.tv_usec = (optval % hz) * tick;
error = sockopt_set(sopt, &tv, sizeof(tv));
break;
case SO_OVERFLOWED:
error = sockopt_setint(sopt, so->so_rcv.sb_overflowed);
break;
default:
MODULE_HOOK_CALL(uipc_socket_50_getopt1_hook,
(opt, so, sopt), enosys(), error);
if (error)
error = ENOPROTOOPT;
break;
}
return error;
}
int
sogetopt(struct socket *so, struct sockopt *sopt)
{
int error;
solock(so);
if (sopt->sopt_level != SOL_SOCKET) {
if (so->so_proto && so->so_proto->pr_ctloutput) { error = ((*so->so_proto->pr_ctloutput)
(PRCO_GETOPT, so, sopt));
} else
error = (ENOPROTOOPT);
} else {
error = sogetopt1(so, sopt);
}
sounlock(so);
return error;
}
/*
* alloc sockopt data buffer buffer
* - will be released at destroy
*/
static int
sockopt_alloc(struct sockopt *sopt, size_t len, km_flag_t kmflag)
{
void *data;
KASSERT(sopt->sopt_size == 0);
if (len > sizeof(sopt->sopt_buf)) {
data = kmem_zalloc(len, kmflag);
if (data == NULL)
return ENOMEM;
sopt->sopt_data = data;
} else
sopt->sopt_data = sopt->sopt_buf;
sopt->sopt_size = len;
return 0;
}
/*
* initialise sockopt storage
* - MAY sleep during allocation
*/
void
sockopt_init(struct sockopt *sopt, int level, int name, size_t size)
{
memset(sopt, 0, sizeof(*sopt));
sopt->sopt_level = level;
sopt->sopt_name = name;
(void)sockopt_alloc(sopt, size, KM_SLEEP);
}
/*
* destroy sockopt storage
* - will release any held memory references
*/
void
sockopt_destroy(struct sockopt *sopt)
{ if (sopt->sopt_data != sopt->sopt_buf) kmem_free(sopt->sopt_data, sopt->sopt_size);
memset(sopt, 0, sizeof(*sopt));
}
/*
* set sockopt value
* - value is copied into sockopt
* - memory is allocated when necessary, will not sleep
*/
int
sockopt_set(struct sockopt *sopt, const void *buf, size_t len)
{
int error;
if (sopt->sopt_size == 0) { error = sockopt_alloc(sopt, len, KM_NOSLEEP);
if (error)
return error;
}
sopt->sopt_retsize = MIN(sopt->sopt_size, len); if (sopt->sopt_retsize > 0) { memcpy(sopt->sopt_data, buf, sopt->sopt_retsize);
}
return 0;
}
/*
* common case of set sockopt integer value
*/
int
sockopt_setint(struct sockopt *sopt, int val)
{ return sockopt_set(sopt, &val, sizeof(int));
}
/*
* get sockopt value
* - correct size must be given
*/
int
sockopt_get(const struct sockopt *sopt, void *buf, size_t len)
{ if (sopt->sopt_size != len)
return EINVAL;
memcpy(buf, sopt->sopt_data, len);
return 0;
}
/*
* common case of get sockopt integer value
*/
int
sockopt_getint(const struct sockopt *sopt, int *valp)
{ return sockopt_get(sopt, valp, sizeof(int));
}
/*
* set sockopt value from mbuf
* - ONLY for legacy code
* - mbuf is released by sockopt
* - will not sleep
*/
int
sockopt_setmbuf(struct sockopt *sopt, struct mbuf *m)
{
size_t len;
int error;
len = m_length(m);
if (sopt->sopt_size == 0) {
error = sockopt_alloc(sopt, len, KM_NOSLEEP);
if (error)
return error;
}
sopt->sopt_retsize = MIN(sopt->sopt_size, len);
m_copydata(m, 0, sopt->sopt_retsize, sopt->sopt_data);
m_freem(m);
return 0;
}
/*
* get sockopt value into mbuf
* - ONLY for legacy code
* - mbuf to be released by the caller
* - will not sleep
*/
struct mbuf *
sockopt_getmbuf(const struct sockopt *sopt)
{
struct mbuf *m;
if (sopt->sopt_size > MCLBYTES)
return NULL;
m = m_get(M_DONTWAIT, MT_SOOPTS);
if (m == NULL)
return NULL;
if (sopt->sopt_size > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
memcpy(mtod(m, void *), sopt->sopt_data, sopt->sopt_size);
m->m_len = sopt->sopt_size;
return m;
}
void
sohasoutofband(struct socket *so)
{
so->so_state |= SS_POLLRDBAND;
fownsignal(so->so_pgid, SIGURG, POLL_PRI, POLLPRI|POLLRDBAND, so);
selnotify(&so->so_rcv.sb_sel, POLLPRI | POLLRDBAND, NOTE_SUBMIT);
}
static void
filt_sordetach(struct knote *kn)
{
struct socket *so;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
if (selremove_knote(&so->so_rcv.sb_sel, kn))
so->so_rcv.sb_flags &= ~SB_KNOTE;
sounlock(so);
}
/*ARGSUSED*/
static int
filt_soread(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = so->so_rcv.sb_cc;
if (so->so_state & SS_CANTRCVMORE) {
knote_set_eof(kn, 0);
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error || so->so_rerror)
rv = 1;
else if (kn->kn_sfflags & NOTE_LOWAT)
rv = (kn->kn_data >= kn->kn_sdata);
else
rv = (kn->kn_data >= so->so_rcv.sb_lowat);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static void
filt_sowdetach(struct knote *kn)
{
struct socket *so;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
if (selremove_knote(&so->so_snd.sb_sel, kn))
so->so_snd.sb_flags &= ~SB_KNOTE;
sounlock(so);
}
/*ARGSUSED*/
static int
filt_sowrite(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = sbspace(&so->so_snd);
if (so->so_state & SS_CANTSENDMORE) {
knote_set_eof(kn, 0);
kn->kn_fflags = so->so_error;
rv = 1;
} else if (so->so_error)
rv = 1;
else if (((so->so_state & SS_ISCONNECTED) == 0) &&
(so->so_proto->pr_flags & PR_CONNREQUIRED))
rv = 0;
else if (kn->kn_sfflags & NOTE_LOWAT)
rv = (kn->kn_data >= kn->kn_sdata);
else
rv = (kn->kn_data >= so->so_snd.sb_lowat);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static int
filt_soempty(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
if (hint != NOTE_SUBMIT)
solock(so);
rv = (kn->kn_data = sbused(&so->so_snd)) == 0 ||
(so->so_options & SO_ACCEPTCONN) != 0;
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
/*ARGSUSED*/
static int
filt_solisten(struct knote *kn, long hint)
{
struct socket *so;
int rv;
so = ((file_t *)kn->kn_obj)->f_socket;
/*
* Set kn_data to number of incoming connections, not
* counting partial (incomplete) connections.
*/
if (hint != NOTE_SUBMIT)
solock(so);
kn->kn_data = so->so_qlen;
rv = (kn->kn_data > 0);
if (hint != NOTE_SUBMIT)
sounlock(so);
return rv;
}
static const struct filterops solisten_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sordetach,
.f_event = filt_solisten,
};
static const struct filterops soread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sordetach,
.f_event = filt_soread,
};
static const struct filterops sowrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sowdetach,
.f_event = filt_sowrite,
};
static const struct filterops soempty_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_sowdetach,
.f_event = filt_soempty,
};
int
soo_kqfilter(struct file *fp, struct knote *kn)
{
struct socket *so;
struct sockbuf *sb;
so = ((file_t *)kn->kn_obj)->f_socket;
solock(so);
switch (kn->kn_filter) {
case EVFILT_READ:
if (so->so_options & SO_ACCEPTCONN)
kn->kn_fop = &solisten_filtops;
else
kn->kn_fop = &soread_filtops;
sb = &so->so_rcv;
break;
case EVFILT_WRITE:
kn->kn_fop = &sowrite_filtops;
sb = &so->so_snd;
#ifdef PIPE_SOCKETPAIR
if (so->so_state & SS_ISAPIPE) {
/* Other end of pipe has been closed. */
if (so->so_state & SS_ISDISCONNECTED) {
sounlock(so);
return EBADF;
}
}
#endif
break;
case EVFILT_EMPTY:
kn->kn_fop = &soempty_filtops;
sb = &so->so_snd;
break;
default:
sounlock(so);
return EINVAL;
}
selrecord_knote(&sb->sb_sel, kn);
sb->sb_flags |= SB_KNOTE;
sounlock(so);
return 0;
}
static int
sodopoll(struct socket *so, int events)
{
int revents;
revents = 0;
if (events & (POLLIN | POLLRDNORM)) if (soreadable(so))
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM)) if (sowritable(so))
revents |= events & (POLLOUT | POLLWRNORM);
if (events & (POLLPRI | POLLRDBAND)) if (so->so_state & SS_POLLRDBAND)
revents |= events & (POLLPRI | POLLRDBAND);
return revents;
}
int
sopoll(struct socket *so, int events)
{
int revents = 0;
#ifndef DIAGNOSTIC
/*
* Do a quick, unlocked check in expectation that the socket
* will be ready for I/O. Don't do this check if DIAGNOSTIC,
* as the solocked() assertions will fail.
*/
if ((revents = sodopoll(so, events)) != 0)
return revents;
#endif
solock(so); if ((revents = sodopoll(so, events)) == 0) { if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { selrecord(curlwp, &so->so_rcv.sb_sel);
so->so_rcv.sb_flags |= SB_NOTIFY;
}
if (events & (POLLOUT | POLLWRNORM)) { selrecord(curlwp, &so->so_snd.sb_sel);
so->so_snd.sb_flags |= SB_NOTIFY;
}
}
sounlock(so);
return revents;
}
struct mbuf **
sbsavetimestamp(int opt, struct mbuf **mp)
{
struct timeval tv;
int error;
memset(&tv, 0, sizeof(tv));
microtime(&tv);
MODULE_HOOK_CALL(uipc_socket_50_sbts_hook, (opt, &mp), enosys(), error);
if (error == 0)
return mp;
if (opt & SO_TIMESTAMP) {
*mp = sbcreatecontrol(&tv, sizeof(tv),
SCM_TIMESTAMP, SOL_SOCKET);
if (*mp)
mp = &(*mp)->m_next;
}
return mp;
}
#include <sys/sysctl.h>
static int sysctl_kern_somaxkva(SYSCTLFN_PROTO);
static int sysctl_kern_sbmax(SYSCTLFN_PROTO);
/*
* sysctl helper routine for kern.somaxkva. ensures that the given
* value is not too small.
* (XXX should we maybe make sure it's not too large as well?)
*/
static int
sysctl_kern_somaxkva(SYSCTLFN_ARGS)
{
int error, new_somaxkva;
struct sysctlnode node;
new_somaxkva = somaxkva;
node = *rnode;
node.sysctl_data = &new_somaxkva;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (new_somaxkva < (16 * 1024 * 1024)) /* sanity */
return EINVAL;
mutex_enter(&so_pendfree_lock);
somaxkva = new_somaxkva;
cv_broadcast(&socurkva_cv);
mutex_exit(&so_pendfree_lock);
return error;
}
/*
* sysctl helper routine for kern.sbmax. Basically just ensures that
* any new value is not too small.
*/
static int
sysctl_kern_sbmax(SYSCTLFN_ARGS)
{
int error, new_sbmax;
struct sysctlnode node;
new_sbmax = sb_max;
node = *rnode;
node.sysctl_data = &new_sbmax;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
KERNEL_LOCK(1, NULL);
error = sb_max_set(new_sbmax);
KERNEL_UNLOCK_ONE(NULL);
return error;
}
/*
* sysctl helper routine for kern.sooptions. Ensures that only allowed
* options can be set.
*/
static int
sysctl_kern_sooptions(SYSCTLFN_ARGS)
{
int error, new_options;
struct sysctlnode node;
new_options = sooptions;
node = *rnode;
node.sysctl_data = &new_options;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (new_options & ~SO_DEFOPTS)
return EINVAL;
sooptions = new_options;
return 0;
}
static void
sysctl_kern_socket_setup(void)
{
KASSERT(socket_sysctllog == NULL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "somaxkva",
SYSCTL_DESCR("Maximum amount of kernel memory to be "
"used for socket buffers"),
sysctl_kern_somaxkva, 0, NULL, 0,
CTL_KERN, KERN_SOMAXKVA, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_BOOL, "sofixedbuf",
SYSCTL_DESCR("Prevent scaling of fixed socket buffers"),
NULL, 0, &sofixedbuf, 0,
CTL_KERN, KERN_SOFIXEDBUF, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sbmax",
SYSCTL_DESCR("Maximum socket buffer size"),
sysctl_kern_sbmax, 0, NULL, 0,
CTL_KERN, KERN_SBMAX, CTL_EOL);
sysctl_createv(&socket_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sooptions",
SYSCTL_DESCR("Default socket options"),
sysctl_kern_sooptions, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ptyfs_subr.c 8.6 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ptyfs_subr.c,v 1.34 2020/11/27 14:43:57 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/select.h>
#include <sys/tty.h>
#include <sys/pty.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <fs/ptyfs/ptyfs.h>
static kmutex_t ptyfs_hashlock;
static SLIST_HEAD(ptyfs_hashhead, ptyfsnode) *ptyfs_node_tbl;
static u_long ptyfs_node_mask; /* size of hash table - 1 */
/*
* allocate a ptyfsnode/vnode pair. the vnode is referenced.
*
* the pty, ptyfs_type, and mount point uniquely
* identify a ptyfsnode. the mount point is needed
* because someone might mount this filesystem
* twice.
*/
int
ptyfs_allocvp(struct mount *mp, struct vnode **vpp, ptyfstype type, int pty)
{
struct ptyfskey key;
memset(&key, 0, sizeof(key));
key.ptk_pty = pty;
key.ptk_type = type;
return vcache_get(mp, &key, sizeof(key), vpp);
}
/*
* Initialize ptyfsnode hash table.
*/
void
ptyfs_hashinit(void)
{
ptyfs_node_tbl = hashinit(16, HASH_SLIST, true, &ptyfs_node_mask);
mutex_init(&ptyfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Free ptyfsnode hash table.
*/
void
ptyfs_hashdone(void)
{
mutex_destroy(&ptyfs_hashlock);
hashdone(ptyfs_node_tbl, HASH_SLIST, ptyfs_node_mask);
}
/*
* Get a ptyfsnode from the hash table, or allocate one.
*/
struct ptyfsnode *
ptyfs_get_node(ptyfstype type, int pty)
{
struct ptyfs_hashhead *ppp;
struct ptyfsnode *pp;
ppp = &ptyfs_node_tbl[PTYFS_FILENO(type, pty) & ptyfs_node_mask];
mutex_enter(&ptyfs_hashlock);
SLIST_FOREACH(pp, ppp, ptyfs_hash) { if (pty == pp->ptyfs_pty && pp->ptyfs_type == type) {
mutex_exit(&ptyfs_hashlock);
return pp;
}
}
mutex_exit(&ptyfs_hashlock);
pp = malloc(sizeof(struct ptyfsnode), M_TEMP, M_WAITOK);
pp->ptyfs_pty = pty;
pp->ptyfs_type = type;
pp->ptyfs_fileno = PTYFS_FILENO(type, pty);
if (pp->ptyfs_type == PTYFSroot)
pp->ptyfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
S_IROTH|S_IXOTH;
else
pp->ptyfs_mode = S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP|
S_IROTH|S_IWOTH;
pp->ptyfs_uid = pp->ptyfs_gid = 0;
pp->ptyfs_status = PTYFS_CHANGE;
PTYFS_ITIMES(pp, NULL, NULL, NULL); pp->ptyfs_birthtime = pp->ptyfs_mtime =
pp->ptyfs_atime = pp->ptyfs_ctime;
pp->ptyfs_flags = 0;
mutex_enter(&ptyfs_hashlock);
/*
* XXX We have minimum race condition when opening master side
* first time, if other threads through other mount points, trying
* opening the same device. As follow we have little chance have
* unused list entries.
*/
SLIST_INSERT_HEAD(ppp, pp, ptyfs_hash);
mutex_exit(&ptyfs_hashlock);
return pp;
}
/*
* Mark this controlling pty as active.
*/
void
ptyfs_set_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
/* Reallocate map if needed. */
if (pty >= pmnt->pmnt_bitmap_size * NBBY) {
int osize, nsize;
uint8_t *obitmap, *nbitmap;
nsize = roundup(howmany(pty + 1, NBBY), 64);
nbitmap = kmem_alloc(nsize, KM_SLEEP);
mutex_enter(&pmnt->pmnt_lock);
if (pty < pmnt->pmnt_bitmap_size * NBBY) {
mutex_exit(&pmnt->pmnt_lock);
kmem_free(nbitmap, nsize);
} else {
osize = pmnt->pmnt_bitmap_size;
obitmap = pmnt->pmnt_bitmap;
pmnt->pmnt_bitmap_size = nsize;
pmnt->pmnt_bitmap = nbitmap;
if (osize > 0)
memcpy(pmnt->pmnt_bitmap, obitmap, osize);
memset(pmnt->pmnt_bitmap + osize, 0, nsize - osize);
mutex_exit(&pmnt->pmnt_lock);
if (osize > 0)
kmem_free(obitmap, osize);
}
}
mutex_enter(&pmnt->pmnt_lock);
setbit(pmnt->pmnt_bitmap, pty);
mutex_exit(&pmnt->pmnt_lock);
}
/*
* Mark this controlling pty as inactive.
*/
void
ptyfs_clr_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
mutex_enter(&pmnt->pmnt_lock);
if (pty >= 0 && pty < pmnt->pmnt_bitmap_size * NBBY)
clrbit(pmnt->pmnt_bitmap, pty);
mutex_exit(&pmnt->pmnt_lock);
}
/*
* Lookup the next active controlling pty greater or equal "pty".
* Return -1 if not found.
*/
int
ptyfs_next_active(struct mount *mp, int pty)
{
struct ptyfsmount *pmnt = VFSTOPTY(mp);
KASSERT(pty >= 0);
mutex_enter(&pmnt->pmnt_lock);
while (pty < pmnt->pmnt_bitmap_size * NBBY) { if (isset(pmnt->pmnt_bitmap, pty)) {
mutex_exit(&pmnt->pmnt_lock);
return pty;
}
pty++;
}
mutex_exit(&pmnt->pmnt_lock);
return -1;
}
/* $NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Matt Thomas <matt@3am-software.com>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <sys/types.h>
#include <stddef.h>
#include <assert.h>
#include <stdbool.h>
#ifdef RBDEBUG
#define KASSERT(s) assert(s)
#define __rbt_unused
#else
#define KASSERT(s) do { } while (/*CONSTCOND*/ 0)
#define __rbt_unused __unused
#endif
__RCSID("$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#else
#include <lib/libkern/libkern.h>
__KERNEL_RCSID(0, "$NetBSD: rb.c,v 1.16 2021/09/16 21:29:41 andvar Exp $");
#ifndef DIAGNOSTIC
#define __rbt_unused __unused
#else
#define __rbt_unused
#endif
#endif
#ifdef _LIBC
__weak_alias(rb_tree_init, _rb_tree_init)
__weak_alias(rb_tree_find_node, _rb_tree_find_node)
__weak_alias(rb_tree_find_node_geq, _rb_tree_find_node_geq)
__weak_alias(rb_tree_find_node_leq, _rb_tree_find_node_leq)
__weak_alias(rb_tree_insert_node, _rb_tree_insert_node)
__weak_alias(rb_tree_remove_node, _rb_tree_remove_node)
__weak_alias(rb_tree_iterate, _rb_tree_iterate)
#ifdef RBDEBUG
__weak_alias(rb_tree_check, _rb_tree_check)
__weak_alias(rb_tree_depths, _rb_tree_depths)
#endif
#include "namespace.h"
#endif
#ifdef RBTEST
#include "rbtree.h"
#else
#include <sys/rbtree.h>
#endif
static void rb_tree_insert_rebalance(struct rb_tree *, struct rb_node *);
static void rb_tree_removal_rebalance(struct rb_tree *, struct rb_node *,
unsigned int);
#ifdef RBDEBUG
static const struct rb_node *rb_tree_iterate_const(const struct rb_tree *,
const struct rb_node *, const unsigned int);
static bool rb_tree_check_node(const struct rb_tree *, const struct rb_node *,
const struct rb_node *, bool);
#else
#define rb_tree_check_node(a, b, c, d) true
#endif
#define RB_NODETOITEM(rbto, rbn) \
((void *)((uintptr_t)(rbn) - (rbto)->rbto_node_offset))
#define RB_ITEMTONODE(rbto, rbn) \
((rb_node_t *)((uintptr_t)(rbn) + (rbto)->rbto_node_offset))
#define RB_SENTINEL_NODE NULL
void
rb_tree_init(struct rb_tree *rbt, const rb_tree_ops_t *ops)
{
rbt->rbt_ops = ops;
rbt->rbt_root = RB_SENTINEL_NODE;
RB_TAILQ_INIT(&rbt->rbt_nodes);
#ifndef RBSMALL
rbt->rbt_minmax[RB_DIR_LEFT] = rbt->rbt_root; /* minimum node */
rbt->rbt_minmax[RB_DIR_RIGHT] = rbt->rbt_root; /* maximum node */
#endif
#ifdef RBSTATS
rbt->rbt_count = 0;
rbt->rbt_insertions = 0;
rbt->rbt_removals = 0;
rbt->rbt_insertion_rebalance_calls = 0;
rbt->rbt_insertion_rebalance_passes = 0;
rbt->rbt_removal_rebalance_calls = 0;
rbt->rbt_removal_rebalance_passes = 0;
#endif
}
void *
rb_tree_find_node(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root;
while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
parent = parent->rb_nodes[diff < 0];
}
return NULL;
}
void *
rb_tree_find_node_geq(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root, *last = NULL;
while (!RB_SENTINEL_P(parent)) { void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
if (diff > 0)
last = parent;
parent = parent->rb_nodes[diff < 0];
}
return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}
void *
rb_tree_find_node_leq(struct rb_tree *rbt, const void *key)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_key_fn compare_key = rbto->rbto_compare_key;
struct rb_node *parent = rbt->rbt_root, *last = NULL;
while (!RB_SENTINEL_P(parent)) {
void *pobj = RB_NODETOITEM(rbto, parent);
const signed int diff = (*compare_key)(rbto->rbto_context,
pobj, key);
if (diff == 0)
return pobj;
if (diff < 0)
last = parent;
parent = parent->rb_nodes[diff < 0];
}
return last == NULL ? NULL : RB_NODETOITEM(rbto, last);
}
void *
rb_tree_insert_node(struct rb_tree *rbt, void *object)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;
struct rb_node *parent, *tmp, *self = RB_ITEMTONODE(rbto, object);
unsigned int position;
bool rebalance;
RBSTAT_INC(rbt->rbt_insertions);
tmp = rbt->rbt_root;
/*
* This is a hack. Because rbt->rbt_root is just a struct rb_node *,
* just like rb_node->rb_nodes[RB_DIR_LEFT], we can use this fact to
* avoid a lot of tests for root and know that even at root,
* updating RB_FATHER(rb_node)->rb_nodes[RB_POSITION(rb_node)] will
* update rbt->rbt_root.
*/
parent = (struct rb_node *)(void *)&rbt->rbt_root;
position = RB_DIR_LEFT;
/*
* Find out where to place this new leaf.
*/
while (!RB_SENTINEL_P(tmp)) {
void *tobj = RB_NODETOITEM(rbto, tmp);
const signed int diff = (*compare_nodes)(rbto->rbto_context,
tobj, object);
if (__predict_false(diff == 0)) {
/*
* Node already exists; return it.
*/
return tobj;
}
parent = tmp;
position = (diff < 0);
tmp = parent->rb_nodes[position];
}
#ifdef RBDEBUG
{
struct rb_node *prev = NULL, *next = NULL;
if (position == RB_DIR_RIGHT)
prev = parent;
else if (tmp != rbt->rbt_root)
next = parent;
/*
* Verify our sequential position
*/
KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
KASSERT(next == NULL || !RB_SENTINEL_P(next));
if (prev != NULL && next == NULL)
next = TAILQ_NEXT(prev, rb_link);
if (prev == NULL && next != NULL)
prev = TAILQ_PREV(next, rb_node_qh, rb_link);
KASSERT(prev == NULL || !RB_SENTINEL_P(prev));
KASSERT(next == NULL || !RB_SENTINEL_P(next));
KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);
KASSERT(next == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self), RB_NODETOITEM(rbto, next)) < 0);
}
#endif
/*
* Initialize the node and insert as a leaf into the tree.
*/
RB_SET_FATHER(self, parent);
RB_SET_POSITION(self, position);
if (__predict_false(parent == (struct rb_node *)(void *)&rbt->rbt_root)) {
RB_MARK_BLACK(self); /* root is always black */
#ifndef RBSMALL
rbt->rbt_minmax[RB_DIR_LEFT] = self;
rbt->rbt_minmax[RB_DIR_RIGHT] = self;
#endif
rebalance = false;
} else {
KASSERT(position == RB_DIR_LEFT || position == RB_DIR_RIGHT);
#ifndef RBSMALL
/*
* Keep track of the minimum and maximum nodes. If our
* parent is a minmax node and we on their min/max side,
* we must be the new min/max node.
*/
if (parent == rbt->rbt_minmax[position]) rbt->rbt_minmax[position] = self;
#endif /* !RBSMALL */
/*
* All new nodes are colored red. We only need to rebalance
* if our parent is also red.
*/
RB_MARK_RED(self);
rebalance = RB_RED_P(parent);
}
KASSERT(RB_SENTINEL_P(parent->rb_nodes[position]));
self->rb_left = parent->rb_nodes[position];
self->rb_right = parent->rb_nodes[position];
parent->rb_nodes[position] = self;
KASSERT(RB_CHILDLESS_P(self));
/*
* Insert the new node into a sorted list for easy sequential access
*/
RBSTAT_INC(rbt->rbt_count);
#ifdef RBDEBUG
if (RB_ROOT_P(rbt, self)) {
RB_TAILQ_INSERT_HEAD(&rbt->rbt_nodes, self, rb_link);
} else if (position == RB_DIR_LEFT) {
KASSERT((*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self),
RB_NODETOITEM(rbto, RB_FATHER(self))) < 0);
RB_TAILQ_INSERT_BEFORE(RB_FATHER(self), self, rb_link);
} else {
KASSERT((*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, RB_FATHER(self)),
RB_NODETOITEM(rbto, self)) < 0);
RB_TAILQ_INSERT_AFTER(&rbt->rbt_nodes, RB_FATHER(self),
self, rb_link);
}
#endif
KASSERT(rb_tree_check_node(rbt, self, NULL, !rebalance));
/*
* Rebalance tree after insertion
*/
if (rebalance) { rb_tree_insert_rebalance(rbt, self);
KASSERT(rb_tree_check_node(rbt, self, NULL, true));
}
/* Successfully inserted, return our node pointer. */
return object;
}
/*
* Swap the location and colors of 'self' and its child @ which. The child
* can not be a sentinel node. This is our rotation function. However,
* since it preserves coloring, it great simplifies both insertion and
* removal since rotation almost always involves the exchanging of colors
* as a separate step.
*/
static void
rb_tree_reparent_nodes(__rbt_unused struct rb_tree *rbt,
struct rb_node *old_father, const unsigned int which)
{
const unsigned int other = which ^ RB_DIR_OTHER;
struct rb_node * const grandpa = RB_FATHER(old_father);
struct rb_node * const old_child = old_father->rb_nodes[which];
struct rb_node * const new_father = old_child;
struct rb_node * const new_child = old_father;
KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT); KASSERT(!RB_SENTINEL_P(old_child)); KASSERT(RB_FATHER(old_child) == old_father);
KASSERT(rb_tree_check_node(rbt, old_father, NULL, false));
KASSERT(rb_tree_check_node(rbt, old_child, NULL, false));
KASSERT(RB_ROOT_P(rbt, old_father) ||
rb_tree_check_node(rbt, grandpa, NULL, false));
/*
* Exchange descendant linkages.
*/
grandpa->rb_nodes[RB_POSITION(old_father)] = new_father;
new_child->rb_nodes[which] = old_child->rb_nodes[other];
new_father->rb_nodes[other] = new_child;
/*
* Update ancestor linkages
*/
RB_SET_FATHER(new_father, grandpa);
RB_SET_FATHER(new_child, new_father);
/*
* Exchange properties between new_father and new_child. The only
* change is that new_child's position is now on the other side.
*/
#if 0
{
struct rb_node tmp;
tmp.rb_info = 0;
RB_COPY_PROPERTIES(&tmp, old_child);
RB_COPY_PROPERTIES(new_father, old_father);
RB_COPY_PROPERTIES(new_child, &tmp);
}
#else
RB_SWAP_PROPERTIES(new_father, new_child);
#endif
RB_SET_POSITION(new_child, other);
/*
* Make sure to reparent the new child to ourself.
*/
if (!RB_SENTINEL_P(new_child->rb_nodes[which])) { RB_SET_FATHER(new_child->rb_nodes[which], new_child);
RB_SET_POSITION(new_child->rb_nodes[which], which);
}
KASSERT(rb_tree_check_node(rbt, new_father, NULL, false));
KASSERT(rb_tree_check_node(rbt, new_child, NULL, false));
KASSERT(RB_ROOT_P(rbt, new_father) ||
rb_tree_check_node(rbt, grandpa, NULL, false));
}
static void
rb_tree_insert_rebalance(struct rb_tree *rbt, struct rb_node *self)
{
struct rb_node * father = RB_FATHER(self);
struct rb_node * grandpa = RB_FATHER(father);
struct rb_node * uncle;
unsigned int which;
unsigned int other;
KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father));
RBSTAT_INC(rbt->rbt_insertion_rebalance_calls);
for (;;) {
KASSERT(!RB_SENTINEL_P(self));
KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father));
/*
* We are red and our parent is red, therefore we must have a
* grandfather and he must be black.
*/
grandpa = RB_FATHER(father);
KASSERT(RB_BLACK_P(grandpa));
KASSERT(RB_DIR_RIGHT == 1 && RB_DIR_LEFT == 0);
which = (father == grandpa->rb_right);
other = which ^ RB_DIR_OTHER;
uncle = grandpa->rb_nodes[other];
if (RB_BLACK_P(uncle))
break;
RBSTAT_INC(rbt->rbt_insertion_rebalance_passes);
/*
* Case 1: our uncle is red
* Simply invert the colors of our parent and
* uncle and make our grandparent red. And
* then solve the problem up at his level.
*/
RB_MARK_BLACK(uncle);
RB_MARK_BLACK(father);
if (__predict_false(RB_ROOT_P(rbt, grandpa))) {
/*
* If our grandpa is root, don't bother
* setting him to red, just return.
*/
KASSERT(RB_BLACK_P(grandpa));
return;
}
RB_MARK_RED(grandpa);
self = grandpa;
father = RB_FATHER(self);
KASSERT(RB_RED_P(self)); if (RB_BLACK_P(father)) {
/*
* If our greatgrandpa is black, we're done.
*/
KASSERT(RB_BLACK_P(rbt->rbt_root));
return;
}
}
KASSERT(!RB_ROOT_P(rbt, self)); KASSERT(RB_RED_P(self)); KASSERT(RB_RED_P(father)); KASSERT(RB_BLACK_P(uncle)); KASSERT(RB_BLACK_P(grandpa));
/*
* Case 2&3: our uncle is black.
*/
if (self == father->rb_nodes[other]) {
/*
* Case 2: we are on the same side as our uncle
* Swap ourselves with our parent so this case
* becomes case 3. Basically our parent becomes our
* child.
*/
rb_tree_reparent_nodes(rbt, father, other);
KASSERT(RB_FATHER(father) == self); KASSERT(self->rb_nodes[which] == father); KASSERT(RB_FATHER(self) == grandpa);
self = father;
father = RB_FATHER(self);
}
KASSERT(RB_RED_P(self) && RB_RED_P(father)); KASSERT(grandpa->rb_nodes[which] == father);
/*
* Case 3: we are opposite a child of a black uncle.
* Swap our parent and grandparent. Since our grandfather
* is black, our father will become black and our new sibling
* (former grandparent) will become red.
*/
rb_tree_reparent_nodes(rbt, grandpa, which);
KASSERT(RB_FATHER(self) == father); KASSERT(RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER] == grandpa); KASSERT(RB_RED_P(self)); KASSERT(RB_BLACK_P(father)); KASSERT(RB_RED_P(grandpa));
/*
* Final step: Set the root to black.
*/
RB_MARK_BLACK(rbt->rbt_root);
}
static void
rb_tree_prune_node(struct rb_tree *rbt, struct rb_node *self, bool rebalance)
{
const unsigned int which = RB_POSITION(self);
struct rb_node *father = RB_FATHER(self);
#ifndef RBSMALL
const bool was_root = RB_ROOT_P(rbt, self);
#endif
KASSERT(rebalance || (RB_ROOT_P(rbt, self) || RB_RED_P(self))); KASSERT(!rebalance || RB_BLACK_P(self)); KASSERT(RB_CHILDLESS_P(self));
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
/*
* Since we are childless, we know that self->rb_left is pointing
* to the sentinel node.
*/
father->rb_nodes[which] = self->rb_left;
/*
* Remove ourselves from the node list, decrement the count,
* and update min/max.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) {
rbt->rbt_minmax[RB_POSITION(self)] = father;
/*
* When removing the root, rbt->rbt_minmax[RB_DIR_LEFT] is
* updated automatically, but we also need to update
* rbt->rbt_minmax[RB_DIR_RIGHT];
*/
if (__predict_false(was_root)) { rbt->rbt_minmax[RB_DIR_RIGHT] = father;
}
}
RB_SET_FATHER(self, NULL);
#endif
/*
* Rebalance if requested.
*/
if (rebalance) rb_tree_removal_rebalance(rbt, father, which);
KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
}
/*
* When deleting an interior node
*/
static void
rb_tree_swap_prune_and_rebalance(struct rb_tree *rbt, struct rb_node *self,
struct rb_node *standin)
{
const unsigned int standin_which = RB_POSITION(standin);
unsigned int standin_other = standin_which ^ RB_DIR_OTHER;
struct rb_node *standin_son;
struct rb_node *standin_father = RB_FATHER(standin);
bool rebalance = RB_BLACK_P(standin);
if (standin_father == self) {
/*
* As a child of self, any childen would be opposite of
* our parent.
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other]));
standin_son = standin->rb_nodes[standin_which];
} else {
/*
* Since we aren't a child of self, any childen would be
* on the same side as our parent.
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_which]));
standin_son = standin->rb_nodes[standin_other];
}
/*
* the node we are removing must have two children.
*/
KASSERT(RB_TWOCHILDREN_P(self));
/*
* If standin has a child, it must be red.
*/
KASSERT(RB_SENTINEL_P(standin_son) || RB_RED_P(standin_son));
/*
* Verify things are sane.
*/
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
KASSERT(rb_tree_check_node(rbt, standin, NULL, false));
if (__predict_false(RB_RED_P(standin_son))) {
/*
* We know we have a red child so if we flip it to black
* we don't have to rebalance.
*/
KASSERT(rb_tree_check_node(rbt, standin_son, NULL, true));
RB_MARK_BLACK(standin_son);
rebalance = false;
if (standin_father == self) {
KASSERT(RB_POSITION(standin_son) == standin_which);
} else {
KASSERT(RB_POSITION(standin_son) == standin_other);
/*
* Change the son's parentage to point to his grandpa.
*/
RB_SET_FATHER(standin_son, standin_father);
RB_SET_POSITION(standin_son, standin_which);
}
}
if (standin_father == self) {
/*
* If we are about to delete the standin's father, then when
* we call rebalance, we need to use ourselves as our father.
* Otherwise remember our original father. Also, sincef we are
* our standin's father we only need to reparent the standin's
* brother.
*
* | R --> S |
* | Q S --> Q T |
* | t --> |
*/
KASSERT(RB_SENTINEL_P(standin->rb_nodes[standin_other])); KASSERT(!RB_SENTINEL_P(self->rb_nodes[standin_other])); KASSERT(self->rb_nodes[standin_which] == standin);
/*
* Have our son/standin adopt his brother as his new son.
*/
standin_father = standin;
} else {
/*
* | R --> S . |
* | / \ | T --> / \ | / |
* | ..... | S --> ..... | T |
*
* Sever standin's connection to his father.
*/
standin_father->rb_nodes[standin_which] = standin_son;
/*
* Adopt the far son.
*/
standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
RB_SET_FATHER(standin->rb_nodes[standin_other], standin);
KASSERT(RB_POSITION(self->rb_nodes[standin_other]) == standin_other);
/*
* Use standin_other because we need to preserve standin_which
* for the removal_rebalance.
*/
standin_other = standin_which;
}
/*
* Move the only remaining son to our standin. If our standin is our
* son, this will be the only son needed to be moved.
*/
KASSERT(standin->rb_nodes[standin_other] != self->rb_nodes[standin_other]);
standin->rb_nodes[standin_other] = self->rb_nodes[standin_other];
RB_SET_FATHER(standin->rb_nodes[standin_other], standin);
/*
* Now copy the result of self to standin and then replace
* self with standin in the tree.
*/
RB_COPY_PROPERTIES(standin, self);
RB_SET_FATHER(standin, RB_FATHER(self));
RB_FATHER(standin)->rb_nodes[RB_POSITION(standin)] = standin;
/*
* Remove ourselves from the node list, decrement the count,
* and update min/max.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(rbt->rbt_minmax[RB_POSITION(self)] == self)) rbt->rbt_minmax[RB_POSITION(self)] = RB_FATHER(self);
RB_SET_FATHER(self, NULL);
#endif
KASSERT(rb_tree_check_node(rbt, standin, NULL, false));
KASSERT(RB_FATHER_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin_father, NULL, false));
KASSERT(RB_LEFT_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin->rb_left, NULL, false));
KASSERT(RB_RIGHT_SENTINEL_P(standin)
|| rb_tree_check_node(rbt, standin->rb_right, NULL, false));
if (!rebalance)
return;
rb_tree_removal_rebalance(rbt, standin_father, standin_which);
KASSERT(rb_tree_check_node(rbt, standin, NULL, true));
}
/*
* We could do this by doing
* rb_tree_node_swap(rbt, self, which);
* rb_tree_prune_node(rbt, self, false);
*
* But it's more efficient to just evalate and recolor the child.
*/
static void
rb_tree_prune_blackred_branch(struct rb_tree *rbt, struct rb_node *self,
unsigned int which)
{
struct rb_node *father = RB_FATHER(self);
struct rb_node *son = self->rb_nodes[which];
#ifndef RBSMALL
const bool was_root = RB_ROOT_P(rbt, self);
#endif
KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
KASSERT(RB_BLACK_P(self) && RB_RED_P(son)); KASSERT(!RB_TWOCHILDREN_P(son)); KASSERT(RB_CHILDLESS_P(son));
KASSERT(rb_tree_check_node(rbt, self, NULL, false));
KASSERT(rb_tree_check_node(rbt, son, NULL, false));
/*
* Remove ourselves from the tree and give our former child our
* properties (position, color, root).
*/
RB_COPY_PROPERTIES(son, self);
father->rb_nodes[RB_POSITION(son)] = son;
RB_SET_FATHER(son, father);
/*
* Remove ourselves from the node list, decrement the count,
* and update minmax.
*/
RB_TAILQ_REMOVE(&rbt->rbt_nodes, self, rb_link);
RBSTAT_DEC(rbt->rbt_count);
#ifndef RBSMALL
if (__predict_false(was_root)) {
KASSERT(rbt->rbt_minmax[which] == son);
rbt->rbt_minmax[which ^ RB_DIR_OTHER] = son;
} else if (rbt->rbt_minmax[RB_POSITION(self)] == self) {
rbt->rbt_minmax[RB_POSITION(self)] = son;
}
RB_SET_FATHER(self, NULL);
#endif
KASSERT(was_root || rb_tree_check_node(rbt, father, NULL, true));
KASSERT(rb_tree_check_node(rbt, son, NULL, true));
}
void
rb_tree_remove_node(struct rb_tree *rbt, void *object)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
struct rb_node *standin, *self = RB_ITEMTONODE(rbto, object);
unsigned int which;
KASSERT(!RB_SENTINEL_P(self));
RBSTAT_INC(rbt->rbt_removals);
/*
* In the following diagrams, we (the node to be removed) are S. Red
* nodes are lowercase. T could be either red or black.
*
* Remember the major axiom of the red-black tree: the number of
* black nodes from the root to each leaf is constant across all
* leaves, only the number of red nodes varies.
*
* Thus removing a red leaf doesn't require any other changes to a
* red-black tree. So if we must remove a node, attempt to rearrange
* the tree so we can remove a red node.
*
* The simpliest case is a childless red node or a childless root node:
*
* | T --> T | or | R --> * |
* | s --> * |
*/
if (RB_CHILDLESS_P(self)) { const bool rebalance = RB_BLACK_P(self) && !RB_ROOT_P(rbt, self); rb_tree_prune_node(rbt, self, rebalance);
return;
}
KASSERT(!RB_CHILDLESS_P(self));
if (!RB_TWOCHILDREN_P(self)) {
/*
* The next simpliest case is the node we are deleting is
* black and has one red child.
*
* | T --> T --> T |
* | S --> R --> R |
* | r --> s --> * |
*/
which = RB_LEFT_SENTINEL_P(self) ? RB_DIR_RIGHT : RB_DIR_LEFT;
KASSERT(RB_BLACK_P(self));
KASSERT(RB_RED_P(self->rb_nodes[which])); KASSERT(RB_CHILDLESS_P(self->rb_nodes[which])); rb_tree_prune_blackred_branch(rbt, self, which);
return;
}
KASSERT(RB_TWOCHILDREN_P(self));
/*
* We invert these because we prefer to remove from the inside of
* the tree.
*/
which = RB_POSITION(self) ^ RB_DIR_OTHER;
/*
* Let's find the node closes to us opposite of our parent
* Now swap it with ourself, "prune" it, and rebalance, if needed.
*/
standin = RB_ITEMTONODE(rbto, rb_tree_iterate(rbt, object, which));
rb_tree_swap_prune_and_rebalance(rbt, self, standin);
}
static void
rb_tree_removal_rebalance(struct rb_tree *rbt, struct rb_node *parent,
unsigned int which)
{ KASSERT(!RB_SENTINEL_P(parent)); KASSERT(RB_SENTINEL_P(parent->rb_nodes[which])); KASSERT(which == RB_DIR_LEFT || which == RB_DIR_RIGHT);
RBSTAT_INC(rbt->rbt_removal_rebalance_calls);
while (RB_BLACK_P(parent->rb_nodes[which])) {
unsigned int other = which ^ RB_DIR_OTHER;
struct rb_node *brother = parent->rb_nodes[other];
RBSTAT_INC(rbt->rbt_removal_rebalance_passes);
KASSERT(!RB_SENTINEL_P(brother));
/*
* For cases 1, 2a, and 2b, our brother's children must
* be black and our father must be black
*/
if (RB_BLACK_P(parent) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) {
if (RB_RED_P(brother)) {
/*
* Case 1: Our brother is red, swap its
* position (and colors) with our parent.
* This should now be case 2b (unless C or E
* has a red child which is case 3; thus no
* explicit branch to case 2b).
*
* B -> D
* A d -> b E
* C E -> A C
*/
KASSERT(RB_BLACK_P(parent));
rb_tree_reparent_nodes(rbt, parent, other);
brother = parent->rb_nodes[other];
KASSERT(!RB_SENTINEL_P(brother)); KASSERT(RB_RED_P(parent)); KASSERT(RB_BLACK_P(brother));
KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
} else {
/*
* Both our parent and brother are black.
* Change our brother to red, advance up rank
* and go through the loop again.
*
* B -> *B
* *A D -> A d
* C E -> C E
*/
RB_MARK_RED(brother);
KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right)); if (RB_ROOT_P(rbt, parent))
return; /* root == parent == black */
KASSERT(rb_tree_check_node(rbt, brother, NULL, false));
KASSERT(rb_tree_check_node(rbt, parent, NULL, false));
which = RB_POSITION(parent);
parent = RB_FATHER(parent);
continue;
}
}
/*
* Avoid an else here so that case 2a above can hit either
* case 2b, 3, or 4.
*/
if (RB_RED_P(parent) && RB_BLACK_P(brother) && RB_BLACK_P(brother->rb_left) && RB_BLACK_P(brother->rb_right)) {
KASSERT(RB_RED_P(parent));
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_BLACK_P(brother->rb_left)); KASSERT(RB_BLACK_P(brother->rb_right));
/*
* We are black, our father is red, our brother and
* both nephews are black. Simply invert/exchange the
* colors of our father and brother (to black and red
* respectively).
*
* | f --> F |
* | * B --> * b |
* | N N --> N N |
*/
RB_MARK_BLACK(parent);
RB_MARK_RED(brother);
KASSERT(rb_tree_check_node(rbt, brother, NULL, true));
break; /* We're done! */
} else {
/*
* Our brother must be black and have at least one
* red child (it may have two).
*/
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[which]) ||
RB_RED_P(brother->rb_nodes[other]));
if (RB_BLACK_P(brother->rb_nodes[other])) {
/*
* Case 3: our brother is black, our near
* nephew is red, and our far nephew is black.
* Swap our brother with our near nephew.
* This result in a tree that matches case 4.
* (Our father could be red or black).
*
* | F --> F |
* | x B --> x B |
* | n --> n |
*/
KASSERT(RB_RED_P(brother->rb_nodes[which]));
rb_tree_reparent_nodes(rbt, brother, which);
KASSERT(RB_FATHER(brother) == parent->rb_nodes[other]);
brother = parent->rb_nodes[other];
KASSERT(RB_RED_P(brother->rb_nodes[other]));
}
/*
* Case 4: our brother is black and our far nephew
* is red. Swap our father and brother locations and
* change our far nephew to black. (these can be
* done in either order so we change the color first).
* The result is a valid red-black tree and is a
* terminal case. (again we don't care about the
* father's color)
*
* If the father is red, we will get a red-black-black
* tree:
* | f -> f --> b |
* | B -> B --> F N |
* | n -> N --> |
*
* If the father is black, we will get an all black
* tree:
* | F -> F --> B |
* | B -> B --> F N |
* | n -> N --> |
*
* If we had two red nephews, then after the swap,
* our former father would have a red grandson.
*/
KASSERT(RB_BLACK_P(brother)); KASSERT(RB_RED_P(brother->rb_nodes[other]));
RB_MARK_BLACK(brother->rb_nodes[other]);
rb_tree_reparent_nodes(rbt, parent, other);
break; /* We're done! */
}
}
KASSERT(rb_tree_check_node(rbt, parent, NULL, true));
}
void *
rb_tree_iterate(struct rb_tree *rbt, void *object, const unsigned int direction)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
const unsigned int other = direction ^ RB_DIR_OTHER;
struct rb_node *self;
KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);
if (object == NULL) {
#ifndef RBSMALL
if (RB_SENTINEL_P(rbt->rbt_root))
return NULL;
return RB_NODETOITEM(rbto, rbt->rbt_minmax[direction]);
#else
self = rbt->rbt_root;
if (RB_SENTINEL_P(self))
return NULL;
while (!RB_SENTINEL_P(self->rb_nodes[direction]))
self = self->rb_nodes[direction];
return RB_NODETOITEM(rbto, self);
#endif /* !RBSMALL */
}
self = RB_ITEMTONODE(rbto, object);
KASSERT(!RB_SENTINEL_P(self));
/*
* We can't go any further in this direction. We proceed up in the
* opposite direction until our parent is in direction we want to go.
*/
if (RB_SENTINEL_P(self->rb_nodes[direction])) {
while (!RB_ROOT_P(rbt, self)) { if (other == RB_POSITION(self))
return RB_NODETOITEM(rbto, RB_FATHER(self));
self = RB_FATHER(self);
}
return NULL;
}
/*
* Advance down one in current direction and go down as far as possible
* in the opposite direction.
*/
self = self->rb_nodes[direction];
KASSERT(!RB_SENTINEL_P(self));
while (!RB_SENTINEL_P(self->rb_nodes[other]))
self = self->rb_nodes[other];
return RB_NODETOITEM(rbto, self);
}
#ifdef RBDEBUG
static const struct rb_node *
rb_tree_iterate_const(const struct rb_tree *rbt, const struct rb_node *self,
const unsigned int direction)
{
const unsigned int other = direction ^ RB_DIR_OTHER;
KASSERT(direction == RB_DIR_LEFT || direction == RB_DIR_RIGHT);
if (self == NULL) {
#ifndef RBSMALL
if (RB_SENTINEL_P(rbt->rbt_root))
return NULL;
return rbt->rbt_minmax[direction];
#else
self = rbt->rbt_root;
if (RB_SENTINEL_P(self))
return NULL;
while (!RB_SENTINEL_P(self->rb_nodes[direction]))
self = self->rb_nodes[direction];
return self;
#endif /* !RBSMALL */
}
KASSERT(!RB_SENTINEL_P(self));
/*
* We can't go any further in this direction. We proceed up in the
* opposite direction until our parent is in direction we want to go.
*/
if (RB_SENTINEL_P(self->rb_nodes[direction])) {
while (!RB_ROOT_P(rbt, self)) {
if (other == RB_POSITION(self))
return RB_FATHER(self);
self = RB_FATHER(self);
}
return NULL;
}
/*
* Advance down one in current direction and go down as far as possible
* in the opposite direction.
*/
self = self->rb_nodes[direction];
KASSERT(!RB_SENTINEL_P(self));
while (!RB_SENTINEL_P(self->rb_nodes[other]))
self = self->rb_nodes[other];
return self;
}
static unsigned int
rb_tree_count_black(const struct rb_node *self)
{
unsigned int left, right;
if (RB_SENTINEL_P(self))
return 0;
left = rb_tree_count_black(self->rb_left);
right = rb_tree_count_black(self->rb_right);
KASSERT(left == right);
return left + RB_BLACK_P(self);
}
static bool
rb_tree_check_node(const struct rb_tree *rbt, const struct rb_node *self,
const struct rb_node *prev, bool red_check)
{
const rb_tree_ops_t *rbto = rbt->rbt_ops;
rbto_compare_nodes_fn compare_nodes = rbto->rbto_compare_nodes;
KASSERT(!RB_SENTINEL_P(self));
KASSERT(prev == NULL || (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, prev), RB_NODETOITEM(rbto, self)) < 0);
/*
* Verify our relationship to our parent.
*/
if (RB_ROOT_P(rbt, self)) {
KASSERT(self == rbt->rbt_root);
KASSERT(RB_POSITION(self) == RB_DIR_LEFT);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
KASSERT(RB_FATHER(self) == (const struct rb_node *) &rbt->rbt_root);
} else {
int diff = (*compare_nodes)(rbto->rbto_context,
RB_NODETOITEM(rbto, self),
RB_NODETOITEM(rbto, RB_FATHER(self)));
KASSERT(self != rbt->rbt_root);
KASSERT(!RB_FATHER_SENTINEL_P(self));
if (RB_POSITION(self) == RB_DIR_LEFT) {
KASSERT(diff < 0);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_LEFT] == self);
} else {
KASSERT(diff > 0);
KASSERT(RB_FATHER(self)->rb_nodes[RB_DIR_RIGHT] == self);
}
}
/*
* Verify our position in the linked list against the tree itself.
*/
{
const struct rb_node *prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
const struct rb_node *next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
KASSERT(prev0 == TAILQ_PREV(self, rb_node_qh, rb_link));
KASSERT(next0 == TAILQ_NEXT(self, rb_link));
#ifndef RBSMALL
KASSERT(prev0 != NULL || self == rbt->rbt_minmax[RB_DIR_LEFT]);
KASSERT(next0 != NULL || self == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif
}
/*
* The root must be black.
* There can never be two adjacent red nodes.
*/
if (red_check) {
KASSERT(!RB_ROOT_P(rbt, self) || RB_BLACK_P(self));
(void) rb_tree_count_black(self);
if (RB_RED_P(self)) {
const struct rb_node *brother;
KASSERT(!RB_ROOT_P(rbt, self));
brother = RB_FATHER(self)->rb_nodes[RB_POSITION(self) ^ RB_DIR_OTHER];
KASSERT(RB_BLACK_P(RB_FATHER(self)));
/*
* I'm red and have no children, then I must either
* have no brother or my brother also be red and
* also have no children. (black count == 0)
*/
KASSERT(!RB_CHILDLESS_P(self)
|| RB_SENTINEL_P(brother)
|| RB_RED_P(brother)
|| RB_CHILDLESS_P(brother));
/*
* If I'm not childless, I must have two children
* and they must be both be black.
*/
KASSERT(RB_CHILDLESS_P(self)
|| (RB_TWOCHILDREN_P(self)
&& RB_BLACK_P(self->rb_left)
&& RB_BLACK_P(self->rb_right)));
/*
* If I'm not childless, thus I have black children,
* then my brother must either be black or have two
* black children.
*/
KASSERT(RB_CHILDLESS_P(self)
|| RB_BLACK_P(brother)
|| (RB_TWOCHILDREN_P(brother)
&& RB_BLACK_P(brother->rb_left)
&& RB_BLACK_P(brother->rb_right)));
} else {
/*
* If I'm black and have one child, that child must
* be red and childless.
*/
KASSERT(RB_CHILDLESS_P(self)
|| RB_TWOCHILDREN_P(self)
|| (!RB_LEFT_SENTINEL_P(self)
&& RB_RIGHT_SENTINEL_P(self)
&& RB_RED_P(self->rb_left)
&& RB_CHILDLESS_P(self->rb_left))
|| (!RB_RIGHT_SENTINEL_P(self)
&& RB_LEFT_SENTINEL_P(self)
&& RB_RED_P(self->rb_right)
&& RB_CHILDLESS_P(self->rb_right)));
/*
* If I'm a childless black node and my parent is
* black, my 2nd closet relative away from my parent
* is either red or has a red parent or red children.
*/
if (!RB_ROOT_P(rbt, self)
&& RB_CHILDLESS_P(self)
&& RB_BLACK_P(RB_FATHER(self))) {
const unsigned int which = RB_POSITION(self);
const unsigned int other = which ^ RB_DIR_OTHER;
const struct rb_node *relative0, *relative;
relative0 = rb_tree_iterate_const(rbt,
self, other);
KASSERT(relative0 != NULL);
relative = rb_tree_iterate_const(rbt,
relative0, other);
KASSERT(relative != NULL);
KASSERT(RB_SENTINEL_P(relative->rb_nodes[which]));
#if 0
KASSERT(RB_RED_P(relative)
|| RB_RED_P(relative->rb_left)
|| RB_RED_P(relative->rb_right)
|| RB_RED_P(RB_FATHER(relative)));
#endif
}
}
/*
* A grandparent's children must be real nodes and not
* sentinels. First check out grandparent.
*/
KASSERT(RB_ROOT_P(rbt, self)
|| RB_ROOT_P(rbt, RB_FATHER(self))
|| RB_TWOCHILDREN_P(RB_FATHER(RB_FATHER(self))));
/*
* If we are have grandchildren on our left, then
* we must have a child on our right.
*/
KASSERT(RB_LEFT_SENTINEL_P(self)
|| RB_CHILDLESS_P(self->rb_left)
|| !RB_RIGHT_SENTINEL_P(self));
/*
* If we are have grandchildren on our right, then
* we must have a child on our left.
*/
KASSERT(RB_RIGHT_SENTINEL_P(self)
|| RB_CHILDLESS_P(self->rb_right)
|| !RB_LEFT_SENTINEL_P(self));
/*
* If we have a child on the left and it doesn't have two
* children make sure we don't have great-great-grandchildren on
* the right.
*/
KASSERT(RB_TWOCHILDREN_P(self->rb_left)
|| RB_CHILDLESS_P(self->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_right)
|| RB_CHILDLESS_P(self->rb_right->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_right->rb_right->rb_right));
/*
* If we have a child on the right and it doesn't have two
* children make sure we don't have great-great-grandchildren on
* the left.
*/
KASSERT(RB_TWOCHILDREN_P(self->rb_right)
|| RB_CHILDLESS_P(self->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_left->rb_right)
|| RB_CHILDLESS_P(self->rb_left->rb_right->rb_left)
|| RB_CHILDLESS_P(self->rb_left->rb_right->rb_right));
/*
* If we are fully interior node, then our predecessors and
* successors must have no children in our direction.
*/
if (RB_TWOCHILDREN_P(self)) {
const struct rb_node *prev0;
const struct rb_node *next0;
prev0 = rb_tree_iterate_const(rbt, self, RB_DIR_LEFT);
KASSERT(prev0 != NULL);
KASSERT(RB_RIGHT_SENTINEL_P(prev0));
next0 = rb_tree_iterate_const(rbt, self, RB_DIR_RIGHT);
KASSERT(next0 != NULL);
KASSERT(RB_LEFT_SENTINEL_P(next0));
}
}
return true;
}
void
rb_tree_check(const struct rb_tree *rbt, bool red_check)
{
const struct rb_node *self;
const struct rb_node *prev;
#ifdef RBSTATS
unsigned int count = 0;
#endif
KASSERT(rbt->rbt_root != NULL);
KASSERT(RB_LEFT_P(rbt->rbt_root));
#if defined(RBSTATS) && !defined(RBSMALL)
KASSERT(rbt->rbt_count > 1
|| rbt->rbt_minmax[RB_DIR_LEFT] == rbt->rbt_minmax[RB_DIR_RIGHT]);
#endif
prev = NULL;
TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
rb_tree_check_node(rbt, self, prev, false);
#ifdef RBSTATS
count++;
#endif
}
#ifdef RBSTATS
KASSERT(rbt->rbt_count == count);
#endif
if (red_check) {
KASSERT(RB_BLACK_P(rbt->rbt_root));
KASSERT(RB_SENTINEL_P(rbt->rbt_root)
|| rb_tree_count_black(rbt->rbt_root));
/*
* The root must be black.
* There can never be two adjacent red nodes.
*/
TAILQ_FOREACH(self, &rbt->rbt_nodes, rb_link) {
rb_tree_check_node(rbt, self, NULL, true);
}
}
}
#endif /* RBDEBUG */
#ifdef RBSTATS
static void
rb_tree_mark_depth(const struct rb_tree *rbt, const struct rb_node *self,
size_t *depths, size_t depth)
{
if (RB_SENTINEL_P(self))
return;
if (RB_TWOCHILDREN_P(self)) {
rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
return;
}
depths[depth]++;
if (!RB_LEFT_SENTINEL_P(self)) {
rb_tree_mark_depth(rbt, self->rb_left, depths, depth + 1);
}
if (!RB_RIGHT_SENTINEL_P(self)) {
rb_tree_mark_depth(rbt, self->rb_right, depths, depth + 1);
}
}
void
rb_tree_depths(const struct rb_tree *rbt, size_t *depths)
{
rb_tree_mark_depth(rbt, rbt->rbt_root, depths, 1);
}
#endif /* RBSTATS */
/* $NetBSD: subr_blist.c,v 1.15 2022/05/31 08:43:16 andvar Exp $ */
/*-
* Copyright (c) 1998 Matthew Dillon. All Rights Reserved.
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting
*
* This module implements a general bitmap allocator/deallocator. The
* allocator eats around 2 bits per 'block'. The module does not
* try to interpret the meaning of a 'block' other than to return
* BLIST_NONE on an allocation failure.
*
* A radix tree is used to maintain the bitmap. Two radix constants are
* involved: One for the bitmaps contained in the leaf nodes (typically
* 32), and one for the meta nodes (typically 16). Both meta and leaf
* nodes have a hint field. This field gives us a hint as to the largest
* free contiguous range of blocks under the node. It may contain a
* value that is too high, but will never contain a value that is too
* low. When the radix tree is searched, allocation failures in subtrees
* update the hint.
*
* The radix tree also implements two collapsed states for meta nodes:
* the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is
* in either of these two states, all information contained underneath
* the node is considered stale. These states are used to optimize
* allocation and freeing operations.
*
* The hinting greatly increases code efficiency for allocations while
* the general radix structure optimizes both allocations and frees. The
* radix tree should be able to operate well no matter how much
* fragmentation there is and no matter how large a bitmap is used.
*
* Unlike the rlist code, the blist code wires all necessary memory at
* creation time. Neither allocations nor frees require interaction with
* the memory subsystem. In contrast, the rlist code may allocate memory
* on an rlist_free() call. The non-blocking features of the blist code
* are used to great advantage in the swap code (vm/nswap_pager.c). The
* rlist code uses a little less overall memory than the blist code (but
* due to swap interleaving not all that much less), but the blist code
* scales much, much better.
*
* LAYOUT: The radix tree is laid out recursively using a
* linear array. Each meta node is immediately followed (laid out
* sequentially in memory) by BLIST_META_RADIX lower level nodes. This
* is a recursive structure but one that can be easily scanned through
* a very simple 'skip' calculation. In order to support large radixes,
* portions of the tree may reside outside our memory allocation. We
* handle this with an early-termination optimization (when bighint is
* set to -1) on the scan. The memory allocation is only large enough
* to cover the number of blocks requested at creation time even if it
* must be encompassed in larger root-node radix.
*
* NOTE: the allocator cannot currently allocate more than
* BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too
* large' if you try. This is an area that could use improvement. The
* radix is large enough that this restriction does not effect the swap
* system, though. Currently only the allocation code is effected by
* this algorithmic unfeature. The freeing code can handle arbitrary
* ranges.
*
* This code can be compiled stand-alone for debugging.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_blist.c,v 1.15 2022/05/31 08:43:16 andvar Exp $");
#if 0
__FBSDID("$FreeBSD: src/sys/kern/subr_blist.c,v 1.17 2004/06/04 04:03:25 alc Exp $");
#endif
#ifdef _KERNEL
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/blist.h>
#include <sys/kmem.h>
#else
#ifndef BLIST_NO_DEBUG
#define BLIST_DEBUG
#endif
#include <sys/types.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <inttypes.h>
#define KM_SLEEP 1
#define kmem_zalloc(a,b) calloc(1, (a))
#define kmem_alloc(a,b) malloc(a)
#define kmem_free(a,b) free(a)
#include "../sys/blist.h"
void panic(const char *ctl, ...) __printflike(1, 2);
#endif
/*
* blmeta and bl_bitmap_t MUST be a power of 2 in size.
*/
typedef struct blmeta {
union {
blist_blkno_t bmu_avail; /* space available under us */
blist_bitmap_t bmu_bitmap; /* bitmap if we are a leaf */
} u;
blist_blkno_t bm_bighint; /* biggest contiguous block hint*/
} blmeta_t;
struct blist {
blist_blkno_t bl_blocks; /* area of coverage */
blist_blkno_t bl_radix; /* coverage radix */
blist_blkno_t bl_skip; /* starting skip */
blist_blkno_t bl_free; /* number of free blocks */
blmeta_t *bl_root; /* root of radix tree */
blist_blkno_t bl_rootblks; /* blks allocated for tree */
};
#define BLIST_META_RADIX 16
/*
* static support functions
*/
static blist_blkno_t blst_leaf_alloc(blmeta_t *scan, blist_blkno_t blk,
int count);
static blist_blkno_t blst_meta_alloc(blmeta_t *scan, blist_blkno_t blk,
blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip);
static void blst_leaf_free(blmeta_t *scan, blist_blkno_t relblk, int count);
static void blst_meta_free(blmeta_t *scan, blist_blkno_t freeBlk,
blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip,
blist_blkno_t blk);
static void blst_copy(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix,
blist_blkno_t skip, blist_t dest, blist_blkno_t count);
static int blst_leaf_fill(blmeta_t *scan, blist_blkno_t blk, int count);
static blist_blkno_t blst_meta_fill(blmeta_t *scan, blist_blkno_t allocBlk,
blist_blkno_t count, blist_blkno_t radix, blist_blkno_t skip,
blist_blkno_t blk);
static blist_blkno_t blst_radix_init(blmeta_t *scan, blist_blkno_t radix,
blist_blkno_t skip, blist_blkno_t count);
#ifndef _KERNEL
static void blst_radix_print(blmeta_t *scan, blist_blkno_t blk,
blist_blkno_t radix, blist_blkno_t skip, int tab);
#endif
/*
* blist_create() - create a blist capable of handling up to the specified
* number of blocks
*
* blocks must be greater than 0
*
* The smallest blist consists of a single leaf node capable of
* managing BLIST_BMAP_RADIX blocks.
*/
blist_t
blist_create(blist_blkno_t blocks)
{
blist_t bl;
blist_blkno_t radix;
blist_blkno_t skip = 0;
/*
* Calculate radix and skip field used for scanning.
*
* XXX check overflow
*/
radix = BLIST_BMAP_RADIX;
while (radix < blocks) {
radix *= BLIST_META_RADIX;
skip = (skip + 1) * BLIST_META_RADIX;
}
bl = kmem_zalloc(sizeof(struct blist), KM_SLEEP);
bl->bl_blocks = blocks;
bl->bl_radix = radix;
bl->bl_skip = skip;
bl->bl_rootblks = 1 +
blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
bl->bl_root = kmem_alloc(sizeof(blmeta_t) * bl->bl_rootblks, KM_SLEEP);
#if defined(BLIST_DEBUG)
printf(
"BLIST representing %" PRIu64 " blocks (%" PRIu64 " MB of swap)"
", requiring %" PRIu64 "K of ram\n",
(uint64_t)bl->bl_blocks,
(uint64_t)bl->bl_blocks * 4 / 1024,
((uint64_t)bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
);
printf("BLIST raw radix tree contains %" PRIu64 " records\n",
(uint64_t)bl->bl_rootblks);
#endif
blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
return(bl);
}
void
blist_destroy(blist_t bl)
{
kmem_free(bl->bl_root, sizeof(blmeta_t) * bl->bl_rootblks);
kmem_free(bl, sizeof(struct blist));
}
/*
* blist_alloc() - reserve space in the block bitmap. Return the base
* of a contiguous region or BLIST_NONE if space could
* not be allocated.
*/
blist_blkno_t
blist_alloc(blist_t bl, blist_blkno_t count)
{
blist_blkno_t blk = BLIST_NONE;
if (bl) {
if (bl->bl_radix == BLIST_BMAP_RADIX)
blk = blst_leaf_alloc(bl->bl_root, 0, count);
else
blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
if (blk != BLIST_NONE)
bl->bl_free -= count;
}
return(blk);
}
/*
* blist_free() - free up space in the block bitmap. Return the base
* of a contiguous region. Panic if an inconsistency is
* found.
*/
void
blist_free(blist_t bl, blist_blkno_t blkno, blist_blkno_t count)
{ if (bl) {
if (bl->bl_radix == BLIST_BMAP_RADIX)
blst_leaf_free(bl->bl_root, blkno, count);
else
blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
bl->bl_free += count;
}
}
/*
* blist_fill() - mark a region in the block bitmap as off-limits
* to the allocator (i.e. allocate it), ignoring any
* existing allocations. Return the number of blocks
* actually filled that were free before the call.
*/
blist_blkno_t
blist_fill(blist_t bl, blist_blkno_t blkno, blist_blkno_t count)
{
blist_blkno_t filled;
if (bl) {
if (bl->bl_radix == BLIST_BMAP_RADIX)
filled = blst_leaf_fill(bl->bl_root, blkno, count);
else
filled = blst_meta_fill(bl->bl_root, blkno, count,
bl->bl_radix, bl->bl_skip, 0);
bl->bl_free -= filled;
return filled;
} else
return 0;
}
/*
* blist_resize() - resize an existing radix tree to handle the
* specified number of blocks. This will reallocate
* the tree and transfer the previous bitmap to the new
* one. When extending the tree you can specify whether
* the new blocks are to left allocated or freed.
*/
void
blist_resize(blist_t *pbl, blist_blkno_t count, int freenew)
{
blist_t newbl = blist_create(count);
blist_t save = *pbl;
*pbl = newbl;
if (count > save->bl_blocks)
count = save->bl_blocks;
blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
/*
* If resizing upwards, should we free the new space or not?
*/
if (freenew && count < newbl->bl_blocks) {
blist_free(newbl, count, newbl->bl_blocks - count);
}
blist_destroy(save);
}
#ifdef BLIST_DEBUG
/*
* blist_print() - dump radix tree
*/
void
blist_print(blist_t bl)
{
printf("BLIST {\n");
blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
printf("}\n");
}
#endif
/************************************************************************
* ALLOCATION SUPPORT FUNCTIONS *
************************************************************************
*
* These support functions do all the actual work. They may seem
* rather longish, but that's because I've commented them up. The
* actual code is straight forward.
*
*/
/*
* blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap).
*
* This is the core of the allocator and is optimized for the 1 block
* and the BLIST_BMAP_RADIX block allocation cases. Other cases are
* somewhat slower. The 1 block allocation case is log2 and extremely
* quick.
*/
static blist_blkno_t
blst_leaf_alloc(
blmeta_t *scan,
blist_blkno_t blk,
int count
) {
blist_bitmap_t orig = scan->u.bmu_bitmap;
if (orig == 0) {
/*
* Optimize bitmap all-allocated case. Also, count = 1
* case assumes at least 1 bit is free in the bitmap, so
* we have to take care of this case here.
*/
scan->bm_bighint = 0;
return(BLIST_NONE);
}
if (count == 1) {
/*
* Optimized code to allocate one bit out of the bitmap
*/
blist_bitmap_t mask;
int j = BLIST_BMAP_RADIX/2;
int r = 0;
mask = (blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX/2);
while (j) {
if ((orig & mask) == 0) {
r += j;
orig >>= j;
}
j >>= 1;
mask >>= j;
}
scan->u.bmu_bitmap &= ~((blist_bitmap_t)1 << r);
return(blk + r);
}
if (count <= BLIST_BMAP_RADIX) {
/*
* non-optimized code to allocate N bits out of the bitmap.
* The more bits, the faster the code runs. It will run
* the slowest allocating 2 bits, but since there aren't any
* memory ops in the core loop (or shouldn't be, anyway),
* you probably won't notice the difference.
*/
int j;
int n = BLIST_BMAP_RADIX - count;
blist_bitmap_t mask;
mask = (blist_bitmap_t)-1 >> n;
for (j = 0; j <= n; ++j) {
if ((orig & mask) == mask) {
scan->u.bmu_bitmap &= ~mask;
return(blk + j);
}
mask = (mask << 1);
}
}
/*
* We couldn't allocate count in this subtree, update bighint.
*/
scan->bm_bighint = count - 1;
return(BLIST_NONE);
}
/*
* blist_meta_alloc() - allocate at a meta in the radix tree.
*
* Attempt to allocate at a meta node. If we can't, we update
* bighint and return a failure. Updating bighint optimize future
* calls that hit this node. We have to check for our collapse cases
* and we have a few optimizations strewn in as well.
*/
static blist_blkno_t
blst_meta_alloc(
blmeta_t *scan,
blist_blkno_t blk,
blist_blkno_t count,
blist_blkno_t radix,
blist_blkno_t skip
) {
blist_blkno_t i;
blist_blkno_t next_skip = (skip / BLIST_META_RADIX);
if (scan->u.bmu_avail == 0) {
/*
* ALL-ALLOCATED special case
*/
scan->bm_bighint = count;
return(BLIST_NONE);
}
if (scan->u.bmu_avail == radix) {
radix /= BLIST_META_RADIX;
/*
* ALL-FREE special case, initialize uninitialize
* sublevel.
*/
for (i = 1; i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (blist_blkno_t)-1)
break;
if (next_skip == 1) {
scan[i].u.bmu_bitmap = (blist_bitmap_t)-1;
scan[i].bm_bighint = BLIST_BMAP_RADIX;
} else {
scan[i].bm_bighint = radix;
scan[i].u.bmu_avail = radix;
}
}
} else {
radix /= BLIST_META_RADIX;
}
for (i = 1; i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (blist_blkno_t)-1) {
/*
* Terminator
*/
break;
} else if (count <= scan[i].bm_bighint) {
/*
* count fits in object
*/
blist_blkno_t r;
if (next_skip == 1) {
r = blst_leaf_alloc(&scan[i], blk, count);
} else {
r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
}
if (r != BLIST_NONE) {
scan->u.bmu_avail -= count;
if (scan->bm_bighint > scan->u.bmu_avail)
scan->bm_bighint = scan->u.bmu_avail;
return(r);
}
} else if (count > radix) {
/*
* count does not fit in object even if it were
* complete free.
*/
panic("blist_meta_alloc: allocation too large");
}
blk += radix;
}
/*
* We couldn't allocate count in this subtree, update bighint.
*/
if (scan->bm_bighint >= count)
scan->bm_bighint = count - 1;
return(BLIST_NONE);
}
/*
* BLST_LEAF_FREE() - free allocated block from leaf bitmap
*
*/
static void
blst_leaf_free(
blmeta_t *scan,
blist_blkno_t blk,
int count
) {
/*
* free some data in this bitmap
*
* e.g.
* 0000111111111110000
* \_________/\__/
* v n
*/
int n = blk & (BLIST_BMAP_RADIX - 1);
blist_bitmap_t mask;
mask = ((blist_bitmap_t)-1 << n) &
((blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX - count - n));
if (scan->u.bmu_bitmap & mask)
panic("blst_radix_free: freeing free block"); scan->u.bmu_bitmap |= mask;
/*
* We could probably do a better job here. We are required to make
* bighint at least as large as the biggest contiguous block of
* data. If we just shoehorn it, a little extra overhead will
* be incured on the next allocation (but only that one typically).
*/
scan->bm_bighint = BLIST_BMAP_RADIX;
}
/*
* BLST_META_FREE() - free allocated blocks from radix tree meta info
*
* This support routine frees a range of blocks from the bitmap.
* The range must be entirely enclosed by this radix node. If a
* meta node, we break the range down recursively to free blocks
* in subnodes (which means that this code can free an arbitrary
* range whereas the allocation code cannot allocate an arbitrary
* range).
*/
static void
blst_meta_free(
blmeta_t *scan,
blist_blkno_t freeBlk,
blist_blkno_t count,
blist_blkno_t radix,
blist_blkno_t skip,
blist_blkno_t blk
) {
blist_blkno_t i;
blist_blkno_t next_skip = (skip / BLIST_META_RADIX);
#if 0
printf("FREE (%" PRIx64 ",%" PRIu64
") FROM (%" PRIx64 ",%" PRIu64 ")\n",
(uint64_t)freeBlk, (uint64_t)count,
(uint64_t)blk, (uint64_t)radix
);
#endif
if (scan->u.bmu_avail == 0) {
/*
* ALL-ALLOCATED special case, with possible
* shortcut to ALL-FREE special case.
*/
scan->u.bmu_avail = count;
scan->bm_bighint = count;
if (count != radix) { for (i = 1; i <= skip; i += next_skip) { if (scan[i].bm_bighint == (blist_blkno_t)-1)
break;
scan[i].bm_bighint = 0;
if (next_skip == 1) {
scan[i].u.bmu_bitmap = 0;
} else {
scan[i].u.bmu_avail = 0;
}
}
/* fall through */
}
} else {
scan->u.bmu_avail += count;
/* scan->bm_bighint = radix; */
}
/*
* ALL-FREE special case.
*/
if (scan->u.bmu_avail == radix)
return;
if (scan->u.bmu_avail > radix)
panic("blst_meta_free: freeing already free blocks (%"
PRIu64 ") %" PRIu64 "/%" PRIu64,
(uint64_t)count,
(uint64_t)scan->u.bmu_avail,
(uint64_t)radix);
/*
* Break the free down into its components
*/
radix /= BLIST_META_RADIX;
i = (freeBlk - blk) / radix;
blk += i * radix;
i = i * next_skip + 1;
while (i <= skip && blk < freeBlk + count) {
blist_blkno_t v;
v = blk + radix - freeBlk;
if (v > count)
v = count;
if (scan->bm_bighint == (blist_blkno_t)-1)
panic("blst_meta_free: freeing unexpected range");
if (next_skip == 1) {
blst_leaf_free(&scan[i], freeBlk, v);
} else {
blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
}
if (scan->bm_bighint < scan[i].bm_bighint) scan->bm_bighint = scan[i].bm_bighint;
count -= v;
freeBlk += v;
blk += radix;
i += next_skip;
}
}
/*
* BLIST_RADIX_COPY() - copy one radix tree to another
*
* Locates free space in the source tree and frees it in the destination
* tree. The space may not already be free in the destination.
*/
static void blst_copy(
blmeta_t *scan,
blist_blkno_t blk,
blist_blkno_t radix,
blist_blkno_t skip,
blist_t dest,
blist_blkno_t count
) {
blist_blkno_t next_skip;
blist_blkno_t i;
/*
* Leaf node
*/
if (radix == BLIST_BMAP_RADIX) {
blist_bitmap_t v = scan->u.bmu_bitmap;
if (v == (blist_bitmap_t)-1) {
blist_free(dest, blk, count);
} else if (v != 0) {
int j;
for (j = 0; j < BLIST_BMAP_RADIX && j < count; ++j) {
if (v & (1 << j))
blist_free(dest, blk + j, 1);
}
}
return;
}
/*
* Meta node
*/
if (scan->u.bmu_avail == 0) {
/*
* Source all allocated, leave dest allocated
*/
return;
}
if (scan->u.bmu_avail == radix) {
/*
* Source all free, free entire dest
*/
if (count < radix)
blist_free(dest, blk, count);
else
blist_free(dest, blk, radix);
return;
}
radix /= BLIST_META_RADIX;
next_skip = (skip / BLIST_META_RADIX);
for (i = 1; count && i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (blist_blkno_t)-1)
break;
if (count >= radix) {
blst_copy(
&scan[i],
blk,
radix,
next_skip - 1,
dest,
radix
);
count -= radix;
} else {
if (count) {
blst_copy(
&scan[i],
blk,
radix,
next_skip - 1,
dest,
count
);
}
count = 0;
}
blk += radix;
}
}
/*
* BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap
*
* This routine allocates all blocks in the specified range
* regardless of any existing allocations in that range. Returns
* the number of blocks allocated by the call.
*/
static int
blst_leaf_fill(blmeta_t *scan, blist_blkno_t blk, int count)
{
int n = blk & (BLIST_BMAP_RADIX - 1);
int nblks;
blist_bitmap_t mask, bitmap;
mask = ((blist_bitmap_t)-1 << n) &
((blist_bitmap_t)-1 >> (BLIST_BMAP_RADIX - count - n));
/* Count the number of blocks we're about to allocate */
bitmap = scan->u.bmu_bitmap & mask;
for (nblks = 0; bitmap != 0; nblks++)
bitmap &= bitmap - 1;
scan->u.bmu_bitmap &= ~mask;
return nblks;
}
/*
* BLIST_META_FILL() - allocate specific blocks at a meta node
*
* This routine allocates the specified range of blocks,
* regardless of any existing allocations in the range. The
* range must be within the extent of this node. Returns the
* number of blocks allocated by the call.
*/
static blist_blkno_t
blst_meta_fill(
blmeta_t *scan,
blist_blkno_t allocBlk,
blist_blkno_t count,
blist_blkno_t radix,
blist_blkno_t skip,
blist_blkno_t blk
) {
blist_blkno_t i;
blist_blkno_t next_skip = (skip / BLIST_META_RADIX);
blist_blkno_t nblks = 0;
if (count == radix || scan->u.bmu_avail == 0) {
/*
* ALL-ALLOCATED special case
*/
nblks = scan->u.bmu_avail;
scan->u.bmu_avail = 0;
scan->bm_bighint = count;
return nblks;
}
if (count > radix)
panic("blist_meta_fill: allocation too large");
if (scan->u.bmu_avail == radix) {
radix /= BLIST_META_RADIX;
/*
* ALL-FREE special case, initialize sublevel
*/
for (i = 1; i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (blist_blkno_t)-1)
break;
if (next_skip == 1) {
scan[i].u.bmu_bitmap = (blist_bitmap_t)-1;
scan[i].bm_bighint = BLIST_BMAP_RADIX;
} else {
scan[i].bm_bighint = radix;
scan[i].u.bmu_avail = radix;
}
}
} else {
radix /= BLIST_META_RADIX;
}
i = (allocBlk - blk) / radix;
blk += i * radix;
i = i * next_skip + 1;
while (i <= skip && blk < allocBlk + count) {
blist_blkno_t v;
v = blk + radix - allocBlk;
if (v > count)
v = count;
if (scan->bm_bighint == (blist_blkno_t)-1)
panic("blst_meta_fill: filling unexpected range");
if (next_skip == 1) {
nblks += blst_leaf_fill(&scan[i], allocBlk, v);
} else {
nblks += blst_meta_fill(&scan[i], allocBlk, v,
radix, next_skip - 1, blk);
}
count -= v;
allocBlk += v;
blk += radix;
i += next_skip;
}
scan->u.bmu_avail -= nblks;
return nblks;
}
/*
* BLST_RADIX_INIT() - initialize radix tree
*
* Initialize our meta structures and bitmaps and calculate the exact
* amount of space required to manage 'count' blocks - this space may
* be considerably less than the calculated radix due to the large
* RADIX values we use.
*/
static blist_blkno_t
blst_radix_init(blmeta_t *scan, blist_blkno_t radix, blist_blkno_t skip,
blist_blkno_t count)
{
blist_blkno_t i;
blist_blkno_t next_skip;
blist_blkno_t memindex = 0;
/*
* Leaf node
*/
if (radix == BLIST_BMAP_RADIX) {
if (scan) { scan->bm_bighint = 0;
scan->u.bmu_bitmap = 0;
}
return(memindex);
}
/*
* Meta node. If allocating the entire object we can special
* case it. However, we need to figure out how much memory
* is required to manage 'count' blocks, so we continue on anyway.
*/
if (scan) { scan->bm_bighint = 0;
scan->u.bmu_avail = 0;
}
radix /= BLIST_META_RADIX;
next_skip = (skip / BLIST_META_RADIX);
for (i = 1; i <= skip; i += next_skip) {
if (count >= radix) {
/*
* Allocate the entire object
*/
memindex = i + blst_radix_init(
((scan) ? &scan[i] : NULL),
radix,
next_skip - 1,
radix
);
count -= radix;
} else if (count > 0) {
/*
* Allocate a partial object
*/
memindex = i + blst_radix_init(
((scan) ? &scan[i] : NULL),
radix,
next_skip - 1,
count
);
count = 0;
} else {
/*
* Add terminator and break out
*/
if (scan) scan[i].bm_bighint = (blist_blkno_t)-1;
break;
}
}
if (memindex < i)
memindex = i;
return(memindex);
}
#ifdef BLIST_DEBUG
static void
blst_radix_print(blmeta_t *scan, blist_blkno_t blk, blist_blkno_t radix,
blist_blkno_t skip, int tab)
{
blist_blkno_t i;
blist_blkno_t next_skip;
int lastState = 0;
if (radix == BLIST_BMAP_RADIX) {
printf(
"%*.*s(%0*" PRIx64 ",%" PRIu64
"): bitmap %0*" PRIx64 " big=%" PRIu64 "\n",
tab, tab, "",
sizeof(blk) * 2,
(uint64_t)blk,
(uint64_t)radix,
sizeof(scan->u.bmu_bitmap) * 2,
(uint64_t)scan->u.bmu_bitmap,
(uint64_t)scan->bm_bighint
);
return;
}
if (scan->u.bmu_avail == 0) {
printf(
"%*.*s(%0*" PRIx64 ",%" PRIu64") ALL ALLOCATED\n",
tab, tab, "",
sizeof(blk) * 2,
(uint64_t)blk,
(uint64_t)radix
);
return;
}
if (scan->u.bmu_avail == radix) {
printf(
"%*.*s(%0*" PRIx64 ",%" PRIu64 ") ALL FREE\n",
tab, tab, "",
sizeof(blk) * 2,
(uint64_t)blk,
(uint64_t)radix
);
return;
}
printf(
"%*.*s(%0*" PRIx64 ",%" PRIu64 "): subtree (%" PRIu64 "/%"
PRIu64 ") big=%" PRIu64 " {\n",
tab, tab, "",
sizeof(blk) * 2,
(uint64_t)blk,
(uint64_t)radix,
(uint64_t)scan->u.bmu_avail,
(uint64_t)radix,
(uint64_t)scan->bm_bighint
);
radix /= BLIST_META_RADIX;
next_skip = (skip / BLIST_META_RADIX);
tab += 4;
for (i = 1; i <= skip; i += next_skip) {
if (scan[i].bm_bighint == (blist_blkno_t)-1) {
printf(
"%*.*s(%0*" PRIx64 ",%" PRIu64 "): Terminator\n",
tab, tab, "",
sizeof(blk) * 2,
(uint64_t)blk,
(uint64_t)radix
);
lastState = 0;
break;
}
blst_radix_print(
&scan[i],
blk,
radix,
next_skip - 1,
tab
);
blk += radix;
}
tab -= 4;
printf(
"%*.*s}\n",
tab, tab, ""
);
}
#endif
#ifdef BLIST_DEBUG
int
main(int ac, char **av)
{
blist_blkno_t size = 1024;
int i;
blist_t bl;
for (i = 1; i < ac; ++i) {
const char *ptr = av[i];
if (*ptr != '-') {
size = strtol(ptr, NULL, 0);
continue;
}
ptr += 2;
fprintf(stderr, "Bad option: %s\n", ptr - 2);
exit(1);
}
bl = blist_create(size);
blist_free(bl, 0, size);
for (;;) {
char buf[1024];
uint64_t da = 0;
uint64_t count = 0;
printf("%" PRIu64 "/%" PRIu64 "/%" PRIu64 "> ",
(uint64_t)bl->bl_free,
(uint64_t)size,
(uint64_t)bl->bl_radix);
fflush(stdout);
if (fgets(buf, sizeof(buf), stdin) == NULL)
break;
switch(buf[0]) {
case 'r':
if (sscanf(buf + 1, "%" SCNu64, &count) == 1) {
blist_resize(&bl, count, 1);
} else {
printf("?\n");
}
case 'p':
blist_print(bl);
break;
case 'a':
if (sscanf(buf + 1, "%" SCNu64, &count) == 1) {
blist_blkno_t blk = blist_alloc(bl, count);
printf(" R=%0*" PRIx64 "\n",
sizeof(blk) * 2,
(uint64_t)blk);
} else {
printf("?\n");
}
break;
case 'f':
if (sscanf(buf + 1, "%" SCNx64 " %" SCNu64,
&da, &count) == 2) {
blist_free(bl, da, count);
} else {
printf("?\n");
}
break;
case 'l':
if (sscanf(buf + 1, "%" SCNx64 " %" SCNu64,
&da, &count) == 2) {
printf(" n=%" PRIu64 "\n",
(uint64_t)blist_fill(bl, da, count));
} else {
printf("?\n");
}
break;
case '?':
case 'h':
puts(
"p -print\n"
"a %d -allocate\n"
"f %x %d -free\n"
"l %x %d -fill\n"
"r %d -resize\n"
"h/? -help"
);
break;
default:
printf("?\n");
break;
}
}
return(0);
}
void
panic(const char *ctl, ...)
{
va_list va;
va_start(va, ctl);
vfprintf(stderr, ctl, va);
fprintf(stderr, "\n");
va_end(va);
exit(1);
}
#endif
/* $NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $ */
/*-
* Copyright (c) 2009 David Schultz <das@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strnlen.c,v 1.2 2014/01/09 11:25:11 apb Exp $");
#endif /* LIBC_SCCS and not lint */
/* FreeBSD: src/lib/libc/string/strnlen.c,v 1.1 2009/02/28 06:00:58 das Exp */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
#if !HAVE_STRNLEN
size_t
strnlen(const char *s, size_t maxlen)
{
size_t len;
for (len = 0; len < maxlen; len++, s++) { if (!*s)
break;
}
return (len);
}
#endif /* !HAVE_STRNLEN */
/* $NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $ */
/* $OpenBSD: strlcpy.c,v 1.7 2003/04/12 21:56:39 millert Exp $ */
/*
* Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE
* FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if !defined(_KERNEL) && !defined(_STANDALONE)
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strlcpy.c,v 1.3 2007/06/04 18:19:27 christos Exp $");
#endif /* LIBC_SCCS and not lint */
#ifdef _LIBC
#include "namespace.h"
#endif
#include <sys/types.h>
#include <assert.h>
#include <string.h>
#ifdef _LIBC
# ifdef __weak_alias
__weak_alias(strlcpy, _strlcpy)
# endif
#endif
#else
#include <lib/libkern/libkern.h>
#endif /* !_KERNEL && !_STANDALONE */
#if !HAVE_STRLCPY
/*
* Copy src to string dst of size siz. At most siz-1 characters
* will be copied. Always NUL terminates (unless siz == 0).
* Returns strlen(src); if retval >= siz, truncation occurred.
*/
size_t
strlcpy(char *dst, const char *src, size_t siz)
{
char *d = dst;
const char *s = src;
size_t n = siz;
_DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL);
/* Copy as many bytes as will fit */
if (n != 0 && --n != 0) {
do {
if ((*d++ = *s++) == 0)
break;
} while (--n != 0);
}
/* Not enough room in dst, add NUL and traverse rest of src */
if (n == 0) {
if (siz != 0) *d = '\0'; /* NUL-terminate dst */ while (*s++)
;
}
return(s - src - 1); /* count does not include NUL */
}
#endif
/* $NetBSD: ufs_vfsops.c,v 1.61 2023/02/22 21:49:45 riastradh Exp $ */
/*
* Copyright (c) 1991, 1993, 1994
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_vfsops.c 8.8 (Berkeley) 5/20/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_vfsops.c,v 1.61 2023/02/22 21:49:45 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_wapbl.h"
#endif
#include <sys/param.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/module.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <miscfs/specfs/specdev.h>
#include <sys/quotactl.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
/* how many times ufs_init() was called */
static int ufs_initcount = 0;
pool_cache_t ufs_direct_cache;
/*
* Make a filesystem operational.
* Nothing to do at the moment.
*/
/* ARGSUSED */
int
ufs_start(struct mount *mp, int flags)
{
return (0);
}
/*
* Return the root of a filesystem.
*/
int
ufs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct vnode *nvp;
int error;
if ((error = VFS_VGET(mp, (ino_t)UFS_ROOTINO, lktype, &nvp)) != 0)
return (error);
*vpp = nvp;
return (0);
}
/*
* Look up and return a vnode/inode pair by inode number.
*/
int
ufs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
int error;
error = vcache_get(mp, &ino, sizeof(ino), vpp);
if (error)
return error;
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
/*
* Do operations associated with quotas
*/
int
ufs_quotactl(struct mount *mp, struct quotactl_args *args)
{
#if !defined(QUOTA) && !defined(QUOTA2)
(void) mp;
(void) args;
return (EOPNOTSUPP);
#else
struct lwp *l = curlwp;
int error;
/* Mark the mount busy, as we're passing it to kauth(9). */
error = vfs_busy(mp);
if (error) {
return (error);
}
mutex_enter(mp->mnt_updating);
error = quota_handle_cmd(mp, l, args);
mutex_exit(mp->mnt_updating);
vfs_unbusy(mp);
return (error);
#endif
}
#if 0
switch (cmd) {
case Q_SYNC:
break;
case Q_GETQUOTA:
/* The user can always query about his own quota. */
if (uid == kauth_cred_getuid(l->l_cred))
break;
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
KAUTH_REQ_SYSTEM_FS_QUOTA_GET, mp, KAUTH_ARG(uid), NULL);
break;
case Q_QUOTAON:
case Q_QUOTAOFF:
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF, mp, NULL, NULL);
break;
case Q_SETQUOTA:
case Q_SETUSE:
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_QUOTA,
KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE, mp, KAUTH_ARG(uid), NULL);
break;
default:
error = EINVAL;
break;
}
type = cmds & SUBCMDMASK;
if (!error) {
/* Only check if there was no error above. */
if ((u_int)type >= MAXQUOTAS)
error = EINVAL;
}
if (error) {
vfs_unbusy(mp);
return (error);
}
mutex_enter(mp->mnt_updating);
switch (cmd) {
case Q_QUOTAON:
error = quotaon(l, mp, type, arg);
break;
case Q_QUOTAOFF:
error = quotaoff(l, mp, type);
break;
case Q_SETQUOTA:
error = setquota(mp, uid, type, arg);
break;
case Q_SETUSE:
error = setuse(mp, uid, type, arg);
break;
case Q_GETQUOTA:
error = getquota(mp, uid, type, arg);
break;
case Q_SYNC:
error = qsync(mp);
break;
default:
error = EINVAL;
}
mutex_exit(mp->mnt_updating);
vfs_unbusy(mp);
return (error);
#endif
/*
* This is the generic part of fhtovp called after the underlying
* filesystem has validated the file handle.
*/
int
ufs_fhtovp(struct mount *mp, struct ufid *ufhp, int lktype, struct vnode **vpp)
{
struct vnode *nvp;
struct inode *ip;
int error;
if ((error = VFS_VGET(mp, ufhp->ufid_ino, lktype, &nvp)) != 0) {
if (error == ENOENT)
error = ESTALE;
*vpp = NULLVP;
return (error);
}
ip = VTOI(nvp);
KASSERT(ip != NULL);
if (ip->i_mode == 0 || ip->i_gen != ufhp->ufid_gen ||
((ip->i_mode & IFMT) == IFDIR && ip->i_size == 0)) {
vput(nvp);
*vpp = NULLVP;
return (ESTALE);
}
*vpp = nvp;
return (0);
}
/*
* Initialize UFS filesystems, done only once.
*/
void
ufs_init(void)
{
if (ufs_initcount++ > 0)
return;
ufs_direct_cache = pool_cache_init(sizeof(struct direct), 0, 0, 0,
"ufsdir", NULL, IPL_NONE, NULL, NULL, NULL);
#if defined(QUOTA) || defined(QUOTA2)
dqinit();
#endif
#ifdef UFS_DIRHASH
ufsdirhash_init();
#endif
#ifdef UFS_EXTATTR
ufs_extattr_init();
#endif
}
void
ufs_reinit(void)
{
#if defined(QUOTA) || defined(QUOTA2)
dqreinit();
#endif
}
/*
* Free UFS filesystem resources, done only once.
*/
void
ufs_done(void)
{
if (--ufs_initcount > 0)
return;
#if defined(QUOTA) || defined(QUOTA2)
dqdone();
#endif
pool_cache_destroy(ufs_direct_cache);
#ifdef UFS_DIRHASH
ufsdirhash_done();
#endif
#ifdef UFS_EXTATTR
ufs_extattr_done();
#endif
}
/*
* module interface
*/
#ifdef WAPBL
MODULE(MODULE_CLASS_MISC, ufs, "wapbl");
#else
MODULE(MODULE_CLASS_MISC, ufs, NULL);
#endif
static int
ufs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
ufs_init();
error = 0;
break;
case MODULE_CMD_FINI:
ufs_done();
error = 0;
break;
default:
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $ */
/* $KAME: raw_ip6.c,v 1.82 2001/07/23 18:57:56 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_ip6.c,v 1.184 2024/02/24 21:41:13 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#include <netinet6/raw_ip6.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#endif
#include "faith.h"
#if defined(NFAITH) && 0 < NFAITH
#include <net/if_faith.h>
#endif
extern struct inpcbtable rawcbtable;
struct inpcbtable raw6cbtable;
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
/*
* Raw interface to IP6 protocol.
*/
static percpu_t *rip6stat_percpu;
#define RIP6_STATINC(x) _NET_STATINC(rip6stat_percpu, x)
static void sysctl_net_inet6_raw6_setup(struct sysctllog **);
/*
* Initialize raw connection block queue.
*/
void
rip6_init(void)
{
sysctl_net_inet6_raw6_setup(NULL);
in6pcb_init(&raw6cbtable, 1, 1);
rip6stat_percpu = percpu_alloc(sizeof(uint64_t) * RIP6_NSTATS);
}
static void
rip6_sbappendaddr(struct inpcb *last, struct ip6_hdr *ip6,
const struct sockaddr *sa, int hlen, struct mbuf *n)
{
struct mbuf *opts = NULL;
if (last->inp_flags & IN6P_CONTROLOPTS ||
SOOPT_TIMESTAMP(last->inp_socket->so_options))
ip6_savecontrol(last, &opts, ip6, n);
m_adj(n, hlen);
if (sbappendaddr(&last->inp_socket->so_rcv, sa, n, opts) == 0) {
soroverflow(last->inp_socket);
m_freem(n);
if (opts)
m_freem(opts);
RIP6_STATINC(RIP6_STAT_FULLSOCK);
} else {
sorwakeup(last->inp_socket);
}
}
/*
* Setup generic address and protocol structures
* for raw_input routine, then pass them along with
* mbuf chain.
*/
int
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb *inp;
struct inpcb *last = NULL;
struct sockaddr_in6 rip6src;
struct mbuf *n;
RIP6_STATINC(RIP6_STAT_IPACKETS);
#if defined(NFAITH) && 0 < NFAITH
if (faithprefix(&ip6->ip6_dst)) {
/* send icmp6 host unreach? */
m_freem(m);
return IPPROTO_DONE;
}
#endif
sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0);
if (sa6_recoverscope(&rip6src) != 0) {
/* XXX: should be impossible. */
m_freem(m);
return IPPROTO_DONE;
}
TAILQ_FOREACH(inp, &raw6cbtable.inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if (in6p_ip6(inp).ip6_nxt &&
in6p_ip6(inp).ip6_nxt != proto)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &ip6->ip6_dst))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) &&
!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &ip6->ip6_src))
continue;
if (in6p_cksum(inp) != -1) {
RIP6_STATINC(RIP6_STAT_ISUM);
/*
* Although in6_cksum() does not need the position of
* the checksum field for verification, enforce that it
* is located within the packet. Userland has given
* a checksum offset, a packet too short for that is
* invalid. Avoid overflow with user supplied offset.
*/
if (m->m_pkthdr.len < *offp + 2 ||
m->m_pkthdr.len - *offp - 2 < in6p_cksum(inp) ||
in6_cksum(m, proto, *offp,
m->m_pkthdr.len - *offp)) {
RIP6_STATINC(RIP6_STAT_BADSUM);
continue;
}
}
if (last == NULL) {
;
}
#ifdef IPSEC
else if (ipsec_used && ipsec_in_reject(m, last)) {
/* do not inject data into pcb */
}
#endif
else if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src),
*offp, n);
}
last = inp;
}
#ifdef IPSEC
if (ipsec_used && last && ipsec_in_reject(m, last)) {
m_freem(m);
IP6_STATDEC(IP6_STAT_DELIVERED);
/* do not inject data into pcb */
} else
#endif
if (last != NULL) {
rip6_sbappendaddr(last, ip6, sin6tosa(&rip6src), *offp, m);
} else {
RIP6_STATINC(RIP6_STAT_NOSOCK);
if (m->m_flags & M_MCAST)
RIP6_STATINC(RIP6_STAT_NOSOCKMCAST);
if (proto == IPPROTO_NONE)
m_freem(m);
else {
int s;
struct ifnet *rcvif = m_get_rcvif(m, &s);
const int prvnxt = ip6_get_prevhdr(m, *offp);
in6_ifstat_inc(rcvif, ifs6_in_protounknown);
m_put_rcvif(rcvif, &s);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_NEXTHEADER,
prvnxt);
}
IP6_STATDEC(IP6_STAT_DELIVERED);
}
return IPPROTO_DONE;
}
void *
rip6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct ip6_hdr *ip6;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
void (*notify)(struct inpcb *, int) = in6pcb_rtchange;
int nxt;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
ip6 = ip6cp->ip6c_ip6;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
nxt = ip6cp->ip6c_nxt;
} else {
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
nxt = -1;
}
if (ip6 && cmd == PRC_MSGSIZE) {
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
int valid = 0;
struct inpcb *inp;
/*
* Check to see if we have a valid raw IPv6 socket
* corresponding to the address in the ICMPv6 message
* payload, and the protocol (ip6_nxt) meets the socket.
* XXX chase extension headers, or pass final nxt value
* from icmp6_notify_error()
*/
inp = NULL;
inp = in6pcb_lookup(&raw6cbtable, &sa6->sin6_addr, 0,
(const struct in6_addr *)&sa6_src->sin6_addr, 0, 0, 0);
#if 0
if (!inp) {
/*
* As the use of sendto(2) is fairly popular,
* we may want to allow non-connected pcb too.
* But it could be too weak against attacks...
* We should at least check if the local
* address (= s) is really ours.
*/
inp = in6pcb_lookup_bound(&raw6cbtable,
&sa6->sin6_addr, 0, 0);
}
#endif
if (inp && in6p_ip6(inp).ip6_nxt &&
in6p_ip6(inp).ip6_nxt == nxt)
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* regardless of if we called icmp6_mtudisc_update(),
* we need to call in6pcb_notify(), to notify path MTU
* change to the userland (RFC3542), because some
* unconnected sockets may share the same destination
* and want to know the path MTU.
*/
}
(void) in6pcb_notify(&raw6cbtable, sa, 0,
sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
return NULL;
}
/*
* Generate IPv6 header and pass packet to ip6_output.
* Tack on options user may have setup with control call.
*/
int
rip6_output(struct mbuf *m, struct socket * const so,
struct sockaddr_in6 * const dstsock, struct mbuf * const control)
{
struct in6_addr *dst;
struct ip6_hdr *ip6;
struct inpcb *inp;
u_int plen = m->m_pkthdr.len;
int error = 0;
struct ip6_pktopts opt, *optp = NULL;
struct ifnet *oifp = NULL;
int type, code; /* for ICMPv6 output statistics only */
int scope_ambiguous = 0;
int bound = curlwp_bind();
struct psref psref;
inp = sotoinpcb(so);
dst = &dstsock->sin6_addr;
if (control) { if ((error = ip6_setpktopts(control, &opt,
in6p_outputopts(inp),
kauth_cred_get(), so->so_proto->pr_protocol)) != 0) {
goto bad;
}
optp = &opt;
} else
optp = in6p_outputopts(inp);
/*
* Check and convert scope zone ID into internal form.
* XXX: we may still need to determine the zone later.
*/
if (!(so->so_state & SS_ISCONNECTED)) {
if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0)
goto bad;
}
/*
* For an ICMPv6 packet, we should know its type and code
* to update statistics.
*/
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
if (m->m_len < sizeof(struct icmp6_hdr) &&
(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
error = ENOBUFS;
goto bad;
}
icmp6 = mtod(m, struct icmp6_hdr *);
type = icmp6->icmp6_type;
code = icmp6->icmp6_code;
} else {
type = 0;
code = 0;
}
M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
if (!m) {
error = ENOBUFS;
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Next header might not be ICMP6 but use its pseudo header anyway.
*/
ip6->ip6_dst = *dst;
/*
* Source address selection.
*/
error = in6_selectsrc(dstsock, optp, in6p_moptions(inp),
&inp->inp_route, &in6p_laddr(inp), &oifp, &psref, &ip6->ip6_src);
if (error != 0)
goto bad;
if (oifp && scope_ambiguous) {
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined
* (when it's required), if we can determine the outgoing
* interface. determine the zone ID based on the interface.
*/
error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
if (error != 0)
goto bad;
}
ip6->ip6_dst = dstsock->sin6_addr;
/* fill in the rest of the IPv6 header fields */
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be filled in ip6_output, so not fill it here. */
ip6->ip6_nxt = in6p_ip6(inp).ip6_nxt;
ip6->ip6_hlim = in6pcb_selecthlim(inp, oifp);
if_put(oifp, &psref);
oifp = NULL;
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
in6p_cksum(inp) != -1) {
const uint8_t nxt = ip6->ip6_nxt;
int off;
u_int16_t sum;
/* compute checksum */
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
off = offsetof(struct icmp6_hdr, icmp6_cksum);
else
off = in6p_cksum(inp);
if (plen < 2 || plen - 2 < off) {
error = EINVAL;
goto bad;
}
off += sizeof(struct ip6_hdr);
sum = 0;
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
sum = in6_cksum(m, nxt, sizeof(*ip6), plen);
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
}
{
struct ifnet *ret_oifp = NULL;
error = ip6_output(m, optp, &inp->inp_route, 0,
in6p_moptions(inp), inp, &ret_oifp);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
if (ret_oifp) icmp6_ifoutstat_inc(ret_oifp, type, code);
ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
} else
RIP6_STATINC(RIP6_STAT_OPACKETS);
}
goto freectl;
bad:
if (m)
m_freem(m);
freectl:
if (control) { ip6_clearpktopts(&opt, -1);
m_freem(control);
}
if_put(oifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Raw IPv6 socket option processing.
*/
int
rip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int error = 0;
if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
int optval;
/* need to fiddle w/ opt(IPPROTO_IPV6, IPV6_CHECKSUM)? */
if (op == PRCO_GETOPT) {
optval = 1;
error = sockopt_set(sopt, &optval, sizeof(optval));
} else if (op == PRCO_SETOPT) {
error = sockopt_getint(sopt, &optval);
if (error)
goto out;
if (optval == 0)
error = EINVAL;
}
goto out;
} else if (sopt->sopt_level != IPPROTO_IPV6)
return ip6_ctloutput(op, so, sopt);
switch (sopt->sopt_name) {
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
if (op == PRCO_SETOPT) error = ip6_mrouter_set(so, sopt);
else if (op == PRCO_GETOPT)
error = ip6_mrouter_get(so, sopt);
else
error = EINVAL;
break;
case IPV6_CHECKSUM:
return ip6_raw_ctloutput(op, so, sopt);
default:
return ip6_ctloutput(op, so, sopt);
}
out:
return error;
}
extern u_long rip6_sendspace;
extern u_long rip6_recvspace;
int
rip6_attach(struct socket *so, int proto)
{
struct inpcb *inp;
int s, error;
KASSERT(sotoinpcb(so) == NULL);
sosetlock(so);
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_RAWSOCK,
KAUTH_ARG(AF_INET6),
KAUTH_ARG(SOCK_RAW),
KAUTH_ARG(so->so_proto->pr_protocol));
if (error) {
return error;
}
s = splsoftnet();
error = soreserve(so, rip6_sendspace, rip6_recvspace);
if (error) {
splx(s);
return error;
}
if ((error = inpcb_create(so, &raw6cbtable)) != 0) {
splx(s);
return error;
}
splx(s);
inp = sotoinpcb(so);
in6p_ip6(inp).ip6_nxt = proto;
in6p_cksum(inp) = -1;
in6p_icmp6filt(inp) = kmem_alloc(sizeof(struct icmp6_filter), KM_SLEEP);
ICMP6_FILTER_SETPASSALL(in6p_icmp6filt(inp));
KASSERT(solocked(so));
return error;
}
static void
rip6_detach(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
KASSERT(solocked(so)); KASSERT(inp != NULL); if (so == ip6_mrouter) { ip6_mrouter_done();
}
/* xxx: RSVP */
if (in6p_icmp6filt(inp) != NULL) { kmem_free(in6p_icmp6filt(inp), sizeof(struct icmp6_filter));
in6p_icmp6filt(inp) = NULL;
}
inpcb_destroy(inp);
}
static int
rip6_accept(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct ifaddr *ifa = NULL;
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (addr->sin6_len != sizeof(*addr))
return EINVAL;
if (IFNET_READER_EMPTY() || addr->sin6_family != AF_INET6)
return EADDRNOTAVAIL;
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
return error;
/*
* we don't support mapped address here, it would confuse
* users so reject it
*/
if (IN6_IS_ADDR_V4MAPPED(&addr->sin6_addr))
return EADDRNOTAVAIL;
s = pserialize_read_enter();
if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
(ifa = ifa_ifwithaddr(sin6tosa(addr))) == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
if (ifa && (ifatoia6(ifa))->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
error = EADDRNOTAVAIL;
goto out;
}
in6p_laddr(inp) = addr->sin6_addr;
error = 0;
out:
pserialize_read_exit(s);
return error;
}
static int
rip6_listen(struct socket *so, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *addr = (struct sockaddr_in6 *)nam;
struct in6_addr in6a;
struct ifnet *ifp = NULL;
int scope_ambiguous = 0;
int error = 0;
struct psref psref;
int bound;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(nam != NULL); if (IFNET_READER_EMPTY())
return EADDRNOTAVAIL;
if (addr->sin6_family != AF_INET6)
return EAFNOSUPPORT;
if (addr->sin6_len != sizeof(*addr))
return EINVAL;
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (addr->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
return error;
bound = curlwp_bind();
/* Source address selection. XXX: need pcblookup? */
error = in6_selectsrc(addr, in6p_outputopts(inp),
in6p_moptions(inp), &inp->inp_route,
&in6p_laddr(inp), &ifp, &psref, &in6a);
if (error != 0)
goto out;
/* XXX: see above */
if (ifp && scope_ambiguous &&
(error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
goto out;
}
in6p_laddr(inp) = in6a;
in6p_faddr(inp) = addr->sin6_addr;
soisconnected(so);
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
static int
rip6_connect2(struct socket *so, struct socket *so2)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_disconnect(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
KASSERT(solocked(so)); KASSERT(inp != NULL); if ((so->so_state & SS_ISCONNECTED) == 0)
return ENOTCONN;
in6p_faddr(inp) = in6addr_any;
so->so_state &= ~SS_ISCONNECTED; /* XXX */
return 0;
}
static int
rip6_shutdown(struct socket *so)
{
KASSERT(solocked(so));
/*
* Mark the connection as being incapable of further input.
*/
socantsendmore(so);
return 0;
}
static int
rip6_abort(struct socket *so)
{
KASSERT(solocked(so));
soisdisconnected(so);
rip6_detach(so);
return 0;
}
static int
rip6_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
return in6_control(so, cmd, nam, ifp);
}
static int
rip6_stat(struct socket *so, struct stat *ub)
{
KASSERT(solocked(so));
/* stat: don't bother with a blocksize */
return 0;
}
static int
rip6_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL);
in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
rip6_sockaddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL);
in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
rip6_rcvd(struct socket *so, int flags, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
rip6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 tmp;
struct sockaddr_in6 *dst;
int error = 0;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL);
/*
* Ship a packet out. The appropriate raw output
* routine handles any messaging necessary.
*/
/* always copy sockaddr to avoid overwrites */
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
error = EISCONN;
goto release;
}
/* XXX */
sockaddr_in6_init(&tmp, &in6p_faddr(inp), 0, 0, 0);
dst = &tmp;
} else {
if (nam == NULL) {
error = ENOTCONN;
goto release;
}
tmp = *(struct sockaddr_in6 *)nam;
dst = &tmp;
if (dst->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
goto release;
}
if (dst->sin6_len != sizeof(*dst)) {
error = EINVAL;
goto release;
}
}
error = rip6_output(m, so, dst, control);
m = NULL;
release:
if (m) m_freem(m);
return error;
}
static int
rip6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
static int
rip6_purgeif(struct socket *so, struct ifnet *ifp)
{
mutex_enter(softnet_lock);
in6pcb_purgeif0(&raw6cbtable, ifp);
#ifdef NET_MPSAFE
mutex_exit(softnet_lock);
#endif
in6_purgeif(ifp);
#ifdef NET_MPSAFE
mutex_enter(softnet_lock);
#endif
in6pcb_purgeif(&raw6cbtable, ifp);
mutex_exit(softnet_lock);
return 0;
}
static int
sysctl_net_inet6_raw6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(rip6stat_percpu, RIP6_NSTATS));
}
static void
sysctl_net_inet6_raw6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "raw6",
SYSCTL_DESCR("Raw IPv6 settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pcblist",
SYSCTL_DESCR("Raw IPv6 control block list"),
sysctl_inpcblist, 0, &raw6cbtable, 0,
CTL_NET, PF_INET6, IPPROTO_RAW,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("Raw IPv6 statistics"),
sysctl_net_inet6_raw6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, RAW6CTL_STATS,
CTL_EOL);
}
PR_WRAP_USRREQS(rip6)
#define rip6_attach rip6_attach_wrapper
#define rip6_detach rip6_detach_wrapper
#define rip6_accept rip6_accept_wrapper
#define rip6_bind rip6_bind_wrapper
#define rip6_listen rip6_listen_wrapper
#define rip6_connect rip6_connect_wrapper
#define rip6_connect2 rip6_connect2_wrapper
#define rip6_disconnect rip6_disconnect_wrapper
#define rip6_shutdown rip6_shutdown_wrapper
#define rip6_abort rip6_abort_wrapper
#define rip6_ioctl rip6_ioctl_wrapper
#define rip6_stat rip6_stat_wrapper
#define rip6_peeraddr rip6_peeraddr_wrapper
#define rip6_sockaddr rip6_sockaddr_wrapper
#define rip6_rcvd rip6_rcvd_wrapper
#define rip6_recvoob rip6_recvoob_wrapper
#define rip6_send rip6_send_wrapper
#define rip6_sendoob rip6_sendoob_wrapper
#define rip6_purgeif rip6_purgeif_wrapper
const struct pr_usrreqs rip6_usrreqs = {
.pr_attach = rip6_attach,
.pr_detach = rip6_detach,
.pr_accept = rip6_accept,
.pr_bind = rip6_bind,
.pr_listen = rip6_listen,
.pr_connect = rip6_connect,
.pr_connect2 = rip6_connect2,
.pr_disconnect = rip6_disconnect,
.pr_shutdown = rip6_shutdown,
.pr_abort = rip6_abort,
.pr_ioctl = rip6_ioctl,
.pr_stat = rip6_stat,
.pr_peeraddr = rip6_peeraddr,
.pr_sockaddr = rip6_sockaddr,
.pr_rcvd = rip6_rcvd,
.pr_recvoob = rip6_recvoob,
.pr_send = rip6_send,
.pr_sendoob = rip6_sendoob,
.pr_purgeif = rip6_purgeif,
};
/* $NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $ */
/*-
* Copyright (c) 2005, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_90.c,v 1.1 2019/09/22 22:59:38 christos Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/socketvar.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/malloc.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_mod.h>
#include <compat/common/compat_util.h>
#include <compat/sys/statvfs.h>
static const struct syscall_package vfs_syscalls_90_syscalls[] = {
{ SYS_compat_90_getvfsstat, 0, (sy_call_t *)compat_90_sys_getvfsstat },
{ SYS_compat_90_statvfs1, 0, (sy_call_t *)compat_90_sys_statvfs1 },
{ SYS_compat_90_fstatvfs1, 0, (sy_call_t *)compat_90_sys_fstatvfs1 },
{ SYS_compat_90_fhstatvfs1, 0, (sy_call_t *)compat_90_sys_fhstatvfs1 },
{ 0,0, NULL }
};
int
compat_90_sys_getvfsstat(struct lwp *l,
const struct compat_90_sys_getvfsstat_args *uap, register_t *retval)
{
/* {
syscallarg(struct statvfs90 *) buf;
syscallarg(size_t) bufsize;
syscallarg(int) flags;
} */
return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
SCARG(uap, flags), statvfs_to_statvfs90_copy,
sizeof(struct statvfs90), retval);
}
int
compat_90_sys_statvfs1(struct lwp *l,
const struct compat_90_sys_statvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_sys_pstatvfs(l, SCARG(uap, path), SCARG(uap, flags), sb);
if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
compat_90_sys_fstatvfs1(struct lwp *l,
const struct compat_90_sys_fstatvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_sys_fstatvfs(l, SCARG(uap, fd), SCARG(uap, flags), sb);
if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
compat_90_sys_fhstatvfs1(struct lwp *l,
const struct compat_90_sys_fhstatvfs1_args *uap, register_t *retval)
{
/* {
syscallarg(const void *) fhp;
syscallarg(size_t) fh_size;
syscallarg(struct statvfs90 *) buf;
syscallarg(int) flags;
} */
struct statvfs *sb = STATVFSBUF_GET();
int error = do_fhstatvfs(l, SCARG(uap, fhp), SCARG(uap, fh_size),
sb, SCARG(uap, flags));
if (!error) error = statvfs_to_statvfs90_copy(sb, SCARG(uap, buf),
sizeof(struct statvfs90));
STATVFSBUF_PUT(sb);
return error;
}
int
vfs_syscalls_90_init(void)
{
return syscall_establish(NULL, vfs_syscalls_90_syscalls);
}
int
vfs_syscalls_90_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_90_syscalls);
}
/* $NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $ */
/*-
* Copyright (c)2011 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_status.c,v 1.6 2020/08/14 09:06:15 chs Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm.h>
/*
* page dirtiness status tracking
*
* separated from uvm_page.c mainly for rump
*/
/*
* these constants are chosen to match so that we can convert between
* them quickly.
*/
__CTASSERT(UVM_PAGE_STATUS_UNKNOWN == 0);
__CTASSERT(UVM_PAGE_STATUS_DIRTY == PG_DIRTY);
__CTASSERT(UVM_PAGE_STATUS_CLEAN == PG_CLEAN);
/*
* uvm_pagegetdirty: return the dirtiness status (one of UVM_PAGE_STATUS_
* values) of the page.
*
* called with the owner locked.
*/
unsigned int
uvm_pagegetdirty(struct vm_page *pg)
{
struct uvm_object * const uobj __diagused = pg->uobject;
KASSERT((~pg->flags & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
return pg->flags & (PG_CLEAN|PG_DIRTY);
}
/*
* uvm_pagemarkdirty: set the dirtiness status (one of UVM_PAGE_STATUS_ values)
* of the page.
*
* called with the owner locked.
*
* update the radix tree tag for object-owned page.
*
* if new status is UVM_PAGE_STATUS_UNKNOWN, clear pmap-level dirty bit
* so that later uvm_pagecheckdirty() can notice modifications on the page.
*/
void
uvm_pagemarkdirty(struct vm_page *pg, unsigned int newstatus)
{
struct uvm_object * const uobj = pg->uobject;
const unsigned int oldstatus = uvm_pagegetdirty(pg);
enum cpu_count base;
KASSERT((~newstatus & (PG_CLEAN|PG_DIRTY)) != 0); KASSERT((newstatus & ~(PG_CLEAN|PG_DIRTY)) == 0); KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
if (oldstatus == newstatus) {
return;
}
/*
* set UVM_PAGE_DIRTY_TAG tag unless known CLEAN so that putpages can
* find possibly-dirty pages quickly.
*/
if (uobj != NULL) {
if (newstatus == UVM_PAGE_STATUS_CLEAN) {
uvm_obj_page_clear_dirty(pg); } else if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
/*
* on first dirty page, mark the object dirty.
* for vnodes this inserts to the syncer worklist.
*/
if (uvm_obj_clean_p(uobj) &&
uobj->pgops->pgo_markdirty != NULL) {
(*uobj->pgops->pgo_markdirty)(uobj);
}
uvm_obj_page_set_dirty(pg);
}
}
if (newstatus == UVM_PAGE_STATUS_UNKNOWN) {
/*
* start relying on pmap-level dirtiness tracking.
*/
pmap_clear_modify(pg);
}
pg->flags &= ~(PG_CLEAN|PG_DIRTY);
pg->flags |= newstatus;
KASSERT(uobj == NULL || ((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
if ((pg->flags & PG_STAT) != 0) {
if ((pg->flags & PG_SWAPBACKED) != 0) {
base = CPU_COUNT_ANONUNKNOWN;
} else {
base = CPU_COUNT_FILEUNKNOWN;
}
kpreempt_disable();
CPU_COUNT(base + oldstatus, -1); CPU_COUNT(base + newstatus, +1);
kpreempt_enable();
}
}
/*
* uvm_pagecheckdirty: check if page is dirty, and remove its dirty bit.
*
* called with the owner locked.
*
* returns if the page was dirty.
*
* if protected is true, mark the page CLEAN. otherwise, mark the page UNKNOWN.
* ("mark" in the sense of uvm_pagemarkdirty().)
*/
bool
uvm_pagecheckdirty(struct vm_page *pg, bool pgprotected)
{
const unsigned int oldstatus = uvm_pagegetdirty(pg);
bool modified;
KASSERT(uvm_page_owner_locked_p(pg, true));
/*
* if pgprotected is true, mark the page CLEAN.
* otherwise mark the page UNKNOWN unless it's CLEAN.
*
* possible transitions:
*
* CLEAN -> CLEAN , modified = false
* UNKNOWN -> UNKNOWN, modified = true
* UNKNOWN -> UNKNOWN, modified = false
* UNKNOWN -> CLEAN , modified = true
* UNKNOWN -> CLEAN , modified = false
* DIRTY -> UNKNOWN, modified = true
* DIRTY -> CLEAN , modified = true
*
* pmap_clear_modify is necessary if either of
* oldstatus or newstatus is UVM_PAGE_STATUS_UNKNOWN.
*/
if (oldstatus == UVM_PAGE_STATUS_CLEAN) {
modified = false;
} else {
const unsigned int newstatus = pgprotected ?
UVM_PAGE_STATUS_CLEAN : UVM_PAGE_STATUS_UNKNOWN;
if (oldstatus == UVM_PAGE_STATUS_DIRTY) {
modified = true;
if (newstatus == UVM_PAGE_STATUS_UNKNOWN) { pmap_clear_modify(pg);
}
} else {
KASSERT(oldstatus == UVM_PAGE_STATUS_UNKNOWN);
modified = pmap_clear_modify(pg);
}
uvm_pagemarkdirty(pg, newstatus);
}
return modified;
}
/* $NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $ */
/*-
* Copyright (c)2007,2008 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* per-cpu storage.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_percpu.c,v 1.25 2020/05/11 21:37:31 riastradh Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/rwlock.h>
#include <sys/vmem.h>
#include <sys/xcall.h>
#define PERCPU_QUANTUM_SIZE (ALIGNBYTES + 1)
#define PERCPU_QCACHE_MAX 0
#define PERCPU_IMPORT_SIZE 2048
struct percpu {
unsigned pc_offset;
size_t pc_size;
percpu_callback_t pc_ctor;
percpu_callback_t pc_dtor;
void *pc_cookie;
LIST_ENTRY(percpu) pc_list;
};
static krwlock_t percpu_swap_lock __cacheline_aligned;
static vmem_t * percpu_offset_arena __read_mostly;
static struct {
kmutex_t lock;
unsigned int nextoff;
LIST_HEAD(, percpu) ctor_list;
struct lwp *busy;
kcondvar_t cv;
} percpu_allocation __cacheline_aligned;
static percpu_cpu_t *
cpu_percpu(struct cpu_info *ci)
{
return &ci->ci_data.cpu_percpu;
}
static unsigned int
percpu_offset(percpu_t *pc)
{
const unsigned int off = pc->pc_offset;
KASSERT(off < percpu_allocation.nextoff);
return off;
}
/*
* percpu_cpu_swap: crosscall handler for percpu_cpu_enlarge
*/
__noubsan
static void
percpu_cpu_swap(void *p1, void *p2)
{
struct cpu_info * const ci = p1;
percpu_cpu_t * const newpcc = p2;
percpu_cpu_t * const pcc = cpu_percpu(ci);
KASSERT(ci == curcpu() || !mp_online);
/*
* swap *pcc and *newpcc unless anyone has beaten us.
*/
rw_enter(&percpu_swap_lock, RW_WRITER);
if (newpcc->pcc_size > pcc->pcc_size) {
percpu_cpu_t tmp;
int s;
tmp = *pcc;
/*
* block interrupts so that we don't lose their modifications.
*/
s = splhigh();
/*
* copy data to new storage.
*/
memcpy(newpcc->pcc_data, pcc->pcc_data, pcc->pcc_size);
/*
* this assignment needs to be atomic for percpu_getptr_remote.
*/
pcc->pcc_data = newpcc->pcc_data;
splx(s);
pcc->pcc_size = newpcc->pcc_size;
*newpcc = tmp;
}
rw_exit(&percpu_swap_lock);
}
/*
* percpu_cpu_enlarge: ensure that percpu_cpu_t of each cpus have enough space
*/
static void
percpu_cpu_enlarge(size_t size)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
percpu_cpu_t pcc;
pcc.pcc_data = kmem_alloc(size, KM_SLEEP); /* XXX cacheline */
pcc.pcc_size = size;
if (!mp_online) {
percpu_cpu_swap(ci, &pcc);
} else {
uint64_t where;
where = xc_unicast(0, percpu_cpu_swap, ci, &pcc, ci);
xc_wait(where);
}
KASSERT(pcc.pcc_size <= size);
if (pcc.pcc_data != NULL) {
kmem_free(pcc.pcc_data, pcc.pcc_size);
}
}
}
/*
* percpu_backend_alloc: vmem import callback for percpu_offset_arena
*/
static int
percpu_backend_alloc(vmem_t *dummy, vmem_size_t size, vmem_size_t *resultsize,
vm_flag_t vmflags, vmem_addr_t *addrp)
{
unsigned int offset;
unsigned int nextoff;
ASSERT_SLEEPABLE();
KASSERT(dummy == NULL);
if ((vmflags & VM_NOSLEEP) != 0)
return ENOMEM;
size = roundup(size, PERCPU_IMPORT_SIZE);
mutex_enter(&percpu_allocation.lock);
offset = percpu_allocation.nextoff;
percpu_allocation.nextoff = nextoff = percpu_allocation.nextoff + size;
mutex_exit(&percpu_allocation.lock);
percpu_cpu_enlarge(nextoff);
*resultsize = size;
*addrp = (vmem_addr_t)offset;
return 0;
}
static void
percpu_zero_cb(void *vp, void *vp2, struct cpu_info *ci)
{
size_t sz = (uintptr_t)vp2;
memset(vp, 0, sz);
}
/*
* percpu_zero: initialize percpu storage with zero.
*/
static void
percpu_zero(percpu_t *pc, size_t sz)
{
percpu_foreach(pc, percpu_zero_cb, (void *)(uintptr_t)sz);
}
/*
* percpu_init: subsystem initialization
*/
void
percpu_init(void)
{
ASSERT_SLEEPABLE();
rw_init(&percpu_swap_lock);
mutex_init(&percpu_allocation.lock, MUTEX_DEFAULT, IPL_NONE);
percpu_allocation.nextoff = PERCPU_QUANTUM_SIZE;
LIST_INIT(&percpu_allocation.ctor_list);
percpu_allocation.busy = NULL;
cv_init(&percpu_allocation.cv, "percpu");
percpu_offset_arena = vmem_xcreate("percpu", 0, 0, PERCPU_QUANTUM_SIZE,
percpu_backend_alloc, NULL, NULL, PERCPU_QCACHE_MAX, VM_SLEEP,
IPL_NONE);
}
/*
* percpu_init_cpu: cpu initialization
*
* => should be called before the cpu appears on the list for CPU_INFO_FOREACH.
* => may be called for static CPUs afterward (typically just primary CPU)
*/
void
percpu_init_cpu(struct cpu_info *ci)
{
percpu_cpu_t * const pcc = cpu_percpu(ci);
struct percpu *pc;
size_t size = percpu_allocation.nextoff; /* XXX racy */
ASSERT_SLEEPABLE();
/*
* For the primary CPU, prior percpu_create may have already
* triggered allocation, so there's nothing more for us to do
* here.
*/
if (pcc->pcc_size)
return;
KASSERT(pcc->pcc_data == NULL);
/*
* Otherwise, allocate storage and, while the constructor list
* is locked, run constructors for all percpus on this CPU.
*/
pcc->pcc_size = size;
if (size) {
pcc->pcc_data = kmem_zalloc(pcc->pcc_size, KM_SLEEP);
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
percpu_allocation.busy = curlwp;
LIST_FOREACH(pc, &percpu_allocation.ctor_list, pc_list) {
KASSERT(pc->pc_ctor);
mutex_exit(&percpu_allocation.lock);
(*pc->pc_ctor)((char *)pcc->pcc_data + pc->pc_offset,
pc->pc_cookie, ci);
mutex_enter(&percpu_allocation.lock);
}
KASSERT(percpu_allocation.busy == curlwp);
percpu_allocation.busy = NULL;
cv_broadcast(&percpu_allocation.cv);
mutex_exit(&percpu_allocation.lock);
}
}
/*
* percpu_alloc: allocate percpu storage
*
* => called in thread context.
* => considered as an expensive and rare operation.
* => allocated storage is initialized with zeros.
*/
percpu_t *
percpu_alloc(size_t size)
{
return percpu_create(size, NULL, NULL, NULL);
}
/*
* percpu_create: allocate percpu storage and associate ctor/dtor with it
*
* => called in thread context.
* => considered as an expensive and rare operation.
* => allocated storage is initialized by ctor, or zeros if ctor is null
* => percpu_free will call dtor first, if dtor is nonnull
* => ctor or dtor may sleep, even on allocation
*/
percpu_t *
percpu_create(size_t size, percpu_callback_t ctor, percpu_callback_t dtor,
void *cookie)
{
vmem_addr_t offset;
percpu_t *pc;
ASSERT_SLEEPABLE();
(void)vmem_alloc(percpu_offset_arena, size, VM_SLEEP | VM_BESTFIT,
&offset);
pc = kmem_alloc(sizeof(*pc), KM_SLEEP);
pc->pc_offset = offset;
pc->pc_size = size;
pc->pc_ctor = ctor;
pc->pc_dtor = dtor;
pc->pc_cookie = cookie;
if (ctor) {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
void *buf;
/*
* Wait until nobody is using the list of percpus with
* constructors.
*/
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
percpu_allocation.busy = curlwp;
mutex_exit(&percpu_allocation.lock);
/*
* Run the constructor for all CPUs. We use a
* temporary buffer wo that we need not hold the
* percpu_swap_lock while running the constructor.
*/
buf = kmem_alloc(size, KM_SLEEP);
for (CPU_INFO_FOREACH(cii, ci)) {
memset(buf, 0, size);
(*ctor)(buf, cookie, ci);
percpu_traverse_enter();
memcpy(percpu_getptr_remote(pc, ci), buf, size);
percpu_traverse_exit();
}
explicit_memset(buf, 0, size);
kmem_free(buf, size);
/*
* Insert the percpu into the list of percpus with
* constructors. We are now done using the list, so it
* is safe for concurrent percpu_create or concurrent
* percpu_init_cpu to run.
*/
mutex_enter(&percpu_allocation.lock);
KASSERT(percpu_allocation.busy == curlwp);
percpu_allocation.busy = NULL;
cv_broadcast(&percpu_allocation.cv);
LIST_INSERT_HEAD(&percpu_allocation.ctor_list, pc, pc_list);
mutex_exit(&percpu_allocation.lock);
} else {
percpu_zero(pc, size);
}
return pc;
}
/*
* percpu_free: free percpu storage
*
* => called in thread context.
* => considered as an expensive and rare operation.
*/
void
percpu_free(percpu_t *pc, size_t size)
{
ASSERT_SLEEPABLE();
KASSERT(size == pc->pc_size);
/*
* If there's a constructor, take the percpu off the list of
* percpus with constructors, but first wait until nobody is
* using the list.
*/
if (pc->pc_ctor) {
mutex_enter(&percpu_allocation.lock);
while (percpu_allocation.busy)
cv_wait(&percpu_allocation.cv,
&percpu_allocation.lock);
LIST_REMOVE(pc, pc_list);
mutex_exit(&percpu_allocation.lock);
}
/* If there's a destructor, run it now for all CPUs. */
if (pc->pc_dtor) {
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
void *buf;
buf = kmem_alloc(size, KM_SLEEP);
for (CPU_INFO_FOREACH(cii, ci)) {
percpu_traverse_enter();
memcpy(buf, percpu_getptr_remote(pc, ci), size); explicit_memset(percpu_getptr_remote(pc, ci), 0, size);
percpu_traverse_exit();
(*pc->pc_dtor)(buf, pc->pc_cookie, ci);
}
explicit_memset(buf, 0, size);
kmem_free(buf, size);
}
vmem_free(percpu_offset_arena, (vmem_addr_t)percpu_offset(pc), size);
kmem_free(pc, sizeof(*pc));
}
/*
* percpu_getref:
*
* => safe to be used in either thread or interrupt context
* => disables preemption; must be bracketed with a percpu_putref()
*/
void *
percpu_getref(percpu_t *pc)
{
kpreempt_disable();
return percpu_getptr_remote(pc, curcpu());
}
/*
* percpu_putref:
*
* => drops the preemption-disabled count after caller is done with per-cpu
* data
*/
void
percpu_putref(percpu_t *pc)
{
kpreempt_enable();
}
/*
* percpu_traverse_enter, percpu_traverse_exit, percpu_getptr_remote:
* helpers to access remote cpu's percpu data.
*
* => called in thread context.
* => percpu_traverse_enter can block low-priority xcalls.
* => typical usage would be:
*
* sum = 0;
* percpu_traverse_enter();
* for (CPU_INFO_FOREACH(cii, ci)) {
* unsigned int *p = percpu_getptr_remote(pc, ci);
* sum += *p;
* }
* percpu_traverse_exit();
*/
void
percpu_traverse_enter(void)
{
ASSERT_SLEEPABLE();
rw_enter(&percpu_swap_lock, RW_READER);
}
void
percpu_traverse_exit(void)
{
rw_exit(&percpu_swap_lock);
}
void *
percpu_getptr_remote(percpu_t *pc, struct cpu_info *ci)
{ return &((char *)cpu_percpu(ci)->pcc_data)[percpu_offset(pc)];
}
/*
* percpu_foreach: call the specified callback function for each cpus.
*
* => must be called from thread context.
* => callback executes on **current** CPU (or, really, arbitrary CPU,
* in case of preemption)
* => caller should not rely on the cpu iteration order.
* => the callback function should be minimum because it is executed with
* holding a global lock, which can block low-priority xcalls.
* eg. it's illegal for a callback function to sleep for memory allocation.
*/
void
percpu_foreach(percpu_t *pc, percpu_callback_t cb, void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
percpu_traverse_enter();
for (CPU_INFO_FOREACH(cii, ci)) { (*cb)(percpu_getptr_remote(pc, ci), arg, ci);
}
percpu_traverse_exit();
}
struct percpu_xcall_ctx {
percpu_callback_t ctx_cb;
void *ctx_arg;
};
static void
percpu_xcfunc(void * const v1, void * const v2)
{
percpu_t * const pc = v1;
struct percpu_xcall_ctx * const ctx = v2;
(*ctx->ctx_cb)(percpu_getref(pc), ctx->ctx_arg, curcpu());
percpu_putref(pc);
}
/*
* percpu_foreach_xcall: call the specified callback function for each
* cpu. This version uses an xcall to run the callback on each cpu.
*
* => must be called from thread context.
* => callback executes on **remote** CPU in soft-interrupt context
* (at the specified soft interrupt priority).
* => caller should not rely on the cpu iteration order.
* => the callback function should be minimum because it may be
* executed in soft-interrupt context. eg. it's illegal for
* a callback function to sleep for memory allocation.
*/
void
percpu_foreach_xcall(percpu_t *pc, u_int xcflags, percpu_callback_t cb,
void *arg)
{
struct percpu_xcall_ctx ctx = {
.ctx_cb = cb,
.ctx_arg = arg,
};
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
xc_wait(xc_unicast(xcflags, percpu_xcfunc, pc, &ctx, ci));
}
}
/* $NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $ */
/*
* Copyright (c) 1994 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1994 Jan-Simon Pendry.
* All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_vfsops.c 8.20 (Berkeley) 5/20/95
*/
/*
* Union Layer
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_vfsops.c,v 1.87 2023/02/13 08:39:40 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/genfs/genfs.h>
#include <fs/union/union.h>
MODULE(MODULE_CLASS_VFS, union, NULL);
/*
* Mount union filesystem
*/
int
union_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0;
struct union_args *args = data;
struct vnode *lowerrootvp = NULLVP;
struct vnode *upperrootvp = NULLVP;
struct union_mount *um = 0;
const char *cp;
char *xp;
int len;
size_t size;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p)\n", __func__, mp);
#endif
if (mp->mnt_flag & MNT_GETARGS) {
um = MOUNTTOUNIONMOUNT(mp);
if (um == NULL)
return EIO;
args->target = NULL;
args->mntflags = um->um_op;
*data_len = sizeof *args;
return 0;
}
/*
* Update is a no-op
*/
if (mp->mnt_flag & MNT_UPDATE) {
/*
* Need to provide.
* 1. a way to convert between rdonly and rdwr mounts.
* 2. support for nfs exports.
*/
error = EOPNOTSUPP;
goto bad;
}
lowerrootvp = mp->mnt_vnodecovered;
vref(lowerrootvp);
/*
* Find upper node.
*/
error = namei_simple_user(args->target,
NSM_FOLLOW_NOEMULROOT, &upperrootvp);
if (error != 0)
goto bad;
if (upperrootvp->v_type != VDIR) {
error = EINVAL;
goto bad;
}
um = kmem_zalloc(sizeof(*um), KM_SLEEP);
/*
* Keep a held reference to the target vnodes.
* They are vrele'd in union_unmount.
*
* Depending on the _BELOW flag, the filesystems are
* viewed in a different order. In effect, this is the
* same as providing a mount under option to the mount syscall.
*/
um->um_op = args->mntflags & UNMNT_OPMASK;
switch (um->um_op) {
case UNMNT_ABOVE:
um->um_lowervp = lowerrootvp;
um->um_uppervp = upperrootvp;
break;
case UNMNT_BELOW:
um->um_lowervp = upperrootvp;
um->um_uppervp = lowerrootvp;
break;
case UNMNT_REPLACE:
vrele(lowerrootvp);
lowerrootvp = NULLVP;
um->um_uppervp = upperrootvp;
um->um_lowervp = lowerrootvp;
break;
default:
error = EINVAL;
goto bad;
}
/*
* This mount is mp-safe if both lower mounts are mp-safe.
*/
if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_iflag & IMNT_MPSAFE)) &&
(um->um_uppervp->v_mount->mnt_iflag & IMNT_MPSAFE))
mp->mnt_iflag |= IMNT_MPSAFE;
/*
* Unless the mount is readonly, ensure that the top layer
* supports whiteout operations
*/
if ((mp->mnt_flag & MNT_RDONLY) == 0) {
static struct componentname nullcn = {
.cn_nameiop = LOOKUP,
.cn_cred = NOCRED
};
vn_lock(um->um_uppervp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_WHITEOUT(um->um_uppervp, &nullcn, LOOKUP);
VOP_UNLOCK(um->um_uppervp);
if (error)
goto bad;
}
um->um_cred = l->l_cred;
kauth_cred_hold(um->um_cred);
um->um_cmode = UN_DIRMODE &~ l->l_proc->p_cwdi->cwdi_cmask;
/*
* Depending on what you think the MNT_LOCAL flag might mean,
* you may want the && to be || on the conditional below.
* At the moment it has been defined that the filesystem is
* only local if it is all local, ie the MNT_LOCAL flag implies
* that the entire namespace is local. If you think the MNT_LOCAL
* flag implies that some of the files might be stored locally
* then you will want to change the conditional.
*/
if (um->um_op == UNMNT_ABOVE) { if (((um->um_lowervp == NULLVP) || (um->um_lowervp->v_mount->mnt_flag & MNT_LOCAL)) &&
(um->um_uppervp->v_mount->mnt_flag & MNT_LOCAL))
mp->mnt_flag |= MNT_LOCAL;
}
/*
* Copy in the upper layer's RDONLY flag. This is for the benefit
* of lookup() which explicitly checks the flag, rather than asking
* the filesystem for its own opinion. This means, that an update
* mount of the underlying filesystem to go from rdonly to rdwr
* will leave the unioned view as read-only.
*/
mp->mnt_flag |= (um->um_uppervp->v_mount->mnt_flag & MNT_RDONLY);
mp->mnt_data = um;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, NULL, UIO_USERSPACE,
mp->mnt_op->vfs_name, mp, l);
if (error)
goto bad;
error = vfs_set_lowermount(mp, um->um_uppervp->v_mount);
if (error)
goto bad;
switch (um->um_op) {
case UNMNT_ABOVE:
cp = "<above>:";
break;
case UNMNT_BELOW:
cp = "<below>:";
break;
case UNMNT_REPLACE:
cp = "";
break;
default:
cp = "<invalid>:";
#ifdef DIAGNOSTIC
panic("%s: bad um_op", __func__);
#endif
break;
}
len = strlen(cp);
memcpy(mp->mnt_stat.f_mntfromname, cp, len);
xp = mp->mnt_stat.f_mntfromname + len;
len = MNAMELEN - len;
(void) copyinstr(args->target, xp, len - 1, &size);
memset(xp + size, 0, len - size);
#ifdef UNION_DIAGNOSTIC
printf("%s: from %s, on %s\n", __func__,
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
/* Setup the readdir hook if it's not set already */
if (!vn_union_readdir_hook) vn_union_readdir_hook = union_readdirhook;
return 0;
bad:
if (um) {
if (um->um_cred) kauth_cred_free(um->um_cred);
kmem_free(um, sizeof(*um));
}
if (upperrootvp)
vrele(upperrootvp);
if (lowerrootvp) vrele(lowerrootvp);
return error;
}
/*
* VFS start. Nothing needed here - the start routine
* on the underlying filesystem(s) will have been called
* when that filesystem was mounted.
*/
/*ARGSUSED*/
int
union_start(struct mount *mp, int flags)
{
return 0;
}
/*
* Free reference to union layer
*/
static bool
union_unmount_selector(void *cl, struct vnode *vp)
{
int *count = cl;
KASSERT(mutex_owned(vp->v_interlock));
*count += 1;
return false;
}
int
union_unmount(struct mount *mp, int mntflags)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
int freeing;
int error;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p)\n", __func__, mp);
#endif
/*
* Keep flushing vnodes from the mount list.
* This is needed because of the un_pvp held
* reference to the parent vnode.
* If more vnodes have been freed on a given pass,
* the try again. The loop will iterate at most
* (d) times, where (d) is the maximum tree depth
* in the filesystem.
*/
for (freeing = 0; (error = vflush(mp, NULL, 0)) != 0;) {
struct vnode_iterator *marker;
int n;
/* count #vnodes held on mount list */
n = 0;
vfs_vnode_iterator_init(mp, &marker);
vfs_vnode_iterator_next(marker, union_unmount_selector, &n);
vfs_vnode_iterator_destroy(marker);
/* if this is unchanged then stop */
if (n == freeing)
break;
/* otherwise try once more time */
freeing = n;
}
/*
* Ok, now that we've tried doing it gently, get out the hammer.
*/
if (mntflags & MNT_FORCE) error = vflush(mp, NULL, FORCECLOSE); if (error)
return error;
/*
* Discard references to upper and lower target vnodes.
*/
if (um->um_lowervp) vrele(um->um_lowervp);
vrele(um->um_uppervp);
kauth_cred_free(um->um_cred);
/*
* Finally, throw away the union_mount structure
*/
kmem_free(um, sizeof(*um));
mp->mnt_data = NULL;
return 0;
}
int
union_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
int error;
/*
* Return locked reference to root.
*/
vref(um->um_uppervp);
if (um->um_lowervp) vref(um->um_lowervp);
error = union_allocvp(vpp, mp, NULL, NULL, NULL,
um->um_uppervp, um->um_lowervp, 1);
if (error) {
vrele(um->um_uppervp);
if (um->um_lowervp) vrele(um->um_lowervp);
return error;
}
vn_lock(*vpp, lktype | LK_RETRY);
return 0;
}
int
union_statvfs(struct mount *mp, struct statvfs *sbp)
{
int error;
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
struct statvfs *sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
unsigned long lbsize;
#ifdef UNION_DIAGNOSTIC
printf("%s(mp = %p, lvp = %p, uvp = %p)\n", __func__, mp,
um->um_lowervp, um->um_uppervp);
#endif
if (um->um_lowervp) {
error = VFS_STATVFS(um->um_lowervp->v_mount, sbuf);
if (error)
goto done;
}
/* now copy across the "interesting" information and fake the rest */
lbsize = sbuf->f_bsize;
sbp->f_blocks = sbuf->f_blocks - sbuf->f_bfree;
sbp->f_files = sbuf->f_files - sbuf->f_ffree;
error = VFS_STATVFS(um->um_uppervp->v_mount, sbuf);
if (error)
goto done;
sbp->f_flag = sbuf->f_flag;
sbp->f_bsize = sbuf->f_bsize;
sbp->f_frsize = sbuf->f_frsize;
sbp->f_iosize = sbuf->f_iosize;
/*
* The "total" fields count total resources in all layers,
* the "free" fields count only those resources which are
* free in the upper layer (since only the upper layer
* is writable).
*/
if (sbuf->f_bsize != lbsize) sbp->f_blocks = sbp->f_blocks * lbsize / sbuf->f_bsize;
sbp->f_blocks += sbuf->f_blocks;
sbp->f_bfree = sbuf->f_bfree;
sbp->f_bavail = sbuf->f_bavail;
sbp->f_bresvd = sbuf->f_bresvd;
sbp->f_files += sbuf->f_files;
sbp->f_ffree = sbuf->f_ffree;
sbp->f_favail = sbuf->f_favail;
sbp->f_fresvd = sbuf->f_fresvd;
copy_statvfs_info(sbp, mp);
done:
kmem_free(sbuf, sizeof(*sbuf));
return error;
}
/*ARGSUSED*/
int
union_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
/*
* XXX - Assumes no data cached at union layer.
*/
return 0;
}
/*ARGSUSED*/
int
union_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
return EOPNOTSUPP;
}
static int
union_renamelock_enter(struct mount *mp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
/* Lock just the upper fs, where the action happens. */
return VFS_RENAMELOCK_ENTER(um->um_uppervp->v_mount);
}
static void
union_renamelock_exit(struct mount *mp)
{
struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
VFS_RENAMELOCK_EXIT(um->um_uppervp->v_mount);
}
extern const struct vnodeopv_desc union_vnodeop_opv_desc;
const struct vnodeopv_desc * const union_vnodeopv_descs[] = {
&union_vnodeop_opv_desc,
NULL,
};
struct vfsops union_vfsops = {
.vfs_name = MOUNT_UNION,
.vfs_min_mount_data = sizeof (struct union_args),
.vfs_mount = union_mount,
.vfs_start = union_start,
.vfs_unmount = union_unmount,
.vfs_root = union_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = union_statvfs,
.vfs_sync = union_sync,
.vfs_vget = union_vget,
.vfs_loadvnode = union_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = union_init,
.vfs_reinit = union_reinit,
.vfs_done = union_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = union_renamelock_enter,
.vfs_renamelock_exit = union_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = union_vnodeopv_descs
};
SYSCTL_SETUP(unionfs_sysctl_setup, "unionfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "union",
SYSCTL_DESCR("Union file system"),
NULL, 0, NULL, 0,
CTL_VFS, 15, CTL_EOL);
/*
* XXX the "15" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "15" is the order as taken from sys/mount.h
*/
}
static int
union_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return vfs_attach(&union_vfsops);
case MODULE_CMD_FINI:
return vfs_detach(&union_vfsops);
default:
return ENOTTY;
}
}
/* $NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $ */
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: ufs_readwrite.c,v 1.128 2022/02/21 17:07:45 hannken Exp $");
#define FS struct fs
#define I_FS i_fs
#define READ ffs_read
#define READ_S "ffs_read"
#define WRITE ffs_write
#define WRITE_S "ffs_write"
#define BUFRD ffs_bufrd
#define BUFWR ffs_bufwr
#define ufs_blkoff ffs_blkoff
#define ufs_blksize ffs_blksize
#define ufs_lblkno ffs_lblkno
#define ufs_lblktosize ffs_lblktosize
#define ufs_blkroundup ffs_blkroundup
static int ufs_post_read_update(struct vnode *, int, int);
static int ufs_post_write_update(struct vnode *, struct uio *, int,
kauth_cred_t, off_t, int, int);
/*
* Vnode op for reading.
*/
/* ARGSUSED */
int
READ(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp;
struct inode *ip;
struct uio *uio;
struct ufsmount *ump;
vsize_t bytelen;
int error, ioflag, advice;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ip->i_ump;
uio = ap->a_uio;
ioflag = ap->a_ioflag;
error = 0;
KASSERT(uio->uio_rw == UIO_READ); KASSERT(vp->v_type == VREG || vp->v_type == VDIR);
/* XXX Eliminate me by refusing directory reads from userland. */
if (vp->v_type == VDIR)
return BUFRD(vp, uio, ioflag, ap->a_cred); if ((u_int64_t)uio->uio_offset > ump->um_maxfilesize)
return (EFBIG);
if (uio->uio_resid == 0)
return (0);
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT)
return ffs_snapshot_read(vp, uio, ioflag); if (uio->uio_offset >= ip->i_size)
goto out;
KASSERT(vp->v_type == VREG);
advice = IO_ADV_DECODE(ap->a_ioflag);
while (uio->uio_resid > 0) { if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag);
}
bytelen = MIN(ip->i_size - uio->uio_offset, uio->uio_resid);
if (bytelen == 0)
break;
error = ubc_uiomove(&vp->v_uobj, uio, bytelen, advice,
UBC_READ | UBC_PARTIALOK | UBC_VNODE_FLAGS(vp));
if (error)
break;
}
out:
error = ufs_post_read_update(vp, ap->a_ioflag, error);
return (error);
}
/*
* UFS op for reading via the buffer cache
*/
int
BUFRD(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
struct inode *ip;
struct ufsmount *ump;
FS *fs;
struct buf *bp;
daddr_t lbn, nextlbn;
off_t bytesinfile;
long size, xfersize, blkoffset;
int error;
KASSERT(VOP_ISLOCKED(vp)); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(uio->uio_rw == UIO_READ);
ip = VTOI(vp);
ump = ip->i_ump;
fs = ip->I_FS;
error = 0;
KASSERT(vp->v_type != VLNK || ip->i_size >= ump->um_maxsymlinklen); KASSERT(vp->v_type != VLNK || ump->um_maxsymlinklen != 0 ||
DIP(ip, blocks) != 0);
if (uio->uio_offset > ump->um_maxfilesize)
return EFBIG;
if (uio->uio_resid == 0)
return 0;
KASSERT(!ISSET(ip->i_flags, (SF_SNAPSHOT | SF_SNAPINVAL))); if (uio->uio_offset >= ip->i_size)
goto out;
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
bytesinfile = ip->i_size - uio->uio_offset;
if (bytesinfile <= 0)
break;
lbn = ufs_lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
size = ufs_blksize(fs, ip, lbn);
blkoffset = ufs_blkoff(fs, uio->uio_offset);
xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
bytesinfile);
if (ufs_lblktosize(fs, nextlbn) >= ip->i_size)
error = bread(vp, lbn, size, 0, &bp);
else {
int nextsize = ufs_blksize(fs, ip, nextlbn);
error = breadn(vp, lbn,
size, &nextlbn, &nextsize, 1, 0, &bp);
}
if (error)
break;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < xfersize) { if (size == 0)
break;
xfersize = size;
}
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
if (error)
break;
brelse(bp, 0);
}
if (bp != NULL) brelse(bp, 0);
out:
error = ufs_post_read_update(vp, ioflag, error);
return (error);
}
static int
ufs_post_read_update(struct vnode *vp, int ioflag, int oerror)
{
struct inode *ip = VTOI(vp);
int error = oerror;
if (!(vp->v_mount->mnt_flag & MNT_NOATIME)) {
ip->i_flag |= IN_ACCESS;
if ((ioflag & IO_SYNC) == IO_SYNC) { error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error)
goto out;
error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT); UFS_WAPBL_END(vp->v_mount);
}
}
out:
/* Read error overrides any inode update error. */
if (oerror)
error = oerror;
return error;
}
/*
* Vnode op for writing.
*/
int
WRITE(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp;
struct uio *uio;
struct inode *ip;
FS *fs;
kauth_cred_t cred;
off_t osize, origoff, oldoff, preallocoff, endallocoff, nsize;
int blkoffset, error, flags, ioflag, resid;
int aflag;
vsize_t bytelen;
bool async;
struct ufsmount *ump;
cred = ap->a_cred;
ioflag = ap->a_ioflag;
uio = ap->a_uio;
vp = ap->a_vp;
ip = VTOI(vp);
ump = ip->i_ump;
KASSERT(vp->v_size == ip->i_size); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(vp->v_type == VREG); KASSERT(!ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JUNLOCK_ASSERT(vp->v_mount); if (ioflag & IO_APPEND) uio->uio_offset = ip->i_size; if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
return (EPERM);
fs = ip->I_FS;
if (uio->uio_offset < 0 ||
(u_int64_t)uio->uio_offset + uio->uio_resid > ump->um_maxfilesize)
return (EFBIG);
if (uio->uio_resid == 0)
return (0);
flags = ioflag & IO_SYNC ? B_SYNC : 0;
async = vp->v_mount->mnt_flag & MNT_ASYNC;
origoff = uio->uio_offset;
resid = uio->uio_resid;
osize = ip->i_size;
error = 0;
KASSERT(vp->v_type == VREG);
/*
* XXX The entire write operation must occur in a single WAPBL
* transaction because it may allocate disk blocks, if
* appending or filling holes, which is allowed to happen only
* if the write fully succeeds.
*
* If ubc_uiomove fails in the middle with EFAULT, we can clean
* up at the end with UFS_TRUNCATE. But if the power fails in
* the middle, there would be nobody to deallocate the blocks,
* without an fsck to globally analyze the file system.
*
* If the increasingly inaccurately named WAPBL were augmented
* with rollback records for block allocations, then we could
* split this into multiple transactions and commit the
* allocations in the last one.
*
* But WAPBL doesn't have that notion now, so we'll have to
* live with gigantic transactions and WAPBL tentacles in
* genfs_getpages/putpages to cope with the possibility that
* the transaction may or may not be locked on entry to the
* page cache.
*
* And even if we added that notion to WAPBL, it wouldn't help
* us get rid of the tentacles in genfs_getpages/putpages
* because we'd have to interoperate with old implementations
* that assume they can replay the log without fsck.
*/
error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error) {
return error;
}
preallocoff = round_page(ufs_blkroundup(fs, MAX(osize, uio->uio_offset)));
aflag = ioflag & IO_SYNC ? B_SYNC : 0;
nsize = MAX(osize, uio->uio_offset + uio->uio_resid);
endallocoff = nsize - ufs_blkoff(fs, nsize);
/*
* if we're increasing the file size, deal with expanding
* the fragment if there is one.
*/
if (nsize > osize && ufs_lblkno(fs, osize) < UFS_NDADDR &&
ufs_lblkno(fs, osize) != ufs_lblkno(fs, nsize) &&
ufs_blkroundup(fs, osize) != osize) {
off_t eob;
eob = ufs_blkroundup(fs, osize);
uvm_vnp_setwritesize(vp, eob);
error = ufs_balloc_range(vp, osize, eob - osize, cred, aflag);
if (error)
goto out;
if (flags & B_SYNC) {
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
VOP_PUTPAGES(vp, trunc_page(osize & fs->fs_bmask),
round_page(eob),
PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
}
}
while (uio->uio_resid > 0) {
int ubc_flags = UBC_WRITE;
bool overwrite; /* if we're overwrite a whole block */
off_t newoff;
if (ioflag & IO_DIRECT) { genfs_directio(vp, uio, ioflag | IO_JOURNALLOCKED);
}
oldoff = uio->uio_offset;
blkoffset = ufs_blkoff(fs, uio->uio_offset);
bytelen = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
if (bytelen == 0) {
break;
}
/*
* if we're filling in a hole, allocate the blocks now and
* initialize the pages first. if we're extending the file,
* we can safely allocate blocks without initializing pages
* since the new blocks will be inaccessible until the write
* is complete.
*/
overwrite = uio->uio_offset >= preallocoff &&
uio->uio_offset < endallocoff;
if (!overwrite && (vp->v_vflag & VV_MAPPED) == 0 && ufs_blkoff(fs, uio->uio_offset) == 0 &&
(uio->uio_offset & PAGE_MASK) == 0) {
vsize_t len;
len = trunc_page(bytelen);
len -= ufs_blkoff(fs, len);
if (len > 0) {
overwrite = true;
bytelen = len;
}
}
newoff = oldoff + bytelen;
if (vp->v_size < newoff) { uvm_vnp_setwritesize(vp, newoff);
}
if (!overwrite) {
error = ufs_balloc_range(vp, uio->uio_offset, bytelen,
cred, aflag);
if (error)
break;
} else {
genfs_node_wrlock(vp);
error = GOP_ALLOC(vp, uio->uio_offset, bytelen,
aflag, cred);
genfs_node_unlock(vp);
if (error)
break;
ubc_flags |= UBC_FAULTBUSY;
}
/*
* copy the data.
*/
error = ubc_uiomove(&vp->v_uobj, uio, bytelen,
IO_ADV_DECODE(ioflag), ubc_flags | UBC_VNODE_FLAGS(vp));
/*
* update UVM's notion of the size now that we've
* copied the data into the vnode's pages.
*
* we should update the size even when uiomove failed.
*/
if (vp->v_size < newoff) { uvm_vnp_setsize(vp, newoff);
}
if (error)
break;
/*
* flush what we just wrote if necessary.
* XXXUBC simplistic async flushing.
*/
if (!async && oldoff >> 16 != uio->uio_offset >> 16) {
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, (oldoff >> 16) << 16,
(uio->uio_offset >> 16) << 16,
PGO_CLEANIT | PGO_JOURNALLOCKED | PGO_LAZY);
if (error)
break;
}
}
if (error == 0 && ioflag & IO_SYNC) { rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, trunc_page(origoff & fs->fs_bmask),
round_page(ufs_blkroundup(fs, uio->uio_offset)),
PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED);
}
out:
error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
error);
UFS_WAPBL_END(vp->v_mount);
return (error);
}
/*
* UFS op for writing via the buffer cache
*/
int
BUFWR(struct vnode *vp, struct uio *uio, int ioflag, kauth_cred_t cred)
{
struct inode *ip;
struct ufsmount *ump;
FS *fs;
int flags;
struct buf *bp;
off_t osize;
int resid, xfersize, size, blkoffset;
daddr_t lbn;
int error;
KASSERT(ISSET(ioflag, IO_NODELOCKED)); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR || vp->v_type == VLNK); KASSERT(vp->v_type != VDIR || ISSET(ioflag, IO_SYNC)); KASSERT(uio->uio_rw == UIO_WRITE); KASSERT(ISSET(ioflag, IO_JOURNALLOCKED)); UFS_WAPBL_JLOCK_ASSERT(vp->v_mount);
ip = VTOI(vp);
ump = ip->i_ump;
fs = ip->I_FS;
KASSERT(vp->v_size == ip->i_size); if (uio->uio_offset < 0 || uio->uio_resid > ump->um_maxfilesize ||
uio->uio_offset > (ump->um_maxfilesize - uio->uio_resid))
return EFBIG;
if (uio->uio_resid == 0)
return 0;
flags = ioflag & IO_SYNC ? B_SYNC : 0;
resid = uio->uio_resid;
osize = ip->i_size;
error = 0;
KASSERT(vp->v_type != VREG);
/* XXX Should never have pages cached here. */
KASSERT(vp->v_uobj.uo_npages == 0); while (uio->uio_resid > 0) {
lbn = ufs_lblkno(fs, uio->uio_offset);
blkoffset = ufs_blkoff(fs, uio->uio_offset);
xfersize = MIN(fs->fs_bsize - blkoffset, uio->uio_resid);
if (fs->fs_bsize > xfersize)
flags |= B_CLRBUF;
else
flags &= ~B_CLRBUF;
error = UFS_BALLOC(vp, uio->uio_offset, xfersize, cred, flags,
&bp);
if (error)
break;
if (uio->uio_offset + xfersize > ip->i_size) {
ip->i_size = uio->uio_offset + xfersize;
DIP_ASSIGN(ip, size, ip->i_size);
uvm_vnp_setsize(vp, ip->i_size);
}
size = ufs_blksize(fs, ip, lbn) - bp->b_resid;
if (xfersize > size)
xfersize = size;
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
/*
* if we didn't clear the block and the uiomove failed,
* the buf will now contain part of some other file,
* so we need to invalidate it.
*/
if (error && (flags & B_CLRBUF) == 0) {
brelse(bp, BC_INVAL);
break;
}
if (ioflag & IO_SYNC)
(void)bwrite(bp);
else if (xfersize + blkoffset == fs->fs_bsize)
bawrite(bp);
else
bdwrite(bp); if (error || xfersize == 0)
break;
}
error = ufs_post_write_update(vp, uio, ioflag, cred, osize, resid,
error);
return (error);
}
static int
ufs_post_write_update(struct vnode *vp, struct uio *uio, int ioflag,
kauth_cred_t cred, off_t osize, int resid, int oerror)
{
struct inode *ip = VTOI(vp);
int error = oerror;
/* Trigger ctime and mtime updates, and atime if MNT_RELATIME. */
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (vp->v_mount->mnt_flag & MNT_RELATIME) ip->i_flag |= IN_ACCESS;
/*
* If we successfully wrote any data and we are not the superuser,
* we clear the setuid and setgid bits as a precaution against
* tampering.
*/
if (resid > uio->uio_resid && cred) { if (ip->i_mode & ISUID) { if (kauth_authorize_vnode(cred,
KAUTH_VNODE_RETAIN_SUID, vp, NULL, EPERM) != 0) {
ip->i_mode &= ~ISUID;
DIP_ASSIGN(ip, mode, ip->i_mode);
}
}
if (ip->i_mode & ISGID) { if (kauth_authorize_vnode(cred,
KAUTH_VNODE_RETAIN_SGID, vp, NULL, EPERM) != 0) {
ip->i_mode &= ~ISGID;
DIP_ASSIGN(ip, mode, ip->i_mode);
}
}
}
/*
* Update the size on disk: truncate back to original size on
* error, or reflect the new size on success.
*/
if (error) {
(void) UFS_TRUNCATE(vp, osize, ioflag & IO_SYNC, cred);
uio->uio_offset -= resid - uio->uio_resid;
uio->uio_resid = resid;
} else if (resid > uio->uio_resid && (ioflag & IO_SYNC) == IO_SYNC)
error = UFS_UPDATE(vp, NULL, NULL, UPDATE_WAIT);
else
UFS_WAPBL_UPDATE(vp, NULL, NULL, 0);
/* Make sure the vnode uvm size matches the inode file size. */
KASSERT(vp->v_size == ip->i_size);
/* Write error overrides any inode update error. */
if (oerror)
error = oerror;
return error;
}
/* $NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $ */
/*-
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Nathan J. Williams, and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Overview
*
* Lightweight processes (LWPs) are the basic unit or thread of
* execution within the kernel. The core state of an LWP is described
* by "struct lwp", also known as lwp_t.
*
* Each LWP is contained within a process (described by "struct proc"),
* Every process contains at least one LWP, but may contain more. The
* process describes attributes shared among all of its LWPs such as a
* private address space, global execution state (stopped, active,
* zombie, ...), signal disposition and so on. On a multiprocessor
* machine, multiple LWPs be executing concurrently in the kernel.
*
* Execution states
*
* At any given time, an LWP has overall state that is described by
* lwp::l_stat. The states are broken into two sets below. The first
* set is guaranteed to represent the absolute, current state of the
* LWP:
*
* LSONPROC
*
* On processor: the LWP is executing on a CPU, either in the
* kernel or in user space.
*
* LSRUN
*
* Runnable: the LWP is parked on a run queue, and may soon be
* chosen to run by an idle processor, or by a processor that
* has been asked to preempt a currently running but lower
* priority LWP.
*
* LSIDL
*
* Idle: the LWP has been created but has not yet executed, or
* it has ceased executing a unit of work and is waiting to be
* started again. This state exists so that the LWP can occupy
* a slot in the process & PID table, but without having to
* worry about being touched; lookups of the LWP by ID will
* fail while in this state. The LWP will become visible for
* lookup once its state transitions further. Some special
* kernel threads also (ab)use this state to indicate that they
* are idle (soft interrupts and idle LWPs).
*
* LSSUSPENDED:
*
* Suspended: the LWP has had its execution suspended by
* another LWP in the same process using the _lwp_suspend()
* system call. User-level LWPs also enter the suspended
* state when the system is shutting down.
*
* The second set represent a "statement of intent" on behalf of the
* LWP. The LWP may in fact be executing on a processor, may be
* sleeping or idle. It is expected to take the necessary action to
* stop executing or become "running" again within a short timeframe.
* The LP_RUNNING flag in lwp::l_pflag indicates that an LWP is running.
* Importantly, it indicates that its state is tied to a CPU.
*
* LSZOMB:
*
* Dead or dying: the LWP has released most of its resources
* and is about to switch away into oblivion, or has already
* switched away. When it switches away, its few remaining
* resources can be collected.
*
* LSSLEEP:
*
* Sleeping: the LWP has entered itself onto a sleep queue, and
* has switched away or will switch away shortly to allow other
* LWPs to run on the CPU.
*
* LSSTOP:
*
* Stopped: the LWP has been stopped as a result of a job
* control signal, or as a result of the ptrace() interface.
*
* Stopped LWPs may run briefly within the kernel to handle
* signals that they receive, but will not return to user space
* until their process' state is changed away from stopped.
*
* Single LWPs within a process can not be set stopped
* selectively: all actions that can stop or continue LWPs
* occur at the process level.
*
* State transitions
*
* Note that the LSSTOP state may only be set when returning to
* user space in userret(), or when sleeping interruptably. The
* LSSUSPENDED state may only be set in userret(). Before setting
* those states, we try to ensure that the LWPs will release all
* locks that they hold, and at a minimum try to ensure that the
* LWP can be set runnable again by a signal.
*
* LWPs may transition states in the following ways:
*
* RUN -------> ONPROC ONPROC -----> RUN
* > SLEEP
* > STOPPED
* > SUSPENDED
* > ZOMB
* > IDL (special cases)
*
* STOPPED ---> RUN SUSPENDED --> RUN
* > SLEEP
*
* SLEEP -----> ONPROC IDL --------> RUN
* > RUN > SUSPENDED
* > STOPPED > STOPPED
* > ONPROC (special cases)
*
* Some state transitions are only possible with kernel threads (eg
* ONPROC -> IDL) and happen under tightly controlled circumstances
* free of unwanted side effects.
*
* Migration
*
* Migration of threads from one CPU to another could be performed
* internally by the scheduler via sched_takecpu() or sched_catchlwp()
* functions. The universal lwp_migrate() function should be used for
* any other cases. Subsystems in the kernel must be aware that CPU
* of LWP may change, while it is not locked.
*
* Locking
*
* The majority of fields in 'struct lwp' are covered by a single,
* general spin lock pointed to by lwp::l_mutex. The locks covering
* each field are documented in sys/lwp.h.
*
* State transitions must be made with the LWP's general lock held,
* and may cause the LWP's lock pointer to change. Manipulation of
* the general lock is not performed directly, but through calls to
* lwp_lock(), lwp_unlock() and others. It should be noted that the
* adaptive locks are not allowed to be released while the LWP's lock
* is being held (unlike for other spin-locks).
*
* States and their associated locks:
*
* LSIDL, LSONPROC, LSZOMB, LSSUPENDED:
*
* Always covered by spc_lwplock, which protects LWPs not
* associated with any other sync object. This is a per-CPU
* lock and matches lwp::l_cpu.
*
* LSRUN:
*
* Always covered by spc_mutex, which protects the run queues.
* This is a per-CPU lock and matches lwp::l_cpu.
*
* LSSLEEP:
*
* Covered by a lock associated with the sleep queue (sometimes
* a turnstile sleep queue) that the LWP resides on. This can
* be spc_lwplock for SOBJ_SLEEPQ_NULL (an "untracked" sleep).
*
* LSSTOP:
*
* If the LWP was previously sleeping (l_wchan != NULL), then
* l_mutex references the sleep queue lock. If the LWP was
* runnable or on the CPU when halted, or has been removed from
* the sleep queue since halted, then the lock is spc_lwplock.
*
* The lock order is as follows:
*
* sleepq -> turnstile -> spc_lwplock -> spc_mutex
*
* Each process has a scheduler state lock (proc::p_lock), and a
* number of counters on LWPs and their states: p_nzlwps, p_nrlwps, and
* so on. When an LWP is to be entered into or removed from one of the
* following states, p_lock must be held and the process wide counters
* adjusted:
*
* LSIDL, LSZOMB, LSSTOP, LSSUSPENDED
*
* (But not always for kernel threads. There are some special cases
* as mentioned above: soft interrupts, and the idle loops.)
*
* Note that an LWP is considered running or likely to run soon if in
* one of the following states. This affects the value of p_nrlwps:
*
* LSRUN, LSONPROC, LSSLEEP
*
* p_lock does not need to be held when transitioning among these
* three states, hence p_lock is rarely taken for state transitions.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lwp.c,v 1.269 2023/12/20 21:03:50 andvar Exp $");
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_dtrace.h"
#define _LWP_API_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/futex.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kcov.h>
#include <sys/kmem.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/msan.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/psref.h>
#include <sys/ptrace.h>
#include <sys/sdt.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uidinfo.h>
#include <sys/xcall.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
static pool_cache_t lwp_cache __read_mostly;
struct lwplist alllwp __cacheline_aligned;
static int lwp_ctor(void *, void *, int);
static void lwp_dtor(void *, void *);
/* DTrace proc provider probes */
SDT_PROVIDER_DEFINE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , lwp__create, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__start, "struct lwp *");
SDT_PROBE_DEFINE1(proc, kernel, , lwp__exit, "struct lwp *");
struct turnstile turnstile0 __cacheline_aligned;
struct lwp lwp0 __aligned(MIN_LWP_ALIGNMENT) = {
#ifdef LWP0_CPU_INFO
.l_cpu = LWP0_CPU_INFO,
#endif
#ifdef LWP0_MD_INITIALIZER
.l_md = LWP0_MD_INITIALIZER,
#endif
.l_proc = &proc0,
.l_lid = 0, /* we own proc0's slot in the pid table */
.l_flag = LW_SYSTEM,
.l_stat = LSONPROC,
.l_ts = &turnstile0,
.l_syncobj = &sched_syncobj,
.l_refcnt = 0,
.l_priority = PRI_USER + NPRI_USER - 1,
.l_inheritedprio = -1,
.l_class = SCHED_OTHER,
.l_psid = PS_NONE,
.l_pi_lenders = SLIST_HEAD_INITIALIZER(&lwp0.l_pi_lenders),
.l_name = __UNCONST("swapper"),
.l_fd = &filedesc0,
};
static int
lwp_maxlwp(void)
{
/* Assume 1 LWP per 1MiB. */
uint64_t lwps_per = ctob(physmem) / (1024 * 1024);
return MAX(MIN(MAXMAXLWP, lwps_per), MAXLWP);
}
static int sysctl_kern_maxlwp(SYSCTLFN_PROTO);
/*
* sysctl helper routine for kern.maxlwp. Ensures that the new
* values are not too low or too high.
*/
static int
sysctl_kern_maxlwp(SYSCTLFN_ARGS)
{
int error, nmaxlwp;
struct sysctlnode node;
nmaxlwp = maxlwp;
node = *rnode;
node.sysctl_data = &nmaxlwp;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (nmaxlwp < 0 || nmaxlwp >= MAXMAXLWP)
return EINVAL;
if (nmaxlwp > lwp_maxlwp())
return EINVAL;
maxlwp = nmaxlwp;
return 0;
}
static void
sysctl_kern_lwp_setup(void)
{
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlwp",
SYSCTL_DESCR("Maximum number of simultaneous threads"),
sysctl_kern_maxlwp, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
void
lwpinit(void)
{
LIST_INIT(&alllwp);
lwpinit_specificdata();
/*
* Provide a barrier to ensure that all mutex_oncpu() and rw_oncpu()
* calls will exit before memory of LWPs is returned to the pool, where
* KVA of LWP structure might be freed and re-used for other purposes.
* Kernel preemption is disabled around mutex_oncpu() and rw_oncpu()
* callers, therefore a regular passive serialization barrier will
* do the job.
*/
lwp_cache = pool_cache_init(sizeof(lwp_t), MIN_LWP_ALIGNMENT, 0,
PR_PSERIALIZE, "lwppl", NULL, IPL_NONE, lwp_ctor, lwp_dtor, NULL);
maxlwp = lwp_maxlwp();
sysctl_kern_lwp_setup();
}
void
lwp0_init(void)
{
struct lwp *l = &lwp0;
KASSERT((void *)uvm_lwp_getuarea(l) != NULL);
LIST_INSERT_HEAD(&alllwp, l, l_list);
callout_init(&l->l_timeout_ch, CALLOUT_MPSAFE);
callout_setfunc(&l->l_timeout_ch, sleepq_timeout, l);
cv_init(&l->l_sigcv, "sigwait");
cv_init(&l->l_waitcv, "vfork");
l->l_cred = kauth_cred_hold(proc0.p_cred);
kdtrace_thread_ctor(NULL, l);
lwp_initspecific(l);
SYSCALL_TIME_LWP_INIT(l);
}
/*
* Initialize the non-zeroed portion of an lwp_t.
*/
static int
lwp_ctor(void *arg, void *obj, int flags)
{
lwp_t *l = obj;
l->l_stat = LSIDL;
l->l_cpu = curcpu();
l->l_mutex = l->l_cpu->ci_schedstate.spc_lwplock;
l->l_ts = kmem_alloc(sizeof(*l->l_ts), flags == PR_WAITOK ?
KM_SLEEP : KM_NOSLEEP);
if (l->l_ts == NULL) {
return ENOMEM;
} else {
turnstile_ctor(l->l_ts);
return 0;
}
}
static void
lwp_dtor(void *arg, void *obj)
{
lwp_t *l = obj;
/*
* The value of l->l_cpu must still be valid at this point.
*/
KASSERT(l->l_cpu != NULL);
/*
* We can't return turnstile0 to the pool (it didn't come from it),
* so if it comes up just drop it quietly and move on.
*/
if (l->l_ts != &turnstile0) kmem_free(l->l_ts, sizeof(*l->l_ts));
}
/*
* Set an LWP suspended.
*
* Must be called with p_lock held, and the LWP locked. Will unlock the
* LWP before return.
*/
int
lwp_suspend(struct lwp *curl, struct lwp *t)
{
int error;
KASSERT(mutex_owned(t->l_proc->p_lock)); KASSERT(lwp_locked(t, NULL)); KASSERT(curl != t || curl->l_stat == LSONPROC);
/*
* If the current LWP has been told to exit, we must not suspend anyone
* else or deadlock could occur. We won't return to userspace.
*/
if ((curl->l_flag & (LW_WEXIT | LW_WCORE)) != 0) {
lwp_unlock(t);
return (EDEADLK);
}
if ((t->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(t);
return 0;
}
error = 0;
switch (t->l_stat) {
case LSRUN:
case LSONPROC:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
lwp_unlock(t);
break;
case LSSLEEP:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
/*
* Kick the LWP and try to get it to the kernel boundary
* so that it will release any locks that it holds.
* setrunnable() will release the lock.
*/
if ((t->l_flag & LW_SINTR) != 0)
setrunnable(t);
else
lwp_unlock(t);
break;
case LSSUSPENDED:
lwp_unlock(t);
break;
case LSSTOP:
t->l_flag |= LW_WSUSPEND;
lwp_need_userret(t);
setrunnable(t);
break;
case LSIDL:
case LSZOMB:
error = EINTR; /* It's what Solaris does..... */
lwp_unlock(t);
break;
}
return (error);
}
/*
* Restart a suspended LWP.
*
* Must be called with p_lock held, and the LWP locked. Will unlock the
* LWP before return.
*/
void
lwp_continue(struct lwp *l)
{ KASSERT(mutex_owned(l->l_proc->p_lock)); KASSERT(lwp_locked(l, NULL));
/* If rebooting or not suspended, then just bail out. */
if ((l->l_flag & LW_WREBOOT) != 0) {
lwp_unlock(l);
return;
}
l->l_flag &= ~LW_WSUSPEND;
if (l->l_stat != LSSUSPENDED || (l->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(l);
return;
}
/* setrunnable() will release the lock. */
setrunnable(l);
}
/*
* Restart a stopped LWP.
*
* Must be called with p_lock held, and the LWP NOT locked. Will unlock the
* LWP before return.
*/
void
lwp_unstop(struct lwp *l)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
lwp_lock(l);
KASSERT((l->l_flag & LW_DBGSUSPEND) == 0);
/* If not stopped, then just bail out. */
if (l->l_stat != LSSTOP) {
lwp_unlock(l);
return;
}
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
if (!p->p_waited)
p->p_pptr->p_nstopchild--;
if (l->l_wchan == NULL) {
/* setrunnable() will release the lock. */
setrunnable(l);
} else if (p->p_xsig && (l->l_flag & LW_SINTR) != 0) {
/* setrunnable() so we can receive the signal */
setrunnable(l);
} else {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
lwp_unlock(l);
}
}
/*
* Wait for an LWP within the current process to exit. If 'lid' is
* non-zero, we are waiting for a specific LWP.
*
* Must be called with p->p_lock held.
*/
int
lwp_wait(struct lwp *l, lwpid_t lid, lwpid_t *departed, bool exiting)
{
const lwpid_t curlid = l->l_lid;
proc_t *p = l->l_proc;
lwp_t *l2, *next;
int error;
KASSERT(mutex_owned(p->p_lock));
p->p_nlwpwait++;
l->l_waitingfor = lid;
for (;;) {
int nfound;
/*
* Avoid a race between exit1() and sigexit(): if the
* process is dumping core, then we need to bail out: call
* into lwp_userret() where we will be suspended until the
* deed is done.
*/
if ((p->p_sflag & PS_WCORE) != 0) { mutex_exit(p->p_lock);
lwp_userret(l);
KASSERT(false);
}
/*
* First off, drain any detached LWP that is waiting to be
* reaped.
*/
if ((l2 = p->p_zomblwp) != NULL) {
p->p_zomblwp = NULL;
lwp_free(l2, false, false);/* releases proc mutex */
mutex_enter(p->p_lock);
continue;
}
/*
* Now look for an LWP to collect. If the whole process is
* exiting, count detached LWPs as eligible to be collected,
* but don't drain them here.
*/
nfound = 0;
error = 0;
/*
* If given a specific LID, go via pid_table and make sure
* it's not detached.
*/
if (lid != 0) {
l2 = proc_find_lwp(p, lid);
if (l2 == NULL) {
error = ESRCH;
break;
}
KASSERT(l2->l_lid == lid); if ((l2->l_prflag & LPR_DETACHED) != 0) {
error = EINVAL;
break;
}
} else {
l2 = LIST_FIRST(&p->p_lwps);
}
for (; l2 != NULL; l2 = next) { next = (lid != 0 ? NULL : LIST_NEXT(l2, l_sibling));
/*
* If a specific wait and the target is waiting on
* us, then avoid deadlock. This also traps LWPs
* that try to wait on themselves.
*
* Note that this does not handle more complicated
* cycles, like: t1 -> t2 -> t3 -> t1. The process
* can still be killed so it is not a major problem.
*/
if (l2->l_lid == lid && l2->l_waitingfor == curlid) {
error = EDEADLK;
break;
}
if (l2 == l)
continue;
if ((l2->l_prflag & LPR_DETACHED) != 0) {
nfound += exiting;
continue;
}
if (lid != 0) {
/*
* Mark this LWP as the first waiter, if there
* is no other.
*/
if (l2->l_waiter == 0) l2->l_waiter = curlid; } else if (l2->l_waiter != 0) {
/*
* It already has a waiter - so don't
* collect it. If the waiter doesn't
* grab it we'll get another chance
* later.
*/
nfound++;
continue;
}
nfound++;
/* No need to lock the LWP in order to see LSZOMB. */
if (l2->l_stat != LSZOMB)
continue;
/*
* We're no longer waiting. Reset the "first waiter"
* pointer on the target, in case it was us.
*/
l->l_waitingfor = 0;
l2->l_waiter = 0;
p->p_nlwpwait--;
if (departed)
*departed = l2->l_lid;
sched_lwp_collect(l2);
/* lwp_free() releases the proc lock. */
lwp_free(l2, false, false);
mutex_enter(p->p_lock);
return 0;
}
if (error != 0)
break;
if (nfound == 0) {
error = ESRCH;
break;
}
/*
* Note: since the lock will be dropped, need to restart on
* wakeup to run all LWPs again, e.g. there may be new LWPs.
*/
if (exiting) {
KASSERT(p->p_nlwps > 1);
error = cv_timedwait(&p->p_lwpcv, p->p_lock, 1);
break;
}
/*
* Break out if all LWPs are in _lwp_wait(). There are
* other ways to hang the process with _lwp_wait(), but the
* sleep is interruptable so little point checking for them.
*/
if (p->p_nlwpwait == p->p_nlwps) {
error = EDEADLK;
break;
}
/*
* Sit around and wait for something to happen. We'll be
* awoken if any of the conditions examined change: if an
* LWP exits, is collected, or is detached.
*/
if ((error = cv_wait_sig(&p->p_lwpcv, p->p_lock)) != 0)
break;
}
/*
* We didn't find any LWPs to collect, we may have received a
* signal, or some other condition has caused us to bail out.
*
* If waiting on a specific LWP, clear the waiters marker: some
* other LWP may want it. Then, kick all the remaining waiters
* so that they can re-check for zombies and for deadlock.
*/
if (lid != 0) {
l2 = proc_find_lwp(p, lid);
KASSERT(l2 == NULL || l2->l_lid == lid); if (l2 != NULL && l2->l_waiter == curlid) l2->l_waiter = 0;
}
p->p_nlwpwait--;
l->l_waitingfor = 0;
cv_broadcast(&p->p_lwpcv);
return error;
}
/*
* Create a new LWP within process 'p2', using LWP 'l1' as a template.
* The new LWP is created in state LSIDL and must be set running,
* suspended, or stopped by the caller.
*/
int
lwp_create(lwp_t *l1, proc_t *p2, vaddr_t uaddr, int flags,
void *stack, size_t stacksize, void (*func)(void *), void *arg,
lwp_t **rnewlwpp, int sclass, const sigset_t *sigmask,
const stack_t *sigstk)
{
struct lwp *l2;
KASSERT(l1 == curlwp || l1->l_proc == &proc0);
/*
* Enforce limits, excluding the first lwp and kthreads. We must
* use the process credentials here when adjusting the limit, as
* they are what's tied to the accounting entity. However for
* authorizing the action, we'll use the LWP's credentials.
*/
mutex_enter(p2->p_lock);
if (p2->p_nlwps != 0 && p2 != &proc0) {
uid_t uid = kauth_cred_getuid(p2->p_cred);
int count = chglwpcnt(uid, 1);
if (__predict_false(count >
p2->p_rlimit[RLIMIT_NTHR].rlim_cur)) {
if (kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_RLIMIT, p2,
KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p2->p_rlimit[RLIMIT_NTHR], KAUTH_ARG(RLIMIT_NTHR))
!= 0) {
(void)chglwpcnt(uid, -1);
mutex_exit(p2->p_lock);
return EAGAIN;
}
}
}
/*
* First off, reap any detached LWP waiting to be collected.
* We can re-use its LWP structure and turnstile.
*/
if ((l2 = p2->p_zomblwp) != NULL) {
p2->p_zomblwp = NULL;
lwp_free(l2, true, false);
/* p2 now unlocked by lwp_free() */
KASSERT(l2->l_ts != NULL); KASSERT(l2->l_inheritedprio == -1); KASSERT(SLIST_EMPTY(&l2->l_pi_lenders));
memset(&l2->l_startzero, 0, sizeof(*l2) -
offsetof(lwp_t, l_startzero));
} else {
mutex_exit(p2->p_lock);
l2 = pool_cache_get(lwp_cache, PR_WAITOK);
memset(&l2->l_startzero, 0, sizeof(*l2) -
offsetof(lwp_t, l_startzero));
SLIST_INIT(&l2->l_pi_lenders);
}
/*
* Because of lockless lookup via pid_table, the LWP can be locked
* and inspected briefly even after it's freed, so a few fields are
* kept stable.
*/
KASSERT(l2->l_stat == LSIDL); KASSERT(l2->l_cpu != NULL); KASSERT(l2->l_ts != NULL); KASSERT(l2->l_mutex == l2->l_cpu->ci_schedstate.spc_lwplock);
l2->l_proc = p2;
l2->l_refcnt = 0;
l2->l_class = sclass;
/*
* Allocate a process ID for this LWP. We need to do this now
* while we can still unwind if it fails. Because we're marked
* as LSIDL, no lookups by the ID will succeed.
*
* N.B. this will always succeed for the first LWP in a process,
* because proc_alloc_lwpid() will usurp the slot. Also note
* that l2->l_proc MUST be valid so that lookups of the proc
* will succeed, even if the LWP itself is not visible.
*/
if (__predict_false(proc_alloc_lwpid(p2, l2) == -1)) {
pool_cache_put(lwp_cache, l2);
return EAGAIN;
}
/*
* If vfork(), we want the LWP to run fast and on the same CPU
* as its parent, so that it can reuse the VM context and cache
* footprint on the local CPU.
*/
l2->l_boostpri = ((flags & LWP_VFORK) ? PRI_KERNEL : PRI_USER);
l2->l_priority = l1->l_priority;
l2->l_inheritedprio = -1;
l2->l_protectprio = -1;
l2->l_auxprio = -1;
l2->l_flag = 0;
l2->l_pflag = LP_MPSAFE;
TAILQ_INIT(&l2->l_ld_locks);
l2->l_psrefs = 0;
kmsan_lwp_alloc(l2);
/*
* For vfork, borrow parent's lwpctl context if it exists.
* This also causes us to return via lwp_userret.
*/
if (flags & LWP_VFORK && l1->l_lwpctl) { l2->l_lwpctl = l1->l_lwpctl;
l2->l_flag |= LW_LWPCTL;
}
/*
* If not the first LWP in the process, grab a reference to the
* descriptor table.
*/
l2->l_fd = p2->p_fd;
if (p2->p_nlwps != 0) {
KASSERT(l1->l_proc == p2);
fd_hold(l2);
} else {
KASSERT(l1->l_proc != p2);
}
if (p2->p_flag & PK_SYSTEM) {
/* Mark it as a system LWP. */
l2->l_flag |= LW_SYSTEM;
}
kdtrace_thread_ctor(NULL, l2);
lwp_initspecific(l2);
sched_lwp_fork(l1, l2);
callout_init(&l2->l_timeout_ch, CALLOUT_MPSAFE);
callout_setfunc(&l2->l_timeout_ch, sleepq_timeout, l2);
cv_init(&l2->l_sigcv, "sigwait");
cv_init(&l2->l_waitcv, "vfork");
l2->l_syncobj = &sched_syncobj;
PSREF_DEBUG_INIT_LWP(l2);
if (rnewlwpp != NULL) *rnewlwpp = l2;
/*
* PCU state needs to be saved before calling uvm_lwp_fork() so that
* the MD cpu_lwp_fork() can copy the saved state to the new LWP.
*/
pcu_save_all(l1);
#if PCU_UNIT_COUNT > 0
l2->l_pcu_valid = l1->l_pcu_valid;
#endif
uvm_lwp_setuarea(l2, uaddr);
uvm_lwp_fork(l1, l2, stack, stacksize, func, (arg != NULL) ? arg : l2);
mutex_enter(p2->p_lock);
l2->l_cred = kauth_cred_hold(p2->p_cred);
if ((flags & LWP_DETACHED) != 0) {
l2->l_prflag = LPR_DETACHED;
p2->p_ndlwps++;
} else
l2->l_prflag = 0;
if (l1->l_proc == p2) {
/*
* These flags are set while p_lock is held. Copy with
* p_lock held too, so the LWP doesn't sneak into the
* process without them being set.
*/
l2->l_flag |= (l1->l_flag & (LW_WEXIT | LW_WREBOOT | LW_WCORE));
} else {
/* fork(): pending core/exit doesn't apply to child. */
l2->l_flag |= (l1->l_flag & LW_WREBOOT);
}
l2->l_sigstk = *sigstk;
l2->l_sigmask = *sigmask;
TAILQ_INIT(&l2->l_sigpend.sp_info);
sigemptyset(&l2->l_sigpend.sp_set);
LIST_INSERT_HEAD(&p2->p_lwps, l2, l_sibling);
p2->p_nlwps++;
p2->p_nrlwps++;
KASSERT(l2->l_affinity == NULL);
/* Inherit the affinity mask. */
if (l1->l_affinity) {
/*
* Note that we hold the state lock while inheriting
* the affinity to avoid race with sched_setaffinity().
*/
lwp_lock(l1);
if (l1->l_affinity) { kcpuset_use(l1->l_affinity);
l2->l_affinity = l1->l_affinity;
}
lwp_unlock(l1);
}
/* Ensure a trip through lwp_userret() if needed. */
if ((l2->l_flag & LW_USERRET) != 0) { lwp_need_userret(l2);
}
/* This marks the end of the "must be atomic" section. */
mutex_exit(p2->p_lock);
SDT_PROBE(proc, kernel, , lwp__create, l2, 0, 0, 0, 0);
mutex_enter(&proc_lock);
LIST_INSERT_HEAD(&alllwp, l2, l_list);
/* Inherit a processor-set */
l2->l_psid = l1->l_psid;
mutex_exit(&proc_lock);
SYSCALL_TIME_LWP_INIT(l2);
if (p2->p_emul->e_lwp_fork) (*p2->p_emul->e_lwp_fork)(l1, l2);
return (0);
}
/*
* Set a new LWP running. If the process is stopping, then the LWP is
* created stopped.
*/
void
lwp_start(lwp_t *l, int flags)
{
proc_t *p = l->l_proc;
mutex_enter(p->p_lock);
lwp_lock(l);
KASSERT(l->l_stat == LSIDL);
if ((flags & LWP_SUSPENDED) != 0) {
/* It'll suspend itself in lwp_userret(). */
l->l_flag |= LW_WSUSPEND;
lwp_need_userret(l);
}
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
KASSERT(l->l_wchan == NULL);
l->l_stat = LSSTOP;
p->p_nrlwps--;
lwp_unlock(l);
} else {
setrunnable(l);
/* LWP now unlocked */
}
mutex_exit(p->p_lock);
}
/*
* Called by MD code when a new LWP begins execution. Must be called
* with the previous LWP locked (so at splsched), or if there is no
* previous LWP, at splsched.
*/
void
lwp_startup(struct lwp *prev, struct lwp *new_lwp)
{
kmutex_t *lock;
KASSERTMSG(new_lwp == curlwp, "l %p curlwp %p prevlwp %p", new_lwp, curlwp, prev);
KASSERT(kpreempt_disabled());
KASSERT(prev != NULL);
KASSERT((prev->l_pflag & LP_RUNNING) != 0);
KASSERT(curcpu()->ci_mtx_count == -2);
/*
* Immediately mark the previous LWP as no longer running and
* unlock (to keep lock wait times short as possible). If a
* zombie, don't touch after clearing LP_RUNNING as it could be
* reaped by another CPU. Use atomic_store_release to ensure
* this -- matches atomic_load_acquire in lwp_free.
*/
lock = prev->l_mutex;
if (__predict_false(prev->l_stat == LSZOMB)) {
atomic_store_release(&prev->l_pflag,
prev->l_pflag & ~LP_RUNNING);
} else {
prev->l_pflag &= ~LP_RUNNING;
}
mutex_spin_exit(lock);
/* Correct spin mutex count after mi_switch(). */
curcpu()->ci_mtx_count = 0;
/* Install new VM context. */
if (__predict_true(new_lwp->l_proc->p_vmspace)) {
pmap_activate(new_lwp);
}
/* We remain at IPL_SCHED from mi_switch() - reset it. */
spl0();
LOCKDEBUG_BARRIER(NULL, 0);
SDT_PROBE(proc, kernel, , lwp__start, new_lwp, 0, 0, 0, 0);
/* For kthreads, acquire kernel lock if not MPSAFE. */
if (__predict_false((new_lwp->l_pflag & LP_MPSAFE) == 0)) {
KERNEL_LOCK(1, new_lwp);
}
}
/*
* Exit an LWP.
*
* *** WARNING *** This can be called with (l != curlwp) in error paths.
*/
void
lwp_exit(struct lwp *l)
{
struct proc *p = l->l_proc;
struct lwp *l2;
bool current;
current = (l == curlwp);
KASSERT(current || l->l_stat == LSIDL); KASSERT(current || l->l_target_cpu == NULL); KASSERT(p == curproc); SDT_PROBE(proc, kernel, , lwp__exit, l, 0, 0, 0, 0);
/* Verify that we hold no locks; for DIAGNOSTIC check kernel_lock. */
LOCKDEBUG_BARRIER(NULL, 0);
KASSERTMSG(curcpu()->ci_biglock_count == 0, "kernel_lock leaked");
/*
* If we are the last live LWP in a process, we need to exit the
* entire process. We do so with an exit status of zero, because
* it's a "controlled" exit, and because that's what Solaris does.
*
* We are not quite a zombie yet, but for accounting purposes we
* must increment the count of zombies here.
*
* Note: the last LWP's specificdata will be deleted here.
*/
mutex_enter(p->p_lock);
if (p->p_nlwps - p->p_nzlwps == 1) {
KASSERT(current == true); KASSERT(p != &proc0);
exit1(l, 0, 0);
/* NOTREACHED */
}
p->p_nzlwps++;
/*
* Perform any required thread cleanup. Do this early so
* anyone wanting to look us up with lwp_getref_lwpid() will
* fail to find us before we become a zombie.
*
* N.B. this will unlock p->p_lock on our behalf.
*/
lwp_thread_cleanup(l); if (p->p_emul->e_lwp_exit) (*p->p_emul->e_lwp_exit)(l);
/* Drop filedesc reference. */
fd_free();
/* Release fstrans private data. */
fstrans_lwp_dtor(l);
/* Delete the specificdata while it's still safe to sleep. */
lwp_finispecific(l);
/*
* Release our cached credentials.
*/
kauth_cred_free(l->l_cred);
callout_destroy(&l->l_timeout_ch);
/*
* If traced, report LWP exit event to the debugger.
*
* Remove the LWP from the global list.
* Free its LID from the PID namespace if needed.
*/
mutex_enter(&proc_lock);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACELWP_EXIT)) ==
(PSL_TRACED|PSL_TRACELWP_EXIT)) {
mutex_enter(p->p_lock);
if (ISSET(p->p_sflag, PS_WEXIT)) {
mutex_exit(p->p_lock);
/*
* We are exiting, bail out without informing parent
* about a terminating LWP as it would deadlock.
*/
} else {
eventswitch(TRAP_LWP, PTRACE_LWP_EXIT, l->l_lid);
mutex_enter(&proc_lock);
}
}
LIST_REMOVE(l, l_list);
mutex_exit(&proc_lock);
/*
* Get rid of all references to the LWP that others (e.g. procfs)
* may have, and mark the LWP as a zombie. If the LWP is detached,
* mark it waiting for collection in the proc structure. Note that
* before we can do that, we need to free any other dead, detached
* LWP waiting to meet its maker.
*
* All conditions need to be observed upon under the same hold of
* p_lock, because if the lock is dropped any of them can change.
*/
mutex_enter(p->p_lock);
for (;;) {
if (lwp_drainrefs(l))
continue;
if ((l->l_prflag & LPR_DETACHED) != 0) {
if ((l2 = p->p_zomblwp) != NULL) {
p->p_zomblwp = NULL;
lwp_free(l2, false, false);
/* proc now unlocked */
mutex_enter(p->p_lock);
continue;
}
p->p_zomblwp = l;
}
break;
}
/*
* If we find a pending signal for the process and we have been
* asked to check for signals, then we lose: arrange to have
* all other LWPs in the process check for signals.
*/
if ((l->l_flag & LW_PENDSIG) != 0 && firstsig(&p->p_sigpend.sp_set) != 0) { LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
lwp_lock(l2);
signotify(l2);
lwp_unlock(l2);
}
}
/*
* Release any PCU resources before becoming a zombie.
*/
pcu_discard_all(l);
lwp_lock(l);
l->l_stat = LSZOMB;
if (l->l_name != NULL) { strcpy(l->l_name, "(zombie)");
}
lwp_unlock(l);
p->p_nrlwps--;
if (l->l_lwpctl != NULL) l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED;
mutex_exit(p->p_lock);
cv_broadcast(&p->p_lwpcv);
/*
* We can no longer block. At this point, lwp_free() may already
* be gunning for us. On a multi-CPU system, we may be off p_lwps.
*
* Free MD LWP resources.
*/
cpu_lwp_free(l, 0);
if (current) {
/* Switch away into oblivion. */
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
panic("lwp_exit");
}
}
/*
* Free a dead LWP's remaining resources.
*
* XXXLWP limits.
*/
void
lwp_free(struct lwp *l, bool recycle, bool last)
{
struct proc *p = l->l_proc;
struct rusage *ru;
ksiginfoq_t kq;
KASSERT(l != curlwp); KASSERT(last || mutex_owned(p->p_lock));
/*
* We use the process credentials instead of the lwp credentials here
* because the lwp credentials maybe cached (just after a setuid call)
* and we don't want pay for syncing, since the lwp is going away
* anyway
*/
if (p != &proc0 && p->p_nlwps != 1) (void)chglwpcnt(kauth_cred_getuid(p->p_cred), -1);
/*
* In the unlikely event that the LWP is still on the CPU,
* then spin until it has switched away.
*
* atomic_load_acquire matches atomic_store_release in
* lwp_startup and mi_switch.
*/
while (__predict_false((atomic_load_acquire(&l->l_pflag) & LP_RUNNING)
!= 0)) {
SPINLOCK_BACKOFF_HOOK;
}
/*
* Now that the LWP's known off the CPU, reset its state back to
* LSIDL, which defeats anything that might have gotten a hold on
* the LWP via pid_table before the ID was freed. It's important
* to do this with both the LWP locked and p_lock held.
*
* Also reset the CPU and lock pointer back to curcpu(), since the
* LWP will in all likelyhood be cached with the current CPU in
* lwp_cache when we free it and later allocated from there again
* (avoid incidental lock contention).
*/
lwp_lock(l);
l->l_stat = LSIDL;
l->l_cpu = curcpu();
lwp_unlock_to(l, l->l_cpu->ci_schedstate.spc_lwplock);
/*
* If this was not the last LWP in the process, then adjust counters
* and unlock. This is done differently for the last LWP in exit1().
*/
if (!last) {
/*
* Add the LWP's run time to the process' base value.
* This needs to co-incide with coming off p_lwps.
*/
bintime_add(&p->p_rtime, &l->l_rtime);
p->p_pctcpu += l->l_pctcpu;
ru = &p->p_stats->p_ru;
ruadd(ru, &l->l_ru);
LIST_REMOVE(l, l_sibling);
p->p_nlwps--;
p->p_nzlwps--;
if ((l->l_prflag & LPR_DETACHED) != 0) p->p_ndlwps--;
mutex_exit(p->p_lock);
/*
* Have any LWPs sleeping in lwp_wait() recheck for
* deadlock.
*/
cv_broadcast(&p->p_lwpcv);
/* Free the LWP ID. */
mutex_enter(&proc_lock);
proc_free_lwpid(p, l->l_lid);
mutex_exit(&proc_lock);
}
/*
* Destroy the LWP's remaining signal information.
*/
ksiginfo_queue_init(&kq);
sigclear(&l->l_sigpend, NULL, &kq);
ksiginfo_queue_drain(&kq);
cv_destroy(&l->l_sigcv);
cv_destroy(&l->l_waitcv);
/*
* Free lwpctl structure and affinity.
*/
if (l->l_lwpctl) { lwp_ctl_free(l);
}
if (l->l_affinity) { kcpuset_unuse(l->l_affinity, NULL);
l->l_affinity = NULL;
}
/*
* Free remaining data structures and the LWP itself unless the
* caller wants to recycle.
*/
if (l->l_name != NULL) kmem_free(l->l_name, MAXCOMLEN);
kmsan_lwp_free(l);
kcov_lwp_free(l);
cpu_lwp_free2(l);
uvm_lwp_exit(l);
KASSERT(SLIST_EMPTY(&l->l_pi_lenders)); KASSERT(l->l_inheritedprio == -1); KASSERT(l->l_blcnt == 0); kdtrace_thread_dtor(NULL, l); if (!recycle) pool_cache_put(lwp_cache, l);
}
/*
* Migrate the LWP to the another CPU. Unlocks the LWP.
*/
void
lwp_migrate(lwp_t *l, struct cpu_info *tci)
{
struct schedstate_percpu *tspc;
int lstat = l->l_stat;
KASSERT(lwp_locked(l, NULL));
KASSERT(tci != NULL);
/* If LWP is still on the CPU, it must be handled like LSONPROC */
if ((l->l_pflag & LP_RUNNING) != 0) {
lstat = LSONPROC;
}
/*
* The destination CPU could be changed while previous migration
* was not finished.
*/
if (l->l_target_cpu != NULL) {
l->l_target_cpu = tci;
lwp_unlock(l);
return;
}
/* Nothing to do if trying to migrate to the same CPU */
if (l->l_cpu == tci) {
lwp_unlock(l);
return;
}
KASSERT(l->l_target_cpu == NULL);
tspc = &tci->ci_schedstate;
switch (lstat) {
case LSRUN:
l->l_target_cpu = tci;
break;
case LSSLEEP:
l->l_cpu = tci;
break;
case LSIDL:
case LSSTOP:
case LSSUSPENDED:
l->l_cpu = tci;
if (l->l_wchan == NULL) {
lwp_unlock_to(l, tspc->spc_lwplock);
return;
}
break;
case LSONPROC:
l->l_target_cpu = tci;
spc_lock(l->l_cpu);
sched_resched_cpu(l->l_cpu, PRI_USER_RT, true);
/* spc now unlocked */
break;
}
lwp_unlock(l);
}
#define lwp_find_exclude(l) \
((l)->l_stat == LSIDL || (l)->l_stat == LSZOMB)
/*
* Find the LWP in the process. Arguments may be zero, in such case,
* the calling process and first LWP in the list will be used.
* On success - returns proc locked.
*
* => pid == 0 -> look in curproc.
* => pid == -1 -> match any proc.
* => otherwise look up the proc.
*
* => lid == 0 -> first LWP in the proc
* => otherwise specific LWP
*/
struct lwp *
lwp_find2(pid_t pid, lwpid_t lid)
{
proc_t *p;
lwp_t *l;
/* First LWP of specified proc. */
if (lid == 0) {
switch (pid) {
case -1:
/* No lookup keys. */
return NULL;
case 0:
p = curproc;
mutex_enter(p->p_lock);
break;
default:
mutex_enter(&proc_lock);
p = proc_find(pid);
if (__predict_false(p == NULL)) {
mutex_exit(&proc_lock);
return NULL;
}
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
break;
}
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (__predict_true(!lwp_find_exclude(l)))
break;
}
goto out;
}
l = proc_find_lwp_acquire_proc(lid, &p);
if (l == NULL)
return NULL;
KASSERT(p != NULL);
KASSERT(mutex_owned(p->p_lock));
if (__predict_false(lwp_find_exclude(l))) {
l = NULL;
goto out;
}
/* Apply proc filter, if applicable. */
switch (pid) {
case -1:
/* Match anything. */
break;
case 0:
if (p != curproc)
l = NULL;
break;
default:
if (p->p_pid != pid)
l = NULL;
break;
}
out:
if (__predict_false(l == NULL)) {
mutex_exit(p->p_lock);
}
return l;
}
/*
* Look up a live LWP within the specified process.
*
* Must be called with p->p_lock held (as it looks at the radix tree,
* and also wants to exclude idle and zombie LWPs).
*/
struct lwp *
lwp_find(struct proc *p, lwpid_t id)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
l = proc_find_lwp(p, id);
KASSERT(l == NULL || l->l_lid == id);
/*
* No need to lock - all of these conditions will
* be visible with the process level mutex held.
*/
if (__predict_false(l != NULL && lwp_find_exclude(l)))
l = NULL;
return l;
}
/*
* Verify that an LWP is locked, and optionally verify that the lock matches
* one we specify.
*/
int
lwp_locked(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *cur = l->l_mutex;
return mutex_owned(cur) && (mtx == cur || mtx == NULL);
}
/*
* Lend a new mutex to an LWP. The old mutex must be held.
*/
kmutex_t *
lwp_setlock(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *oldmtx = l->l_mutex;
KASSERT(mutex_owned(oldmtx)); atomic_store_release(&l->l_mutex, mtx);
return oldmtx;
}
/*
* Lend a new mutex to an LWP, and release the old mutex. The old mutex
* must be held.
*/
void
lwp_unlock_to(struct lwp *l, kmutex_t *mtx)
{
kmutex_t *old;
KASSERT(lwp_locked(l, NULL));
old = l->l_mutex;
atomic_store_release(&l->l_mutex, mtx);
mutex_spin_exit(old);
}
int
lwp_trylock(struct lwp *l)
{
kmutex_t *old;
for (;;) {
if (!mutex_tryenter(old = atomic_load_consume(&l->l_mutex)))
return 0;
if (__predict_true(atomic_load_relaxed(&l->l_mutex) == old))
return 1;
mutex_spin_exit(old);
}
}
void
lwp_unsleep(lwp_t *l, bool unlock)
{ KASSERT(mutex_owned(l->l_mutex));
(*l->l_syncobj->sobj_unsleep)(l, unlock);
}
/*
* Lock an LWP.
*/
void
lwp_lock(lwp_t *l)
{ kmutex_t *old = atomic_load_consume(&l->l_mutex);
/*
* Note: mutex_spin_enter() will have posted a read barrier.
* Re-test l->l_mutex. If it has changed, we need to try again.
*/
mutex_spin_enter(old);
while (__predict_false(atomic_load_relaxed(&l->l_mutex) != old)) {
mutex_spin_exit(old);
old = atomic_load_consume(&l->l_mutex);
mutex_spin_enter(old);
}
}
/*
* Unlock an LWP.
*/
void
lwp_unlock(lwp_t *l)
{
mutex_spin_exit(l->l_mutex);
}
void
lwp_changepri(lwp_t *l, pri_t pri)
{ KASSERT(mutex_owned(l->l_mutex)); if (l->l_priority == pri)
return;
(*l->l_syncobj->sobj_changepri)(l, pri);
KASSERT(l->l_priority == pri);
}
void
lwp_lendpri(lwp_t *l, pri_t pri)
{ KASSERT(mutex_owned(l->l_mutex));
(*l->l_syncobj->sobj_lendpri)(l, pri);
KASSERT(l->l_inheritedprio == pri);
}
pri_t
lwp_eprio(lwp_t *l)
{
pri_t pri = l->l_priority;
KASSERT(mutex_owned(l->l_mutex));
/*
* Timeshared/user LWPs get a temporary priority boost for blocking
* in kernel. This is key to good interactive response on a loaded
* system: without it, things will seem very sluggish to the user.
*
* The function of the boost is to get the LWP onto a CPU and
* running quickly. Once that happens the LWP loses the priority
* boost and could be preempted very quickly by another LWP but that
* won't happen often enough to be an annoyance.
*/
if (pri <= MAXPRI_USER && l->l_boostpri > MAXPRI_USER) pri = (pri >> 1) + l->l_boostpri;
return MAX(l->l_auxprio, pri);
}
/*
* Handle exceptions for mi_userret(). Called if a member of LW_USERRET is
* set or a preemption is required.
*/
void
lwp_userret(struct lwp *l)
{
struct proc *p;
int sig, f;
KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC);
p = l->l_proc;
for (;;) {
/*
* This is the main location that user preemptions are
* processed.
*/
preempt_point();
/*
* It is safe to do this unlocked and without raised SPL,
* since whenever a flag of interest is added to l_flag the
* LWP will take an AST and come down this path again. If a
* remote CPU posts the AST, it will be done with an IPI
* (strongly synchronising).
*/
if ((f = atomic_load_relaxed(&l->l_flag) & LW_USERRET) == 0) {
return;
}
/*
* Start out with the correct credentials.
*/
if ((f & LW_CACHECRED) != 0) { kauth_cred_t oc = l->l_cred;
mutex_enter(p->p_lock);
l->l_cred = kauth_cred_hold(p->p_cred);
lwp_lock(l);
l->l_flag &= ~LW_CACHECRED;
lwp_unlock(l);
mutex_exit(p->p_lock);
kauth_cred_free(oc);
}
/*
* Process pending signals first, unless the process
* is dumping core or exiting, where we will instead
* enter the LW_WSUSPEND case below.
*/
if ((f & (LW_PENDSIG | LW_WCORE | LW_WEXIT)) == LW_PENDSIG) {
mutex_enter(p->p_lock);
while ((sig = issignal(l)) != 0)
postsig(sig);
mutex_exit(p->p_lock);
continue;
}
/*
* Core-dump or suspend pending.
*
* In case of core dump, suspend ourselves, so that the kernel
* stack and therefore the userland registers saved in the
* trapframe are around for coredump() to write them out.
* We also need to save any PCU resources that we have so that
* they accessible for coredump(). We issue a wakeup on
* p->p_lwpcv so that sigexit() will write the core file out
* once all other LWPs are suspended.
*/
if ((f & LW_WSUSPEND) != 0) {
pcu_save_all(l);
mutex_enter(p->p_lock);
p->p_nrlwps--;
lwp_lock(l);
l->l_stat = LSSUSPENDED;
lwp_unlock(l);
mutex_exit(p->p_lock);
cv_broadcast(&p->p_lwpcv);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
continue;
}
/*
* Process is exiting. The core dump and signal cases must
* be handled first.
*/
if ((f & LW_WEXIT) != 0) { lwp_exit(l);
KASSERT(0);
/* NOTREACHED */
}
/*
* Update lwpctl processor (for vfork child_return).
*/
if ((f & LW_LWPCTL) != 0) {
lwp_lock(l);
KASSERT(kpreempt_disabled());
l->l_lwpctl->lc_curcpu = (int)cpu_index(l->l_cpu);
l->l_lwpctl->lc_pctr++;
l->l_flag &= ~LW_LWPCTL;
lwp_unlock(l);
continue;
}
}
}
/*
* Force an LWP to enter the kernel, to take a trip through lwp_userret().
*/
void
lwp_need_userret(struct lwp *l)
{ KASSERT(!cpu_intr_p()); KASSERT(lwp_locked(l, NULL) || l->l_stat == LSIDL);
/*
* If the LWP is in any state other than LSONPROC, we know that it
* is executing in-kernel and will hit userret() on the way out.
*
* If the LWP is curlwp, then we know we'll be back out to userspace
* soon (can't be called from a hardware interrupt here).
*
* Otherwise, we can't be sure what the LWP is doing, so first make
* sure the update to l_flag will be globally visible, and then
* force the LWP to take a trip through trap() where it will do
* userret().
*/
if (l->l_stat == LSONPROC && l != curlwp) { membar_producer();
cpu_signotify(l);
}
}
/*
* Add one reference to an LWP. This will prevent the LWP from
* exiting, thus keep the lwp structure and PCB around to inspect.
*/
void
lwp_addref(struct lwp *l)
{ KASSERT(mutex_owned(l->l_proc->p_lock)); KASSERT(l->l_stat != LSZOMB);
l->l_refcnt++;
}
/*
* Remove one reference to an LWP. If this is the last reference,
* then we must finalize the LWP's death.
*/
void
lwp_delref(struct lwp *l)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
lwp_delref2(l);
mutex_exit(p->p_lock);
}
/*
* Remove one reference to an LWP. If this is the last reference,
* then we must finalize the LWP's death. The proc mutex is held
* on entry.
*/
void
lwp_delref2(struct lwp *l)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock)); KASSERT(l->l_stat != LSZOMB); KASSERT(l->l_refcnt > 0); if (--l->l_refcnt == 0) cv_broadcast(&p->p_lwpcv);
}
/*
* Drain all references to the current LWP. Returns true if
* we blocked.
*/
bool
lwp_drainrefs(struct lwp *l)
{
struct proc *p = l->l_proc;
bool rv = false;
KASSERT(mutex_owned(p->p_lock));
l->l_prflag |= LPR_DRAINING;
while (l->l_refcnt > 0) {
rv = true;
cv_wait(&p->p_lwpcv, p->p_lock);
}
return rv;
}
/*
* Return true if the specified LWP is 'alive'. Only p->p_lock need
* be held.
*/
bool
lwp_alive(lwp_t *l)
{ KASSERT(mutex_owned(l->l_proc->p_lock)); switch (l->l_stat) {
case LSSLEEP:
case LSRUN:
case LSONPROC:
case LSSTOP:
case LSSUSPENDED:
return true;
default:
return false;
}
}
/*
* Return first live LWP in the process.
*/
lwp_t *
lwp_find_first(proc_t *p)
{
lwp_t *l;
KASSERT(mutex_owned(p->p_lock)); LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (lwp_alive(l)) {
return l;
}
}
return NULL;
}
/*
* Allocate a new lwpctl structure for a user LWP.
*/
int
lwp_ctl_alloc(vaddr_t *uaddr)
{
lcproc_t *lp;
u_int bit, i, offset;
struct uvm_object *uao;
int error;
lcpage_t *lcp;
proc_t *p;
lwp_t *l;
l = curlwp;
p = l->l_proc;
/* don't allow a vforked process to create lwp ctls */
if (p->p_lflag & PL_PPWAIT)
return EBUSY;
if (l->l_lcpage != NULL) {
lcp = l->l_lcpage;
*uaddr = lcp->lcp_uaddr + (vaddr_t)l->l_lwpctl - lcp->lcp_kaddr;
return 0;
}
/* First time around, allocate header structure for the process. */
if ((lp = p->p_lwpctl) == NULL) { lp = kmem_alloc(sizeof(*lp), KM_SLEEP);
mutex_init(&lp->lp_lock, MUTEX_DEFAULT, IPL_NONE);
lp->lp_uao = NULL;
TAILQ_INIT(&lp->lp_pages);
mutex_enter(p->p_lock);
if (p->p_lwpctl == NULL) {
p->p_lwpctl = lp;
mutex_exit(p->p_lock);
} else {
mutex_exit(p->p_lock);
mutex_destroy(&lp->lp_lock);
kmem_free(lp, sizeof(*lp));
lp = p->p_lwpctl;
}
}
/*
* Set up an anonymous memory region to hold the shared pages.
* Map them into the process' address space. The user vmspace
* gets the first reference on the UAO.
*/
mutex_enter(&lp->lp_lock);
if (lp->lp_uao == NULL) {
lp->lp_uao = uao_create(LWPCTL_UAREA_SZ, 0);
lp->lp_cur = 0;
lp->lp_max = LWPCTL_UAREA_SZ;
lp->lp_uva = p->p_emul->e_vm_default_addr(p,
(vaddr_t)p->p_vmspace->vm_daddr, LWPCTL_UAREA_SZ,
p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
error = uvm_map(&p->p_vmspace->vm_map, &lp->lp_uva,
LWPCTL_UAREA_SZ, lp->lp_uao, 0, 0, UVM_MAPFLAG(UVM_PROT_RW,
UVM_PROT_RW, UVM_INH_NONE, UVM_ADV_NORMAL, 0));
if (error != 0) { uao_detach(lp->lp_uao);
lp->lp_uao = NULL;
mutex_exit(&lp->lp_lock);
return error;
}
}
/* Get a free block and allocate for this LWP. */
TAILQ_FOREACH(lcp, &lp->lp_pages, lcp_chain) { if (lcp->lcp_nfree != 0)
break;
}
if (lcp == NULL) {
/* Nothing available - try to set up a free page. */
if (lp->lp_cur == lp->lp_max) {
mutex_exit(&lp->lp_lock);
return ENOMEM;
}
lcp = kmem_alloc(LWPCTL_LCPAGE_SZ, KM_SLEEP);
/*
* Wire the next page down in kernel space. Since this
* is a new mapping, we must add a reference.
*/
uao = lp->lp_uao;
(*uao->pgops->pgo_reference)(uao);
lcp->lcp_kaddr = vm_map_min(kernel_map);
error = uvm_map(kernel_map, &lcp->lcp_kaddr, PAGE_SIZE,
uao, lp->lp_cur, PAGE_SIZE,
UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
UVM_INH_NONE, UVM_ADV_RANDOM, 0));
if (error != 0) {
mutex_exit(&lp->lp_lock);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
(*uao->pgops->pgo_detach)(uao);
return error;
}
error = uvm_map_pageable(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE, FALSE, 0);
if (error != 0) {
mutex_exit(&lp->lp_lock);
uvm_unmap(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
return error;
}
/* Prepare the page descriptor and link into the list. */
lcp->lcp_uaddr = lp->lp_uva + lp->lp_cur;
lp->lp_cur += PAGE_SIZE;
lcp->lcp_nfree = LWPCTL_PER_PAGE;
lcp->lcp_rotor = 0;
memset(lcp->lcp_bitmap, 0xff, LWPCTL_BITMAP_SZ);
TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
}
for (i = lcp->lcp_rotor; lcp->lcp_bitmap[i] == 0;) {
if (++i >= LWPCTL_BITMAP_ENTRIES)
i = 0;
}
bit = ffs(lcp->lcp_bitmap[i]) - 1;
lcp->lcp_bitmap[i] ^= (1U << bit);
lcp->lcp_rotor = i;
lcp->lcp_nfree--;
l->l_lcpage = lcp;
offset = (i << 5) + bit;
l->l_lwpctl = (lwpctl_t *)lcp->lcp_kaddr + offset;
*uaddr = lcp->lcp_uaddr + offset * sizeof(lwpctl_t);
mutex_exit(&lp->lp_lock);
KPREEMPT_DISABLE(l);
l->l_lwpctl->lc_curcpu = (int)cpu_index(curcpu());
KPREEMPT_ENABLE(l);
return 0;
}
/*
* Free an lwpctl structure back to the per-process list.
*/
void
lwp_ctl_free(lwp_t *l)
{
struct proc *p = l->l_proc;
lcproc_t *lp;
lcpage_t *lcp;
u_int map, offset;
/* don't free a lwp context we borrowed for vfork */
if (p->p_lflag & PL_PPWAIT) {
l->l_lwpctl = NULL;
return;
}
lp = p->p_lwpctl;
KASSERT(lp != NULL);
lcp = l->l_lcpage;
offset = (u_int)((lwpctl_t *)l->l_lwpctl - (lwpctl_t *)lcp->lcp_kaddr);
KASSERT(offset < LWPCTL_PER_PAGE);
mutex_enter(&lp->lp_lock);
lcp->lcp_nfree++;
map = offset >> 5;
lcp->lcp_bitmap[map] |= (1U << (offset & 31));
if (lcp->lcp_bitmap[lcp->lcp_rotor] == 0) lcp->lcp_rotor = map; if (TAILQ_FIRST(&lp->lp_pages)->lcp_nfree == 0) { TAILQ_REMOVE(&lp->lp_pages, lcp, lcp_chain); TAILQ_INSERT_HEAD(&lp->lp_pages, lcp, lcp_chain);
}
mutex_exit(&lp->lp_lock);
}
/*
* Process is exiting; tear down lwpctl state. This can only be safely
* called by the last LWP in the process.
*/
void
lwp_ctl_exit(void)
{
lcpage_t *lcp, *next;
lcproc_t *lp;
proc_t *p;
lwp_t *l;
l = curlwp;
l->l_lwpctl = NULL;
l->l_lcpage = NULL;
p = l->l_proc;
lp = p->p_lwpctl;
KASSERT(lp != NULL);
KASSERT(p->p_nlwps == 1);
for (lcp = TAILQ_FIRST(&lp->lp_pages); lcp != NULL; lcp = next) {
next = TAILQ_NEXT(lcp, lcp_chain);
uvm_unmap(kernel_map, lcp->lcp_kaddr,
lcp->lcp_kaddr + PAGE_SIZE);
kmem_free(lcp, LWPCTL_LCPAGE_SZ);
}
if (lp->lp_uao != NULL) {
uvm_unmap(&p->p_vmspace->vm_map, lp->lp_uva,
lp->lp_uva + LWPCTL_UAREA_SZ);
}
mutex_destroy(&lp->lp_lock);
kmem_free(lp, sizeof(*lp));
p->p_lwpctl = NULL;
}
/*
* Return the current LWP's "preemption counter". Used to detect
* preemption across operations that can tolerate preemption without
* crashing, but which may generate incorrect results if preempted.
*
* We do arithmetic in unsigned long to avoid undefined behaviour in
* the event of arithmetic overflow on LP32, and issue __insn_barrier()
* on both sides so this can safely be used to detect changes to the
* preemption counter in loops around other memory accesses even in the
* event of whole-program optimization (e.g., gcc -flto).
*/
long
lwp_pctr(void)
{
unsigned long pctr;
__insn_barrier();
pctr = curlwp->l_ru.ru_nvcsw;
pctr += curlwp->l_ru.ru_nivcsw;
__insn_barrier();
return pctr;
}
/*
* Set an LWP's private data pointer.
*/
int
lwp_setprivate(struct lwp *l, void *ptr)
{
int error = 0;
l->l_private = ptr;
#ifdef __HAVE_CPU_LWP_SETPRIVATE
error = cpu_lwp_setprivate(l, ptr);
#endif
return error;
}
/*
* Perform any thread-related cleanup on LWP exit.
* N.B. l->l_proc->p_lock must be HELD on entry but will
* be released before returning!
*/
void
lwp_thread_cleanup(struct lwp *l)
{
KASSERT(mutex_owned(l->l_proc->p_lock));
mutex_exit(l->l_proc->p_lock);
/*
* If the LWP has robust futexes, release them all
* now.
*/
if (__predict_false(l->l_robust_head != 0)) { futex_release_all_lwp(l);
}
}
#if defined(DDB)
#include <machine/pcb.h>
void
lwp_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
lwp_t *l;
LIST_FOREACH(l, &alllwp, l_list) {
uintptr_t stack = (uintptr_t)KSTACK_LOWEST_ADDR(l);
if (addr < stack || stack + KSTACK_SIZE <= addr) {
continue;
}
(*pr)("%p is %p+%zu, LWP %p's stack\n",
(void *)addr, (void *)stack,
(size_t)(addr - stack), l);
}
}
#endif /* defined(DDB) */
/* $NetBSD: hash.h,v 1.8 2014/09/05 05:46:15 matt Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Luke Mewburn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_HASH_H_
#define _SYS_HASH_H_
#include <sys/types.h>
#ifdef __HAVE_MACHINE_HASH_H
#include <machine/hash.h>
#endif
#ifndef __HAVE_HASH32_BUF /* not overridden by MD hash */
#define HASH32_BUF_INIT 5381
/*
* uint32_t
* hash32_buf(const void *bf, size_t len, uint32_t hash)
* return a 32 bit hash of the binary buffer buf (size len),
* seeded with an initial hash value of hash (usually HASH32_BUF_INIT).
*/
static __inline uint32_t
hash32_buf(const void *bf, size_t len, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
while (len-- != 0) /* "nemesi": k=257, r=r*257 */
hash = hash * 257 + *s++;
return (hash * 257);
}
#endif /* __HAVE_HASH32_BUF */
#ifndef __HAVE_HASH32_STR /* not overridden by MD hash */
#define HASH32_STR_INIT 5381
/*
* uint32_t
* hash32_str(const void *bf, uint32_t hash)
* return a 32 bit hash of NUL terminated ASCII string buf,
* seeded with an initial hash value of hash (usually HASH32_STR_INIT).
*/
static __inline uint32_t
hash32_str(const void *bf, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
uint8_t c;
while ((c = *s++) != 0)
hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */
return (hash + (hash >> 5));
}
/*
* uint32_t
* hash32_strn(const void *bf, size_t len, uint32_t hash)
* return a 32 bit hash of NUL terminated ASCII string buf up to
* a maximum of len bytes,
* seeded with an initial hash value of hash (usually HASH32_STR_INIT).
*/
static __inline uint32_t
hash32_strn(const void *bf, size_t len, uint32_t hash)
{
const uint8_t *s = (const uint8_t *)bf;
uint8_t c;
while ((c = *s++) != 0 && len-- != 0)
hash = hash * 33 + c; /* "perl": k=33, r=r+r/32 */
return (hash + (hash >> 5));
}
#endif /* __HAVE_HASH32_STR */
__BEGIN_DECLS
uint32_t murmurhash2(const void *, size_t, uint32_t);
__END_DECLS
#endif /* !_SYS_HASH_H_ */
/* $NetBSD: rfcomm_upper.c,v 1.23 2018/09/03 16:29:36 riastradh Exp $ */
/*-
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Written by Iain Hibbert for Itronix Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rfcomm_upper.c,v 1.23 2018/09/03 16:29:36 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/kmem.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/l2cap.h>
#include <netbt/rfcomm.h>
/****************************************************************************
*
* RFCOMM DLC - Upper Protocol API
*
* Currently the only 'Port Emulation Entity' is the RFCOMM socket code
* but it is should be possible to provide a pseudo-device for a direct
* tty interface.
*/
/*
* rfcomm_attach_pcb(handle, proto, upper)
*
* attach a new RFCOMM DLC to handle, populate with reasonable defaults
*/
int
rfcomm_attach_pcb(struct rfcomm_dlc **handle,
const struct btproto *proto, void *upper)
{
struct rfcomm_dlc *dlc;
KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL);
dlc = kmem_intr_zalloc(sizeof(struct rfcomm_dlc), KM_NOSLEEP);
if (dlc == NULL)
return ENOMEM;
dlc->rd_state = RFCOMM_DLC_CLOSED;
dlc->rd_mtu = rfcomm_mtu_default;
dlc->rd_proto = proto;
dlc->rd_upper = upper;
dlc->rd_laddr.bt_len = sizeof(struct sockaddr_bt);
dlc->rd_laddr.bt_family = AF_BLUETOOTH;
dlc->rd_laddr.bt_psm = L2CAP_PSM_RFCOMM;
dlc->rd_raddr.bt_len = sizeof(struct sockaddr_bt);
dlc->rd_raddr.bt_family = AF_BLUETOOTH;
dlc->rd_raddr.bt_psm = L2CAP_PSM_RFCOMM;
dlc->rd_lmodem = RFCOMM_MSC_RTC | RFCOMM_MSC_RTR | RFCOMM_MSC_DV;
callout_init(&dlc->rd_timeout, 0);
callout_setfunc(&dlc->rd_timeout, rfcomm_dlc_timeout, dlc);
*handle = dlc;
return 0;
}
/*
* rfcomm_bind_pcb(dlc, sockaddr)
*
* bind DLC to local address
*/
int
rfcomm_bind_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr)
{
if (dlc->rd_state != RFCOMM_DLC_CLOSED)
return EINVAL;
memcpy(&dlc->rd_laddr, addr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* rfcomm_sockaddr_pcb(dlc, sockaddr)
*
* return local address
*/
int
rfcomm_sockaddr_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr)
{
memcpy(addr, &dlc->rd_laddr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* rfcomm_connect_pcb(dlc, sockaddr)
*
* Initiate connection of RFCOMM DLC to remote address.
*/
int
rfcomm_connect_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *dest)
{
struct rfcomm_session *rs;
int err = 0;
if (dlc->rd_state != RFCOMM_DLC_CLOSED)
return EISCONN;
memcpy(&dlc->rd_raddr, dest, sizeof(struct sockaddr_bt));
if (dlc->rd_raddr.bt_channel < RFCOMM_CHANNEL_MIN
|| dlc->rd_raddr.bt_channel > RFCOMM_CHANNEL_MAX
|| bdaddr_any(&dlc->rd_raddr.bt_bdaddr))
return EDESTADDRREQ;
if (dlc->rd_raddr.bt_psm == L2CAP_PSM_ANY)
dlc->rd_raddr.bt_psm = L2CAP_PSM_RFCOMM;
else if (dlc->rd_raddr.bt_psm != L2CAP_PSM_RFCOMM
&& (dlc->rd_raddr.bt_psm < 0x1001
|| L2CAP_PSM_INVALID(dlc->rd_raddr.bt_psm)))
return EINVAL;
/*
* We are allowed only one RFCOMM session between any 2 Bluetooth
* devices, so see if there is a session already otherwise create
* one and set it connecting.
*/
rs = rfcomm_session_lookup(&dlc->rd_laddr, &dlc->rd_raddr);
if (rs == NULL) {
rs = rfcomm_session_alloc(&rfcomm_session_active,
&dlc->rd_laddr);
if (rs == NULL)
return ENOMEM;
rs->rs_flags |= RFCOMM_SESSION_INITIATOR;
rs->rs_state = RFCOMM_SESSION_WAIT_CONNECT;
err = l2cap_connect_pcb(rs->rs_l2cap, &dlc->rd_raddr);
if (err) {
rfcomm_session_free(rs);
return err;
}
/*
* This session will start up automatically when its
* L2CAP channel is connected.
*/
}
/* construct DLC */
dlc->rd_dlci = RFCOMM_MKDLCI(IS_INITIATOR(rs) ? 0:1, dest->bt_channel);
if (rfcomm_dlc_lookup(rs, dlc->rd_dlci))
return EBUSY;
l2cap_sockaddr_pcb(rs->rs_l2cap, &dlc->rd_laddr);
/*
* attach the DLC to the session and start it off
*/
dlc->rd_session = rs;
dlc->rd_state = RFCOMM_DLC_WAIT_SESSION;
LIST_INSERT_HEAD(&rs->rs_dlcs, dlc, rd_next);
if (rs->rs_state == RFCOMM_SESSION_OPEN)
err = rfcomm_dlc_connect(dlc);
return err;
}
/*
* rfcomm_peeraddr_pcb(dlc, sockaddr)
*
* return remote address
*/
int
rfcomm_peeraddr_pcb(struct rfcomm_dlc *dlc, struct sockaddr_bt *addr)
{
memcpy(addr, &dlc->rd_raddr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* rfcomm_disconnect_pcb(dlc, linger)
*
* disconnect RFCOMM DLC
*/
int
rfcomm_disconnect_pcb(struct rfcomm_dlc *dlc, int linger)
{
struct rfcomm_session *rs = dlc->rd_session;
int err = 0;
KASSERT(dlc != NULL);
switch (dlc->rd_state) {
case RFCOMM_DLC_CLOSED:
case RFCOMM_DLC_LISTEN:
return EINVAL;
case RFCOMM_DLC_WAIT_SEND_UA:
err = rfcomm_session_send_frame(rs,
RFCOMM_FRAME_DM, dlc->rd_dlci);
/* fall through */
case RFCOMM_DLC_WAIT_SESSION:
case RFCOMM_DLC_WAIT_CONNECT:
case RFCOMM_DLC_WAIT_SEND_SABM:
rfcomm_dlc_close(dlc, 0);
break;
case RFCOMM_DLC_OPEN:
if (dlc->rd_txbuf != NULL && linger != 0) {
dlc->rd_flags |= RFCOMM_DLC_SHUTDOWN;
break;
}
/* else fall through */
case RFCOMM_DLC_WAIT_RECV_UA:
dlc->rd_state = RFCOMM_DLC_WAIT_DISCONNECT;
err = rfcomm_session_send_frame(rs, RFCOMM_FRAME_DISC,
dlc->rd_dlci);
callout_schedule(&dlc->rd_timeout, rfcomm_ack_timeout * hz);
break;
case RFCOMM_DLC_WAIT_DISCONNECT:
err = EALREADY;
break;
default:
UNKNOWN(dlc->rd_state);
break;
}
return err;
}
/*
* rfcomm_detach_pcb(handle)
*
* detach RFCOMM DLC from handle
*/
void
rfcomm_detach_pcb(struct rfcomm_dlc **handle)
{
struct rfcomm_dlc *dlc = *handle;
if (dlc->rd_state != RFCOMM_DLC_CLOSED) rfcomm_dlc_close(dlc, 0); if (dlc->rd_txbuf != NULL) { m_freem(dlc->rd_txbuf);
dlc->rd_txbuf = NULL;
}
dlc->rd_upper = NULL;
*handle = NULL;
/*
* If callout is invoking we can't free the DLC so
* mark it and let the callout release it.
*/
if (callout_invoking(&dlc->rd_timeout))
dlc->rd_flags |= RFCOMM_DLC_DETACH;
else {
callout_destroy(&dlc->rd_timeout);
kmem_intr_free(dlc, sizeof(*dlc));
}
}
/*
* rfcomm_listen_pcb(dlc)
*
* This DLC is a listener. We look for an existing listening session
* with a matching address to attach to or else create a new one on
* the listeners list. If the ANY channel is given, allocate the first
* available for the session.
*/
int
rfcomm_listen_pcb(struct rfcomm_dlc *dlc)
{
struct rfcomm_session *rs;
struct rfcomm_dlc *used;
struct sockaddr_bt addr;
int err, channel;
if (dlc->rd_state != RFCOMM_DLC_CLOSED)
return EISCONN;
if (dlc->rd_laddr.bt_channel != RFCOMM_CHANNEL_ANY
&& (dlc->rd_laddr.bt_channel < RFCOMM_CHANNEL_MIN
|| dlc->rd_laddr.bt_channel > RFCOMM_CHANNEL_MAX))
return EADDRNOTAVAIL;
if (dlc->rd_laddr.bt_psm == L2CAP_PSM_ANY)
dlc->rd_laddr.bt_psm = L2CAP_PSM_RFCOMM;
else if (dlc->rd_laddr.bt_psm != L2CAP_PSM_RFCOMM
&& (dlc->rd_laddr.bt_psm < 0x1001
|| L2CAP_PSM_INVALID(dlc->rd_laddr.bt_psm)))
return EADDRNOTAVAIL;
LIST_FOREACH(rs, &rfcomm_session_listen, rs_next) {
l2cap_sockaddr_pcb(rs->rs_l2cap, &addr);
if (addr.bt_psm != dlc->rd_laddr.bt_psm)
continue;
if (bdaddr_same(&dlc->rd_laddr.bt_bdaddr, &addr.bt_bdaddr))
break;
}
if (rs == NULL) {
rs = rfcomm_session_alloc(&rfcomm_session_listen,
&dlc->rd_laddr);
if (rs == NULL)
return ENOMEM;
rs->rs_state = RFCOMM_SESSION_LISTEN;
err = l2cap_listen_pcb(rs->rs_l2cap);
if (err) {
rfcomm_session_free(rs);
return err;
}
}
if (dlc->rd_laddr.bt_channel == RFCOMM_CHANNEL_ANY) {
channel = RFCOMM_CHANNEL_MIN;
used = LIST_FIRST(&rs->rs_dlcs);
while (used != NULL) {
if (used->rd_laddr.bt_channel == channel) {
if (channel++ == RFCOMM_CHANNEL_MAX)
return EADDRNOTAVAIL;
used = LIST_FIRST(&rs->rs_dlcs);
} else {
used = LIST_NEXT(used, rd_next);
}
}
dlc->rd_laddr.bt_channel = channel;
}
dlc->rd_session = rs;
dlc->rd_state = RFCOMM_DLC_LISTEN;
LIST_INSERT_HEAD(&rs->rs_dlcs, dlc, rd_next);
return 0;
}
/*
* rfcomm_send_pcb(dlc, mbuf)
*
* Output data on DLC. This is streamed data, so we add it
* to our buffer and start the DLC, which will assemble
* packets and send them if it can.
*/
int
rfcomm_send_pcb(struct rfcomm_dlc *dlc, struct mbuf *m)
{
if (dlc->rd_txbuf != NULL) {
dlc->rd_txbuf->m_pkthdr.len += m->m_pkthdr.len;
m_cat(dlc->rd_txbuf, m);
} else {
dlc->rd_txbuf = m;
}
if (dlc->rd_state == RFCOMM_DLC_OPEN)
rfcomm_dlc_start(dlc);
return 0;
}
/*
* rfcomm_rcvd_pcb(dlc, space)
*
* Indicate space now available in receive buffer
*
* This should be used to give an initial value of the receive buffer
* size when the DLC is attached and anytime data is cleared from the
* buffer after that.
*/
int
rfcomm_rcvd_pcb(struct rfcomm_dlc *dlc, size_t space)
{ KASSERT(dlc != NULL);
dlc->rd_rxsize = space;
/*
* if we are using credit based flow control, we may
* want to send some credits..
*/
if (dlc->rd_state == RFCOMM_DLC_OPEN && (dlc->rd_session->rs_flags & RFCOMM_SESSION_CFC)) rfcomm_dlc_start(dlc);
return 0;
}
/*
* rfcomm_setopt(dlc, sopt)
*
* set DLC options
*/
int
rfcomm_setopt(struct rfcomm_dlc *dlc, const struct sockopt *sopt)
{
int mode, err = 0;
uint16_t mtu;
switch (sopt->sopt_name) {
case SO_RFCOMM_MTU:
err = sockopt_get(sopt, &mtu, sizeof(mtu));
if (err)
break;
if (mtu < RFCOMM_MTU_MIN || mtu > RFCOMM_MTU_MAX)
err = EINVAL;
else if (dlc->rd_state == RFCOMM_DLC_CLOSED)
dlc->rd_mtu = mtu;
else
err = EBUSY;
break;
case SO_RFCOMM_LM:
err = sockopt_getint(sopt, &mode);
if (err)
break;
mode &= (RFCOMM_LM_SECURE | RFCOMM_LM_ENCRYPT | RFCOMM_LM_AUTH);
if (mode & RFCOMM_LM_SECURE)
mode |= RFCOMM_LM_ENCRYPT;
if (mode & RFCOMM_LM_ENCRYPT)
mode |= RFCOMM_LM_AUTH;
dlc->rd_mode = mode;
if (dlc->rd_state == RFCOMM_DLC_OPEN)
err = rfcomm_dlc_setmode(dlc);
break;
default:
err = ENOPROTOOPT;
break;
}
return err;
}
/*
* rfcomm_getopt(dlc, sopt)
*
* get DLC options
*/
int
rfcomm_getopt(struct rfcomm_dlc *dlc, struct sockopt *sopt)
{
struct rfcomm_fc_info fc;
switch (sopt->sopt_name) {
case SO_RFCOMM_MTU:
return sockopt_set(sopt, &dlc->rd_mtu, sizeof(uint16_t));
case SO_RFCOMM_FC_INFO:
memset(&fc, 0, sizeof(fc));
fc.lmodem = dlc->rd_lmodem;
fc.rmodem = dlc->rd_rmodem;
fc.tx_cred = uimax(dlc->rd_txcred, 0xff);
fc.rx_cred = uimax(dlc->rd_rxcred, 0xff);
if (dlc->rd_session && (dlc->rd_session->rs_flags & RFCOMM_SESSION_CFC)) fc.cfc = 1;
return sockopt_set(sopt, &fc, sizeof(fc));
case SO_RFCOMM_LM:
return sockopt_setint(sopt, dlc->rd_mode);
default:
break;
}
return ENOPROTOOPT;
}
/* $NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $ */
/* $KAME: ip6_output.c,v 1.172 2001/03/25 09:55:56 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_output.c,v 1.235 2024/04/19 00:55:35 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/errno.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/ip_var.h>
#include <netinet/icmp6.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_offload.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif
extern pfil_head_t *inet6_pfil_hook; /* XXX */
struct ip6_exthdrs {
struct mbuf *ip6e_ip6;
struct mbuf *ip6e_hbh;
struct mbuf *ip6e_dest1;
struct mbuf *ip6e_rthdr;
struct mbuf *ip6e_dest2;
};
static int ip6_pcbopt(int, u_char *, int, struct ip6_pktopts **,
kauth_cred_t, int);
static int ip6_getpcbopt(struct ip6_pktopts *, int, struct sockopt *);
static int ip6_setpktopt(int, u_char *, int, struct ip6_pktopts *, kauth_cred_t,
int, int, int);
static int ip6_setmoptions(const struct sockopt *, struct inpcb *);
static int ip6_getmoptions(struct sockopt *, struct inpcb *);
static int ip6_copyexthdr(struct mbuf **, void *, int);
static int ip6_insertfraghdr(struct mbuf *, struct mbuf *, int,
struct ip6_frag **);
static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
static int ip6_getpmtu(struct rtentry *, struct ifnet *, u_long *, int *);
static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
static int ip6_ifaddrvalid(const struct in6_addr *, const struct in6_addr *);
static int ip6_handle_rthdr(struct ip6_rthdr *, struct ip6_hdr *);
#ifdef RFC2292
static int ip6_pcbopts(struct ip6_pktopts **, struct socket *, struct sockopt *);
#endif
static int
ip6_handle_rthdr(struct ip6_rthdr *rh, struct ip6_hdr *ip6)
{
int error = 0;
switch (rh->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
/* Dropped, RFC5095. */
default: /* is it possible? */
error = EINVAL;
}
return error;
}
/*
* Send an IP packet to a host.
*/
int
ip6_if_output(struct ifnet * const ifp, struct ifnet * const origifp,
struct mbuf * const m, const struct sockaddr_in6 * const dst,
const struct rtentry *rt)
{
int error = 0;
if (rt != NULL) {
error = rt_check_reject_route(rt, ifp);
if (error != 0) { IP6_STATINC(IP6_STAT_RTREJECT);
m_freem(m);
return error;
}
}
/* discard the packet if IPv6 operation is disabled on the interface */
if ((ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED)) {
m_freem(m);
return ENETDOWN; /* better error? */
}
if ((ifp->if_flags & IFF_LOOPBACK) != 0)
error = if_output_lock(ifp, origifp, m, sin6tocsa(dst), rt);
else
error = if_output_lock(ifp, ifp, m, sin6tocsa(dst), rt);
return error;
}
/*
* IP6 output. The packet in mbuf chain m contains a skeletal IP6
* header (with pri, len, nxt, hlim, src, dst).
*
* This function may modify ver and hlim only. The mbuf chain containing the
* packet will be freed. The mbuf opt, if present, will not be freed.
*
* Type of "mtu": rt_rmx.rmx_mtu is u_long, ifnet.ifr_mtu is int, and
* nd_ifinfo.linkmtu is u_int32_t. So we use u_long to hold largest one,
* which is rt_rmx.rmx_mtu.
*/
int
ip6_output(
struct mbuf *m0,
struct ip6_pktopts *opt,
struct route *ro,
int flags,
struct ip6_moptions *im6o,
struct inpcb *inp,
struct ifnet **ifpp /* XXX: just for statistics */
)
{
struct ip6_hdr *ip6, *mhip6;
struct ifnet *ifp = NULL, *origifp = NULL;
struct mbuf *m = m0;
int tlen, len, off;
bool tso;
struct route ip6route;
struct rtentry *rt = NULL, *rt_pmtu;
const struct sockaddr_in6 *dst;
struct sockaddr_in6 src_sa, dst_sa;
int error = 0;
struct in6_ifaddr *ia = NULL;
u_long mtu;
int alwaysfrag, dontfrag;
u_int32_t optlen = 0, plen = 0, unfragpartlen = 0;
struct ip6_exthdrs exthdrs;
struct in6_addr finaldst, src0, dst0;
u_int32_t zone;
struct route *ro_pmtu = NULL;
int hdrsplit = 0;
int needipsec = 0;
#ifdef IPSEC
struct secpolicy *sp = NULL;
#endif
struct psref psref, psref_ia;
int bound = curlwp_bind();
bool release_psref_ia = false;
#ifdef DIAGNOSTIC
if ((m->m_flags & M_PKTHDR) == 0)
panic("ip6_output: no HDR");
if ((m->m_pkthdr.csum_flags &
(M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TSOv4)) != 0) {
panic("ip6_output: IPv4 checksum offload flags: %d",
m->m_pkthdr.csum_flags);
}
if ((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) ==
(M_CSUM_TCPv6|M_CSUM_UDPv6)) {
panic("ip6_output: conflicting checksum offload flags: %d",
m->m_pkthdr.csum_flags);
}
#endif
M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data, sizeof(struct ip6_hdr));
#define MAKE_EXTHDR(hp, mp) \
do { \
if (hp) { \
struct ip6_ext *eh = (struct ip6_ext *)(hp); \
error = ip6_copyexthdr((mp), (void *)(hp), \
((eh)->ip6e_len + 1) << 3); \
if (error) \
goto freehdrs; \
} \
} while (/*CONSTCOND*/ 0)
memset(&exthdrs, 0, sizeof(exthdrs));
if (opt) {
/* Hop-by-Hop options header */
MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh);
/* Destination options header (1st part) */
MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1);
/* Routing header */
MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr);
/* Destination options header (2nd part) */
MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2);
}
/*
* Calculate the total length of the extension header chain.
* Keep the length of the unfragmentable part for fragmentation.
*/
optlen = 0;
if (exthdrs.ip6e_hbh) optlen += exthdrs.ip6e_hbh->m_len; if (exthdrs.ip6e_dest1) optlen += exthdrs.ip6e_dest1->m_len; if (exthdrs.ip6e_rthdr) optlen += exthdrs.ip6e_rthdr->m_len;
unfragpartlen = optlen + sizeof(struct ip6_hdr);
/* NOTE: we don't add AH/ESP length here. do that later. */
if (exthdrs.ip6e_dest2) optlen += exthdrs.ip6e_dest2->m_len;
#ifdef IPSEC
if (ipsec_used) {
/* Check the security policy (SP) for the packet */
sp = ipsec6_check_policy(m, inp, flags, &needipsec, &error);
if (error != 0) {
/*
* Hack: -EINVAL is used to signal that a packet
* should be silently discarded. This is typically
* because we asked key management for an SA and
* it was delayed (e.g. kicked up to IKE).
*/
if (error == -EINVAL)
error = 0;
IP6_STATINC(IP6_STAT_IPSECDROP_OUT);
goto freehdrs;
}
}
#endif
if (needipsec &&
(m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) {
in6_undefer_cksum_tcpudp(m);
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
/*
* If we need IPsec, or there is at least one extension header,
* separate IP6 header from the payload.
*/
if ((needipsec || optlen) && !hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
/* adjust mbuf packet header length */
m->m_pkthdr.len += optlen;
plen = m->m_pkthdr.len - sizeof(*ip6);
/* If this is a jumbo payload, insert a jumbo payload option. */
if (plen > IPV6_MAXPACKET) {
if (!hdrsplit) {
if ((error = ip6_splithdr(m, &exthdrs)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
m = NULL;
goto freehdrs;
}
m = exthdrs.ip6e_ip6;
hdrsplit++;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
if ((error = ip6_insert_jumboopt(&exthdrs, plen)) != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto freehdrs;
}
optlen += 8; /* XXX JUMBOOPTLEN */
ip6->ip6_plen = 0;
} else
ip6->ip6_plen = htons(plen);
/*
* Concatenate headers and fill in next header fields.
* Here we have, on "m"
* IPv6 payload
* and we insert headers accordingly. Finally, we should be getting:
* IPv6 hbh dest1 rthdr ah* [esp* dest2 payload]
*
* during the header composing process, "m" points to IPv6 header.
* "mprev" points to an extension header prior to esp.
*/
{
u_char *nexthdrp = &ip6->ip6_nxt;
struct mbuf *mprev = m;
/*
* we treat dest2 specially. this makes IPsec processing
* much easier. the goal here is to make mprev point the
* mbuf prior to dest2.
*
* result: IPv6 dest2 payload
* m and mprev will point to IPv6 header.
*/
if (exthdrs.ip6e_dest2) { if (!hdrsplit) panic("assumption failed: hdr not split");
exthdrs.ip6e_dest2->m_next = m->m_next;
m->m_next = exthdrs.ip6e_dest2;
*mtod(exthdrs.ip6e_dest2, u_char *) = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_DSTOPTS;
}
#define MAKE_CHAIN(m, mp, p, i)\
do {\
if (m) {\
if (!hdrsplit) \
panic("assumption failed: hdr not split"); \
*mtod((m), u_char *) = *(p);\
*(p) = (i);\
p = mtod((m), u_char *);\
(m)->m_next = (mp)->m_next;\
(mp)->m_next = (m);\
(mp) = (m);\
}\
} while (/*CONSTCOND*/ 0)
/*
* result: IPv6 hbh dest1 rthdr dest2 payload
* m will point to IPv6 header. mprev will point to the
* extension header prior to dest2 (rthdr in the above case).
*/
MAKE_CHAIN(exthdrs.ip6e_hbh, mprev, nexthdrp, IPPROTO_HOPOPTS); MAKE_CHAIN(exthdrs.ip6e_dest1, mprev, nexthdrp,
IPPROTO_DSTOPTS);
MAKE_CHAIN(exthdrs.ip6e_rthdr, mprev, nexthdrp,
IPPROTO_ROUTING);
M_CSUM_DATA_IPv6_SET(m->m_pkthdr.csum_data,
sizeof(struct ip6_hdr) + optlen);
}
/* Need to save for pmtu */
finaldst = ip6->ip6_dst;
/*
* If there is a routing header, replace destination address field
* with the first hop of the routing header.
*/
if (exthdrs.ip6e_rthdr) {
struct ip6_rthdr *rh;
rh = mtod(exthdrs.ip6e_rthdr, struct ip6_rthdr *);
error = ip6_handle_rthdr(rh, ip6);
if (error != 0) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
}
/* Source address validation */
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src) &&
(flags & IPV6_UNSPECSRC) == 0) {
error = EOPNOTSUPP;
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
error = EOPNOTSUPP;
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
IP6_STATINC(IP6_STAT_LOCALOUT);
/*
* Route packet.
*/
/* initialize cached route */
if (ro == NULL) { memset(&ip6route, 0, sizeof(ip6route));
ro = &ip6route;
}
ro_pmtu = ro;
if (opt && opt->ip6po_rthdr) ro = &opt->ip6po_route;
/*
* if specified, try to fill in the traffic class field.
* do not override if a non-zero value is already set.
* we check the diffserv field and the ecn field separately.
*/
if (opt && opt->ip6po_tclass >= 0) {
int mask = 0;
if ((ip6->ip6_flow & htonl(0xfc << 20)) == 0)
mask |= 0xfc;
if ((ip6->ip6_flow & htonl(0x03 << 20)) == 0)
mask |= 0x03;
if (mask != 0) ip6->ip6_flow |= htonl((opt->ip6po_tclass & mask) << 20);
}
/* fill in or override the hop limit field, if necessary. */
if (opt && opt->ip6po_hlim != -1) ip6->ip6_hlim = opt->ip6po_hlim & 0xff; else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
if (im6o != NULL)
ip6->ip6_hlim = im6o->im6o_multicast_hlim;
else
ip6->ip6_hlim = ip6_defmcasthlim;
}
#ifdef IPSEC
if (needipsec) {
error = ipsec6_process_packet(m, sp->req, flags);
/*
* Preserve KAME behaviour: ENOENT can be returned
* when an SA acquire is in progress. Don't propagate
* this to user-level; it confuses applications.
* XXX this will go away when the SADB is redone.
*/
if (error == ENOENT)
error = 0;
goto done;
}
#endif
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);
/* We do not need a route for multicast */
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
struct in6_pktinfo *pi = NULL;
/*
* If the outgoing interface for the address is specified by
* the caller, use it.
*/
if (opt && (pi = opt->ip6po_pktinfo) != NULL) {
/* XXX boundary check is assumed to be already done. */
ifp = if_get_byindex(pi->ipi6_ifindex, &psref); } else if (im6o != NULL) { ifp = if_get_byindex(im6o->im6o_multicast_if_index,
&psref);
}
}
if (ifp == NULL) {
error = in6_selectroute(&dst_sa, opt, &ro, &rt, true);
if (error != 0)
goto bad;
ifp = if_get_byindex(rt->rt_ifp->if_index, &psref);
}
if (rt == NULL) {
/*
* If in6_selectroute() does not return a route entry,
* dst may not have been updated.
*/
error = rtcache_setdst(ro, sin6tosa(&dst_sa));
if (error) { IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
}
/*
* then rt (for unicast) and ifp must be non-NULL valid values.
*/
if ((flags & IPV6_FORWARDING) == 0) {
/* XXX: the FORWARDING flag can be set for mrouting. */
in6_ifstat_inc(ifp, ifs6_out_request);
}
if (rt != NULL) {
ia = (struct in6_ifaddr *)(rt->rt_ifa);
rt->rt_use++;
}
/*
* The outgoing interface must be in the zone of source and
* destination addresses. We should use ia_ifp to support the
* case of sending packets to an address of our own.
*/
if (ia != NULL) {
origifp = ia->ia_ifp;
if (if_is_deactivated(origifp)) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto bad;
}
if_acquire(origifp, &psref_ia);
release_psref_ia = true;
} else
origifp = ifp;
src0 = ip6->ip6_src;
if (in6_setscope(&src0, origifp, &zone))
goto badscope;
sockaddr_in6_init(&src_sa, &ip6->ip6_src, 0, 0, 0);
if (sa6_recoverscope(&src_sa) || zone != src_sa.sin6_scope_id)
goto badscope;
dst0 = ip6->ip6_dst;
if (in6_setscope(&dst0, origifp, &zone))
goto badscope;
/* re-initialize to be sure */
sockaddr_in6_init(&dst_sa, &ip6->ip6_dst, 0, 0, 0);
if (sa6_recoverscope(&dst_sa) || zone != dst_sa.sin6_scope_id)
goto badscope;
/* scope check is done. */
/* Ensure we only send from a valid address. */
if ((ifp->if_flags & IFF_LOOPBACK) == 0 &&
(flags & IPV6_FORWARDING) == 0 &&
(error = ip6_ifaddrvalid(&src0, &dst0)) != 0)
{
char ip6buf[INET6_ADDRSTRLEN];
nd6log(LOG_ERR,
"refusing to send from invalid address %s (pid %d)\n",
IN6_PRINT(ip6buf, &src0), curproc->p_pid);
IP6_STATINC(IP6_STAT_ODROPPED);
in6_ifstat_inc(origifp, ifs6_out_discard);
if (error == 1)
/*
* Address exists, but is tentative or detached.
* We can't send from it because it's invalid,
* so we drop the packet.
*/
error = 0;
else
error = EADDRNOTAVAIL;
goto bad;
}
if (rt != NULL && (rt->rt_flags & RTF_GATEWAY) &&
!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
dst = satocsin6(rt->rt_gateway);
else
dst = satocsin6(rtcache_getdst(ro));
/*
* XXXXXX: original code follows:
*/
if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst))
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
else {
bool ingroup;
m->m_flags = (m->m_flags & ~M_BCAST) | M_MCAST;
in6_ifstat_inc(ifp, ifs6_out_mcast);
/*
* Confirm that the outgoing interface supports multicast.
*/
if (!(ifp->if_flags & IFF_MULTICAST)) {
IP6_STATINC(IP6_STAT_NOROUTE);
in6_ifstat_inc(ifp, ifs6_out_discard);
error = ENETUNREACH;
goto bad;
}
ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
if (ingroup && (im6o == NULL || im6o->im6o_multicast_loop)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
*/
KASSERT(dst != NULL);
ip6_mloopback(ifp, m, dst);
} else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IPV6_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip6_mloopback(),
* above, will be forwarded by the ip6_input() routine,
* if necessary.
*/
if (ip6_mrouter && (flags & IPV6_FORWARDING) == 0) { if (ip6_mforward(ip6, ifp, m) != 0) { m_freem(m);
goto done;
}
}
}
/*
* Multicasts with a hoplimit of zero may be looped back,
* above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip6_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip6->ip6_hlim == 0 || (ifp->if_flags & IFF_LOOPBACK) || IN6_IS_ADDR_MC_INTFACELOCAL(&ip6->ip6_dst)) {
m_freem(m);
goto done;
}
}
/*
* Fill the outgoing interface to tell the upper layer
* to increment per-interface statistics.
*/
if (ifpp) *ifpp = ifp;
/* Determine path MTU. */
/*
* ro_pmtu represent final destination while
* ro might represent immediate destination.
* Use ro_pmtu destination since MTU might differ.
*/
if (ro_pmtu != ro) {
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
/* ro_pmtu may not have a cache */
sockaddr_in6_init(&u.dst6, &finaldst, 0, 0, 0);
rt_pmtu = rtcache_lookup(ro_pmtu, &u.dst);
} else
rt_pmtu = rt; error = ip6_getpmtu(rt_pmtu, ifp, &mtu, &alwaysfrag); if (rt_pmtu != NULL && rt_pmtu != rt)
rtcache_unref(rt_pmtu, ro_pmtu);
KASSERT(error == 0); /* ip6_getpmtu never fail if ifp is passed */
/*
* The caller of this function may specify to use the minimum MTU
* in some cases.
* An advanced API option (IPV6_USE_MIN_MTU) can also override MTU
* setting. The logic is a bit complicated; by default, unicast
* packets will follow path MTU while multicast packets will be sent at
* the minimum MTU. If IP6PO_MINMTU_ALL is specified, all packets
* including unicast ones will be sent at the minimum MTU. Multicast
* packets will always be sent at the minimum MTU unless
* IP6PO_MINMTU_DISABLE is explicitly specified.
* See RFC 3542 for more details.
*/
if (mtu > IPV6_MMTU) { if ((flags & IPV6_MINMTU))
mtu = IPV6_MMTU;
else if (opt && opt->ip6po_minmtu == IP6PO_MINMTU_ALL)
mtu = IPV6_MMTU;
else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) &&
(opt == NULL ||
opt->ip6po_minmtu != IP6PO_MINMTU_DISABLE)) {
mtu = IPV6_MMTU;
}
}
/*
* clear embedded scope identifiers if necessary.
* in6_clearscope will touch the addresses only when necessary.
*/
in6_clearscope(&ip6->ip6_src);
in6_clearscope(&ip6->ip6_dst);
/*
* If the outgoing packet contains a hop-by-hop options header,
* it must be examined and processed even by the source node.
* (RFC 2460, section 4.)
*
* XXX Is this really necessary?
*/
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
u_int32_t dummy1 = 0; /* XXX unused */
u_int32_t dummy2; /* XXX unused */
int hoff = sizeof(struct ip6_hdr);
if (ip6_hopopts_input(&dummy1, &dummy2, &m, &hoff)) {
/* m was already freed at this point */
error = EINVAL;
goto done;
}
ip6 = mtod(m, struct ip6_hdr *);
}
/*
* Run through list of hooks for output packets.
*/
error = pfil_run_hooks(inet6_pfil_hook, &m, ifp, PFIL_OUT);
if (error != 0 || m == NULL) {
IP6_STATINC(IP6_STAT_PFILDROP_OUT);
goto done;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Send the packet to the outgoing interface.
* If necessary, do IPv6 fragmentation before sending.
*
* the logic here is rather complex:
* 1: normal case (dontfrag == 0, alwaysfrag == 0)
* 1-a: send as is if tlen <= path mtu
* 1-b: fragment if tlen > path mtu
*
* 2: if user asks us not to fragment (dontfrag == 1)
* 2-a: send as is if tlen <= interface mtu
* 2-b: error if tlen > interface mtu
*
* 3: if we always need to attach fragment header (alwaysfrag == 1)
* always fragment
*
* 4: if dontfrag == 1 && alwaysfrag == 1
* error, as we cannot handle this conflicting request
*/
tlen = m->m_pkthdr.len;
tso = (m->m_pkthdr.csum_flags & M_CSUM_TSOv6) != 0;
if (opt && (opt->ip6po_flags & IP6PO_DONTFRAG))
dontfrag = 1;
else
dontfrag = 0;
if (dontfrag && alwaysfrag) { /* case 4 */
/* conflicting request - can't transmit */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
goto bad;
}
if (dontfrag && (!tso && tlen > ifp->if_mtu)) { /* case 2-b */
/*
* Even if the DONTFRAG option is specified, we cannot send the
* packet when the data length is larger than the MTU of the
* outgoing interface.
* Notify the error by sending IPV6_PATHMTU ancillary data as
* well as returning an error code (the latter is not described
* in the API spec.)
*/
u_int32_t mtu32;
struct ip6ctlparam ip6cp;
mtu32 = (u_int32_t)mtu;
memset(&ip6cp, 0, sizeof(ip6cp));
ip6cp.ip6c_cmdarg = (void *)&mtu32;
pfctlinput2(PRC_MSGSIZE,
rtcache_getdst(ro_pmtu), &ip6cp);
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
goto bad;
}
/*
* transmit packet without fragmentation
*/
if (dontfrag || (!alwaysfrag && (tlen <= mtu || tso))) {
/* case 1-a and 2-a */
struct in6_ifaddr *ia6;
int sw_csum;
int s;
ip6 = mtod(m, struct ip6_hdr *);
s = pserialize_read_enter();
ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
if (ia6) {
/* Record statistics for this interface address. */
ia6->ia_ifa.ifa_data.ifad_outbytes += m->m_pkthdr.len;
}
pserialize_read_exit(s);
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
if ((sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6)) != 0) { if (IN6_NEED_CHECKSUM(ifp,
sw_csum & (M_CSUM_UDPv6|M_CSUM_TCPv6))) {
in6_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
KASSERT(dst != NULL);
if (__predict_false(sw_csum & M_CSUM_TSOv6)) {
/*
* TSO6 is required by a packet, but disabled for
* the interface.
*/
error = ip6_tso_output(ifp, origifp, m, dst, rt);
} else
error = ip6_if_output(ifp, origifp, m, dst, rt);
goto done;
}
if (tso) {
IP6_STATINC(IP6_STAT_CANTFRAG); /* XXX */
error = EINVAL; /* XXX */
goto bad;
}
/*
* try to fragment the packet. case 1-b and 3
*/
if (mtu < IPV6_MMTU) {
/* path MTU cannot be less than IPV6_MMTU */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
} else if (ip6->ip6_plen == 0) {
/* jumbo payload cannot be fragmented */
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
} else {
const uint32_t id = ip6_randomid();
struct mbuf **mnext, *m_frgpart;
const int hlen = unfragpartlen;
struct ip6_frag *ip6f;
u_char nextproto;
if (mtu > IPV6_MAXPACKET)
mtu = IPV6_MAXPACKET;
/*
* Must be able to put at least 8 bytes per fragment.
*/
len = (mtu - hlen - sizeof(struct ip6_frag)) & ~7;
if (len < 8) {
IP6_STATINC(IP6_STAT_CANTFRAG);
error = EMSGSIZE;
in6_ifstat_inc(ifp, ifs6_out_fragfail);
goto bad;
}
mnext = &m->m_nextpkt;
/*
* Change the next header field of the last header in the
* unfragmentable part.
*/
if (exthdrs.ip6e_rthdr) {
nextproto = *mtod(exthdrs.ip6e_rthdr, u_char *);
*mtod(exthdrs.ip6e_rthdr, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_dest1) {
nextproto = *mtod(exthdrs.ip6e_dest1, u_char *);
*mtod(exthdrs.ip6e_dest1, u_char *) = IPPROTO_FRAGMENT;
} else if (exthdrs.ip6e_hbh) {
nextproto = *mtod(exthdrs.ip6e_hbh, u_char *);
*mtod(exthdrs.ip6e_hbh, u_char *) = IPPROTO_FRAGMENT;
} else {
nextproto = ip6->ip6_nxt;
ip6->ip6_nxt = IPPROTO_FRAGMENT;
}
if ((m->m_pkthdr.csum_flags & (M_CSUM_UDPv6|M_CSUM_TCPv6))
!= 0) {
if (IN6_NEED_CHECKSUM(ifp,
m->m_pkthdr.csum_flags &
(M_CSUM_UDPv6|M_CSUM_TCPv6))) {
in6_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_UDPv6|M_CSUM_TCPv6);
}
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto
* chain.
*/
m0 = m;
for (off = hlen; off < tlen; off += len) {
struct mbuf *mlast;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (!m) {
error = ENOBUFS;
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
m_reset_rcvif(m);
m->m_flags = m0->m_flags & M_COPYFLAGS;
*mnext = m;
mnext = &m->m_nextpkt;
m->m_data += max_linkhdr;
mhip6 = mtod(m, struct ip6_hdr *);
*mhip6 = *ip6;
m->m_len = sizeof(*mhip6);
ip6f = NULL;
error = ip6_insertfraghdr(m0, m, hlen, &ip6f);
if (error) {
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
/* Fill in the Frag6 Header */
ip6f->ip6f_offlg = htons((u_int16_t)((off - hlen) & ~7));
if (off + len >= tlen)
len = tlen - off;
else
ip6f->ip6f_offlg |= IP6F_MORE_FRAG;
ip6f->ip6f_reserved = 0;
ip6f->ip6f_ident = id;
ip6f->ip6f_nxt = nextproto;
mhip6->ip6_plen = htons((u_int16_t)(len + hlen +
sizeof(*ip6f) - sizeof(struct ip6_hdr)));
if ((m_frgpart = m_copym(m0, off, len, M_DONTWAIT)) == NULL) {
error = ENOBUFS;
IP6_STATINC(IP6_STAT_ODROPPED);
goto sendorfree;
}
for (mlast = m; mlast->m_next; mlast = mlast->m_next)
;
mlast->m_next = m_frgpart;
m->m_pkthdr.len = len + hlen + sizeof(*ip6f);
m_reset_rcvif(m);
IP6_STATINC(IP6_STAT_OFRAGMENTS);
in6_ifstat_inc(ifp, ifs6_out_fragcreat);
}
in6_ifstat_inc(ifp, ifs6_out_fragok);
}
sendorfree:
m = m0->m_nextpkt;
m0->m_nextpkt = 0;
m_freem(m0);
for (m0 = m; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = 0;
if (error == 0) {
struct in6_ifaddr *ia6;
int s;
ip6 = mtod(m, struct ip6_hdr *);
s = pserialize_read_enter();
ia6 = in6_ifawithifp(ifp, &ip6->ip6_src);
if (ia6) {
/*
* Record statistics for this interface
* address.
*/
ia6->ia_ifa.ifa_data.ifad_outbytes +=
m->m_pkthdr.len;
}
pserialize_read_exit(s);
KASSERT(dst != NULL);
error = ip6_if_output(ifp, origifp, m, dst, rt);
} else
m_freem(m);
}
if (error == 0) IP6_STATINC(IP6_STAT_FRAGMENTED);
done:
rtcache_unref(rt, ro);
if (ro == &ip6route) rtcache_free(&ip6route);
#ifdef IPSEC
if (sp != NULL) KEY_SP_UNREF(&sp);
#endif
if_put(ifp, &psref);
if (release_psref_ia) if_put(origifp, &psref_ia); curlwp_bindx(bound);
return error;
freehdrs:
m_freem(exthdrs.ip6e_hbh);
m_freem(exthdrs.ip6e_dest1);
m_freem(exthdrs.ip6e_rthdr);
m_freem(exthdrs.ip6e_dest2);
/* FALLTHROUGH */
bad:
m_freem(m);
goto done;
badscope:
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(origifp, ifs6_out_discard); if (error == 0)
error = EHOSTUNREACH; /* XXX */
goto bad;
}
static int
ip6_copyexthdr(struct mbuf **mp, void *hdr, int hlen)
{
struct mbuf *m;
if (hlen > MCLBYTES)
return ENOBUFS; /* XXX */
MGET(m, M_DONTWAIT, MT_DATA);
if (!m)
return ENOBUFS;
if (hlen > MLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) { m_free(m);
return ENOBUFS;
}
}
m->m_len = hlen;
if (hdr)
memcpy(mtod(m, void *), hdr, hlen);
*mp = m;
return 0;
}
/*
* Insert jumbo payload option.
*/
static int
ip6_insert_jumboopt(struct ip6_exthdrs *exthdrs, u_int32_t plen)
{
struct mbuf *mopt;
u_int8_t *optbuf;
u_int32_t v;
#define JUMBOOPTLEN 8 /* length of jumbo payload option and padding */
/*
* If there is no hop-by-hop options header, allocate new one.
* If there is one but it doesn't have enough space to store the
* jumbo payload option, allocate a cluster to store the whole options.
* Otherwise, use it to store the options.
*/
if (exthdrs->ip6e_hbh == NULL) {
MGET(mopt, M_DONTWAIT, MT_DATA);
if (mopt == 0)
return (ENOBUFS);
mopt->m_len = JUMBOOPTLEN;
optbuf = mtod(mopt, u_int8_t *);
optbuf[1] = 0; /* = ((JUMBOOPTLEN) >> 3) - 1 */
exthdrs->ip6e_hbh = mopt;
} else {
struct ip6_hbh *hbh;
mopt = exthdrs->ip6e_hbh;
if (M_TRAILINGSPACE(mopt) < JUMBOOPTLEN) {
const int oldoptlen = mopt->m_len;
struct mbuf *n;
/*
* Assumptions:
* - exthdrs->ip6e_hbh is not referenced from places
* other than exthdrs.
* - exthdrs->ip6e_hbh is not an mbuf chain.
*/
KASSERT(mopt->m_next == NULL);
/*
* Give up if the whole (new) hbh header does not fit
* even in an mbuf cluster.
*/
if (oldoptlen + JUMBOOPTLEN > MCLBYTES)
return ENOBUFS;
/*
* At this point, we must always prepare a cluster.
*/
MGET(n, M_DONTWAIT, MT_DATA);
if (n) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n)
return ENOBUFS;
n->m_len = oldoptlen + JUMBOOPTLEN;
bcopy(mtod(mopt, void *), mtod(n, void *),
oldoptlen);
optbuf = mtod(n, u_int8_t *) + oldoptlen;
m_freem(mopt);
mopt = exthdrs->ip6e_hbh = n;
} else {
optbuf = mtod(mopt, u_int8_t *) + mopt->m_len;
mopt->m_len += JUMBOOPTLEN;
}
optbuf[0] = IP6OPT_PADN;
optbuf[1] = 0;
/*
* Adjust the header length according to the pad and
* the jumbo payload option.
*/
hbh = mtod(mopt, struct ip6_hbh *);
hbh->ip6h_len += (JUMBOOPTLEN >> 3);
}
/* fill in the option. */
optbuf[2] = IP6OPT_JUMBO;
optbuf[3] = 4;
v = (u_int32_t)htonl(plen + JUMBOOPTLEN);
memcpy(&optbuf[4], &v, sizeof(u_int32_t));
/* finally, adjust the packet header length */
exthdrs->ip6e_ip6->m_pkthdr.len += JUMBOOPTLEN;
return 0;
#undef JUMBOOPTLEN
}
/*
* Insert fragment header and copy unfragmentable header portions.
*
* *frghdrp will not be read, and it is guaranteed that either an
* error is returned or that *frghdrp will point to space allocated
* for the fragment header.
*
* On entry, m contains:
* IPv6 Header
* On exit, it contains:
* IPv6 Header -> Unfragmentable Part -> Frag6 Header
*/
static int
ip6_insertfraghdr(struct mbuf *m0, struct mbuf *m, int hlen,
struct ip6_frag **frghdrp)
{
struct mbuf *n, *mlast;
if (hlen > sizeof(struct ip6_hdr)) {
n = m_copym(m0, sizeof(struct ip6_hdr),
hlen - sizeof(struct ip6_hdr), M_DONTWAIT);
if (n == NULL)
return ENOBUFS;
m->m_next = n;
} else
n = m;
/* Search for the last mbuf of unfragmentable part. */
for (mlast = n; mlast->m_next; mlast = mlast->m_next)
;
if ((mlast->m_flags & M_EXT) == 0 &&
M_TRAILINGSPACE(mlast) >= sizeof(struct ip6_frag)) {
/* use the trailing space of the last mbuf for the fragment hdr */
*frghdrp = (struct ip6_frag *)(mtod(mlast, char *) +
mlast->m_len);
mlast->m_len += sizeof(struct ip6_frag);
} else {
/* allocate a new mbuf for the fragment header */
struct mbuf *mfrg;
MGET(mfrg, M_DONTWAIT, MT_DATA);
if (mfrg == NULL)
return ENOBUFS;
mfrg->m_len = sizeof(struct ip6_frag);
*frghdrp = mtod(mfrg, struct ip6_frag *);
mlast->m_next = mfrg;
}
return 0;
}
static int
ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup,
int *alwaysfragp)
{
u_int32_t mtu = 0;
int alwaysfrag = 0;
int error = 0;
if (rt != NULL) {
if (ifp == NULL)
ifp = rt->rt_ifp;
mtu = rt->rt_rmx.rmx_mtu;
if (mtu == 0)
mtu = ifp->if_mtu; else if (mtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph:
* if we record ICMPv6 too big message with
* mtu < IPV6_MMTU, transmit packets sized IPV6_MMTU
* or smaller, with fragment header attached.
* (fragment header is needed regardless from the
* packet size, for translators to identify packets)
*/
alwaysfrag = 1;
mtu = IPV6_MMTU;
} else if (mtu > ifp->if_mtu) {
/*
* The MTU on the route is larger than the MTU on
* the interface! This shouldn't happen, unless the
* MTU of the interface has been changed after the
* interface was brought up. Change the MTU in the
* route to match the interface MTU (as long as the
* field isn't locked).
*/
mtu = ifp->if_mtu;
if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) rt->rt_rmx.rmx_mtu = mtu;
}
} else if (ifp) {
mtu = ifp->if_mtu;
} else
error = EHOSTUNREACH; /* XXX */
*mtup = mtu;
if (alwaysfragp)
*alwaysfragp = alwaysfrag;
return (error);
}
/*
* IP6 socket option processing.
*/
int
ip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int optdatalen, uproto;
void *optdata;
struct inpcb *inp = sotoinpcb(so);
struct ip_moptions **mopts;
int error, optval;
int level, optname;
KASSERT(solocked(so)); KASSERT(sopt != NULL);
level = sopt->sopt_level;
optname = sopt->sopt_name;
error = optval = 0;
uproto = (int)so->so_proto->pr_protocol;
switch (level) {
case IPPROTO_IP:
switch (optname) {
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
case IP_MULTICAST_IF:
case IP_MULTICAST_LOOP:
case IP_MULTICAST_TTL:
mopts = &inp->inp_moptions;
switch (op) {
case PRCO_GETOPT:
return ip_getmoptions(*mopts, sopt);
case PRCO_SETOPT:
return ip_setmoptions(mopts, sopt);
default:
return EINVAL;
}
default:
return ENOPROTOOPT;
}
case IPPROTO_IPV6:
break;
default:
return ENOPROTOOPT;
}
switch (op) {
case PRCO_SETOPT:
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTOPTIONS:
error = ip6_pcbopts(&in6p_outputopts(inp), so, sopt);
break;
#endif
/*
* Use of some Hop-by-Hop options or some
* Destination options, might require special
* privilege. That is, normal applications
* (without special privilege) might be forbidden
* from setting certain options in outgoing packets,
* and might never see certain options in received
* packets. [RFC 2292 Section 6]
* KAME specific note:
* KAME prevents non-privileged users from sending or
* receiving ANY hbh/dst options in order to avoid
* overhead of parsing options in the kernel.
*/
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
case IPV6_RECVRTHDRDSTOPTS:
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6, KAUTH_REQ_NETWORK_IPV6_HOPBYHOP,
NULL, NULL, NULL);
if (error)
break;
/* FALLTHROUGH */
case IPV6_UNICAST_HOPS:
case IPV6_HOPLIMIT:
case IPV6_FAITH:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_RECVTCLASS:
case IPV6_V6ONLY:
case IPV6_BINDANY:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optname) {
case IPV6_UNICAST_HOPS:
if (optval < -1 || optval >= 256)
error = EINVAL;
else {
/* -1 = kernel default */
in6p_hops6(inp) = optval;
}
break;
#define OPTSET(bit) \
do { \
if (optval) \
inp->inp_flags |= (bit); \
else \
inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)
#ifdef RFC2292
#define OPTSET2292(bit) \
do { \
inp->inp_flags |= IN6P_RFC2292; \
if (optval) \
inp->inp_flags |= (bit); \
else \
inp->inp_flags &= ~(bit); \
} while (/*CONSTCOND*/ 0)
#endif
#define OPTBIT(bit) (inp->inp_flags & (bit) ? 1 : 0)
case IPV6_RECVPKTINFO:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_PKTINFO);
break;
case IPV6_HOPLIMIT:
{
struct ip6_pktopts **optp;
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(IPV6_HOPLIMIT,
(u_char *)&optval,
sizeof(optval),
optp,
kauth_cred_get(), uproto);
break;
}
case IPV6_RECVHOPLIMIT:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_HOPLIMIT);
break;
case IPV6_RECVHOPOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_DSTOPTS);
break;
case IPV6_RECVRTHDRDSTOPTS:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_RTHDRDSTOPTS);
break;
case IPV6_RECVRTHDR:
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_RTHDR);
break;
case IPV6_FAITH:
OPTSET(IN6P_FAITH);
break;
case IPV6_RECVPATHMTU:
/*
* We ignore this option for TCP
* sockets.
* (RFC3542 leaves this case
* unspecified.)
*/
if (uproto != IPPROTO_TCP) OPTSET(IN6P_MTU);
break;
case IPV6_V6ONLY:
/*
* make setsockopt(IPV6_V6ONLY)
* available only prior to bind(2).
* see ipng mailing list, Jun 22 2001.
*/
if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
error = EINVAL;
break;
}
#ifdef INET6_BINDV6ONLY
if (!optval)
error = EINVAL;
#else
OPTSET(IN6P_IPV6_V6ONLY);
#endif
break;
case IPV6_RECVTCLASS:
#ifdef RFC2292
/* cannot mix with RFC2292 XXX */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
OPTSET(IN6P_TCLASS);
break;
case IPV6_BINDANY:
error = kauth_authorize_network(
kauth_cred_get(), KAUTH_NETWORK_BIND,
KAUTH_REQ_NETWORK_BIND_ANYADDR, so, NULL,
NULL);
if (error)
break;
OPTSET(IN6P_BINDANY);
break;
}
break;
case IPV6_OTCLASS:
{
struct ip6_pktopts **optp;
u_int8_t tclass;
error = sockopt_get(sopt, &tclass, sizeof(tclass));
if (error)
break;
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname,
(u_char *)&tclass,
sizeof(tclass),
optp,
kauth_cred_get(), uproto);
break;
}
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
case IPV6_PREFER_TEMPADDR:
error = sockopt_getint(sopt, &optval);
if (error)
break;
{
struct ip6_pktopts **optp;
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname,
(u_char *)&optval,
sizeof(optval),
optp,
kauth_cred_get(), uproto);
break;
}
#ifdef RFC2292
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292HOPOPTS:
case IPV6_2292DSTOPTS:
case IPV6_2292RTHDR:
/* RFC 2292 */
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optname) {
case IPV6_2292PKTINFO:
OPTSET2292(IN6P_PKTINFO);
break;
case IPV6_2292HOPLIMIT:
OPTSET2292(IN6P_HOPLIMIT);
break;
case IPV6_2292HOPOPTS:
/*
* Check super-user privilege.
* See comments for IPV6_RECVHOPOPTS.
*/
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
NULL, NULL);
if (error)
return (error);
OPTSET2292(IN6P_HOPOPTS);
break;
case IPV6_2292DSTOPTS:
error = kauth_authorize_network(
kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL,
NULL, NULL);
if (error)
return (error);
OPTSET2292(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS); /* XXX */
break;
case IPV6_2292RTHDR:
OPTSET2292(IN6P_RTHDR);
break;
}
break;
#endif
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_NEXTHOP: {
/* new advanced API (RFC3542) */
void *optbuf;
int optbuflen;
struct ip6_pktopts **optp;
#ifdef RFC2292
/* cannot mix with RFC2292 */
if (OPTBIT(IN6P_RFC2292)) {
error = EINVAL;
break;
}
#endif
optbuflen = sopt->sopt_size;
optbuf = malloc(optbuflen, M_IP6OPT, M_NOWAIT);
if (optbuf == NULL) {
error = ENOBUFS;
break;
}
error = sockopt_get(sopt, optbuf, optbuflen);
if (error) {
free(optbuf, M_IP6OPT);
break;
}
optp = &in6p_outputopts(inp);
error = ip6_pcbopt(optname, optbuf, optbuflen,
optp, kauth_cred_get(), uproto);
free(optbuf, M_IP6OPT);
break;
}
#undef OPTSET
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_setmoptions(sopt, inp);
break;
case IPV6_PORTRANGE:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optval) {
case IPV6_PORTRANGE_DEFAULT:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags &= ~(IN6P_HIGHPORT);
break;
case IPV6_PORTRANGE_HIGH:
inp->inp_flags &= ~(IN6P_LOWPORT);
inp->inp_flags |= IN6P_HIGHPORT;
break;
case IPV6_PORTRANGE_LOW:
inp->inp_flags &= ~(IN6P_HIGHPORT);
inp->inp_flags |= IN6P_LOWPORT;
break;
default:
error = EINVAL;
break;
}
break;
case IPV6_PORTALGO:
error = sockopt_getint(sopt, &optval);
if (error)
break;
error = portalgo_algo_index_select(inp, optval);
break;
#if defined(IPSEC)
case IPV6_IPSEC_POLICY:
if (ipsec_enabled) {
error = ipsec_set_policy(inp,
sopt->sopt_data, sopt->sopt_size,
kauth_cred_get());
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTOPTIONS:
/*
* RFC3542 (effectively) deprecated the
* semantics of the 2292-style pktoptions.
* Since it was not reliable in nature (i.e.,
* applications had to expect the lack of some
* information after all), it would make sense
* to simplify this part by always returning
* empty data.
*/
break;
#endif
case IPV6_RECVHOPOPTS:
case IPV6_RECVDSTOPTS:
case IPV6_RECVRTHDRDSTOPTS:
case IPV6_UNICAST_HOPS:
case IPV6_RECVPKTINFO:
case IPV6_RECVHOPLIMIT:
case IPV6_RECVRTHDR:
case IPV6_RECVPATHMTU:
case IPV6_FAITH:
case IPV6_V6ONLY:
case IPV6_PORTRANGE:
case IPV6_RECVTCLASS:
case IPV6_BINDANY:
switch (optname) {
case IPV6_RECVHOPOPTS:
optval = OPTBIT(IN6P_HOPOPTS);
break;
case IPV6_RECVDSTOPTS:
optval = OPTBIT(IN6P_DSTOPTS);
break;
case IPV6_RECVRTHDRDSTOPTS:
optval = OPTBIT(IN6P_RTHDRDSTOPTS);
break;
case IPV6_UNICAST_HOPS:
optval = in6p_hops6(inp);
break;
case IPV6_RECVPKTINFO:
optval = OPTBIT(IN6P_PKTINFO);
break;
case IPV6_RECVHOPLIMIT:
optval = OPTBIT(IN6P_HOPLIMIT);
break;
case IPV6_RECVRTHDR:
optval = OPTBIT(IN6P_RTHDR);
break;
case IPV6_RECVPATHMTU:
optval = OPTBIT(IN6P_MTU);
break;
case IPV6_FAITH:
optval = OPTBIT(IN6P_FAITH);
break;
case IPV6_V6ONLY:
optval = OPTBIT(IN6P_IPV6_V6ONLY);
break;
case IPV6_PORTRANGE:
{
int flags;
flags = inp->inp_flags;
if (flags & IN6P_HIGHPORT)
optval = IPV6_PORTRANGE_HIGH;
else if (flags & IN6P_LOWPORT)
optval = IPV6_PORTRANGE_LOW;
else
optval = 0;
break;
}
case IPV6_RECVTCLASS:
optval = OPTBIT(IN6P_TCLASS);
break;
case IPV6_BINDANY:
optval = OPTBIT(IN6P_BINDANY);
break;
}
if (error)
break;
error = sockopt_setint(sopt, optval);
break;
case IPV6_PATHMTU:
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
struct route *ro = &inp->inp_route;
struct rtentry *rt;
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
/*
* XXX: we dot not consider the case of source
* routing, or optional information to specify
* the outgoing interface.
*/
sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
rt = rtcache_lookup(ro, &u.dst);
error = ip6_getpmtu(rt, NULL, &pmtu, NULL);
rtcache_unref(rt, ro);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)
pmtu = IPV6_MAXPACKET;
memset(&mtuinfo, 0, sizeof(mtuinfo));
mtuinfo.ip6m_mtu = (u_int32_t)pmtu;
optdata = (void *)&mtuinfo;
optdatalen = sizeof(mtuinfo);
if (optdatalen > MCLBYTES)
return (EMSGSIZE); /* XXX */
error = sockopt_set(sopt, optdata, optdatalen);
break;
}
#ifdef RFC2292
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292HOPOPTS:
case IPV6_2292RTHDR:
case IPV6_2292DSTOPTS:
switch (optname) {
case IPV6_2292PKTINFO:
optval = OPTBIT(IN6P_PKTINFO);
break;
case IPV6_2292HOPLIMIT:
optval = OPTBIT(IN6P_HOPLIMIT);
break;
case IPV6_2292HOPOPTS:
optval = OPTBIT(IN6P_HOPOPTS);
break;
case IPV6_2292RTHDR:
optval = OPTBIT(IN6P_RTHDR);
break;
case IPV6_2292DSTOPTS:
optval = OPTBIT(IN6P_DSTOPTS|IN6P_RTHDRDSTOPTS);
break;
}
error = sockopt_setint(sopt, optval);
break;
#endif
case IPV6_PKTINFO:
case IPV6_HOPOPTS:
case IPV6_RTHDR:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_NEXTHOP:
case IPV6_OTCLASS:
case IPV6_TCLASS:
case IPV6_DONTFRAG:
case IPV6_USE_MIN_MTU:
case IPV6_PREFER_TEMPADDR:
error = ip6_getpcbopt(in6p_outputopts(inp),
optname, sopt);
break;
case IPV6_MULTICAST_IF:
case IPV6_MULTICAST_HOPS:
case IPV6_MULTICAST_LOOP:
case IPV6_JOIN_GROUP:
case IPV6_LEAVE_GROUP:
error = ip6_getmoptions(sopt, inp);
break;
case IPV6_PORTALGO:
optval = inp->inp_portalgo;
error = sockopt_setint(sopt, optval);
break;
#if defined(IPSEC)
case IPV6_IPSEC_POLICY:
if (ipsec_used) {
struct mbuf *m = NULL;
/*
* XXX: this will return EINVAL as sopt is
* empty
*/
error = ipsec_get_policy(inp, sopt->sopt_data,
sopt->sopt_size, &m);
if (!error) error = sockopt_setmbuf(sopt, m);
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
}
return (error);
}
int
ip6_raw_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int error = 0, optval;
const int icmp6off = offsetof(struct icmp6_hdr, icmp6_cksum);
struct inpcb *inp = sotoinpcb(so);
int level, optname;
KASSERT(sopt != NULL);
level = sopt->sopt_level;
optname = sopt->sopt_name;
if (level != IPPROTO_IPV6) {
return ENOPROTOOPT;
}
switch (optname) {
case IPV6_CHECKSUM:
/*
* For ICMPv6 sockets, no modification allowed for checksum
* offset, permit "no change" values to help existing apps.
*
* XXX RFC3542 says: "An attempt to set IPV6_CHECKSUM
* for an ICMPv6 socket will fail." The current
* behavior does not meet RFC3542.
*/
switch (op) {
case PRCO_SETOPT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
if (optval < -1 || (optval > 0 && (optval % 2) != 0)) {
/*
* The API assumes non-negative even offset
* values or -1 as a special value.
*/
error = EINVAL;
} else if (so->so_proto->pr_protocol ==
IPPROTO_ICMPV6) {
if (optval != icmp6off)
error = EINVAL;
} else
in6p_cksum(inp) = optval;
break;
case PRCO_GETOPT:
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
optval = icmp6off;
else
optval = in6p_cksum(inp);
error = sockopt_setint(sopt, optval);
break;
default:
error = EINVAL;
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
return (error);
}
#ifdef RFC2292
/*
* Set up IP6 options in pcb for insertion in output packets or
* specifying behavior of outgoing packets.
*/
static int
ip6_pcbopts(struct ip6_pktopts **pktopt, struct socket *so,
struct sockopt *sopt)
{
struct ip6_pktopts *opt = *pktopt;
struct mbuf *m;
int error = 0;
KASSERT(solocked(so));
/* turn off any old options. */
if (opt) {
#ifdef DIAGNOSTIC
if (opt->ip6po_pktinfo || opt->ip6po_nexthop || opt->ip6po_hbh || opt->ip6po_dest1 || opt->ip6po_dest2 ||
opt->ip6po_rhinfo.ip6po_rhi_rthdr)
printf("ip6_pcbopts: all specified options are cleared.\n");
#endif
ip6_clearpktopts(opt, -1);
} else {
opt = malloc(sizeof(*opt), M_IP6OPT, M_NOWAIT);
if (opt == NULL)
return (ENOBUFS);
}
*pktopt = NULL;
if (sopt == NULL || sopt->sopt_size == 0) {
/*
* Only turning off any previous options, regardless of
* whether the opt is just created or given.
*/
free(opt, M_IP6OPT);
return (0);
}
/* set options specified by user. */
m = sockopt_getmbuf(sopt);
if (m == NULL) {
free(opt, M_IP6OPT);
return (ENOBUFS);
}
error = ip6_setpktopts(m, opt, NULL, kauth_cred_get(),
so->so_proto->pr_protocol);
m_freem(m);
if (error != 0) {
ip6_clearpktopts(opt, -1); /* XXX: discard all options */
free(opt, M_IP6OPT);
return (error);
}
*pktopt = opt;
return (0);
}
#endif
/*
* initialize ip6_pktopts. beware that there are non-zero default values in
* the struct.
*/
void
ip6_initpktopts(struct ip6_pktopts *opt)
{
memset(opt, 0, sizeof(*opt));
opt->ip6po_hlim = -1; /* -1 means default hop limit */
opt->ip6po_tclass = -1; /* -1 means default traffic class */
opt->ip6po_minmtu = IP6PO_MINMTU_MCASTONLY;
opt->ip6po_prefer_tempaddr = IP6PO_TEMPADDR_SYSTEM;
}
#define sin6tosa(sin6) ((struct sockaddr *)(sin6)) /* XXX */
static int
ip6_pcbopt(int optname, u_char *buf, int len, struct ip6_pktopts **pktopt,
kauth_cred_t cred, int uproto)
{
struct ip6_pktopts *opt;
if (*pktopt == NULL) {
*pktopt = malloc(sizeof(struct ip6_pktopts), M_IP6OPT,
M_NOWAIT);
if (*pktopt == NULL)
return (ENOBUFS);
ip6_initpktopts(*pktopt);
}
opt = *pktopt;
return (ip6_setpktopt(optname, buf, len, opt, cred, 1, 0, uproto));
}
static int
ip6_getpcbopt(struct ip6_pktopts *pktopt, int optname, struct sockopt *sopt)
{
void *optdata = NULL;
int optdatalen = 0;
struct ip6_ext *ip6e;
int error = 0;
struct in6_pktinfo null_pktinfo;
int deftclass = 0, on;
int defminmtu = IP6PO_MINMTU_MCASTONLY;
int defpreftemp = IP6PO_TEMPADDR_SYSTEM;
switch (optname) {
case IPV6_PKTINFO:
if (pktopt && pktopt->ip6po_pktinfo) optdata = (void *)pktopt->ip6po_pktinfo;
else {
/* XXX: we don't have to do this every time... */
memset(&null_pktinfo, 0, sizeof(null_pktinfo));
optdata = (void *)&null_pktinfo;
}
optdatalen = sizeof(struct in6_pktinfo);
break;
case IPV6_OTCLASS:
/* XXX */
return (EINVAL);
case IPV6_TCLASS:
if (pktopt && pktopt->ip6po_tclass >= 0)
optdata = (void *)&pktopt->ip6po_tclass;
else
optdata = (void *)&deftclass;
optdatalen = sizeof(int);
break;
case IPV6_HOPOPTS:
if (pktopt && pktopt->ip6po_hbh) { optdata = (void *)pktopt->ip6po_hbh;
ip6e = (struct ip6_ext *)pktopt->ip6po_hbh;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDR:
if (pktopt && pktopt->ip6po_rthdr) { optdata = (void *)pktopt->ip6po_rthdr;
ip6e = (struct ip6_ext *)pktopt->ip6po_rthdr;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_RTHDRDSTOPTS:
if (pktopt && pktopt->ip6po_dest1) { optdata = (void *)pktopt->ip6po_dest1;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest1;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_DSTOPTS:
if (pktopt && pktopt->ip6po_dest2) { optdata = (void *)pktopt->ip6po_dest2;
ip6e = (struct ip6_ext *)pktopt->ip6po_dest2;
optdatalen = (ip6e->ip6e_len + 1) << 3;
}
break;
case IPV6_NEXTHOP:
if (pktopt && pktopt->ip6po_nexthop) { optdata = (void *)pktopt->ip6po_nexthop;
optdatalen = pktopt->ip6po_nexthop->sa_len;
}
break;
case IPV6_USE_MIN_MTU:
if (pktopt)
optdata = (void *)&pktopt->ip6po_minmtu;
else
optdata = (void *)&defminmtu;
optdatalen = sizeof(int);
break;
case IPV6_DONTFRAG:
if (pktopt && ((pktopt->ip6po_flags) & IP6PO_DONTFRAG))
on = 1;
else
on = 0;
optdata = (void *)&on;
optdatalen = sizeof(on);
break;
case IPV6_PREFER_TEMPADDR:
if (pktopt)
optdata = (void *)&pktopt->ip6po_prefer_tempaddr;
else
optdata = (void *)&defpreftemp;
optdatalen = sizeof(int);
break;
default: /* should not happen */
#ifdef DIAGNOSTIC
panic("ip6_getpcbopt: unexpected option\n");
#endif
return (ENOPROTOOPT);
}
error = sockopt_set(sopt, optdata, optdatalen);
return (error);
}
void
ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
{ if (optname == -1 || optname == IPV6_PKTINFO) { if (pktopt->ip6po_pktinfo) free(pktopt->ip6po_pktinfo, M_IP6OPT);
pktopt->ip6po_pktinfo = NULL;
}
if (optname == -1 || optname == IPV6_HOPLIMIT) pktopt->ip6po_hlim = -1;
if (optname == -1 || optname == IPV6_TCLASS)
pktopt->ip6po_tclass = -1;
if (optname == -1 || optname == IPV6_NEXTHOP) {
rtcache_free(&pktopt->ip6po_nextroute);
if (pktopt->ip6po_nexthop) free(pktopt->ip6po_nexthop, M_IP6OPT);
pktopt->ip6po_nexthop = NULL;
}
if (optname == -1 || optname == IPV6_HOPOPTS) { if (pktopt->ip6po_hbh) free(pktopt->ip6po_hbh, M_IP6OPT);
pktopt->ip6po_hbh = NULL;
}
if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) { if (pktopt->ip6po_dest1) free(pktopt->ip6po_dest1, M_IP6OPT);
pktopt->ip6po_dest1 = NULL;
}
if (optname == -1 || optname == IPV6_RTHDR) { if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr) free(pktopt->ip6po_rhinfo.ip6po_rhi_rthdr, M_IP6OPT);
pktopt->ip6po_rhinfo.ip6po_rhi_rthdr = NULL;
rtcache_free(&pktopt->ip6po_route);
}
if (optname == -1 || optname == IPV6_DSTOPTS) { if (pktopt->ip6po_dest2) free(pktopt->ip6po_dest2, M_IP6OPT);
pktopt->ip6po_dest2 = NULL;
}
}
#define PKTOPT_EXTHDRCPY(type) \
do { \
if (src->type) { \
int hlen = (((struct ip6_ext *)src->type)->ip6e_len + 1) << 3;\
dst->type = malloc(hlen, M_IP6OPT, canwait); \
if (dst->type == NULL) \
goto bad; \
memcpy(dst->type, src->type, hlen); \
} \
} while (/*CONSTCOND*/ 0)
static int
copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
{
dst->ip6po_hlim = src->ip6po_hlim;
dst->ip6po_tclass = src->ip6po_tclass;
dst->ip6po_flags = src->ip6po_flags;
dst->ip6po_minmtu = src->ip6po_minmtu;
dst->ip6po_prefer_tempaddr = src->ip6po_prefer_tempaddr;
if (src->ip6po_pktinfo) {
dst->ip6po_pktinfo = malloc(sizeof(*dst->ip6po_pktinfo),
M_IP6OPT, canwait);
if (dst->ip6po_pktinfo == NULL)
goto bad;
*dst->ip6po_pktinfo = *src->ip6po_pktinfo;
}
if (src->ip6po_nexthop) {
dst->ip6po_nexthop = malloc(src->ip6po_nexthop->sa_len,
M_IP6OPT, canwait);
if (dst->ip6po_nexthop == NULL)
goto bad;
memcpy(dst->ip6po_nexthop, src->ip6po_nexthop,
src->ip6po_nexthop->sa_len);
}
PKTOPT_EXTHDRCPY(ip6po_hbh); PKTOPT_EXTHDRCPY(ip6po_dest1); PKTOPT_EXTHDRCPY(ip6po_dest2); PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
return (0);
bad:
if (dst->ip6po_pktinfo) free(dst->ip6po_pktinfo, M_IP6OPT); if (dst->ip6po_nexthop) free(dst->ip6po_nexthop, M_IP6OPT); if (dst->ip6po_hbh) free(dst->ip6po_hbh, M_IP6OPT); if (dst->ip6po_dest1) free(dst->ip6po_dest1, M_IP6OPT); if (dst->ip6po_dest2) free(dst->ip6po_dest2, M_IP6OPT); if (dst->ip6po_rthdr) free(dst->ip6po_rthdr, M_IP6OPT);
return (ENOBUFS);
}
#undef PKTOPT_EXTHDRCPY
struct ip6_pktopts *
ip6_copypktopts(struct ip6_pktopts *src, int canwait)
{
int error;
struct ip6_pktopts *dst;
dst = malloc(sizeof(*dst), M_IP6OPT, canwait);
if (dst == NULL)
return (NULL);
ip6_initpktopts(dst);
if ((error = copypktopts(dst, src, canwait)) != 0) {
free(dst, M_IP6OPT);
return (NULL);
}
return (dst);
}
void
ip6_freepcbopts(struct ip6_pktopts *pktopt)
{
if (pktopt == NULL)
return;
ip6_clearpktopts(pktopt, -1);
free(pktopt, M_IP6OPT);
}
int
ip6_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
struct psref *psref, void *v, size_t l)
{
struct ipv6_mreq mreq;
int error;
struct in6_addr *ia = &mreq.ipv6mr_multiaddr;
struct in_addr *ia4 = (void *)&ia->s6_addr32[3];
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error != 0)
return error;
if (IN6_IS_ADDR_UNSPECIFIED(ia)) {
/*
* We use the unspecified address to specify to accept
* all multicast addresses. Only super user is allowed
* to do this.
*/
if (kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST, NULL, NULL, NULL))
return EACCES;
} else if (IN6_IS_ADDR_V4MAPPED(ia)) {
// Don't bother if we are not going to use ifp.
if (l == sizeof(*ia)) { memcpy(v, ia, l);
return 0;
}
} else if (!IN6_IS_ADDR_MULTICAST(ia)) {
return EINVAL;
}
/*
* If no interface was explicitly specified, choose an
* appropriate one according to the given multicast address.
*/
if (mreq.ipv6mr_interface == 0) {
struct rtentry *rt;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
struct sockaddr_in6 dst6;
} u;
struct route ro;
/*
* Look up the routing table for the
* address, and choose the outgoing interface.
* XXX: is it a good approach?
*/
memset(&ro, 0, sizeof(ro));
if (IN6_IS_ADDR_V4MAPPED(ia)) sockaddr_in_init(&u.dst4, ia4, 0);
else
sockaddr_in6_init(&u.dst6, ia, 0, 0, 0);
error = rtcache_setdst(&ro, &u.dst);
if (error != 0)
return error;
rt = rtcache_init(&ro);
*ifp = rt != NULL ? if_get_byindex(rt->rt_ifp->if_index, psref) : NULL;
rtcache_unref(rt, &ro);
rtcache_free(&ro);
} else {
/*
* If the interface is specified, validate it.
*/
*ifp = if_get_byindex(mreq.ipv6mr_interface, psref);
if (*ifp == NULL)
return ENXIO; /* XXX EINVAL? */
}
if (sizeof(*ia) == l)
memcpy(v, ia, l);
else
memcpy(v, ia4, l);
return 0;
}
/*
* Set the IP6 multicast options in response to user setsockopt().
*/
static int
ip6_setmoptions(const struct sockopt *sopt, struct inpcb *inp)
{
int error = 0;
u_int loop, ifindex;
struct ipv6_mreq mreq;
struct in6_addr ia;
struct ifnet *ifp;
struct ip6_moptions *im6o = in6p_moptions(inp);
struct in6_multi_mship *imm;
KASSERT(inp_locked(inp)); if (im6o == NULL) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
im6o = malloc(sizeof(*im6o), M_IPMOPTS, M_NOWAIT); if (im6o == NULL)
return (ENOBUFS);
in6p_moptions(inp) = im6o;
im6o->im6o_multicast_if_index = 0;
im6o->im6o_multicast_hlim = ip6_defmcasthlim;
im6o->im6o_multicast_loop = IPV6_DEFAULT_MULTICAST_LOOP;
LIST_INIT(&im6o->im6o_memberships);
}
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF: {
int s;
/*
* Select the interface for outgoing multicast packets.
*/
error = sockopt_get(sopt, &ifindex, sizeof(ifindex));
if (error != 0)
break;
s = pserialize_read_enter();
if (ifindex != 0) {
if ((ifp = if_byindex(ifindex)) == NULL) {
pserialize_read_exit(s);
error = ENXIO; /* XXX EINVAL? */
break;
}
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
} else
ifp = NULL;
im6o->im6o_multicast_if_index = if_get_index(ifp);
pserialize_read_exit(s);
break;
}
case IPV6_MULTICAST_HOPS:
{
/*
* Set the IP6 hoplimit for outgoing multicast packets.
*/
int optval;
error = sockopt_getint(sopt, &optval);
if (error != 0)
break;
if (optval < -1 || optval >= 256)
error = EINVAL;
else if (optval == -1)
im6o->im6o_multicast_hlim = ip6_defmcasthlim;
else
im6o->im6o_multicast_hlim = optval;
break;
}
case IPV6_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
error = sockopt_get(sopt, &loop, sizeof(loop));
if (error != 0)
break;
if (loop > 1) {
error = EINVAL;
break;
}
im6o->im6o_multicast_loop = loop;
break;
case IPV6_JOIN_GROUP: {
int bound;
struct psref psref;
/*
* Add a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
bound = curlwp_bind();
ifp = NULL;
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
if (error != 0) {
KASSERT(ifp == NULL); curlwp_bindx(bound);
return error;
}
if (IN6_IS_ADDR_V4MAPPED(&ia)) { error = ip_setmoptions(&inp->inp_moptions, sopt);
goto put_break;
}
/*
* See if we found an interface, and confirm that it
* supports multicast
*/
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
goto put_break;
}
if (in6_setscope(&ia, ifp, NULL)) {
error = EADDRNOTAVAIL; /* XXX: should not happen */
goto put_break;
}
/*
* See if the membership already exists.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if (imm->i6mm_maddr->in6m_ifp == ifp &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&ia))
goto put_break;
}
if (imm != NULL) {
error = EADDRINUSE;
goto put_break;
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
imm = in6_joingroup(ifp, &ia, &error, 0);
if (imm == NULL)
goto put_break;
LIST_INSERT_HEAD(&im6o->im6o_memberships, imm, i6mm_chain);
put_break:
if_put(ifp, &psref);
curlwp_bindx(bound);
break;
}
case IPV6_LEAVE_GROUP: {
/*
* Drop a multicast group membership.
* Group must be a valid IP6 multicast address.
*/
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error != 0)
break;
if (IN6_IS_ADDR_V4MAPPED(&mreq.ipv6mr_multiaddr)) { error = ip_setmoptions(&inp->inp_moptions, sopt);
break;
}
/*
* If an interface address was specified, get a pointer
* to its ifnet structure.
*/
if (mreq.ipv6mr_interface != 0) {
if ((ifp = if_byindex(mreq.ipv6mr_interface)) == NULL) {
error = ENXIO; /* XXX EINVAL? */
break;
}
} else
ifp = NULL;
/* Fill in the scope zone ID */
if (ifp) {
if (in6_setscope(&mreq.ipv6mr_multiaddr, ifp, NULL)) {
/* XXX: should not happen */
error = EADDRNOTAVAIL;
break;
}
} else if (mreq.ipv6mr_interface != 0) {
/*
* XXX: This case would happens when the (positive)
* index is in the valid range, but the corresponding
* interface has been detached dynamically. The above
* check probably avoids such case to happen here, but
* we check it explicitly for safety.
*/
error = EADDRNOTAVAIL;
break;
} else { /* ipv6mr_interface == 0 */
struct sockaddr_in6 sa6_mc;
/*
* The API spec says as follows:
* If the interface index is specified as 0, the
* system may choose a multicast group membership to
* drop by matching the multicast address only.
* On the other hand, we cannot disambiguate the scope
* zone unless an interface is provided. Thus, we
* check if there's ambiguity with the default scope
* zone as the last resort.
*/
sockaddr_in6_init(&sa6_mc, &mreq.ipv6mr_multiaddr,
0, 0, 0);
error = sa6_embedscope(&sa6_mc, ip6_use_defzone);
if (error != 0)
break;
mreq.ipv6mr_multiaddr = sa6_mc.sin6_addr; }
/*
* Find the membership in the membership list.
*/
LIST_FOREACH(imm, &im6o->im6o_memberships, i6mm_chain) { if ((ifp == NULL || imm->i6mm_maddr->in6m_ifp == ifp) &&
IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
&mreq.ipv6mr_multiaddr))
break;
}
if (imm == NULL) {
/* Unable to resolve interface */
error = EADDRNOTAVAIL;
break;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
/* in6m_ifp should not leave thanks to inp_lock */
break;
}
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the mbuf.
*/
if (im6o->im6o_multicast_if_index == 0 && im6o->im6o_multicast_hlim == ip6_defmcasthlim && im6o->im6o_multicast_loop == IPV6_DEFAULT_MULTICAST_LOOP &&
LIST_EMPTY(&im6o->im6o_memberships)) {
free(in6p_moptions(inp), M_IPMOPTS);
in6p_moptions(inp) = NULL;
}
return (error);
}
/*
* Return the IP6 multicast options in response to user getsockopt().
*/
static int
ip6_getmoptions(struct sockopt *sopt, struct inpcb *inp)
{
u_int optval;
int error;
struct ip6_moptions *im6o = in6p_moptions(inp);
switch (sopt->sopt_name) {
case IPV6_MULTICAST_IF:
if (im6o == NULL || im6o->im6o_multicast_if_index == 0)
optval = 0;
else
optval = im6o->im6o_multicast_if_index;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IPV6_MULTICAST_HOPS:
if (im6o == NULL)
optval = ip6_defmcasthlim;
else
optval = im6o->im6o_multicast_hlim;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IPV6_MULTICAST_LOOP:
if (im6o == NULL)
optval = IPV6_DEFAULT_MULTICAST_LOOP;
else
optval = im6o->im6o_multicast_loop;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
default:
error = EOPNOTSUPP;
}
return (error);
}
/*
* Discard the IP6 multicast options.
*/
void
ip6_freemoptions(struct ip6_moptions *im6o)
{
struct in6_multi_mship *imm, *nimm;
if (im6o == NULL)
return;
/* The owner of im6o (inp) should be protected by solock */
LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships, i6mm_chain, nimm) { LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
free(im6o, M_IPMOPTS);
}
/*
* Set IPv6 outgoing packet options based on advanced API.
*/
int
ip6_setpktopts(struct mbuf *control, struct ip6_pktopts *opt,
struct ip6_pktopts *stickyopt, kauth_cred_t cred, int uproto)
{
struct cmsghdr *cm = 0;
if (control == NULL || opt == NULL)
return (EINVAL);
ip6_initpktopts(opt);
if (stickyopt) {
int error;
/*
* If stickyopt is provided, make a local copy of the options
* for this particular packet, then override them by ancillary
* objects.
* XXX: copypktopts() does not copy the cached route to a next
* hop (if any). This is not very good in terms of efficiency,
* but we can allow this since this option should be rarely
* used.
*/
if ((error = copypktopts(opt, stickyopt, M_NOWAIT)) != 0)
return (error);
}
/*
* XXX: Currently, we assume all the optional information is stored
* in a single mbuf.
*/
if (control->m_next)
return (EINVAL);
/* XXX if cm->cmsg_len is not aligned, control->m_len can become <0 */
for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len),
control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
int error;
if (control->m_len < CMSG_LEN(0))
return (EINVAL);
cm = mtod(control, struct cmsghdr *);
if (cm->cmsg_len < CMSG_LEN(0) || cm->cmsg_len > control->m_len)
return (EINVAL);
if (cm->cmsg_level != IPPROTO_IPV6)
continue;
error = ip6_setpktopt(cm->cmsg_type, CMSG_DATA(cm),
cm->cmsg_len - CMSG_LEN(0), opt, cred, 0, 1, uproto);
if (error)
return (error);
}
return (0);
}
/*
* Set a particular packet option, as a sticky option or an ancillary data
* item. "len" can be 0 only when it's a sticky option.
* We have 4 cases of combination of "sticky" and "cmsg":
* "sticky=0, cmsg=0": impossible
* "sticky=0, cmsg=1": RFC2292 or RFC3542 ancillary data
* "sticky=1, cmsg=0": RFC3542 socket option
* "sticky=1, cmsg=1": RFC2292 socket option
*/
static int
ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
kauth_cred_t cred, int sticky, int cmsg, int uproto)
{
int minmtupolicy;
int error;
if (!sticky && !cmsg) {
#ifdef DIAGNOSTIC
printf("ip6_setpktopt: impossible case\n");
#endif
return (EINVAL);
}
/*
* IPV6_2292xxx is for backward compatibility to RFC2292, and should
* not be specified in the context of RFC3542. Conversely,
* RFC3542 types should not be specified in the context of RFC2292.
*/
if (!cmsg) {
switch (optname) {
case IPV6_2292PKTINFO:
case IPV6_2292HOPLIMIT:
case IPV6_2292NEXTHOP:
case IPV6_2292HOPOPTS:
case IPV6_2292DSTOPTS:
case IPV6_2292RTHDR:
case IPV6_2292PKTOPTIONS:
return (ENOPROTOOPT);
}
}
if (sticky && cmsg) {
switch (optname) {
case IPV6_PKTINFO:
case IPV6_HOPLIMIT:
case IPV6_NEXTHOP:
case IPV6_HOPOPTS:
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
case IPV6_RTHDR:
case IPV6_USE_MIN_MTU:
case IPV6_DONTFRAG:
case IPV6_OTCLASS:
case IPV6_TCLASS:
case IPV6_PREFER_TEMPADDR: /* XXX not an RFC3542 option */
return (ENOPROTOOPT);
}
}
switch (optname) {
#ifdef RFC2292
case IPV6_2292PKTINFO:
#endif
case IPV6_PKTINFO:
{
struct in6_pktinfo *pktinfo;
if (len != sizeof(struct in6_pktinfo))
return (EINVAL);
pktinfo = (struct in6_pktinfo *)buf;
/*
* An application can clear any sticky IPV6_PKTINFO option by
* doing a "regular" setsockopt with ipi6_addr being
* in6addr_any and ipi6_ifindex being zero.
* [RFC 3542, Section 6]
*/
if (optname == IPV6_PKTINFO && opt->ip6po_pktinfo && pktinfo->ipi6_ifindex == 0 && IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) { ip6_clearpktopts(opt, optname);
break;
}
if (uproto == IPPROTO_TCP && optname == IPV6_PKTINFO && sticky && !IN6_IS_ADDR_UNSPECIFIED(&pktinfo->ipi6_addr)) {
return (EINVAL);
}
/* Validate the interface index if specified. */
if (pktinfo->ipi6_ifindex) {
struct ifnet *ifp;
int s = pserialize_read_enter();
ifp = if_byindex(pktinfo->ipi6_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
}
/*
* We store the address anyway, and let in6_selectsrc()
* validate the specified address. This is because ipi6_addr
* may not have enough information about its scope zone, and
* we may need additional information (such as outgoing
* interface or the scope zone of a destination address) to
* disambiguate the scope.
* XXX: the delay of the validation may confuse the
* application when it is used as a sticky option.
*/
if (opt->ip6po_pktinfo == NULL) {
opt->ip6po_pktinfo = malloc(sizeof(*pktinfo),
M_IP6OPT, M_NOWAIT);
if (opt->ip6po_pktinfo == NULL)
return (ENOBUFS);
}
memcpy(opt->ip6po_pktinfo, pktinfo, sizeof(*pktinfo));
break;
}
#ifdef RFC2292
case IPV6_2292HOPLIMIT:
#endif
case IPV6_HOPLIMIT:
{
int *hlimp;
/*
* RFC 3542 deprecated the usage of sticky IPV6_HOPLIMIT
* to simplify the ordering among hoplimit options.
*/
if (optname == IPV6_HOPLIMIT && sticky)
return (ENOPROTOOPT);
if (len != sizeof(int))
return (EINVAL);
hlimp = (int *)buf;
if (*hlimp < -1 || *hlimp > 255)
return (EINVAL);
opt->ip6po_hlim = *hlimp;
break;
}
case IPV6_OTCLASS:
if (len != sizeof(u_int8_t))
return (EINVAL);
opt->ip6po_tclass = *(u_int8_t *)buf;
break;
case IPV6_TCLASS:
{
int tclass;
if (len != sizeof(int))
return (EINVAL);
tclass = *(int *)buf;
if (tclass < -1 || tclass > 255)
return (EINVAL);
opt->ip6po_tclass = tclass;
break;
}
#ifdef RFC2292
case IPV6_2292NEXTHOP:
#endif
case IPV6_NEXTHOP:
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { /* just remove the option */
ip6_clearpktopts(opt, IPV6_NEXTHOP);
break;
}
/* check if cmsg_len is large enough for sa_len */
if (len < sizeof(struct sockaddr) || len < *buf)
return (EINVAL);
switch (((struct sockaddr *)buf)->sa_family) {
case AF_INET6:
{
struct sockaddr_in6 *sa6 = (struct sockaddr_in6 *)buf;
if (sa6->sin6_len != sizeof(struct sockaddr_in6))
return (EINVAL);
if (IN6_IS_ADDR_UNSPECIFIED(&sa6->sin6_addr) ||
IN6_IS_ADDR_MULTICAST(&sa6->sin6_addr)) {
return (EINVAL);
}
if ((error = sa6_embedscope(sa6, ip6_use_defzone))
!= 0) {
return (error);
}
break;
}
case AF_LINK: /* eventually be supported? */
default:
return (EAFNOSUPPORT);
}
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, IPV6_NEXTHOP);
opt->ip6po_nexthop = malloc(*buf, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_nexthop == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_nexthop, buf, *buf);
break;
#ifdef RFC2292
case IPV6_2292HOPOPTS:
#endif
case IPV6_HOPOPTS:
{
struct ip6_hbh *hbh;
int hbhlen;
/*
* XXX: We don't allow a non-privileged user to set ANY HbH
* options, since per-option restriction has too much
* overhead.
*/
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { ip6_clearpktopts(opt, IPV6_HOPOPTS);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_hbh))
return (EINVAL);
hbh = (struct ip6_hbh *)buf;
hbhlen = (hbh->ip6h_len + 1) << 3;
if (len != hbhlen)
return (EINVAL);
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, IPV6_HOPOPTS);
opt->ip6po_hbh = malloc(hbhlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_hbh == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_hbh, hbh, hbhlen);
break;
}
#ifdef RFC2292
case IPV6_2292DSTOPTS:
#endif
case IPV6_DSTOPTS:
case IPV6_RTHDRDSTOPTS:
{
struct ip6_dest *dest, **newdest = NULL;
int destlen;
/* XXX: see the comment for IPV6_HOPOPTS */
error = kauth_authorize_network(cred,
KAUTH_NETWORK_IPV6,
KAUTH_REQ_NETWORK_IPV6_HOPBYHOP, NULL, NULL, NULL);
if (error)
return (error);
if (len == 0) { ip6_clearpktopts(opt, optname);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_dest))
return (EINVAL);
dest = (struct ip6_dest *)buf;
destlen = (dest->ip6d_len + 1) << 3;
if (len != destlen)
return (EINVAL);
/*
* Determine the position that the destination options header
* should be inserted; before or after the routing header.
*/
switch (optname) {
case IPV6_2292DSTOPTS:
/*
* The old advanced API is ambiguous on this point.
* Our approach is to determine the position based
* according to the existence of a routing header.
* Note, however, that this depends on the order of the
* extension headers in the ancillary data; the 1st
* part of the destination options header must appear
* before the routing header in the ancillary data,
* too.
* RFC3542 solved the ambiguity by introducing
* separate ancillary data or option types.
*/
if (opt->ip6po_rthdr == NULL)
newdest = &opt->ip6po_dest1;
else
newdest = &opt->ip6po_dest2;
break;
case IPV6_RTHDRDSTOPTS:
newdest = &opt->ip6po_dest1;
break;
case IPV6_DSTOPTS:
newdest = &opt->ip6po_dest2;
break;
}
/* turn off the previous option, then set the new option. */
ip6_clearpktopts(opt, optname);
*newdest = malloc(destlen, M_IP6OPT, M_NOWAIT);
if (*newdest == NULL)
return (ENOBUFS);
memcpy(*newdest, dest, destlen);
break;
}
#ifdef RFC2292
case IPV6_2292RTHDR:
#endif
case IPV6_RTHDR:
{
struct ip6_rthdr *rth;
int rthlen;
if (len == 0) { ip6_clearpktopts(opt, IPV6_RTHDR);
break; /* just remove the option */
}
/* message length validation */
if (len < sizeof(struct ip6_rthdr))
return (EINVAL);
rth = (struct ip6_rthdr *)buf;
rthlen = (rth->ip6r_len + 1) << 3;
if (len != rthlen)
return (EINVAL);
switch (rth->ip6r_type) {
case IPV6_RTHDR_TYPE_0:
/* Dropped, RFC5095. */
default:
return (EINVAL); /* not supported */
}
/* turn off the previous option */
ip6_clearpktopts(opt, IPV6_RTHDR);
opt->ip6po_rthdr = malloc(rthlen, M_IP6OPT, M_NOWAIT);
if (opt->ip6po_rthdr == NULL)
return (ENOBUFS);
memcpy(opt->ip6po_rthdr, rth, rthlen);
break;
}
case IPV6_USE_MIN_MTU:
if (len != sizeof(int))
return (EINVAL);
minmtupolicy = *(int *)buf;
if (minmtupolicy != IP6PO_MINMTU_MCASTONLY &&
minmtupolicy != IP6PO_MINMTU_DISABLE &&
minmtupolicy != IP6PO_MINMTU_ALL) {
return (EINVAL);
}
opt->ip6po_minmtu = minmtupolicy;
break;
case IPV6_DONTFRAG:
if (len != sizeof(int))
return (EINVAL);
if (uproto == IPPROTO_TCP || *(int *)buf == 0) {
/*
* we ignore this option for TCP sockets.
* (RFC3542 leaves this case unspecified.)
*/
opt->ip6po_flags &= ~IP6PO_DONTFRAG;
} else
opt->ip6po_flags |= IP6PO_DONTFRAG;
break;
case IPV6_PREFER_TEMPADDR:
{
int preftemp;
if (len != sizeof(int))
return (EINVAL);
preftemp = *(int *)buf;
switch (preftemp) {
case IP6PO_TEMPADDR_SYSTEM:
case IP6PO_TEMPADDR_NOTPREFER:
case IP6PO_TEMPADDR_PREFER:
break;
default:
return (EINVAL);
}
opt->ip6po_prefer_tempaddr = preftemp;
break;
}
default:
return (ENOPROTOOPT);
} /* end of switch */
return (0);
}
/*
* Routine called from ip6_output() to loop back a copy of an IP6 multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be lo0ifp -- easier than replicating that code here.
*/
void
ip6_mloopback(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr_in6 *dst)
{
struct mbuf *copym;
struct ip6_hdr *ip6;
copym = m_copypacket(m, M_DONTWAIT);
if (copym == NULL)
return;
/*
* Make sure to deep-copy IPv6 header portion in case the data
* is in an mbuf cluster, so that we can safely override the IPv6
* header portion later.
*/
if ((copym->m_flags & M_EXT) != 0 ||
copym->m_len < sizeof(struct ip6_hdr)) {
copym = m_pullup(copym, sizeof(struct ip6_hdr));
if (copym == NULL)
return;
}
#ifdef DIAGNOSTIC
if (copym->m_len < sizeof(*ip6)) {
m_freem(copym);
return;
}
#endif
ip6 = mtod(copym, struct ip6_hdr *);
/*
* clear embedded scope identifiers if necessary.
* in6_clearscope will touch the addresses only when necessary.
*/
in6_clearscope(&ip6->ip6_src);
in6_clearscope(&ip6->ip6_dst);
(void)looutput(ifp, copym, (const struct sockaddr *)dst, NULL);
}
/*
* Chop IPv6 header off from the payload.
*/
static int
ip6_splithdr(struct mbuf *m, struct ip6_exthdrs *exthdrs)
{
struct mbuf *mh;
struct ip6_hdr *ip6;
ip6 = mtod(m, struct ip6_hdr *);
if (m->m_len > sizeof(*ip6)) {
MGETHDR(mh, M_DONTWAIT, MT_HEADER);
if (mh == NULL) {
m_freem(m);
return ENOBUFS;
}
m_move_pkthdr(mh, m);
m_align(mh, sizeof(*ip6));
m->m_len -= sizeof(*ip6);
m->m_data += sizeof(*ip6);
mh->m_next = m;
mh->m_len = sizeof(*ip6);
memcpy(mtod(mh, void *), (void *)ip6, sizeof(*ip6));
m = mh;
}
exthdrs->ip6e_ip6 = m;
return 0;
}
/*
* Compute IPv6 extension header length.
*/
int
ip6_optlen(struct inpcb *inp)
{
int len;
if (!in6p_outputopts(inp))
return 0;
len = 0;
#define elen(x) \
(((struct ip6_ext *)(x)) ? (((struct ip6_ext *)(x))->ip6e_len + 1) << 3 : 0)
len += elen(in6p_outputopts(inp)->ip6po_hbh); len += elen(in6p_outputopts(inp)->ip6po_dest1); len += elen(in6p_outputopts(inp)->ip6po_rthdr); len += elen(in6p_outputopts(inp)->ip6po_dest2);
return len;
#undef elen
}
/*
* Ensure sending address is valid.
* Returns 0 on success, -1 if an error should be sent back or 1
* if the packet could be dropped without error (protocol dependent).
*/
static int
ip6_ifaddrvalid(const struct in6_addr *src, const struct in6_addr *dst)
{
struct sockaddr_in6 sin6;
int s, error;
struct ifaddr *ifa;
struct in6_ifaddr *ia6;
if (IN6_IS_ADDR_UNSPECIFIED(src))
return 0;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = *src;
s = pserialize_read_enter();
ifa = ifa_ifwithaddr(sin6tosa(&sin6));
if ((ia6 = ifatoia6(ifa)) == NULL ||
ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED))
error = -1;
else if (ia6->ia6_flags & IN6_IFF_TENTATIVE)
error = 1;
else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
(sin6.sin6_addr = *dst, ifa_ifwithaddr(sin6tosa(&sin6)) == NULL))
/* Allow internal traffic to DETACHED addresses */
error = 1;
else
error = 0;
pserialize_read_exit(s);
return error;
}
/* $NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006,
* 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Coyote Point Systems, Inc.
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
*/
/*
* TODO list for SYN cache stuff:
*
* Find room for a "state" field, which is needed to keep a
* compressed state for TIME_WAIT TCBs. It's been noted already
* that this is fairly important for very high-volume web and
* mail servers, which use a large number of short-lived
* connections.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_syncache.c,v 1.6 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lwp.h> /* for lwp0 */
#include <sys/cprng.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip6.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_syncache.h>
#ifdef TCP_SIGNATURE
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif /* IPSEC*/
#endif
static void syn_cache_timer(void *);
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *, const struct sockaddr *,
struct syn_cache_head **);
static int syn_cache_respond(struct syn_cache *);
/* syn hash parameters */
#define TCP_SYN_HASH_SIZE 293
#define TCP_SYN_BUCKET_SIZE 35
static int tcp_syn_cache_size = TCP_SYN_HASH_SIZE;
int tcp_syn_cache_limit = TCP_SYN_HASH_SIZE*TCP_SYN_BUCKET_SIZE;
int tcp_syn_bucket_limit = 3*TCP_SYN_BUCKET_SIZE;
static struct syn_cache_head tcp_syn_cache[TCP_SYN_HASH_SIZE];
/*
* TCP compressed state engine. Currently used to hold compressed
* state for SYN_RECEIVED.
*/
u_long syn_cache_count;
static u_int32_t syn_hash1, syn_hash2;
#define SYN_HASH(sa, sp, dp) \
((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
((u_int32_t)(sp)))^syn_hash2)))
#ifndef INET6
#define SYN_HASHALL(hash, src, dst) \
do { \
hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
((const struct sockaddr_in *)(src))->sin_port, \
((const struct sockaddr_in *)(dst))->sin_port); \
} while (/*CONSTCOND*/ 0)
#else
#define SYN_HASH6(sa, sp, dp) \
((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
(((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
& 0x7fffffff)
#define SYN_HASHALL(hash, src, dst) \
do { \
switch ((src)->sa_family) { \
case AF_INET: \
hash = SYN_HASH(&((const struct sockaddr_in *)(src))->sin_addr, \
((const struct sockaddr_in *)(src))->sin_port, \
((const struct sockaddr_in *)(dst))->sin_port); \
break; \
case AF_INET6: \
hash = SYN_HASH6(&((const struct sockaddr_in6 *)(src))->sin6_addr, \
((const struct sockaddr_in6 *)(src))->sin6_port, \
((const struct sockaddr_in6 *)(dst))->sin6_port); \
break; \
default: \
hash = 0; \
} \
} while (/*CONSTCOND*/0)
#endif /* INET6 */
static struct pool syn_cache_pool;
/*
* We don't estimate RTT with SYNs, so each packet starts with the default
* RTT and each timer step has a fixed timeout value.
*/
static inline void
syn_cache_timer_arm(struct syn_cache *sc)
{
TCPT_RANGESET(sc->sc_rxtcur,
TCPTV_SRTTDFLT * tcp_backoff[sc->sc_rxtshift], TCPTV_MIN,
TCPTV_REXMTMAX);
callout_reset(&sc->sc_timer,
sc->sc_rxtcur * (hz / PR_SLOWHZ), syn_cache_timer, sc);
}
#define SYN_CACHE_TIMESTAMP(sc) (tcp_now - (sc)->sc_timebase)
static inline void
syn_cache_rm(struct syn_cache *sc)
{
TAILQ_REMOVE(&tcp_syn_cache[sc->sc_bucketidx].sch_bucket,
sc, sc_bucketq);
sc->sc_tp = NULL;
LIST_REMOVE(sc, sc_tpq);
tcp_syn_cache[sc->sc_bucketidx].sch_length--;
callout_stop(&sc->sc_timer);
syn_cache_count--;
}
static inline void
syn_cache_put(struct syn_cache *sc)
{
if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts);
rtcache_free(&sc->sc_route);
sc->sc_flags |= SCF_DEAD;
if (!callout_invoking(&sc->sc_timer)) callout_schedule(&(sc)->sc_timer, 1);
}
void
syn_cache_init(void)
{
int i;
pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
"synpl", NULL, IPL_SOFTNET);
/* Initialize the hash buckets. */
for (i = 0; i < tcp_syn_cache_size; i++)
TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
}
void
syn_cache_insert(struct syn_cache *sc, struct tcpcb *tp)
{
struct syn_cache_head *scp;
struct syn_cache *sc2;
int s;
/*
* If there are no entries in the hash table, reinitialize
* the hash secrets.
*/
if (syn_cache_count == 0) {
syn_hash1 = cprng_fast32();
syn_hash2 = cprng_fast32();
}
SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
scp = &tcp_syn_cache[sc->sc_bucketidx];
/*
* Make sure that we don't overflow the per-bucket
* limit or the total cache size limit.
*/
s = splsoftnet();
if (scp->sch_length >= tcp_syn_bucket_limit) {
TCP_STATINC(TCP_STAT_SC_BUCKETOVERFLOW);
/*
* The bucket is full. Toss the oldest element in the
* bucket. This will be the first entry in the bucket.
*/
sc2 = TAILQ_FIRST(&scp->sch_bucket);
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find an
* entry in our bucket.
*/
if (sc2 == NULL)
panic("syn_cache_insert: bucketoverflow: impossible");
#endif
syn_cache_rm(sc2);
syn_cache_put(sc2); /* calls pool_put but see spl above */
} else if (syn_cache_count >= tcp_syn_cache_limit) {
struct syn_cache_head *scp2, *sce;
TCP_STATINC(TCP_STAT_SC_OVERFLOWED);
/*
* The cache is full. Toss the oldest entry in the
* first non-empty bucket we can find.
*
* XXX We would really like to toss the oldest
* entry in the cache, but we hope that this
* condition doesn't happen very often.
*/
scp2 = scp;
if (TAILQ_EMPTY(&scp2->sch_bucket)) {
sce = &tcp_syn_cache[tcp_syn_cache_size];
for (++scp2; scp2 != scp; scp2++) {
if (scp2 >= sce)
scp2 = &tcp_syn_cache[0];
if (! TAILQ_EMPTY(&scp2->sch_bucket))
break;
}
#ifdef DIAGNOSTIC
/*
* This should never happen; we should always find a
* non-empty bucket.
*/
if (scp2 == scp)
panic("syn_cache_insert: cacheoverflow: "
"impossible");
#endif
}
sc2 = TAILQ_FIRST(&scp2->sch_bucket);
syn_cache_rm(sc2);
syn_cache_put(sc2); /* calls pool_put but see spl above */
}
/*
* Initialize the entry's timer.
*/
sc->sc_rxttot = 0;
sc->sc_rxtshift = 0;
syn_cache_timer_arm(sc);
/* Link it from tcpcb entry */
LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
/* Put it into the bucket. */
TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
scp->sch_length++;
syn_cache_count++;
TCP_STATINC(TCP_STAT_SC_ADDED);
splx(s);
}
/*
* Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
* If we have retransmitted an entry the maximum number of times, expire
* that entry.
*/
static void
syn_cache_timer(void *arg)
{
struct syn_cache *sc = arg;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
callout_ack(&sc->sc_timer);
if (__predict_false(sc->sc_flags & SCF_DEAD)) {
TCP_STATINC(TCP_STAT_SC_DELAYED_FREE);
goto free;
}
if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
/* Drop it -- too many retransmissions. */
goto dropit;
}
/*
* Compute the total amount of time this entry has
* been on a queue. If this entry has been on longer
* than the keep alive timer would allow, expire it.
*/
sc->sc_rxttot += sc->sc_rxtcur;
if (sc->sc_rxttot >= MIN(tcp_keepinit, TCP_TIMER_MAXTICKS))
goto dropit;
TCP_STATINC(TCP_STAT_SC_RETRANSMITTED);
(void)syn_cache_respond(sc);
/* Advance the timer back-off. */
sc->sc_rxtshift++;
syn_cache_timer_arm(sc);
goto out;
dropit:
TCP_STATINC(TCP_STAT_SC_TIMED_OUT);
syn_cache_rm(sc);
if (sc->sc_ipopts)
(void) m_free(sc->sc_ipopts);
rtcache_free(&sc->sc_route);
free:
callout_destroy(&sc->sc_timer);
pool_put(&syn_cache_pool, sc);
out:
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Remove syn cache created by the specified tcb entry,
* because this does not make sense to keep them
* (if there's no tcb entry, syn cache entry will never be used)
*/
void
syn_cache_cleanup(struct tcpcb *tp)
{
struct syn_cache *sc, *nsc;
int s;
s = splsoftnet();
for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
nsc = LIST_NEXT(sc, sc_tpq);
#ifdef DIAGNOSTIC
if (sc->sc_tp != tp)
panic("invalid sc_tp in syn_cache_cleanup");
#endif
syn_cache_rm(sc);
syn_cache_put(sc); /* calls pool_put but see spl above */
}
/* just for safety */
LIST_INIT(&tp->t_sc);
splx(s);
}
/*
* Find an entry in the syn cache.
*/
static struct syn_cache *
syn_cache_lookup(const struct sockaddr *src, const struct sockaddr *dst,
struct syn_cache_head **headp)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
u_int32_t hash;
int s;
SYN_HASHALL(hash, src, dst);
scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
*headp = scp;
s = splsoftnet();
for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
sc = TAILQ_NEXT(sc, sc_bucketq)) {
if (sc->sc_hash != hash)
continue;
if (!memcmp(&sc->sc_src, src, src->sa_len) &&
!memcmp(&sc->sc_dst, dst, dst->sa_len)) {
splx(s);
return (sc);
}
}
splx(s);
return (NULL);
}
/*
* This function gets called when we receive an ACK for a socket in the
* LISTEN state. We look up the connection in the syn cache, and if it's
* there, we pull it out of the cache and turn it into a full-blown
* connection in the SYN-RECEIVED state.
*
* The return values may not be immediately obvious, and their effects
* can be subtle, so here they are:
*
* NULL SYN was not found in cache; caller should drop the
* packet and send an RST.
*
* -1 We were unable to create the new connection, and are
* aborting it. An ACK,RST is being sent to the peer
* (unless we got screwey sequence numbers; see below),
* because the 3-way handshake has been completed. Caller
* should not free the mbuf, since we may be using it. If
* we are not, we will free it.
*
* Otherwise, the return value is a pointer to the new socket
* associated with the connection.
*/
struct socket *
syn_cache_get(struct sockaddr *src, struct sockaddr *dst,
struct tcphdr *th, struct socket *so, struct mbuf *m)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
struct inpcb *inp = NULL;
struct tcpcb *tp;
int s;
struct socket *oso;
s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return NULL;
}
/*
* Verify the sequence and ack numbers. Try getting the correct
* response again.
*/
if ((th->th_ack != sc->sc_iss + 1) ||
SEQ_LEQ(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
m_freem(m);
(void)syn_cache_respond(sc);
splx(s);
return ((struct socket *)(-1));
}
/* Remove this cache entry */
syn_cache_rm(sc);
splx(s);
/*
* Ok, create the full blown connection, and set things up
* as they would have been set up if we had created the
* connection when the SYN arrived. If we can't create
* the connection, abort it.
*/
/*
* inp still has the OLD in_pcb stuff, set the
* v6-related flags on the new guy, too. This is
* done particularly for the case where an AF_INET6
* socket is bound only to a port, and a v4 connection
* comes in on that port.
* we also copy the flowinfo from the original pcb
* to the new one.
*/
oso = so;
so = sonewconn(so, true);
if (so == NULL)
goto resetandabort;
inp = sotoinpcb(so);
switch (src->sa_family) {
case AF_INET:
if (inp->inp_af == AF_INET) {
in4p_laddr(inp) = ((struct sockaddr_in *)dst)->sin_addr;
inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
inp->inp_options = ip_srcroute(m);
inpcb_set_state(inp, INP_BOUND);
if (inp->inp_options == NULL) {
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
/* IPv4 packet to AF_INET6 socket */
memset(&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
in6p_laddr(inp).s6_addr16[5] = htons(0xffff);
bcopy(&((struct sockaddr_in *)dst)->sin_addr,
&in6p_laddr(inp).s6_addr32[3],
sizeof(((struct sockaddr_in *)dst)->sin_addr));
inp->inp_lport = ((struct sockaddr_in *)dst)->sin_port;
intotcpcb(inp)->t_family = AF_INET;
if (sotoinpcb(oso)->inp_flags & IN6P_IPV6_V6ONLY)
inp->inp_flags |= IN6P_IPV6_V6ONLY;
else
inp->inp_flags &= ~IN6P_IPV6_V6ONLY;
inpcb_set_state(inp, INP_BOUND);
}
#endif
break;
#ifdef INET6
case AF_INET6:
if (inp->inp_af == AF_INET6) {
in6p_laddr(inp) = ((struct sockaddr_in6 *)dst)->sin6_addr;
inp->inp_lport = ((struct sockaddr_in6 *)dst)->sin6_port;
inpcb_set_state(inp, INP_BOUND);
}
break;
#endif
}
#ifdef INET6
if (inp && intotcpcb(inp)->t_family == AF_INET6 && sotoinpcb(oso)) {
struct inpcb *oinp = sotoinpcb(oso);
/* inherit socket options from the listening socket */
inp->inp_flags |= (oinp->inp_flags & IN6P_CONTROLOPTS);
if (inp->inp_flags & IN6P_CONTROLOPTS) {
m_freem(inp->inp_options);
inp->inp_options = NULL;
}
ip6_savecontrol(inp, &inp->inp_options,
mtod(m, struct ip6_hdr *), m);
}
#endif
/*
* Give the new socket our cached route reference.
*/
rtcache_copy(&inp->inp_route, &sc->sc_route);
rtcache_free(&sc->sc_route);
if (inp->inp_af == AF_INET) {
struct sockaddr_in sin;
memcpy(&sin, src, src->sa_len);
if (inpcb_connect(inp, &sin, &lwp0)) {
goto resetandabort;
}
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
struct sockaddr_in6 sin6;
memcpy(&sin6, src, src->sa_len);
if (src->sa_family == AF_INET) {
/* IPv4 packet to AF_INET6 socket */
in6_sin_2_v4mapsin6((struct sockaddr_in *)src, &sin6);
}
if (in6pcb_connect(inp, &sin6, NULL)) {
goto resetandabort;
}
}
#endif
else {
goto resetandabort;
}
tp = intotcpcb(inp);
tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
if (sc->sc_request_r_scale != 15) {
tp->requested_s_scale = sc->sc_requested_s_scale;
tp->request_r_scale = sc->sc_request_r_scale;
tp->snd_scale = sc->sc_requested_s_scale;
tp->rcv_scale = sc->sc_request_r_scale;
tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
}
if (sc->sc_flags & SCF_TIMESTAMP)
tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
tp->ts_timebase = sc->sc_timebase;
tp->t_template = tcp_template(tp);
if (tp->t_template == 0) {
tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
so = NULL;
m_freem(m);
goto abort;
}
tp->iss = sc->sc_iss;
tp->irs = sc->sc_irs;
tcp_sendseqinit(tp);
tcp_rcvseqinit(tp);
tp->t_state = TCPS_SYN_RECEIVED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepinit);
TCP_STATINC(TCP_STAT_ACCEPTS);
if ((sc->sc_flags & SCF_SACK_PERMIT) && tcp_do_sack)
tp->t_flags |= TF_WILL_SACK;
if ((sc->sc_flags & SCF_ECN_PERMIT) && tcp_do_ecn)
tp->t_flags |= TF_ECN_PERMIT;
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
/* Initialize tp->t_ourmss before we deal with the peer's! */
tp->t_ourmss = sc->sc_ourmaxseg;
tcp_mss_from_peer(tp, sc->sc_peermaxseg);
/*
* Initialize the initial congestion window. If we
* had to retransmit the SYN,ACK, we must initialize cwnd
* to 1 segment (i.e. the Loss Window).
*/
if (sc->sc_rxtshift)
tp->snd_cwnd = tp->t_peermss;
else {
int ss = tcp_init_win;
if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
ss = tcp_init_win_local;
#ifdef INET6
else if (inp->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(inp)))
ss = tcp_init_win_local;
#endif
tp->snd_cwnd = TCP_INITIAL_WINDOW(ss, tp->t_peermss);
}
tcp_rmx_rtt(tp);
tp->snd_wl1 = sc->sc_irs;
tp->rcv_up = sc->sc_irs + 1;
/*
* This is what would have happened in tcp_output() when
* the SYN,ACK was sent.
*/
tp->snd_up = tp->snd_una;
tp->snd_max = tp->snd_nxt = tp->iss+1;
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
TCP_STATINC(TCP_STAT_SC_COMPLETED);
s = splsoftnet();
syn_cache_put(sc);
splx(s);
return so;
resetandabort:
(void)tcp_respond(NULL, m, m, th, (tcp_seq)0, th->th_ack, TH_RST);
abort:
if (so != NULL) {
(void) soqremque(so, 1);
(void) soabort(so);
mutex_enter(softnet_lock);
}
s = splsoftnet();
syn_cache_put(sc);
splx(s);
TCP_STATINC(TCP_STAT_SC_ABORTED);
return ((struct socket *)(-1));
}
/*
* This function is called when we get a RST for a
* non-existent connection, so that we can see if the
* connection is in the syn cache. If it is, zap it.
*/
void
syn_cache_reset(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
int s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return;
}
if (SEQ_LT(th->th_seq, sc->sc_irs) ||
SEQ_GT(th->th_seq, sc->sc_irs+1)) {
splx(s);
return;
}
syn_cache_rm(sc);
TCP_STATINC(TCP_STAT_SC_RESET);
syn_cache_put(sc); /* calls pool_put but see spl above */
splx(s);
}
void
syn_cache_unreach(const struct sockaddr *src, const struct sockaddr *dst,
struct tcphdr *th)
{
struct syn_cache *sc;
struct syn_cache_head *scp;
int s;
s = splsoftnet();
if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
splx(s);
return;
}
/* If the sequence number != sc_iss, then it's a bogus ICMP msg */
if (ntohl(th->th_seq) != sc->sc_iss) {
splx(s);
return;
}
/*
* If we've retransmitted 3 times and this is our second error,
* we remove the entry. Otherwise, we allow it to continue on.
* This prevents us from incorrectly nuking an entry during a
* spurious network outage.
*
* See tcp_notify().
*/
if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
sc->sc_flags |= SCF_UNREACH;
splx(s);
return;
}
syn_cache_rm(sc);
TCP_STATINC(TCP_STAT_SC_UNREACH);
syn_cache_put(sc); /* calls pool_put but see spl above */
splx(s);
}
/*
* Given a LISTEN socket and an inbound SYN request, add this to the syn
* cache, and send back a segment:
* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
* to the source.
*
* IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
* Doing so would require that we hold onto the data and deliver it
* to the application. However, if we are the target of a SYN-flood
* DoS attack, an attacker could send data which would eventually
* consume all available buffer space if it were ACKed. By not ACKing
* the data, we avoid this DoS scenario.
*/
int
syn_cache_add(struct sockaddr *src, struct sockaddr *dst, struct tcphdr *th,
unsigned int toff, struct socket *so, struct mbuf *m, u_char *optp,
int optlen, struct tcp_opt_info *oi)
{
struct tcpcb tb, *tp;
long win;
struct syn_cache *sc;
struct syn_cache_head *scp;
struct mbuf *ipopts;
int s;
tp = sototcpcb(so);
/*
* Initialize some local state.
*/
win = sbspace(&so->so_rcv);
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
#ifdef TCP_SIGNATURE
if (optp || (tp->t_flags & TF_SIGNATURE))
#else
if (optp)
#endif
{
tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
#ifdef TCP_SIGNATURE
tb.t_flags |= (tp->t_flags & TF_SIGNATURE);
#endif
tb.t_state = TCPS_LISTEN;
if (tcp_dooptions(&tb, optp, optlen, th, m, toff, oi) < 0)
return 0;
} else
tb.t_flags = 0;
switch (src->sa_family) {
case AF_INET:
/* Remember the IP options, if any. */
ipopts = ip_srcroute(m);
break;
default:
ipopts = NULL;
}
/*
* See if we already have an entry for this connection.
* If we do, resend the SYN,ACK. We do not count this
* as a retransmission (XXX though maybe we should).
*/
if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
TCP_STATINC(TCP_STAT_SC_DUPESYN);
if (ipopts) {
/*
* If we were remembering a previous source route,
* forget it and use the new one we've been given.
*/
if (sc->sc_ipopts)
(void)m_free(sc->sc_ipopts);
sc->sc_ipopts = ipopts;
}
sc->sc_timestamp = tb.ts_recent;
m_freem(m);
if (syn_cache_respond(sc) == 0) {
uint64_t *tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDACKS]++;
tcps[TCP_STAT_SNDTOTAL]++;
TCP_STAT_PUTREF();
}
return 1;
}
s = splsoftnet();
sc = pool_get(&syn_cache_pool, PR_NOWAIT);
splx(s);
if (sc == NULL) {
if (ipopts)
(void)m_free(ipopts);
return 0;
}
/*
* Fill in the cache, and put the necessary IP and TCP
* options into the reply.
*/
memset(sc, 0, sizeof(struct syn_cache));
callout_init(&sc->sc_timer, CALLOUT_MPSAFE);
memcpy(&sc->sc_src, src, src->sa_len);
memcpy(&sc->sc_dst, dst, dst->sa_len);
sc->sc_flags = 0;
sc->sc_ipopts = ipopts;
sc->sc_irs = th->th_seq;
switch (src->sa_family) {
case AF_INET:
{
struct sockaddr_in *srcin = (void *)src;
struct sockaddr_in *dstin = (void *)dst;
sc->sc_iss = tcp_new_iss1(&dstin->sin_addr,
&srcin->sin_addr, dstin->sin_port,
srcin->sin_port, sizeof(dstin->sin_addr));
break;
}
#ifdef INET6
case AF_INET6:
{
struct sockaddr_in6 *srcin6 = (void *)src;
struct sockaddr_in6 *dstin6 = (void *)dst;
sc->sc_iss = tcp_new_iss1(&dstin6->sin6_addr,
&srcin6->sin6_addr, dstin6->sin6_port,
srcin6->sin6_port, sizeof(dstin6->sin6_addr));
break;
}
#endif
}
sc->sc_peermaxseg = oi->maxseg;
sc->sc_ourmaxseg = tcp_mss_to_advertise(m->m_flags & M_PKTHDR ?
m_get_rcvif_NOMPSAFE(m) : NULL, sc->sc_src.sa.sa_family);
sc->sc_win = win;
sc->sc_timebase = tcp_now - 1; /* see tcp_newtcpcb() */
sc->sc_timestamp = tb.ts_recent;
if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
(TF_REQ_TSTMP|TF_RCVD_TSTMP))
sc->sc_flags |= SCF_TIMESTAMP;
if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
(TF_RCVD_SCALE|TF_REQ_SCALE)) {
sc->sc_requested_s_scale = tb.requested_s_scale;
sc->sc_request_r_scale = 0;
/*
* Pick the smallest possible scaling factor that
* will still allow us to scale up to sb_max.
*
* We do this because there are broken firewalls that
* will corrupt the window scale option, leading to
* the other endpoint believing that our advertised
* window is unscaled. At scale factors larger than
* 5 the unscaled window will drop below 1500 bytes,
* leading to serious problems when traversing these
* broken firewalls.
*
* With the default sbmax of 256K, a scale factor
* of 3 will be chosen by this algorithm. Those who
* choose a larger sbmax should watch out
* for the compatibility problems mentioned above.
*
* RFC1323: The Window field in a SYN (i.e., a <SYN>
* or <SYN,ACK>) segment itself is never scaled.
*/
while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
(TCP_MAXWIN << sc->sc_request_r_scale) < sb_max)
sc->sc_request_r_scale++;
} else {
sc->sc_requested_s_scale = 15;
sc->sc_request_r_scale = 15;
}
if ((tb.t_flags & TF_SACK_PERMIT) && tcp_do_sack)
sc->sc_flags |= SCF_SACK_PERMIT;
/*
* ECN setup packet received.
*/
if ((th->th_flags & (TH_ECE|TH_CWR)) && tcp_do_ecn)
sc->sc_flags |= SCF_ECN_PERMIT;
#ifdef TCP_SIGNATURE
if (tb.t_flags & TF_SIGNATURE)
sc->sc_flags |= SCF_SIGNATURE;
#endif
sc->sc_tp = tp;
m_freem(m);
if (syn_cache_respond(sc) == 0) {
uint64_t *tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDACKS]++;
tcps[TCP_STAT_SNDTOTAL]++;
TCP_STAT_PUTREF();
syn_cache_insert(sc, tp);
} else {
s = splsoftnet();
/*
* syn_cache_put() will try to schedule the timer, so
* we need to initialize it
*/
syn_cache_timer_arm(sc);
syn_cache_put(sc);
splx(s);
TCP_STATINC(TCP_STAT_SC_DROPPED);
}
return 1;
}
/*
* syn_cache_respond: (re)send SYN+ACK.
*
* Returns 0 on success.
*/
static int
syn_cache_respond(struct syn_cache *sc)
{
#ifdef INET6
struct rtentry *rt = NULL;
#endif
struct route *ro;
u_int8_t *optp;
int optlen, error;
u_int16_t tlen;
struct ip *ip = NULL;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
struct tcpcb *tp;
struct tcphdr *th;
struct mbuf *m;
u_int hlen;
#ifdef TCP_SIGNATURE
struct secasvar *sav = NULL;
u_int8_t *sigp = NULL;
#endif
ro = &sc->sc_route;
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
hlen = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
return EAFNOSUPPORT;
}
/* Worst case scenario, since we don't know the option size yet. */
tlen = hlen + sizeof(struct tcphdr) + MAX_TCPOPTLEN;
KASSERT(max_linkhdr + tlen <= MCLBYTES);
/*
* Create the IP+TCP header from scratch.
*/
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m && (max_linkhdr + tlen) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL)
return ENOBUFS;
MCLAIM(m, &tcp_tx_mowner);
tp = sc->sc_tp;
/* Fixup the mbuf. */
m->m_data += max_linkhdr;
m_reset_rcvif(m);
memset(mtod(m, void *), 0, tlen);
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip = mtod(m, struct ip *);
ip->ip_v = 4;
ip->ip_dst = sc->sc_src.sin.sin_addr;
ip->ip_src = sc->sc_dst.sin.sin_addr;
ip->ip_p = IPPROTO_TCP;
th = (struct tcphdr *)(ip + 1);
th->th_dport = sc->sc_src.sin.sin_port;
th->th_sport = sc->sc_dst.sin.sin_port;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_vfc = IPV6_VERSION;
ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
ip6->ip6_nxt = IPPROTO_TCP;
/* ip6_plen will be updated in ip6_output() */
th = (struct tcphdr *)(ip6 + 1);
th->th_dport = sc->sc_src.sin6.sin6_port;
th->th_sport = sc->sc_dst.sin6.sin6_port;
break;
#endif
default:
panic("%s: impossible (1)", __func__);
}
th->th_seq = htonl(sc->sc_iss);
th->th_ack = htonl(sc->sc_irs + 1);
th->th_flags = TH_SYN|TH_ACK;
th->th_win = htons(sc->sc_win);
/* th_x2, th_sum, th_urp already 0 from memset */
/* Tack on the TCP options. */
optp = (u_int8_t *)(th + 1);
optlen = 0;
*optp++ = TCPOPT_MAXSEG;
*optp++ = TCPOLEN_MAXSEG;
*optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
*optp++ = sc->sc_ourmaxseg & 0xff;
optlen += TCPOLEN_MAXSEG;
if (sc->sc_request_r_scale != 15) {
*((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
sc->sc_request_r_scale);
optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
}
if (sc->sc_flags & SCF_SACK_PERMIT) {
/* Let the peer know that we will SACK. */
*optp++ = TCPOPT_SACK_PERMITTED;
*optp++ = TCPOLEN_SACK_PERMITTED;
optlen += TCPOLEN_SACK_PERMITTED;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
}
*optp++ = TCPOPT_TIMESTAMP;
*optp++ = TCPOLEN_TIMESTAMP;
u_int32_t *lp = (u_int32_t *)(optp);
/* Form timestamp option as shown in appendix A of RFC 1323. */
*lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
*lp = htonl(sc->sc_timestamp);
optp += TCPOLEN_TIMESTAMP - 2;
optlen += TCPOLEN_TIMESTAMP;
}
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE) {
sav = tcp_signature_getsav(m);
if (sav == NULL) {
m_freem(m);
return EPERM;
}
*optp++ = TCPOPT_SIGNATURE;
*optp++ = TCPOLEN_SIGNATURE;
sigp = optp;
memset(optp, 0, TCP_SIGLEN);
optp += TCP_SIGLEN;
optlen += TCPOLEN_SIGNATURE;
}
#endif
/*
* Terminate and pad TCP options to a 4 byte boundary.
*
* According to RFC793: "The content of the header beyond the
* End-of-Option option must be header padding (i.e., zero)."
* And later: "The padding is composed of zeros."
*/
if (optlen % 4) {
optlen += TCPOLEN_EOL;
*optp++ = TCPOPT_EOL;
}
while (optlen % 4) {
optlen += TCPOLEN_PAD;
*optp++ = TCPOPT_PAD;
}
/* Compute the actual values now that we've added the options. */
tlen = hlen + sizeof(struct tcphdr) + optlen;
m->m_len = m->m_pkthdr.len = tlen;
th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
#ifdef TCP_SIGNATURE
if (sav) {
(void)tcp_signature(m, th, hlen, sav, sigp);
key_sa_recordxfer(sav, m);
KEY_SA_UNREF(&sav);
}
#endif
/*
* Send ECN SYN-ACK setup packet.
* Routes can be asymmetric, so, even if we receive a packet
* with ECE and CWR set, we must not assume no one will block
* the ECE packet we are about to send.
*/
if ((sc->sc_flags & SCF_ECN_PERMIT) && tp &&
SEQ_GEQ(tp->snd_nxt, tp->snd_max)) {
th->th_flags |= TH_ECE;
TCP_STATINC(TCP_STAT_ECN_SHS);
/*
* draft-ietf-tcpm-ecnsyn-00.txt
*
* "[...] a TCP node MAY respond to an ECN-setup
* SYN packet by setting ECT in the responding
* ECN-setup SYN/ACK packet, indicating to routers
* that the SYN/ACK packet is ECN-Capable.
* This allows a congested router along the path
* to mark the packet instead of dropping the
* packet as an indication of congestion."
*
* "[...] There can be a great benefit in setting
* an ECN-capable codepoint in SYN/ACK packets [...]
* Congestion is most likely to occur in
* the server-to-client direction. As a result,
* setting an ECN-capable codepoint in SYN/ACK
* packets can reduce the occurrence of three-second
* retransmit timeouts resulting from the drop
* of SYN/ACK packets."
*
* Page 4 and 6, January 2006.
*/
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_tos |= IPTOS_ECN_ECT0;
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
break;
#endif
}
TCP_STATINC(TCP_STAT_ECN_ECT);
}
/*
* Compute the packet's checksum.
*
* Fill in some straggling IP bits. Note the stack expects
* ip_len to be in host order, for convenience.
*/
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
ip->ip_len = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in4_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip->ip_len = htons(tlen);
ip->ip_ttl = ip_defttl;
/* XXX tos? */
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_plen = htons(tlen - hlen);
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
ip6->ip6_plen = htons(tlen - hlen);
/* ip6_hlim will be initialized afterwards */
/* XXX flowlabel? */
break;
#endif
}
/* XXX use IPsec policy on listening socket, on SYN ACK */
tp = sc->sc_tp;
switch (sc->sc_src.sa.sa_family) {
case AF_INET:
error = ip_output(m, sc->sc_ipopts, ro,
(ip_mtudisc ? IP_MTUDISC : 0),
NULL, tp ? tp->t_inpcb : NULL);
break;
#ifdef INET6
case AF_INET6:
ip6->ip6_hlim = in6pcb_selecthlim(NULL,
(rt = rtcache_validate(ro)) != NULL ? rt->rt_ifp : NULL);
rtcache_unref(rt, ro);
error = ip6_output(m, NULL /*XXX*/, ro, 0, NULL,
tp ? tp->t_inpcb : NULL, NULL);
break;
#endif
default:
panic("%s: impossible (2)", __func__);
}
return error;
}
/* $NetBSD: sys_process.c,v 1.180 2020/05/26 00:50:53 kamil Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
/*-
* Copyright (c) 1993 Jan-Simon Pendry.
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
/*
* References:
* (1) Bach's "The Design of the UNIX Operating System",
* (2) sys/miscfs/procfs from UCB's 4.4BSD-Lite distribution,
* (3) the "4.4BSD Programmer's Reference Manual" published
* by USENIX and O'Reilly & Associates.
* The 4.4BSD PRM does a reasonably good job of documenting what the various
* ptrace() requests should actually do, and its text is quoted several times
* in this file.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_process.c,v 1.180 2020/05/26 00:50:53 kamil Exp $");
#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#include "opt_ktrace.h"
#include "opt_pax.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
#if defined(KTRACE) || defined(PTRACE_HOOKS)
int
process_domem(struct lwp *curl /*tracer*/,
struct lwp *l /*traced*/,
struct uio *uio)
{
struct proc *p = l->l_proc; /* traced */
struct vmspace *vm;
int error;
size_t len;
error = 0;
len = uio->uio_resid;
if (len == 0)
return 0;
#ifdef PMAP_NEED_PROCWR
vaddr_t addr = uio->uio_offset;
#endif
vm = p->p_vmspace;
if ((l->l_flag & LW_WEXIT) || vm->vm_refcnt < 1)
error = EFAULT;
if (error == 0)
uvmspace_addref(p->p_vmspace);
if (error != 0)
return error;
error = uvm_io(&vm->vm_map, uio, pax_mprotect_prot(l));
#ifdef PMAP_NEED_PROCWR
if (error == 0 && uio->uio_rw == UIO_WRITE)
pmap_procwr(p, addr, len);
#endif
uvmspace_free(vm);
return error;
}
#endif /* KTRACE || PTRACE_HOOKS */
/*
* Dummy routine so that ptrace_common module will fail to load if this
* routine is not defined.
*/
#if defined(PTRACE_HOOKS)
void
ptrace_hooks(void)
{
}
#endif
/* $NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_anon.c: uvm anon ops
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_anon.c,v 1.80 2020/10/25 00:05:26 chs Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
#include <uvm/uvm_pdpolicy.h>
static struct pool_cache uvm_anon_cache;
static int uvm_anon_ctor(void *, void *, int);
void
uvm_anon_init(void)
{
pool_cache_bootstrap(&uvm_anon_cache, sizeof(struct vm_anon), 0, 0,
PR_LARGECACHE, "anonpl", NULL, IPL_NONE, uvm_anon_ctor,
NULL, NULL);
}
static int
uvm_anon_ctor(void *arg, void *object, int flags)
{
struct vm_anon *anon = object;
anon->an_ref = 0;
anon->an_lock = NULL;
anon->an_page = NULL;
#if defined(VMSWAP)
anon->an_swslot = 0;
#endif
return 0;
}
/*
* uvm_analloc: allocate a new anon.
*
* => anon will have no lock associated.
*/
struct vm_anon *
uvm_analloc(void)
{
struct vm_anon *anon;
anon = pool_cache_get(&uvm_anon_cache, PR_NOWAIT);
if (anon) { KASSERT(anon->an_ref == 0); KASSERT(anon->an_lock == NULL); KASSERT(anon->an_page == NULL);
#if defined(VMSWAP)
KASSERT(anon->an_swslot == 0);
#endif
anon->an_ref = 1;
}
return anon;
}
/*
* uvm_anfree: free a single anon structure
*
* => anon must be removed from the amap (if anon was in an amap).
* => amap must be locked, if anon was owned by amap.
* => we may drop and re-acquire the lock here (to break loans).
*/
void
uvm_anfree(struct vm_anon *anon)
{
struct vm_page *pg = anon->an_page, *pg2 __diagused;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(anon=%#jx)", (uintptr_t)anon, 0,0,0);
KASSERT(anon->an_lock == NULL || rw_write_held(anon->an_lock)); KASSERT(anon->an_ref == 0);
/*
* Dispose of the page, if it is resident.
*/
if (__predict_true(pg != NULL)) {
KASSERT(anon->an_lock != NULL);
/*
* If there is a resident page and it is loaned, then anon
* may not own it. Call out to uvm_anon_lockloanpg() to
* identify and lock the real owner of the page.
*/
if (__predict_false(pg->loan_count != 0)) {
pg2 = uvm_anon_lockloanpg(anon);
KASSERT(pg2 == pg);
}
/*
* If the page is owned by a UVM object (now locked),
* then kill the loan on the page rather than free it,
* and release the object lock.
*/
if (__predict_false(pg->uobject != NULL)) {
mutex_enter(&pg->interlock);
KASSERT(pg->loan_count > 0);
pg->loan_count--;
pg->uanon = NULL;
mutex_exit(&pg->interlock);
rw_exit(pg->uobject->vmobjlock);
} else {
/*
* If page has no UVM object, then anon is the owner,
* and it is already locked.
*/
KASSERT((pg->flags & PG_RELEASED) == 0);
pmap_page_protect(pg, VM_PROT_NONE);
/*
* If the page is busy, mark it as PG_RELEASED, so
* that uvm_anon_release(9) would release it later.
*/
if (__predict_false((pg->flags & PG_BUSY) != 0)) {
pg->flags |= PG_RELEASED;
rw_obj_hold(anon->an_lock);
return;
}
uvm_pagefree(pg);
UVMHIST_LOG(maphist, "anon %#jx, page %#jx: "
"freed now!", (uintptr_t)anon, (uintptr_t)pg,
0, 0);
}
} else {
#if defined(VMSWAP)
if (anon->an_swslot > 0) {
/* This page is no longer only in swap. */
KASSERT(uvmexp.swpgonly > 0);
atomic_dec_uint(&uvmexp.swpgonly);
}
#endif
}
anon->an_lock = NULL;
/*
* Free any swap resources, leave a page replacement hint.
*/
uvm_anon_dropswap(anon);
uvmpdpol_anfree(anon);
UVMHIST_LOG(maphist,"<- done!",0,0,0,0);
pool_cache_put(&uvm_anon_cache, anon);
}
/*
* uvm_anon_lockloanpg: given a locked anon, lock its resident page owner.
*
* => anon is locked by caller
* => on return: anon is locked
* if there is a resident page:
* if it has a uobject, it is locked by us
* if it is ownerless, we take over as owner
* we return the resident page (it can change during
* this function)
* => note that the only time an anon has an ownerless resident page
* is if the page was loaned from a uvm_object and the uvm_object
* disowned it
* => this only needs to be called when you want to do an operation
* on an anon's resident page and that page has a non-zero loan
* count.
*/
struct vm_page *
uvm_anon_lockloanpg(struct vm_anon *anon)
{
struct vm_page *pg;
krw_t op;
KASSERT(rw_lock_held(anon->an_lock));
/*
* loop while we have a resident page that has a non-zero loan count.
* if we successfully get our lock, we will "break" the loop.
* note that the test for pg->loan_count is not protected -- this
* may produce false positive results. note that a false positive
* result may cause us to do more work than we need to, but it will
* not produce an incorrect result.
*/
while (((pg = anon->an_page) != NULL) && pg->loan_count != 0) {
mutex_enter(&pg->interlock);
if (pg->uobject) {
/*
* if we didn't get a lock (try lock failed), then we
* toggle our anon lock and try again
*/
if (!rw_tryenter(pg->uobject->vmobjlock, RW_WRITER)) {
/*
* someone locking the object has a chance to
* lock us right now
*
* XXX Better than yielding but inadequate.
*/
mutex_exit(&pg->interlock);
op = rw_lock_op(anon->an_lock);
rw_exit(anon->an_lock);
kpause("lkloanpg", false, 1, NULL);
rw_enter(anon->an_lock, op);
continue;
}
}
/*
* If page is un-owned i.e. the object dropped its ownership,
* then we have to take the ownership.
*/
if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) {
pg->flags |= PG_ANON;
pg->loan_count--;
}
mutex_exit(&pg->interlock);
break;
}
return pg;
}
#if defined(VMSWAP)
/*
* uvm_anon_pagein: fetch an anon's page.
*
* => anon must be locked, and is unlocked upon return.
* => returns true if pagein was aborted due to lack of memory.
*/
bool
uvm_anon_pagein(struct vm_amap *amap, struct vm_anon *anon)
{
struct vm_page *pg;
struct uvm_object *uobj;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(anon->an_lock == amap->am_lock);
/*
* Get the page of the anon.
*/
switch (uvmfault_anonget(NULL, amap, anon)) {
case 0:
/* Success - we have the page. */
KASSERT(rw_write_held(anon->an_lock));
break;
case EIO:
case ERESTART:
/*
* Nothing more to do on errors. ERESTART means that the
* anon was freed.
*/
return false;
case ENOLCK:
panic("uvm_anon_pagein");
default:
return true;
}
/*
* Mark the page as dirty and clear its swslot.
*/
pg = anon->an_page;
uobj = pg->uobject;
if (anon->an_swslot > 0) {
uvm_swap_free(anon->an_swslot, 1);
}
anon->an_swslot = 0;
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
/*
* Deactivate the page (to put it on a page queue).
*/
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
rw_exit(anon->an_lock);
if (uobj) {
rw_exit(uobj->vmobjlock);
}
return false;
}
/*
* uvm_anon_dropswap: release any swap resources from this anon.
*
* => anon must be locked or have a reference count of 0.
*/
void
uvm_anon_dropswap(struct vm_anon *anon)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (anon->an_swslot == 0)
return;
UVMHIST_LOG(maphist,"freeing swap for anon %#jx, paged to swslot %#jx",
(uintptr_t)anon, anon->an_swslot, 0, 0);
uvm_swap_free(anon->an_swslot, 1);
anon->an_swslot = 0;
}
#endif
/*
* uvm_anon_release: release an anon and its page.
*
* => anon should not have any references.
* => anon must be locked.
*/
void
uvm_anon_release(struct vm_anon *anon)
{
struct vm_page *pg = anon->an_page;
krwlock_t *lock;
KASSERT(rw_write_held(anon->an_lock));
KASSERT(pg != NULL);
KASSERT((pg->flags & PG_RELEASED) != 0);
KASSERT((pg->flags & PG_BUSY) != 0);
KASSERT(pg->uobject == NULL);
KASSERT(pg->uanon == anon);
KASSERT(pg->loan_count == 0);
KASSERT(anon->an_ref == 0);
if ((pg->flags & PG_PAGEOUT) != 0) {
pg->flags &= ~PG_PAGEOUT;
uvm_pageout_done(1);
}
uvm_pagefree(pg);
KASSERT(anon->an_page == NULL);
lock = anon->an_lock;
uvm_anfree(anon);
rw_exit(lock);
/* Note: extra reference is held for PG_RELEASED case. */
rw_obj_free(lock);
}
/* $NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by William Studenmund and Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.c 8.5 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if.c,v 1.529 2023/02/24 11:02:45 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_atalk.h"
#include "opt_wlan.h"
#include "opt_net_mpsafe.h"
#include "opt_mrouting.h"
#endif
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <sys/msan.h>
#include <sys/hook.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_ether.h>
#include <net/if_media.h>
#include <net80211/ieee80211.h>
#include <net80211/ieee80211_ioctl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <sys/module.h>
#ifdef NETATALK
#include <netatalk/at_extern.h>
#include <netatalk/at.h>
#endif
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip_encap.h>
#include <net/bpf.h>
#ifdef INET6
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#endif
#include "ether.h"
#include "bridge.h"
#if NBRIDGE > 0
#include <net/if_bridgevar.h>
#endif
#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
#endif
#include <compat/sys/sockio.h>
MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
/*
* XXX reusing (ifp)->if_snd->ifq_lock rather than having another spin mutex
* for each ifnet. It doesn't matter because:
* - if IFEF_MPSAFE is enabled, if_snd isn't used and lock contentions on
* ifq_lock don't happen
* - if IFEF_MPSAFE is disabled, there is no lock contention on ifq_lock
* because if_snd, if_link_state_change and if_link_state_change_process
* are all called with KERNEL_LOCK
*/
#define IF_LINK_STATE_CHANGE_LOCK(ifp) \
mutex_enter((ifp)->if_snd.ifq_lock)
#define IF_LINK_STATE_CHANGE_UNLOCK(ifp) \
mutex_exit((ifp)->if_snd.ifq_lock)
/*
* Global list of interfaces.
*/
/* DEPRECATED. Remove it once kvm(3) users disappeared */
struct ifnet_head ifnet_list;
struct pslist_head ifnet_pslist;
static ifnet_t ** ifindex2ifnet = NULL;
static u_int if_index = 1;
static size_t if_indexlim = 0;
static uint64_t index_gen;
/* Mutex to protect the above objects. */
kmutex_t ifnet_mtx __cacheline_aligned;
static struct psref_class *ifnet_psref_class __read_mostly;
static pserialize_t ifnet_psz;
static struct workqueue *ifnet_link_state_wq __read_mostly;
static struct workqueue *if_slowtimo_wq __read_mostly;
static kmutex_t if_clone_mtx;
struct ifnet *lo0ifp;
int ifqmaxlen = IFQ_MAXLEN;
struct psref_class *ifa_psref_class __read_mostly;
static int if_delroute_matcher(struct rtentry *, void *);
static bool if_is_unit(const char *);
static struct if_clone *if_clone_lookup(const char *, int *);
static LIST_HEAD(, if_clone) if_cloners = LIST_HEAD_INITIALIZER(if_cloners);
static int if_cloners_count;
/* Packet filtering hook for interfaces. */
pfil_head_t * if_pfil __read_mostly;
static kauth_listener_t if_listener;
static int doifioctl(struct socket *, u_long, void *, struct lwp *);
static void sysctl_sndq_setup(struct sysctllog **, const char *,
struct ifaltq *);
static void if_slowtimo_intr(void *);
static void if_slowtimo_work(struct work *, void *);
static int sysctl_if_watchdog(SYSCTLFN_PROTO);
static void sysctl_watchdog_setup(struct ifnet *);
static void if_attachdomain1(struct ifnet *);
static int ifconf(u_long, void *);
static int if_transmit(struct ifnet *, struct mbuf *);
static int if_clone_create(const char *);
static int if_clone_destroy(const char *);
static void if_link_state_change_work(struct work *, void *);
static void if_up_locked(struct ifnet *);
static void _if_down(struct ifnet *);
static void if_down_deactivated(struct ifnet *);
struct if_percpuq {
struct ifnet *ipq_ifp;
void *ipq_si;
struct percpu *ipq_ifqs; /* struct ifqueue */
};
static struct mbuf *if_percpuq_dequeue(struct if_percpuq *);
static void if_percpuq_drops(void *, void *, struct cpu_info *);
static int sysctl_percpuq_drops_handler(SYSCTLFN_PROTO);
static void sysctl_percpuq_setup(struct sysctllog **, const char *,
struct if_percpuq *);
struct if_deferred_start {
struct ifnet *ids_ifp;
void (*ids_if_start)(struct ifnet *);
void *ids_si;
};
static void if_deferred_start_softint(void *);
static void if_deferred_start_common(struct ifnet *);
static void if_deferred_start_destroy(struct ifnet *);
struct if_slowtimo_data {
kmutex_t isd_lock;
struct callout isd_ch;
struct work isd_work;
struct ifnet *isd_ifp;
bool isd_queued;
bool isd_dying;
bool isd_trigger;
};
/*
* Hook for if_vlan - needed by if_agr
*/
struct if_vlan_vlan_input_hook_t if_vlan_vlan_input_hook;
static void if_sysctl_setup(struct sysctllog **);
static int
if_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_network_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg1;
if (action != KAUTH_NETWORK_INTERFACE)
return result;
if ((req == KAUTH_REQ_NETWORK_INTERFACE_GET) ||
(req == KAUTH_REQ_NETWORK_INTERFACE_SET))
result = KAUTH_RESULT_ALLOW;
return result;
}
/*
* Network interface utility routines.
*
* Routines with ifa_ifwith* names take sockaddr *'s as
* parameters.
*/
void
ifinit(void)
{
#if (defined(INET) || defined(INET6))
encapinit();
#endif
if_listener = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
if_listener_cb, NULL);
/* interfaces are available, inform socket code */
ifioctl = doifioctl;
}
/*
* XXX Initialization before configure().
* XXX hack to get pfil_add_hook working in autoconf.
*/
void
ifinit1(void)
{
int error __diagused;
#ifdef NET_MPSAFE
printf("NET_MPSAFE enabled\n");
#endif
mutex_init(&if_clone_mtx, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&ifnet_list);
mutex_init(&ifnet_mtx, MUTEX_DEFAULT, IPL_NONE);
ifnet_psz = pserialize_create();
ifnet_psref_class = psref_class_create("ifnet", IPL_SOFTNET);
ifa_psref_class = psref_class_create("ifa", IPL_SOFTNET);
error = workqueue_create(&ifnet_link_state_wq, "iflnkst",
if_link_state_change_work, NULL, PRI_SOFTNET, IPL_NET,
WQ_MPSAFE);
KASSERT(error == 0);
PSLIST_INIT(&ifnet_pslist);
error = workqueue_create(&if_slowtimo_wq, "ifwdog",
if_slowtimo_work, NULL, PRI_SOFTNET, IPL_SOFTCLOCK, WQ_MPSAFE);
KASSERTMSG(error == 0, "error=%d", error);
if_indexlim = 8;
if_pfil = pfil_head_create(PFIL_TYPE_IFNET, NULL);
KASSERT(if_pfil != NULL);
#if NETHER > 0 || defined(NETATALK) || defined(WLAN)
etherinit();
#endif
}
/* XXX must be after domaininit() */
void
ifinit_post(void)
{
if_sysctl_setup(NULL);
}
ifnet_t *
if_alloc(u_char type)
{
return kmem_zalloc(sizeof(ifnet_t), KM_SLEEP);
}
void
if_free(ifnet_t *ifp)
{
kmem_free(ifp, sizeof(ifnet_t));
}
void
if_initname(struct ifnet *ifp, const char *name, int unit)
{
(void)snprintf(ifp->if_xname, sizeof(ifp->if_xname),
"%s%d", name, unit);
}
/*
* Null routines used while an interface is going away. These routines
* just return an error.
*/
int
if_nulloutput(struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *so, const struct rtentry *rt)
{
return ENXIO;
}
void
if_nullinput(struct ifnet *ifp, struct mbuf *m)
{
/* Nothing. */
}
void
if_nullstart(struct ifnet *ifp)
{
/* Nothing. */
}
int
if_nulltransmit(struct ifnet *ifp, struct mbuf *m)
{
m_freem(m);
return ENXIO;
}
int
if_nullioctl(struct ifnet *ifp, u_long cmd, void *data)
{
return ENXIO;
}
int
if_nullinit(struct ifnet *ifp)
{
return ENXIO;
}
void
if_nullstop(struct ifnet *ifp, int disable)
{
/* Nothing. */
}
void
if_nullslowtimo(struct ifnet *ifp)
{
/* Nothing. */
}
void
if_nulldrain(struct ifnet *ifp)
{
/* Nothing. */
}
void
if_set_sadl(struct ifnet *ifp, const void *lla, u_char addrlen, bool factory)
{
struct ifaddr *ifa;
struct sockaddr_dl *sdl;
ifp->if_addrlen = addrlen;
if_alloc_sadl(ifp);
ifa = ifp->if_dl;
sdl = satosdl(ifa->ifa_addr);
(void)sockaddr_dl_setaddr(sdl, sdl->sdl_len, lla, ifp->if_addrlen);
if (factory) {
KASSERT(ifp->if_hwdl == NULL);
ifp->if_hwdl = ifp->if_dl;
ifaref(ifp->if_hwdl);
}
/* TBD routing socket */
}
struct ifaddr *
if_dl_create(const struct ifnet *ifp, const struct sockaddr_dl **sdlp)
{
unsigned socksize, ifasize;
int addrlen, namelen;
struct sockaddr_dl *mask, *sdl;
struct ifaddr *ifa;
namelen = strlen(ifp->if_xname);
addrlen = ifp->if_addrlen;
socksize = roundup(sockaddr_dl_measure(namelen, addrlen),
sizeof(long));
ifasize = sizeof(*ifa) + 2 * socksize;
ifa = malloc(ifasize, M_IFADDR, M_WAITOK | M_ZERO);
sdl = (struct sockaddr_dl *)(ifa + 1);
mask = (struct sockaddr_dl *)(socksize + (char *)sdl);
sockaddr_dl_init(sdl, socksize, ifp->if_index, ifp->if_type,
ifp->if_xname, namelen, NULL, addrlen);
mask->sdl_family = AF_LINK;
mask->sdl_len = sockaddr_dl_measure(namelen, 0);
memset(&mask->sdl_data[0], 0xff, namelen);
ifa->ifa_rtrequest = link_rtrequest;
ifa->ifa_addr = (struct sockaddr *)sdl;
ifa->ifa_netmask = (struct sockaddr *)mask;
ifa_psref_init(ifa);
*sdlp = sdl;
return ifa;
}
static void
if_sadl_setrefs(struct ifnet *ifp, struct ifaddr *ifa)
{
const struct sockaddr_dl *sdl;
ifp->if_dl = ifa;
ifaref(ifa);
sdl = satosdl(ifa->ifa_addr);
ifp->if_sadl = sdl;
}
/*
* Allocate the link level name for the specified interface. This
* is an attachment helper. It must be called after ifp->if_addrlen
* is initialized, which may not be the case when if_attach() is
* called.
*/
void
if_alloc_sadl(struct ifnet *ifp)
{
struct ifaddr *ifa;
const struct sockaddr_dl *sdl;
/*
* If the interface already has a link name, release it
* now. This is useful for interfaces that can change
* link types, and thus switch link names often.
*/
if (ifp->if_sadl != NULL) if_free_sadl(ifp, 0);
ifa = if_dl_create(ifp, &sdl);
ifa_insert(ifp, ifa);
if_sadl_setrefs(ifp, ifa);
}
static void
if_deactivate_sadl(struct ifnet *ifp)
{
struct ifaddr *ifa;
KASSERT(ifp->if_dl != NULL);
ifa = ifp->if_dl;
ifp->if_sadl = NULL;
ifp->if_dl = NULL;
ifafree(ifa);
}
static void
if_replace_sadl(struct ifnet *ifp, struct ifaddr *ifa)
{
struct ifaddr *old;
KASSERT(ifp->if_dl != NULL);
old = ifp->if_dl;
ifaref(ifa);
/* XXX Update if_dl and if_sadl atomically */
ifp->if_dl = ifa;
ifp->if_sadl = satosdl(ifa->ifa_addr);
ifafree(old);
}
void
if_activate_sadl(struct ifnet *ifp, struct ifaddr *ifa0,
const struct sockaddr_dl *sdl)
{
struct ifaddr *ifa;
const int bound = curlwp_bind();
KASSERT(ifa_held(ifa0));
const int s = splsoftnet();
if_replace_sadl(ifp, ifa0);
int ss = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
ifa_acquire(ifa, &psref);
pserialize_read_exit(ss);
rtinit(ifa, RTM_LLINFO_UPD, 0);
ss = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(ss);
splx(s);
curlwp_bindx(bound);
}
/*
* Free the link level name for the specified interface. This is
* a detach helper. This is called from if_detach().
*/
void
if_free_sadl(struct ifnet *ifp, int factory)
{
struct ifaddr *ifa;
if (factory && ifp->if_hwdl != NULL) {
ifa = ifp->if_hwdl;
ifp->if_hwdl = NULL;
ifafree(ifa);
}
ifa = ifp->if_dl;
if (ifa == NULL) {
KASSERT(ifp->if_sadl == NULL);
return;
}
KASSERT(ifp->if_sadl != NULL);
const int s = splsoftnet();
KASSERT(ifa->ifa_addr->sa_family == AF_LINK);
ifa_remove(ifp, ifa);
if_deactivate_sadl(ifp);
splx(s);
}
static void
if_getindex(ifnet_t *ifp)
{
bool hitlimit = false;
char xnamebuf[HOOKNAMSIZ];
ifp->if_index_gen = index_gen++;
snprintf(xnamebuf, sizeof(xnamebuf), "%s-lshk", ifp->if_xname);
ifp->if_linkstate_hooks = simplehook_create(IPL_NET,
xnamebuf);
ifp->if_index = if_index;
if (ifindex2ifnet == NULL) {
if_index++;
goto skip;
}
while (if_byindex(ifp->if_index)) {
/*
* If we hit USHRT_MAX, we skip back to 0 since
* there are a number of places where the value
* of if_index or if_index itself is compared
* to or stored in an unsigned short. By
* jumping back, we won't botch those assignments
* or comparisons.
*/
if (++if_index == 0) {
if_index = 1;
} else if (if_index == USHRT_MAX) {
/*
* However, if we have to jump back to
* zero *twice* without finding an empty
* slot in ifindex2ifnet[], then there
* there are too many (>65535) interfaces.
*/
if (hitlimit) panic("too many interfaces");
hitlimit = true;
if_index = 1;
}
ifp->if_index = if_index;
}
skip:
/*
* ifindex2ifnet is indexed by if_index. Since if_index will
* grow dynamically, it should grow too.
*/
if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
size_t m, n, oldlim;
void *q;
oldlim = if_indexlim;
while (ifp->if_index >= if_indexlim)
if_indexlim <<= 1;
/* grow ifindex2ifnet */
m = oldlim * sizeof(struct ifnet *);
n = if_indexlim * sizeof(struct ifnet *);
q = malloc(n, M_IFADDR, M_WAITOK | M_ZERO);
if (ifindex2ifnet != NULL) {
memcpy(q, ifindex2ifnet, m);
free(ifindex2ifnet, M_IFADDR);
}
ifindex2ifnet = (struct ifnet **)q;
}
ifindex2ifnet[ifp->if_index] = ifp;
}
/*
* Initialize an interface and assign an index for it.
*
* It must be called prior to a device specific attach routine
* (e.g., ether_ifattach and ieee80211_ifattach) or if_alloc_sadl,
* and be followed by if_register:
*
* if_initialize(ifp);
* ether_ifattach(ifp, enaddr);
* if_register(ifp);
*/
void
if_initialize(ifnet_t *ifp)
{ KASSERT(if_indexlim > 0);
TAILQ_INIT(&ifp->if_addrlist);
/*
* Link level name is allocated later by a separate call to
* if_alloc_sadl().
*/
if (ifp->if_snd.ifq_maxlen == 0) ifp->if_snd.ifq_maxlen = ifqmaxlen;
ifp->if_broadcastaddr = 0; /* reliably crash if used uninitialized */
ifp->if_link_state = LINK_STATE_UNKNOWN;
ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
ifp->if_link_scheduled = false;
ifp->if_capenable = 0;
ifp->if_csum_flags_tx = 0;
ifp->if_csum_flags_rx = 0;
#ifdef ALTQ
ifp->if_snd.altq_type = 0;
ifp->if_snd.altq_disc = NULL;
ifp->if_snd.altq_flags &= ALTQF_CANTCHANGE;
ifp->if_snd.altq_tbr = NULL;
ifp->if_snd.altq_ifp = ifp;
#endif
IFQ_LOCK_INIT(&ifp->if_snd);
ifp->if_pfil = pfil_head_create(PFIL_TYPE_IFNET, ifp);
pfil_run_ifhooks(if_pfil, PFIL_IFNET_ATTACH, ifp);
IF_AFDATA_LOCK_INIT(ifp);
PSLIST_ENTRY_INIT(ifp, if_pslist_entry);
PSLIST_INIT(&ifp->if_addr_pslist);
psref_target_init(&ifp->if_psref, ifnet_psref_class);
ifp->if_ioctl_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&ifp->if_multiaddrs);
if_stats_init(ifp);
IFNET_GLOBAL_LOCK();
if_getindex(ifp);
IFNET_GLOBAL_UNLOCK();
}
/*
* Register an interface to the list of "active" interfaces.
*/
void
if_register(ifnet_t *ifp)
{
/*
* If the driver has not supplied its own if_ioctl or if_stop,
* then supply the default.
*/
if (ifp->if_ioctl == NULL) ifp->if_ioctl = ifioctl_common; if (ifp->if_stop == NULL) ifp->if_stop = if_nullstop; sysctl_sndq_setup(&ifp->if_sysctl_log, ifp->if_xname, &ifp->if_snd); if (!STAILQ_EMPTY(&domains)) if_attachdomain1(ifp);
/* Announce the interface. */
rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
if (ifp->if_slowtimo != NULL) {
struct if_slowtimo_data *isd;
isd = kmem_zalloc(sizeof(*isd), KM_SLEEP);
mutex_init(&isd->isd_lock, MUTEX_DEFAULT, IPL_SOFTCLOCK);
callout_init(&isd->isd_ch, CALLOUT_MPSAFE);
callout_setfunc(&isd->isd_ch, if_slowtimo_intr, ifp);
isd->isd_ifp = ifp;
ifp->if_slowtimo_data = isd;
if_slowtimo_intr(ifp);
sysctl_watchdog_setup(ifp);
}
if (ifp->if_transmit == NULL || ifp->if_transmit == if_nulltransmit) ifp->if_transmit = if_transmit;
IFNET_GLOBAL_LOCK();
TAILQ_INSERT_TAIL(&ifnet_list, ifp, if_list); IFNET_WRITER_INSERT_TAIL(ifp);
IFNET_GLOBAL_UNLOCK();
}
/*
* The if_percpuq framework
*
* It allows network device drivers to execute the network stack
* in softint (so called softint-based if_input). It utilizes
* softint and percpu ifqueue. It doesn't distribute any packets
* between CPUs, unlike pktqueue(9).
*
* Currently we support two options for device drivers to apply the framework:
* - Use it implicitly with less changes
* - If you use if_attach in driver's _attach function and if_input in
* driver's Rx interrupt handler, a packet is queued and a softint handles
* the packet implicitly
* - Use it explicitly in each driver (recommended)
* - You can use if_percpuq_* directly in your driver
* - In this case, you need to allocate struct if_percpuq in driver's softc
* - See wm(4) as a reference implementation
*/
static void
if_percpuq_softint(void *arg)
{
struct if_percpuq *ipq = arg;
struct ifnet *ifp = ipq->ipq_ifp;
struct mbuf *m;
while ((m = if_percpuq_dequeue(ipq)) != NULL) {
if_statinc(ifp, if_ipackets);
bpf_mtap(ifp, m, BPF_D_IN);
ifp->_if_input(ifp, m);
}
}
static void
if_percpuq_init_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
memset(ifq, 0, sizeof(*ifq));
ifq->ifq_maxlen = IFQ_MAXLEN;
}
struct if_percpuq *
if_percpuq_create(struct ifnet *ifp)
{
struct if_percpuq *ipq;
u_int flags = SOFTINT_NET;
flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;
ipq = kmem_zalloc(sizeof(*ipq), KM_SLEEP);
ipq->ipq_ifp = ifp;
ipq->ipq_si = softint_establish(flags, if_percpuq_softint, ipq);
ipq->ipq_ifqs = percpu_alloc(sizeof(struct ifqueue));
percpu_foreach(ipq->ipq_ifqs, &if_percpuq_init_ifq, NULL);
sysctl_percpuq_setup(&ifp->if_sysctl_log, ifp->if_xname, ipq);
return ipq;
}
static struct mbuf *
if_percpuq_dequeue(struct if_percpuq *ipq)
{
struct mbuf *m;
struct ifqueue *ifq;
const int s = splnet();
ifq = percpu_getref(ipq->ipq_ifqs);
IF_DEQUEUE(ifq, m);
percpu_putref(ipq->ipq_ifqs);
splx(s);
return m;
}
static void
if_percpuq_purge_ifq(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
IF_PURGE(ifq);
}
void
if_percpuq_destroy(struct if_percpuq *ipq)
{
/* if_detach may already destroy it */
if (ipq == NULL)
return;
softint_disestablish(ipq->ipq_si);
percpu_foreach(ipq->ipq_ifqs, &if_percpuq_purge_ifq, NULL);
percpu_free(ipq->ipq_ifqs, sizeof(struct ifqueue));
kmem_free(ipq, sizeof(*ipq));
}
void
if_percpuq_enqueue(struct if_percpuq *ipq, struct mbuf *m)
{
struct ifqueue *ifq;
KASSERT(ipq != NULL);
const int s = splnet();
ifq = percpu_getref(ipq->ipq_ifqs);
if (IF_QFULL(ifq)) {
IF_DROP(ifq);
percpu_putref(ipq->ipq_ifqs);
m_freem(m);
goto out;
}
IF_ENQUEUE(ifq, m);
percpu_putref(ipq->ipq_ifqs);
softint_schedule(ipq->ipq_si);
out:
splx(s);
}
static void
if_percpuq_drops(void *p, void *arg, struct cpu_info *ci __unused)
{
struct ifqueue *const ifq = p;
uint64_t *sum = arg;
*sum += ifq->ifq_drops;
}
static int
sysctl_percpuq_drops_handler(SYSCTLFN_ARGS)
{
struct sysctlnode node;
struct if_percpuq *ipq;
uint64_t sum = 0;
int error;
node = *rnode;
ipq = node.sysctl_data;
percpu_foreach(ipq->ipq_ifqs, if_percpuq_drops, &sum);
node.sysctl_data = ∑
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error != 0 || newp == NULL)
return error;
return 0;
}
static void
sysctl_percpuq_setup(struct sysctllog **clog, const char* ifname,
struct if_percpuq *ipq)
{
const struct sysctlnode *cnode, *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, ifname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "rcvq",
SYSCTL_DESCR("Interface input queue controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
#ifdef NOTYET
/* XXX Should show each per-CPU queue length? */
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "len",
SYSCTL_DESCR("Current input queue length"),
sysctl_percpuq_len, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed input queue length"),
sysctl_percpuq_maxlen_handler, 0, (void *)ipq, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
#endif
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Total packets dropped due to full input queue"),
sysctl_percpuq_drops_handler, 0, (void *)ipq, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl nodes\n", ifname);
return;
}
/*
* The deferred if_start framework
*
* The common APIs to defer if_start to softint when if_start is requested
* from a device driver running in hardware interrupt context.
*/
/*
* Call ifp->if_start (or equivalent) in a dedicated softint for
* deferred if_start.
*/
static void
if_deferred_start_softint(void *arg)
{
struct if_deferred_start *ids = arg;
struct ifnet *ifp = ids->ids_ifp;
ids->ids_if_start(ifp);
}
/*
* The default callback function for deferred if_start.
*/
static void
if_deferred_start_common(struct ifnet *ifp)
{
const int s = splnet();
if_start_lock(ifp);
splx(s);
}
static inline bool
if_snd_is_used(struct ifnet *ifp)
{
return ALTQ_IS_ENABLED(&ifp->if_snd) ||
ifp->if_transmit == if_transmit ||
ifp->if_transmit == NULL ||
ifp->if_transmit == if_nulltransmit;
}
/*
* Schedule deferred if_start.
*/
void
if_schedule_deferred_start(struct ifnet *ifp)
{
KASSERT(ifp->if_deferred_start != NULL);
if (if_snd_is_used(ifp) && IFQ_IS_EMPTY(&ifp->if_snd))
return;
softint_schedule(ifp->if_deferred_start->ids_si);
}
/*
* Create an instance of deferred if_start. A driver should call the function
* only if the driver needs deferred if_start. Drivers can setup their own
* deferred if_start function via 2nd argument.
*/
void
if_deferred_start_init(struct ifnet *ifp, void (*func)(struct ifnet *))
{
struct if_deferred_start *ids;
u_int flags = SOFTINT_NET;
flags |= if_is_mpsafe(ifp) ? SOFTINT_MPSAFE : 0;
ids = kmem_zalloc(sizeof(*ids), KM_SLEEP);
ids->ids_ifp = ifp;
ids->ids_si = softint_establish(flags, if_deferred_start_softint, ids);
if (func != NULL)
ids->ids_if_start = func;
else
ids->ids_if_start = if_deferred_start_common;
ifp->if_deferred_start = ids;
}
static void
if_deferred_start_destroy(struct ifnet *ifp)
{
if (ifp->if_deferred_start == NULL)
return;
softint_disestablish(ifp->if_deferred_start->ids_si);
kmem_free(ifp->if_deferred_start, sizeof(*ifp->if_deferred_start));
ifp->if_deferred_start = NULL;
}
/*
* The common interface input routine that is called by device drivers,
* which should be used only when the driver's rx handler already runs
* in softint.
*/
void
if_input(struct ifnet *ifp, struct mbuf *m)
{
KASSERT(ifp->if_percpuq == NULL);
KASSERT(!cpu_intr_p());
if_statinc(ifp, if_ipackets);
bpf_mtap(ifp, m, BPF_D_IN);
ifp->_if_input(ifp, m);
}
/*
* DEPRECATED. Use if_initialize and if_register instead.
* See the above comment of if_initialize.
*
* Note that it implicitly enables if_percpuq to make drivers easy to
* migrate softint-based if_input without much changes. If you don't
* want to enable it, use if_initialize instead.
*/
void
if_attach(ifnet_t *ifp)
{
if_initialize(ifp);
ifp->if_percpuq = if_percpuq_create(ifp);
if_register(ifp);
}
void
if_attachdomain(void)
{
struct ifnet *ifp;
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
struct psref psref;
psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
pserialize_read_exit(s);
if_attachdomain1(ifp);
s = pserialize_read_enter();
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
}
static void
if_attachdomain1(struct ifnet *ifp)
{
struct domain *dp;
const int s = splsoftnet();
/* address family dependent data region */
memset(ifp->if_afdata, 0, sizeof(ifp->if_afdata));
DOMAIN_FOREACH(dp) { if (dp->dom_ifattach != NULL)
ifp->if_afdata[dp->dom_family] =
(*dp->dom_ifattach)(ifp);
}
splx(s);
}
/*
* Deactivate an interface. This points all of the procedure
* handles at error stubs. May be called from interrupt context.
*/
void
if_deactivate(struct ifnet *ifp)
{
const int s = splsoftnet();
ifp->if_output = if_nulloutput;
ifp->_if_input = if_nullinput;
ifp->if_start = if_nullstart;
ifp->if_transmit = if_nulltransmit;
ifp->if_ioctl = if_nullioctl;
ifp->if_init = if_nullinit;
ifp->if_stop = if_nullstop;
if (ifp->if_slowtimo)
ifp->if_slowtimo = if_nullslowtimo;
ifp->if_drain = if_nulldrain;
/* No more packets may be enqueued. */
ifp->if_snd.ifq_maxlen = 0;
splx(s);
}
bool
if_is_deactivated(const struct ifnet *ifp)
{
return ifp->if_output == if_nulloutput;
}
void
if_purgeaddrs(struct ifnet *ifp, int family,
void (*purgeaddr)(struct ifaddr *))
{
struct ifaddr *ifa, *nifa;
int s;
s = pserialize_read_enter();
for (ifa = IFADDR_READER_FIRST(ifp); ifa; ifa = nifa) {
nifa = IFADDR_READER_NEXT(ifa);
if (ifa->ifa_addr->sa_family != family)
continue;
pserialize_read_exit(s);
(*purgeaddr)(ifa);
s = pserialize_read_enter();
}
pserialize_read_exit(s);
}
#ifdef IFAREF_DEBUG
static struct ifaddr **ifa_list;
static int ifa_list_size;
/* Depends on only one if_attach runs at once */
static void
if_build_ifa_list(struct ifnet *ifp)
{
struct ifaddr *ifa;
int i;
KASSERT(ifa_list == NULL);
KASSERT(ifa_list_size == 0);
IFADDR_READER_FOREACH(ifa, ifp)
ifa_list_size++;
ifa_list = kmem_alloc(sizeof(*ifa) * ifa_list_size, KM_SLEEP);
i = 0;
IFADDR_READER_FOREACH(ifa, ifp) {
ifa_list[i++] = ifa;
ifaref(ifa);
}
}
static void
if_check_and_free_ifa_list(struct ifnet *ifp)
{
int i;
struct ifaddr *ifa;
if (ifa_list == NULL)
return;
for (i = 0; i < ifa_list_size; i++) {
char buf[64];
ifa = ifa_list[i];
sockaddr_format(ifa->ifa_addr, buf, sizeof(buf));
if (ifa->ifa_refcnt > 1) {
log(LOG_WARNING,
"ifa(%s) still referenced (refcnt=%d)\n",
buf, ifa->ifa_refcnt - 1);
} else
log(LOG_DEBUG,
"ifa(%s) not referenced (refcnt=%d)\n",
buf, ifa->ifa_refcnt - 1);
ifafree(ifa);
}
kmem_free(ifa_list, sizeof(*ifa) * ifa_list_size);
ifa_list = NULL;
ifa_list_size = 0;
}
#endif
/*
* Detach an interface from the list of "active" interfaces,
* freeing any resources as we go along.
*
* NOTE: This routine must be called with a valid thread context,
* as it may block.
*/
void
if_detach(struct ifnet *ifp)
{
struct socket so;
struct ifaddr *ifa;
#ifdef IFAREF_DEBUG
struct ifaddr *last_ifa = NULL;
#endif
struct domain *dp;
const struct protosw *pr;
int i, family, purged;
#ifdef IFAREF_DEBUG
if_build_ifa_list(ifp);
#endif
/*
* XXX It's kind of lame that we have to have the
* XXX socket structure...
*/
memset(&so, 0, sizeof(so));
const int s = splnet();
sysctl_teardown(&ifp->if_sysctl_log);
IFNET_LOCK(ifp);
/*
* Unset all queued link states and pretend a
* link state change is scheduled.
* This stops any more link state changes occurring for this
* interface while it's being detached so it's safe
* to drain the workqueue.
*/
IF_LINK_STATE_CHANGE_LOCK(ifp);
ifp->if_link_queue = -1; /* all bits set, see link_state_change() */
ifp->if_link_scheduled = true;
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
workqueue_wait(ifnet_link_state_wq, &ifp->if_link_work);
if_deactivate(ifp);
IFNET_UNLOCK(ifp);
/*
* Unlink from the list and wait for all readers to leave
* from pserialize read sections. Note that we can't do
* psref_target_destroy here. See below.
*/
IFNET_GLOBAL_LOCK();
ifindex2ifnet[ifp->if_index] = NULL;
TAILQ_REMOVE(&ifnet_list, ifp, if_list);
IFNET_WRITER_REMOVE(ifp);
pserialize_perform(ifnet_psz);
IFNET_GLOBAL_UNLOCK();
if (ifp->if_slowtimo != NULL) {
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
mutex_enter(&isd->isd_lock);
isd->isd_dying = true;
mutex_exit(&isd->isd_lock);
callout_halt(&isd->isd_ch, NULL);
workqueue_wait(if_slowtimo_wq, &isd->isd_work);
callout_destroy(&isd->isd_ch);
mutex_destroy(&isd->isd_lock);
kmem_free(isd, sizeof(*isd));
ifp->if_slowtimo_data = NULL; /* paraonia */
ifp->if_slowtimo = NULL; /* paranoia */
}
if_deferred_start_destroy(ifp);
/*
* Do an if_down() to give protocols a chance to do something.
*/
if_down_deactivated(ifp);
#ifdef ALTQ
if (ALTQ_IS_ENABLED(&ifp->if_snd))
altq_disable(&ifp->if_snd);
if (ALTQ_IS_ATTACHED(&ifp->if_snd))
altq_detach(&ifp->if_snd);
#endif
#if NCARP > 0
/* Remove the interface from any carp group it is a part of. */
if (ifp->if_carp != NULL && ifp->if_type != IFT_CARP)
carp_ifdetach(ifp);
#endif
/*
* Ensure that all packets on protocol input pktqueues have been
* processed, or, at least, removed from the queues.
*
* A cross-call will ensure that the interrupts have completed.
* FIXME: not quite..
*/
pktq_ifdetach();
xc_barrier(0);
/*
* Rip all the addresses off the interface. This should make
* all of the routes go away.
*
* pr_usrreq calls can remove an arbitrary number of ifaddrs
* from the list, including our "cursor", ifa. For safety,
* and to honor the TAILQ abstraction, I just restart the
* loop after each removal. Note that the loop will exit
* when all of the remaining ifaddrs belong to the AF_LINK
* family. I am counting on the historical fact that at
* least one pr_usrreq in each address domain removes at
* least one ifaddr.
*/
again:
/*
* At this point, no other one tries to remove ifa in the list,
* so we don't need to take a lock or psref. Avoid using
* IFADDR_READER_FOREACH to pass over an inspection of contract
* violations of pserialize.
*/
IFADDR_WRITER_FOREACH(ifa, ifp) {
family = ifa->ifa_addr->sa_family;
#ifdef IFAREF_DEBUG
printf("if_detach: ifaddr %p, family %d, refcnt %d\n",
ifa, family, ifa->ifa_refcnt);
if (last_ifa != NULL && ifa == last_ifa)
panic("if_detach: loop detected");
last_ifa = ifa;
#endif
if (family == AF_LINK)
continue;
dp = pffinddomain(family);
KASSERTMSG(dp != NULL, "no domain for AF %d", family);
/*
* XXX These PURGEIF calls are redundant with the
* purge-all-families calls below, but are left in for
* now both to make a smaller change, and to avoid
* unplanned interactions with clearing of
* ifp->if_addrlist.
*/
purged = 0;
for (pr = dp->dom_protosw;
pr < dp->dom_protoswNPROTOSW; pr++) {
so.so_proto = pr;
if (pr->pr_usrreqs) {
(void) (*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
purged = 1;
}
}
if (purged == 0) {
/*
* XXX What's really the best thing to do
* XXX here? --thorpej@NetBSD.org
*/
printf("if_detach: WARNING: AF %d not purged\n",
family);
ifa_remove(ifp, ifa);
}
goto again;
}
if_free_sadl(ifp, 1);
restart:
IFADDR_WRITER_FOREACH(ifa, ifp) {
family = ifa->ifa_addr->sa_family;
KASSERT(family == AF_LINK);
ifa_remove(ifp, ifa);
goto restart;
}
/* Delete stray routes from the routing table. */
for (i = 0; i <= AF_MAX; i++)
rt_delete_matched_entries(i, if_delroute_matcher, ifp, false);
DOMAIN_FOREACH(dp) {
if (dp->dom_ifdetach != NULL && ifp->if_afdata[dp->dom_family])
{
void *p = ifp->if_afdata[dp->dom_family];
if (p) {
ifp->if_afdata[dp->dom_family] = NULL;
(*dp->dom_ifdetach)(ifp, p);
}
}
/*
* One would expect multicast memberships (INET and
* INET6) on UDP sockets to be purged by the PURGEIF
* calls above, but if all addresses were removed from
* the interface prior to destruction, the calls will
* not be made (e.g. ppp, for which pppd(8) generally
* removes addresses before destroying the interface).
* Because there is no invariant that multicast
* memberships only exist for interfaces with IPv4
* addresses, we must call PURGEIF regardless of
* addresses. (Protocols which might store ifnet
* pointers are marked with PR_PURGEIF.)
*/
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
{
so.so_proto = pr;
if (pr->pr_usrreqs && pr->pr_flags & PR_PURGEIF)
(void)(*pr->pr_usrreqs->pr_purgeif)(&so, ifp);
}
}
/*
* Must be done after the above pr_purgeif because if_psref may be
* still used in pr_purgeif.
*/
psref_target_destroy(&ifp->if_psref, ifnet_psref_class);
PSLIST_ENTRY_DESTROY(ifp, if_pslist_entry);
pfil_run_ifhooks(if_pfil, PFIL_IFNET_DETACH, ifp);
(void)pfil_head_destroy(ifp->if_pfil);
/* Announce that the interface is gone. */
rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
IF_AFDATA_LOCK_DESTROY(ifp);
if (ifp->if_percpuq != NULL) {
if_percpuq_destroy(ifp->if_percpuq);
ifp->if_percpuq = NULL;
}
mutex_obj_free(ifp->if_ioctl_lock);
ifp->if_ioctl_lock = NULL;
mutex_obj_free(ifp->if_snd.ifq_lock);
if_stats_fini(ifp);
KASSERT(!simplehook_has_hooks(ifp->if_linkstate_hooks));
simplehook_destroy(ifp->if_linkstate_hooks);
splx(s);
#ifdef IFAREF_DEBUG
if_check_and_free_ifa_list(ifp);
#endif
}
/*
* Callback for a radix tree walk to delete all references to an
* ifnet.
*/
static int
if_delroute_matcher(struct rtentry *rt, void *v)
{
struct ifnet *ifp = (struct ifnet *)v;
if (rt->rt_ifp == ifp)
return 1;
else
return 0;
}
/*
* Create a clone network interface.
*/
static int
if_clone_create(const char *name)
{
struct if_clone *ifc;
struct ifnet *ifp;
struct psref psref;
int unit;
KASSERT(mutex_owned(&if_clone_mtx));
ifc = if_clone_lookup(name, &unit);
if (ifc == NULL)
return EINVAL;
ifp = if_get(name, &psref);
if (ifp != NULL) {
if_put(ifp, &psref);
return EEXIST;
}
return (*ifc->ifc_create)(ifc, unit);
}
/*
* Destroy a clone network interface.
*/
static int
if_clone_destroy(const char *name)
{
struct if_clone *ifc;
struct ifnet *ifp;
struct psref psref;
int error;
int (*if_ioctlfn)(struct ifnet *, u_long, void *);
KASSERT(mutex_owned(&if_clone_mtx));
ifc = if_clone_lookup(name, NULL);
if (ifc == NULL)
return EINVAL;
if (ifc->ifc_destroy == NULL)
return EOPNOTSUPP;
ifp = if_get(name, &psref);
if (ifp == NULL)
return ENXIO;
/* We have to disable ioctls here */
IFNET_LOCK(ifp);
if_ioctlfn = ifp->if_ioctl;
ifp->if_ioctl = if_nullioctl;
IFNET_UNLOCK(ifp);
/*
* We cannot call ifc_destroy with holding ifp.
* Releasing ifp here is safe thanks to if_clone_mtx.
*/
if_put(ifp, &psref);
error = (*ifc->ifc_destroy)(ifp);
if (error != 0) {
/* We have to restore if_ioctl on error */
IFNET_LOCK(ifp);
ifp->if_ioctl = if_ioctlfn;
IFNET_UNLOCK(ifp);
}
return error;
}
static bool
if_is_unit(const char *name)
{
while (*name != '\0') {
if (*name < '0' || *name > '9')
return false;
name++;
}
return true;
}
/*
* Look up a network interface cloner.
*/
static struct if_clone *
if_clone_lookup(const char *name, int *unitp)
{
struct if_clone *ifc;
const char *cp;
char *dp, ifname[IFNAMSIZ + 3];
int unit;
KASSERT(mutex_owned(&if_clone_mtx));
strcpy(ifname, "if_");
/* separate interface name from unit */
/* TODO: search unit number from backward */
for (dp = ifname + 3, cp = name; cp - name < IFNAMSIZ && *cp && !if_is_unit(cp);)
*dp++ = *cp++;
if (cp == name || cp - name == IFNAMSIZ || !*cp)
return NULL; /* No name or unit number */
*dp++ = '\0';
again:
LIST_FOREACH(ifc, &if_cloners, ifc_list) {
if (strcmp(ifname + 3, ifc->ifc_name) == 0)
break;
}
if (ifc == NULL) {
int error;
if (*ifname == '\0')
return NULL;
mutex_exit(&if_clone_mtx);
error = module_autoload(ifname, MODULE_CLASS_DRIVER);
mutex_enter(&if_clone_mtx);
if (error)
return NULL;
*ifname = '\0';
goto again;
}
unit = 0;
while (cp - name < IFNAMSIZ && *cp) { if (*cp < '0' || *cp > '9' || unit >= INT_MAX / 10) {
/* Bogus unit number. */
return NULL;
}
unit = (unit * 10) + (*cp++ - '0');
}
if (unitp != NULL) *unitp = unit;
return ifc;
}
/*
* Register a network interface cloner.
*/
void
if_clone_attach(struct if_clone *ifc)
{
mutex_enter(&if_clone_mtx);
LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
if_cloners_count++;
mutex_exit(&if_clone_mtx);
}
/*
* Unregister a network interface cloner.
*/
void
if_clone_detach(struct if_clone *ifc)
{
mutex_enter(&if_clone_mtx);
LIST_REMOVE(ifc, ifc_list);
if_cloners_count--;
mutex_exit(&if_clone_mtx);
}
/*
* Provide list of interface cloners to userspace.
*/
int
if_clone_list(int buf_count, char *buffer, int *total)
{
char outbuf[IFNAMSIZ], *dst;
struct if_clone *ifc;
int count, error = 0;
mutex_enter(&if_clone_mtx);
*total = if_cloners_count;
if ((dst = buffer) == NULL) {
/* Just asking how many there are. */
goto out;
}
if (buf_count < 0) {
error = EINVAL;
goto out;
}
count = (if_cloners_count < buf_count) ? if_cloners_count : buf_count;
for (ifc = LIST_FIRST(&if_cloners); ifc != NULL && count != 0;
ifc = LIST_NEXT(ifc, ifc_list), count--, dst += IFNAMSIZ) {
(void)strncpy(outbuf, ifc->ifc_name, sizeof(outbuf));
if (outbuf[sizeof(outbuf) - 1] != '\0') {
error = ENAMETOOLONG;
goto out;
}
error = copyout(outbuf, dst, sizeof(outbuf));
if (error != 0)
break;
}
out:
mutex_exit(&if_clone_mtx);
return error;
}
void
ifa_psref_init(struct ifaddr *ifa)
{
psref_target_init(&ifa->ifa_psref, ifa_psref_class);
}
void
ifaref(struct ifaddr *ifa)
{
atomic_inc_uint(&ifa->ifa_refcnt);
}
void
ifafree(struct ifaddr *ifa)
{ KASSERT(ifa != NULL); KASSERTMSG(ifa->ifa_refcnt > 0, "ifa_refcnt=%d", ifa->ifa_refcnt);
membar_release();
if (atomic_dec_uint_nv(&ifa->ifa_refcnt) != 0)
return;
membar_acquire();
free(ifa, M_IFADDR);
}
bool
ifa_is_destroying(struct ifaddr *ifa)
{
return ISSET(ifa->ifa_flags, IFA_DESTROYING);
}
void
ifa_insert(struct ifnet *ifp, struct ifaddr *ifa)
{
ifa->ifa_ifp = ifp;
/*
* Check MP-safety for IFEF_MPSAFE drivers.
* Check !IFF_RUNNING for initialization routines that normally don't
* take IFNET_LOCK but it's safe because there is no competitor.
* XXX there are false positive cases because IFF_RUNNING can be off on
* if_stop.
*/
KASSERT(!if_is_mpsafe(ifp) || !ISSET(ifp->if_flags, IFF_RUNNING) ||
IFNET_LOCKED(ifp));
TAILQ_INSERT_TAIL(&ifp->if_addrlist, ifa, ifa_list);
IFADDR_ENTRY_INIT(ifa);
IFADDR_WRITER_INSERT_TAIL(ifp, ifa);
ifaref(ifa);
}
void
ifa_remove(struct ifnet *ifp, struct ifaddr *ifa)
{
KASSERT(ifa->ifa_ifp == ifp);
/*
* Check MP-safety for IFEF_MPSAFE drivers.
* if_is_deactivated indicates ifa_remove is called from if_detach
* where it is safe even if IFNET_LOCK isn't held.
*/
KASSERT(!if_is_mpsafe(ifp) || if_is_deactivated(ifp) ||
IFNET_LOCKED(ifp));
TAILQ_REMOVE(&ifp->if_addrlist, ifa, ifa_list);
IFADDR_WRITER_REMOVE(ifa);
#ifdef NET_MPSAFE
IFNET_GLOBAL_LOCK();
pserialize_perform(ifnet_psz);
IFNET_GLOBAL_UNLOCK();
#endif
#ifdef NET_MPSAFE
psref_target_destroy(&ifa->ifa_psref, ifa_psref_class);
#endif
IFADDR_ENTRY_DESTROY(ifa);
ifafree(ifa);
}
void
ifa_acquire(struct ifaddr *ifa, struct psref *psref)
{
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifa->ifa_psref, ifa_psref_class);
}
void
ifa_release(struct ifaddr *ifa, struct psref *psref)
{ if (ifa == NULL)
return;
psref_release(psref, &ifa->ifa_psref, ifa_psref_class);
}
bool
ifa_held(struct ifaddr *ifa)
{
return psref_held(&ifa->ifa_psref, ifa_psref_class);
}
static inline int
equal(const struct sockaddr *sa1, const struct sockaddr *sa2)
{
return sockaddr_cmp(sa1, sa2) == 0;
}
/*
* Locate an interface based on a complete address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithaddr(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != addr->sa_family)
continue;
if (equal(addr, ifa->ifa_addr))
return ifa;
if ((ifp->if_flags & IFF_BROADCAST) && ifa->ifa_broadaddr &&
/* IP6 doesn't have broadcast */
ifa->ifa_broadaddr->sa_len != 0 &&
equal(ifa->ifa_broadaddr, addr))
return ifa;
}
}
return NULL;
}
struct ifaddr *
ifa_ifwithaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s = pserialize_read_enter();
ifa = ifa_ifwithaddr(addr);
if (ifa != NULL) ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Locate the point to point interface with a given destination address.
*/
/*ARGSUSED*/
struct ifaddr *
ifa_ifwithdstaddr(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa;
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
if ((ifp->if_flags & IFF_POINTOPOINT) == 0)
continue;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != addr->sa_family ||
ifa->ifa_dstaddr == NULL)
continue;
if (equal(addr, ifa->ifa_dstaddr))
return ifa;
}
}
return NULL;
}
struct ifaddr *
ifa_ifwithdstaddr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithdstaddr(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface on a specific network. If many, choice
* is most specific found.
*/
struct ifaddr *
ifa_ifwithnet(const struct sockaddr *addr)
{
struct ifnet *ifp;
struct ifaddr *ifa, *ifa_maybe = NULL;
const struct sockaddr_dl *sdl;
u_int af = addr->sa_family;
const char *addr_data = addr->sa_data, *cplim;
if (af == AF_LINK) {
sdl = satocsdl(addr);
if (sdl->sdl_index && sdl->sdl_index < if_indexlim && ifindex2ifnet[sdl->sdl_index] &&
!if_is_deactivated(ifindex2ifnet[sdl->sdl_index])) {
return ifindex2ifnet[sdl->sdl_index]->if_dl;
}
}
#ifdef NETATALK
if (af == AF_APPLETALK) {
const struct sockaddr_at *sat, *sat2;
sat = (const struct sockaddr_at *)addr;
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
ifa = at_ifawithnet((const struct sockaddr_at *)addr,
ifp);
if (ifa == NULL)
continue;
sat2 = (struct sockaddr_at *)ifa->ifa_addr;
if (sat2->sat_addr.s_net == sat->sat_addr.s_net)
return ifa; /* exact match */
if (ifa_maybe == NULL) {
/* else keep the if with the right range */
ifa_maybe = ifa;
}
}
return ifa_maybe;
}
#endif
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) {
const char *cp, *cp2, *cp3;
if (ifa->ifa_addr->sa_family != af ||
ifa->ifa_netmask == NULL)
next: continue;
cp = addr_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = (const char *)ifa->ifa_netmask +
ifa->ifa_netmask->sa_len;
while (cp3 < cplim) {
if ((*cp++ ^ *cp2++) & *cp3++) {
/* want to continue for() loop */
goto next;
}
}
if (ifa_maybe == NULL ||
rt_refines(ifa->ifa_netmask,
ifa_maybe->ifa_netmask))
ifa_maybe = ifa;
}
}
return ifa_maybe;
}
struct ifaddr *
ifa_ifwithnet_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithnet(addr);
if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find the interface of the address.
*/
struct ifaddr *
ifa_ifwithladdr(const struct sockaddr *addr)
{
struct ifaddr *ia;
if ((ia = ifa_ifwithaddr(addr)) || (ia = ifa_ifwithdstaddr(addr)) ||
(ia = ifa_ifwithnet(addr)))
return ia;
return NULL;
}
struct ifaddr *
ifa_ifwithladdr_psref(const struct sockaddr *addr, struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifa_ifwithladdr(addr); if (ifa != NULL)
ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface using a specific address family
*/
struct ifaddr *
ifa_ifwithaf(int af)
{
struct ifnet *ifp;
struct ifaddr *ifa = NULL;
int s;
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family == af)
goto out;
}
}
out:
pserialize_read_exit(s);
return ifa;
}
/*
* Find an interface address specific to an interface best matching
* a given address.
*/
struct ifaddr *
ifaof_ifpforaddr(const struct sockaddr *addr, struct ifnet *ifp)
{
struct ifaddr *ifa;
const char *cp, *cp2, *cp3;
const char *cplim;
struct ifaddr *ifa_maybe = 0;
u_int af = addr->sa_family;
if (if_is_deactivated(ifp))
return NULL;
if (af >= AF_MAX)
return NULL;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != af)
continue;
ifa_maybe = ifa;
if (ifa->ifa_netmask == NULL) {
if (equal(addr, ifa->ifa_addr) || (ifa->ifa_dstaddr &&
equal(addr, ifa->ifa_dstaddr)))
return ifa;
continue;
}
cp = addr->sa_data;
cp2 = ifa->ifa_addr->sa_data;
cp3 = ifa->ifa_netmask->sa_data;
cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
for (; cp3 < cplim; cp3++) { if ((*cp++ ^ *cp2++) & *cp3)
break;
}
if (cp3 == cplim)
return ifa;
}
return ifa_maybe;
}
struct ifaddr *
ifaof_ifpforaddr_psref(const struct sockaddr *addr, struct ifnet *ifp,
struct psref *psref)
{
struct ifaddr *ifa;
int s;
s = pserialize_read_enter();
ifa = ifaof_ifpforaddr(addr, ifp);
if (ifa != NULL) ifa_acquire(ifa, psref);
pserialize_read_exit(s);
return ifa;
}
/*
* Default action when installing a route with a Link Level gateway.
* Lookup an appropriate real ifa to point to.
* This should be moved to /sys/net/link.c eventually.
*/
void
link_rtrequest(int cmd, struct rtentry *rt, const struct rt_addrinfo *info)
{
struct ifaddr *ifa;
const struct sockaddr *dst;
struct ifnet *ifp;
struct psref psref;
if (cmd != RTM_ADD || ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
return;
ifp = rt->rt_ifa->ifa_ifp;
dst = rt_getkey(rt);
if ((ifa = ifaof_ifpforaddr_psref(dst, ifp, &psref)) != NULL) {
rt_replace_ifa(rt, ifa);
if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
ifa->ifa_rtrequest(cmd, rt, info);
ifa_release(ifa, &psref);
}
}
/*
* bitmask macros to manage a densely packed link_state change queue.
* Because we need to store LINK_STATE_UNKNOWN(0), LINK_STATE_DOWN(1) and
* LINK_STATE_UP(2) we need 2 bits for each state change.
* As a state change to store is 0, treat all bits set as an unset item.
*/
#define LQ_ITEM_BITS 2
#define LQ_ITEM_MASK ((1 << LQ_ITEM_BITS) - 1)
#define LQ_MASK(i) (LQ_ITEM_MASK << (i) * LQ_ITEM_BITS)
#define LINK_STATE_UNSET LQ_ITEM_MASK
#define LQ_ITEM(q, i) (((q) & LQ_MASK((i))) >> (i) * LQ_ITEM_BITS)
#define LQ_STORE(q, i, v) \
do { \
(q) &= ~LQ_MASK((i)); \
(q) |= (v) << (i) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_MAX(q) ((sizeof((q)) * NBBY) / LQ_ITEM_BITS)
#define LQ_POP(q, v) \
do { \
(v) = LQ_ITEM((q), 0); \
(q) >>= LQ_ITEM_BITS; \
(q) |= LINK_STATE_UNSET << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_PUSH(q, v) \
do { \
(q) >>= LQ_ITEM_BITS; \
(q) |= (v) << (LQ_MAX((q)) - 1) * LQ_ITEM_BITS; \
} while (0 /* CONSTCOND */)
#define LQ_FIND_UNSET(q, i) \
for ((i) = 0; i < LQ_MAX((q)); (i)++) { \
if (LQ_ITEM((q), (i)) == LINK_STATE_UNSET) \
break; \
}
/*
* Handle a change in the interface link state and
* queue notifications.
*/
void
if_link_state_change(struct ifnet *ifp, int link_state)
{
int idx;
/* Ensure change is to a valid state */
switch (link_state) {
case LINK_STATE_UNKNOWN: /* FALLTHROUGH */
case LINK_STATE_DOWN: /* FALLTHROUGH */
case LINK_STATE_UP:
break;
default:
#ifdef DEBUG
printf("%s: invalid link state %d\n",
ifp->if_xname, link_state);
#endif
return;
}
IF_LINK_STATE_CHANGE_LOCK(ifp);
/* Find the last unset event in the queue. */
LQ_FIND_UNSET(ifp->if_link_queue, idx);
if (idx == 0) {
/*
* There is no queue of link state changes.
* As we have the lock we can safely compare against the
* current link state and return if the same.
* Otherwise, if scheduled is true then the interface is being
* detached and the queue is being drained so we need
* to avoid queuing more work.
*/
if (ifp->if_link_state == link_state ||
ifp->if_link_scheduled)
goto out;
} else {
/* Ensure link_state doesn't match the last queued state. */
if (LQ_ITEM(ifp->if_link_queue, idx - 1)
== (uint8_t)link_state)
goto out;
}
/* Handle queue overflow. */
if (idx == LQ_MAX(ifp->if_link_queue)) {
uint8_t lost;
/*
* The DOWN state must be protected from being pushed off
* the queue to ensure that userland will always be
* in a sane state.
* Because DOWN is protected, there is no need to protect
* UNKNOWN.
* It should be invalid to change from any other state to
* UNKNOWN anyway ...
*/
lost = LQ_ITEM(ifp->if_link_queue, 0);
LQ_PUSH(ifp->if_link_queue, (uint8_t)link_state);
if (lost == LINK_STATE_DOWN) {
lost = LQ_ITEM(ifp->if_link_queue, 0);
LQ_STORE(ifp->if_link_queue, 0, LINK_STATE_DOWN);
}
printf("%s: lost link state change %s\n",
ifp->if_xname,
lost == LINK_STATE_UP ? "UP" :
lost == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN");
} else
LQ_STORE(ifp->if_link_queue, idx, (uint8_t)link_state);
if (ifp->if_link_scheduled)
goto out;
ifp->if_link_scheduled = true;
workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work, NULL);
out:
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
}
/*
* Handle interface link state change notifications.
*/
static void
if_link_state_change_process(struct ifnet *ifp, int link_state)
{
struct domain *dp;
const int s = splnet();
bool notify;
KASSERT(!cpu_intr_p());
IF_LINK_STATE_CHANGE_LOCK(ifp);
/* Ensure the change is still valid. */
if (ifp->if_link_state == link_state) {
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
splx(s);
return;
}
#ifdef DEBUG
log(LOG_DEBUG, "%s: link state %s (was %s)\n", ifp->if_xname,
link_state == LINK_STATE_UP ? "UP" :
link_state == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN",
ifp->if_link_state == LINK_STATE_UP ? "UP" :
ifp->if_link_state == LINK_STATE_DOWN ? "DOWN" :
"UNKNOWN");
#endif
/*
* When going from UNKNOWN to UP, we need to mark existing
* addresses as tentative and restart DAD as we may have
* erroneously not found a duplicate.
*
* This needs to happen before rt_ifmsg to avoid a race where
* listeners would have an address and expect it to work right
* away.
*/
notify = (link_state == LINK_STATE_UP &&
ifp->if_link_state == LINK_STATE_UNKNOWN);
ifp->if_link_state = link_state;
/* The following routines may sleep so release the spin mutex */
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
if (notify) {
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp,
LINK_STATE_DOWN);
}
}
/* Notify that the link state has changed. */
rt_ifmsg(ifp);
simplehook_dohooks(ifp->if_linkstate_hooks);
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp, link_state);
}
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
splx(s);
}
/*
* Process the interface link state change queue.
*/
static void
if_link_state_change_work(struct work *work, void *arg)
{
struct ifnet *ifp = container_of(work, struct ifnet, if_link_work);
uint8_t state;
KERNEL_LOCK_UNLESS_NET_MPSAFE();
const int s = splnet();
/*
* Pop a link state change from the queue and process it.
* If there is nothing to process then if_detach() has been called.
* We keep if_link_scheduled = true so the queue can safely drain
* without more work being queued.
*/
IF_LINK_STATE_CHANGE_LOCK(ifp);
LQ_POP(ifp->if_link_queue, state);
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
if (state == LINK_STATE_UNSET)
goto out;
if_link_state_change_process(ifp, state);
/* If there is a link state change to come, schedule it. */
IF_LINK_STATE_CHANGE_LOCK(ifp);
if (LQ_ITEM(ifp->if_link_queue, 0) != LINK_STATE_UNSET) {
ifp->if_link_scheduled = true;
workqueue_enqueue(ifnet_link_state_wq, &ifp->if_link_work,
NULL);
} else
ifp->if_link_scheduled = false;
IF_LINK_STATE_CHANGE_UNLOCK(ifp);
out:
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
void *
if_linkstate_change_establish(struct ifnet *ifp, void (*fn)(void *), void *arg)
{
khook_t *hk;
hk = simplehook_establish(ifp->if_linkstate_hooks, fn, arg);
return (void *)hk;
}
void
if_linkstate_change_disestablish(struct ifnet *ifp, void *vhook,
kmutex_t *lock)
{
simplehook_disestablish(ifp->if_linkstate_hooks, vhook, lock);
}
/*
* Used to mark addresses on an interface as DETATCHED or TENTATIVE
* and thus start Duplicate Address Detection without changing the
* real link state.
*/
void
if_domain_link_state_change(struct ifnet *ifp, int link_state)
{
struct domain *dp;
const int s = splnet();
KERNEL_LOCK_UNLESS_NET_MPSAFE();
DOMAIN_FOREACH(dp) {
if (dp->dom_if_link_state_change != NULL)
dp->dom_if_link_state_change(ifp, link_state);
}
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
/*
* Default action when installing a local route on a point-to-point
* interface.
*/
void
p2p_rtrequest(int req, struct rtentry *rt,
__unused const struct rt_addrinfo *info)
{
struct ifnet *ifp = rt->rt_ifp;
struct ifaddr *ifa, *lo0ifa;
int s = pserialize_read_enter();
switch (req) {
case RTM_ADD:
if ((rt->rt_flags & RTF_LOCAL) == 0)
break;
rt->rt_ifp = lo0ifp;
if (ISSET(info->rti_flags, RTF_DONTCHANGEIFA))
break;
IFADDR_READER_FOREACH(ifa, ifp) {
if (equal(rt_getkey(rt), ifa->ifa_addr))
break;
}
if (ifa == NULL)
break;
/*
* Ensure lo0 has an address of the same family.
*/
IFADDR_READER_FOREACH(lo0ifa, lo0ifp) {
if (lo0ifa->ifa_addr->sa_family ==
ifa->ifa_addr->sa_family)
break;
}
if (lo0ifa == NULL)
break;
/*
* Make sure to set rt->rt_ifa to the interface
* address we are using, otherwise we will have trouble
* with source address selection.
*/
if (ifa != rt->rt_ifa)
rt_replace_ifa(rt, ifa);
break;
case RTM_DELETE:
default:
break;
}
pserialize_read_exit(s);
}
static void
_if_down(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct domain *dp;
struct psref psref;
ifp->if_flags &= ~IFF_UP;
nanotime(&ifp->if_lastchange);
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
pfctlinput(PRC_IFDOWN, ifa->ifa_addr);
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound); IFQ_PURGE(&ifp->if_snd);
#if NCARP > 0
if (ifp->if_carp) carp_carpdev_state(ifp);
#endif
rt_ifmsg(ifp);
DOMAIN_FOREACH(dp) { if (dp->dom_if_down) dp->dom_if_down(ifp);
}
}
static void
if_down_deactivated(struct ifnet *ifp)
{
KASSERT(if_is_deactivated(ifp));
_if_down(ifp);
}
void
if_down_locked(struct ifnet *ifp)
{
KASSERT(IFNET_LOCKED(ifp));
_if_down(ifp);
}
/*
* Mark an interface down and notify protocols of
* the transition.
* NOTE: must be called at splsoftnet or equivalent.
*/
void
if_down(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
if_down_locked(ifp);
IFNET_UNLOCK(ifp);
}
/*
* Must be called with holding if_ioctl_lock.
*/
static void
if_up_locked(struct ifnet *ifp)
{
#ifdef notyet
struct ifaddr *ifa;
#endif
struct domain *dp;
KASSERT(IFNET_LOCKED(ifp)); KASSERT(!if_is_deactivated(ifp));
ifp->if_flags |= IFF_UP;
nanotime(&ifp->if_lastchange);
#ifdef notyet
/* this has no effect on IP, and will kill all ISO connections XXX */
IFADDR_READER_FOREACH(ifa, ifp)
pfctlinput(PRC_IFUP, ifa->ifa_addr);
#endif
#if NCARP > 0
if (ifp->if_carp) carp_carpdev_state(ifp);
#endif
rt_ifmsg(ifp);
DOMAIN_FOREACH(dp) { if (dp->dom_if_up) dp->dom_if_up(ifp);
}
}
/*
* Handle interface slowtimo timer routine. Called
* from softclock, we decrement timer (if set) and
* call the appropriate interface routine on expiration.
*/
static bool
if_slowtimo_countdown(struct ifnet *ifp)
{
bool fire = false;
const int s = splnet();
KERNEL_LOCK(1, NULL);
if (ifp->if_timer != 0 && --ifp->if_timer == 0)
fire = true;
KERNEL_UNLOCK_ONE(NULL);
splx(s);
return fire;
}
static void
if_slowtimo_intr(void *arg)
{
struct ifnet *ifp = arg;
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
mutex_enter(&isd->isd_lock);
if (!isd->isd_dying) {
if (isd->isd_trigger || if_slowtimo_countdown(ifp)) {
if (!isd->isd_queued) {
isd->isd_queued = true;
workqueue_enqueue(if_slowtimo_wq,
&isd->isd_work, NULL);
}
} else
callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
}
mutex_exit(&isd->isd_lock);
}
static void
if_slowtimo_work(struct work *work, void *arg)
{
struct if_slowtimo_data *isd =
container_of(work, struct if_slowtimo_data, isd_work);
struct ifnet *ifp = isd->isd_ifp;
const int s = splnet();
KERNEL_LOCK(1, NULL);
(*ifp->if_slowtimo)(ifp);
KERNEL_UNLOCK_ONE(NULL);
splx(s);
mutex_enter(&isd->isd_lock);
if (isd->isd_trigger) {
isd->isd_trigger = false;
printf("%s: watchdog triggered\n", ifp->if_xname);
}
isd->isd_queued = false;
if (!isd->isd_dying)
callout_schedule(&isd->isd_ch, hz / IFNET_SLOWHZ);
mutex_exit(&isd->isd_lock);
}
static int
sysctl_if_watchdog(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
struct ifnet *ifp = node.sysctl_data;
struct if_slowtimo_data *isd = ifp->if_slowtimo_data;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg) {
mutex_enter(&isd->isd_lock);
KASSERT(!isd->isd_dying);
isd->isd_trigger = true;
callout_schedule(&isd->isd_ch, 0);
mutex_exit(&isd->isd_lock);
}
return 0;
}
static void
sysctl_watchdog_setup(struct ifnet *ifp)
{
struct sysctllog **clog = &ifp->if_sysctl_log;
const struct sysctlnode *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, ifp->if_xname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "watchdog",
SYSCTL_DESCR("Interface watchdog controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "trigger",
SYSCTL_DESCR("Trigger watchdog timeout"),
sysctl_if_watchdog, 0, (int *)ifp, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl watchdog nodes\n", ifp->if_xname);
}
/*
* Mark an interface up and notify protocols of
* the transition.
* NOTE: must be called at splsoftnet or equivalent.
*/
void
if_up(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
if_up_locked(ifp);
IFNET_UNLOCK(ifp);
}
/*
* Set/clear promiscuous mode on interface ifp based on the truth value
* of pswitch. The calls are reference counted so that only the first
* "on" request actually has an effect, as does the final "off" request.
* Results are undefined if the "off" and "on" requests are not matched.
*/
int
ifpromisc_locked(struct ifnet *ifp, int pswitch)
{
int pcount, ret = 0;
u_short nflags;
KASSERT(IFNET_LOCKED(ifp));
pcount = ifp->if_pcount;
if (pswitch) {
/*
* Allow the device to be "placed" into promiscuous
* mode even if it is not configured up. It will
* consult IFF_PROMISC when it is brought up.
*/
if (ifp->if_pcount++ != 0)
goto out;
nflags = ifp->if_flags | IFF_PROMISC;
} else {
if (--ifp->if_pcount > 0)
goto out;
nflags = ifp->if_flags & ~IFF_PROMISC;
}
ret = if_flags_set(ifp, nflags);
/* Restore interface state if not successful. */
if (ret != 0)
ifp->if_pcount = pcount;
out:
return ret;
}
int
ifpromisc(struct ifnet *ifp, int pswitch)
{
int e;
IFNET_LOCK(ifp);
e = ifpromisc_locked(ifp, pswitch);
IFNET_UNLOCK(ifp);
return e;
}
/*
* if_ioctl(ifp, cmd, data)
*
* Apply an ioctl command to the interface. Returns 0 on success,
* nonzero errno(3) number on failure.
*
* For SIOCADDMULTI/SIOCDELMULTI, caller need not hold locks -- it
* is the driver's responsibility to take any internal locks.
* (Kernel logic should generally invoke these only through
* if_mcast_op.)
*
* For all other ioctls, caller must hold ifp->if_ioctl_lock,
* a.k.a. IFNET_LOCK. May sleep.
*/
int
if_ioctl(struct ifnet *ifp, u_long cmd, void *data)
{
switch (cmd) {
case SIOCADDMULTI:
case SIOCDELMULTI:
break;
default:
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
}
return (*ifp->if_ioctl)(ifp, cmd, data);
}
/*
* if_init(ifp)
*
* Prepare the hardware underlying ifp to process packets
* according to its current configuration. Returns 0 on success,
* nonzero errno(3) number on failure.
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
int
if_init(struct ifnet *ifp)
{
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
return (*ifp->if_init)(ifp);
}
/*
* if_stop(ifp, disable)
*
* Stop the hardware underlying ifp from processing packets.
*
* If disable is true, ... XXX(?)
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
void
if_stop(struct ifnet *ifp, int disable)
{
KASSERTMSG(IFNET_LOCKED(ifp), "%s", ifp->if_xname);
(*ifp->if_stop)(ifp, disable);
}
/*
* Map interface name to
* interface structure pointer.
*/
struct ifnet *
ifunit(const char *name)
{
struct ifnet *ifp;
const char *cp = name;
u_int unit = 0;
u_int i;
/*
* If the entire name is a number, treat it as an ifindex.
*/
for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
unit = unit * 10 + (*cp - '0');
/*
* If the number took all of the name, then it's a valid ifindex.
*/
if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_byindex(unit);
ifp = NULL;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
if (strcmp(ifp->if_xname, name) == 0)
goto out;
}
out:
pserialize_read_exit(s);
return ifp;
}
/*
* Get a reference of an ifnet object by an interface name.
* The returned reference is protected by psref(9). The caller
* must release a returned reference by if_put after use.
*/
struct ifnet *
if_get(const char *name, struct psref *psref)
{
struct ifnet *ifp;
const char *cp = name;
u_int unit = 0;
u_int i;
/*
* If the entire name is a number, treat it as an ifindex.
*/
for (i = 0; i < IFNAMSIZ && *cp >= '0' && *cp <= '9'; i++, cp++)
unit = unit * 10 + (*cp - '0');
/*
* If the number took all of the name, then it's a valid ifindex.
*/
if (i == IFNAMSIZ || (cp != name && *cp == '\0')) return if_get_byindex(unit, psref);
ifp = NULL;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) { if (if_is_deactivated(ifp))
continue;
if (strcmp(ifp->if_xname, name) == 0) {
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifp->if_psref,
ifnet_psref_class);
goto out;
}
}
out:
pserialize_read_exit(s);
return ifp;
}
/*
* Release a reference of an ifnet object given by if_get, if_get_byindex
* or if_get_bylla.
*/
void
if_put(const struct ifnet *ifp, struct psref *psref)
{ if (ifp == NULL)
return;
psref_release(psref, &ifp->if_psref, ifnet_psref_class);
}
/*
* Return ifp having idx. Return NULL if not found. Normally if_byindex
* should be used.
*/
ifnet_t *
_if_byindex(u_int idx)
{
return (__predict_true(idx < if_indexlim)) ? ifindex2ifnet[idx] : NULL;
}
/*
* Return ifp having idx. Return NULL if not found or the found ifp is
* already deactivated.
*/
ifnet_t *
if_byindex(u_int idx)
{
ifnet_t *ifp;
ifp = _if_byindex(idx); if (ifp != NULL && if_is_deactivated(ifp))
ifp = NULL;
return ifp;
}
/*
* Get a reference of an ifnet object by an interface index.
* The returned reference is protected by psref(9). The caller
* must release a returned reference by if_put after use.
*/
ifnet_t *
if_get_byindex(u_int idx, struct psref *psref)
{
ifnet_t *ifp;
const int s = pserialize_read_enter();
ifp = if_byindex(idx);
if (__predict_true(ifp != NULL)) {
PSREF_DEBUG_FILL_RETURN_ADDRESS(psref);
psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
return ifp;
}
ifnet_t *
if_get_bylla(const void *lla, unsigned char lla_len, struct psref *psref)
{
ifnet_t *ifp;
const int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
if (if_is_deactivated(ifp))
continue;
if (ifp->if_addrlen != lla_len)
continue;
if (memcmp(lla, CLLADDR(ifp->if_sadl), lla_len) == 0) {
psref_acquire(psref, &ifp->if_psref,
ifnet_psref_class);
break;
}
}
pserialize_read_exit(s);
return ifp;
}
/*
* Note that it's safe only if the passed ifp is guaranteed to not be freed,
* for example using pserialize or the ifp is already held or some other
* object is held which guarantes the ifp to not be freed indirectly.
*/
void
if_acquire(struct ifnet *ifp, struct psref *psref)
{ KASSERT(ifp->if_index != 0);
psref_acquire(psref, &ifp->if_psref, ifnet_psref_class);
}
bool
if_held(struct ifnet *ifp)
{
return psref_held(&ifp->if_psref, ifnet_psref_class);
}
/*
* Some tunnel interfaces can nest, e.g. IPv4 over IPv4 gif(4) tunnel over
* IPv4. Check the tunnel nesting count.
* Return > 0, if tunnel nesting count is more than limit.
* Return 0, if tunnel nesting count is equal or less than limit.
*/
int
if_tunnel_check_nesting(struct ifnet *ifp, struct mbuf *m, int limit)
{
struct m_tag *mtag;
int *count;
mtag = m_tag_find(m, PACKET_TAG_TUNNEL_INFO);
if (mtag != NULL) {
count = (int *)(mtag + 1);
if (++(*count) > limit) {
log(LOG_NOTICE,
"%s: recursively called too many times(%d)\n",
ifp->if_xname, *count);
return EIO;
}
} else {
mtag = m_tag_get(PACKET_TAG_TUNNEL_INFO, sizeof(*count),
M_NOWAIT);
if (mtag != NULL) {
m_tag_prepend(m, mtag);
count = (int *)(mtag + 1);
*count = 0;
} else {
log(LOG_DEBUG, "%s: m_tag_get() failed, "
"recursion calls are not prevented.\n",
ifp->if_xname);
}
}
return 0;
}
static void
if_tunnel_ro_init_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
tro->tr_ro = kmem_zalloc(sizeof(*tro->tr_ro), KM_SLEEP);
tro->tr_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
}
static void
if_tunnel_ro_fini_pc(void *p, void *arg __unused, struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
rtcache_free(tro->tr_ro);
kmem_free(tro->tr_ro, sizeof(*tro->tr_ro));
mutex_obj_free(tro->tr_lock);
}
percpu_t *
if_tunnel_alloc_ro_percpu(void)
{
return percpu_create(sizeof(struct tunnel_ro),
if_tunnel_ro_init_pc, if_tunnel_ro_fini_pc, NULL);
}
void
if_tunnel_free_ro_percpu(percpu_t *ro_percpu)
{
percpu_free(ro_percpu, sizeof(struct tunnel_ro));
}
static void
if_tunnel_rtcache_free_pc(void *p, void *arg __unused,
struct cpu_info *ci __unused)
{
struct tunnel_ro *tro = p;
mutex_enter(tro->tr_lock);
rtcache_free(tro->tr_ro);
mutex_exit(tro->tr_lock);
}
void if_tunnel_ro_percpu_rtcache_free(percpu_t *ro_percpu)
{
percpu_foreach(ro_percpu, if_tunnel_rtcache_free_pc, NULL);
}
void
if_export_if_data(ifnet_t * const ifp, struct if_data *ifi, bool zero_stats)
{
/* Collect the volatile stats first; this zeros *ifi. */
if_stats_to_if_data(ifp, ifi, zero_stats);
ifi->ifi_type = ifp->if_type;
ifi->ifi_addrlen = ifp->if_addrlen;
ifi->ifi_hdrlen = ifp->if_hdrlen;
ifi->ifi_link_state = ifp->if_link_state;
ifi->ifi_mtu = ifp->if_mtu;
ifi->ifi_metric = ifp->if_metric;
ifi->ifi_baudrate = ifp->if_baudrate;
ifi->ifi_lastchange = ifp->if_lastchange;
}
/* common */
int
ifioctl_common(struct ifnet *ifp, u_long cmd, void *data)
{
struct ifreq *ifr;
struct ifcapreq *ifcr;
struct ifdatareq *ifdr;
unsigned short flags;
char *descr;
int error;
switch (cmd) {
case SIOCSIFCAP:
ifcr = data;
if ((ifcr->ifcr_capenable & ~ifp->if_capabilities) != 0)
return EINVAL;
if (ifcr->ifcr_capenable == ifp->if_capenable)
return 0;
ifp->if_capenable = ifcr->ifcr_capenable;
/* Pre-compute the checksum flags mask. */
ifp->if_csum_flags_tx = 0;
ifp->if_csum_flags_rx = 0;
if (ifp->if_capenable & IFCAP_CSUM_IPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_IPv4; if (ifp->if_capenable & IFCAP_CSUM_IPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_IPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_TCPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_TCPv4; if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Tx) ifp->if_csum_flags_tx |= M_CSUM_UDPv4; if (ifp->if_capenable & IFCAP_CSUM_UDPv4_Rx) ifp->if_csum_flags_rx |= M_CSUM_UDPv4; if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Tx) ifp->if_csum_flags_tx |= M_CSUM_TCPv6; if (ifp->if_capenable & IFCAP_CSUM_TCPv6_Rx) ifp->if_csum_flags_rx |= M_CSUM_TCPv6; if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Tx) ifp->if_csum_flags_tx |= M_CSUM_UDPv6; if (ifp->if_capenable & IFCAP_CSUM_UDPv6_Rx) ifp->if_csum_flags_rx |= M_CSUM_UDPv6; if (ifp->if_capenable & IFCAP_TSOv4)
ifp->if_csum_flags_tx |= M_CSUM_TSOv4;
if (ifp->if_capenable & IFCAP_TSOv6)
ifp->if_csum_flags_tx |= M_CSUM_TSOv6;
#if NBRIDGE > 0
if (ifp->if_bridge != NULL) bridge_calc_csum_flags(ifp->if_bridge);
#endif
if (ifp->if_flags & IFF_UP)
return ENETRESET;
return 0;
case SIOCSIFFLAGS:
ifr = data;
/*
* If if_is_mpsafe(ifp), KERNEL_LOCK isn't held here, but if_up
* and if_down aren't MP-safe yet, so we must hold the lock.
*/
KERNEL_LOCK_IF_IFP_MPSAFE(ifp); if (ifp->if_flags & IFF_UP && (ifr->ifr_flags & IFF_UP) == 0) {
const int s = splsoftnet();
if_down_locked(ifp);
splx(s);
}
if (ifr->ifr_flags & IFF_UP && (ifp->if_flags & IFF_UP) == 0) { const int s = splsoftnet();
if_up_locked(ifp);
splx(s);
}
KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
flags = (ifp->if_flags & IFF_CANTCHANGE) |
(ifr->ifr_flags &~ IFF_CANTCHANGE);
if (ifp->if_flags != flags) { ifp->if_flags = flags;
/* Notify that the flags have changed. */
rt_ifmsg(ifp);
}
break;
case SIOCGIFFLAGS:
ifr = data;
ifr->ifr_flags = ifp->if_flags;
break;
case SIOCGIFMETRIC:
ifr = data;
ifr->ifr_metric = ifp->if_metric;
break;
case SIOCGIFMTU:
ifr = data;
ifr->ifr_mtu = ifp->if_mtu;
break;
case SIOCGIFDLT:
ifr = data;
ifr->ifr_dlt = ifp->if_dlt;
break;
case SIOCGIFCAP:
ifcr = data;
ifcr->ifcr_capabilities = ifp->if_capabilities;
ifcr->ifcr_capenable = ifp->if_capenable;
break;
case SIOCSIFMETRIC:
ifr = data;
ifp->if_metric = ifr->ifr_metric;
break;
case SIOCGIFDATA:
ifdr = data;
if_export_if_data(ifp, &ifdr->ifdr_data, false);
break;
case SIOCGIFINDEX:
ifr = data;
ifr->ifr_index = ifp->if_index;
break;
case SIOCZIFDATA:
ifdr = data;
if_export_if_data(ifp, &ifdr->ifdr_data, true);
getnanotime(&ifp->if_lastchange);
break;
case SIOCSIFMTU:
ifr = data;
if (ifp->if_mtu == ifr->ifr_mtu)
break;
ifp->if_mtu = ifr->ifr_mtu;
return ENETRESET;
case SIOCSIFDESCR:
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
NULL);
if (error)
return error;
ifr = data;
if (ifr->ifr_buflen > IFDESCRSIZE)
return ENAMETOOLONG;
if (ifr->ifr_buf == NULL || ifr->ifr_buflen == 0) {
/* unset description */
descr = NULL;
} else {
descr = kmem_zalloc(IFDESCRSIZE, KM_SLEEP);
/*
* copy (IFDESCRSIZE - 1) bytes to ensure
* terminating nul
*/
error = copyin(ifr->ifr_buf, descr, IFDESCRSIZE - 1);
if (error) { kmem_free(descr, IFDESCRSIZE);
return error;
}
}
if (ifp->if_description != NULL) kmem_free(ifp->if_description, IFDESCRSIZE);
ifp->if_description = descr;
break;
case SIOCGIFDESCR:
ifr = data;
descr = ifp->if_description;
if (descr == NULL)
return ENOMSG;
if (ifr->ifr_buflen < IFDESCRSIZE)
return EINVAL;
error = copyout(descr, ifr->ifr_buf, IFDESCRSIZE);
if (error)
return error;
break;
default:
return ENOTTY;
}
return 0;
}
int
ifaddrpref_ioctl(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
struct if_addrprefreq *ifap = (struct if_addrprefreq *)data;
struct ifaddr *ifa;
const struct sockaddr *any, *sa;
union {
struct sockaddr sa;
struct sockaddr_storage ss;
} u, v;
int s, error = 0;
switch (cmd) {
case SIOCSIFADDRPREF:
error = kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, KAUTH_ARG(cmd),
NULL);
if (error)
return error;
break;
case SIOCGIFADDRPREF:
break;
default:
return EOPNOTSUPP;
}
/* sanity checks */
if (data == NULL || ifp == NULL) {
panic("invalid argument to %s", __func__);
/*NOTREACHED*/
}
/* address must be specified on ADD and DELETE */
sa = sstocsa(&ifap->ifap_addr);
if (sa->sa_family != sofamily(so))
return EINVAL;
if ((any = sockaddr_any(sa)) == NULL || sa->sa_len != any->sa_len)
return EINVAL;
sockaddr_externalize(&v.sa, sizeof(v.ss), sa);
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != sa->sa_family)
continue;
sockaddr_externalize(&u.sa, sizeof(u.ss), ifa->ifa_addr);
if (sockaddr_cmp(&u.sa, &v.sa) == 0)
break;
}
if (ifa == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
switch (cmd) {
case SIOCSIFADDRPREF:
ifa->ifa_preference = ifap->ifap_preference;
goto out;
case SIOCGIFADDRPREF:
/* fill in the if_laddrreq structure */
(void)sockaddr_copy(sstosa(&ifap->ifap_addr),
sizeof(ifap->ifap_addr), ifa->ifa_addr);
ifap->ifap_preference = ifa->ifa_preference;
goto out;
default:
error = EOPNOTSUPP;
}
out:
pserialize_read_exit(s);
return error;
}
/*
* Interface ioctls.
*/
static int
doifioctl(struct socket *so, u_long cmd, void *data, struct lwp *l)
{
struct ifnet *ifp;
struct ifreq *ifr;
int error = 0;
u_long ocmd = cmd;
u_short oif_flags;
struct ifreq ifrb;
struct oifreq *oifr = NULL;
int r;
struct psref psref;
bool do_if43_post = false;
bool do_ifm80_post = false;
switch (cmd) {
case SIOCGIFCONF:
return ifconf(cmd, data);
case SIOCINITIFADDR:
return EPERM;
default:
MODULE_HOOK_CALL(uipc_syscalls_40_hook, (cmd, data), enosys(),
error);
if (error != ENOSYS)
return error;
MODULE_HOOK_CALL(uipc_syscalls_50_hook, (l, cmd, data),
enosys(), error);
if (error != ENOSYS)
return error;
error = 0;
break;
}
ifr = data;
/* Pre-conversion */
MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), error); if (cmd != ocmd) { oifr = data;
data = ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
do_if43_post = true;
}
MODULE_HOOK_CALL(ifmedia_80_pre_hook, (ifr, &cmd, &do_ifm80_post),
enosys(), error);
switch (cmd) {
case SIOCIFCREATE:
case SIOCIFDESTROY: {
const int bound = curlwp_bind();
if (l != NULL) {
ifp = if_get(ifr->ifr_name, &psref);
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
KAUTH_ARG(cmd), NULL);
if (ifp != NULL) if_put(ifp, &psref); if (error != 0) { curlwp_bindx(bound);
return error;
}
}
KERNEL_LOCK_UNLESS_NET_MPSAFE();
mutex_enter(&if_clone_mtx);
r = (cmd == SIOCIFCREATE) ?
if_clone_create(ifr->ifr_name) : if_clone_destroy(ifr->ifr_name);
mutex_exit(&if_clone_mtx);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
curlwp_bindx(bound);
return r;
}
case SIOCIFGCLONERS: {
struct if_clonereq *req = (struct if_clonereq *)data;
return if_clone_list(req->ifcr_count, req->ifcr_buffer,
&req->ifcr_total);
}
}
if ((cmd & IOC_IN) == 0 || IOCPARM_LEN(cmd) < sizeof(ifr->ifr_name))
return EINVAL;
const int bound = curlwp_bind();
ifp = if_get(ifr->ifr_name, &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
return ENXIO;
}
switch (cmd) {
case SIOCALIFADDR:
case SIOCDLIFADDR:
case SIOCSIFADDRPREF:
case SIOCSIFFLAGS:
case SIOCSIFCAP:
case SIOCSIFMETRIC:
case SIOCZIFDATA:
case SIOCSIFMTU:
case SIOCSIFPHYADDR:
case SIOCDIFPHYADDR:
#ifdef INET6
case SIOCSIFPHYADDR_IN6:
#endif
case SIOCSLIFPHYADDR:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCSETHERCAP:
case SIOCSIFMEDIA:
case SIOCSDRVSPEC:
case SIOCG80211:
case SIOCS80211:
case SIOCS80211NWID:
case SIOCS80211NWKEY:
case SIOCS80211POWER:
case SIOCS80211BSSID:
case SIOCS80211CHANNEL:
case SIOCSLINKSTR:
if (l != NULL) {
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
KAUTH_ARG(cmd), NULL);
if (error != 0)
goto out;
}
}
oif_flags = ifp->if_flags;
KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp);
IFNET_LOCK(ifp);
error = if_ioctl(ifp, cmd, data); if (error != ENOTTY)
;
else if (so->so_proto == NULL)
error = EOPNOTSUPP;
else {
KERNEL_LOCK_IF_IFP_MPSAFE(ifp); MODULE_HOOK_CALL(if_ifioctl_43_hook,
(so, ocmd, cmd, data, l), enosys(), error);
if (error == ENOSYS) error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
cmd, data, ifp);
KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp);
}
if (((oif_flags ^ ifp->if_flags) & IFF_UP) != 0) {
if ((ifp->if_flags & IFF_UP) != 0) {
const int s = splsoftnet();
if_up_locked(ifp);
splx(s);
}
}
/* Post-conversion */
if (do_ifm80_post && (error == 0)) MODULE_HOOK_CALL(ifmedia_80_post_hook, (ifr, cmd),
enosys(), error);
if (do_if43_post) IFREQN2O_43(oifr, ifr);
IFNET_UNLOCK(ifp);
KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp);
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Return interface configuration
* of system. List may be used
* in later ioctl's (above) to get
* other information.
*
* Each record is a struct ifreq. Before the addition of
* sockaddr_storage, the API rule was that sockaddr flavors that did
* not fit would extend beyond the struct ifreq, with the next struct
* ifreq starting sa_len beyond the struct sockaddr. Because the
* union in struct ifreq includes struct sockaddr_storage, every kind
* of sockaddr must fit. Thus, there are no longer any overlength
* records.
*
* Records are added to the user buffer if they fit, and ifc_len is
* adjusted to the length that was written. Thus, the user is only
* assured of getting the complete list if ifc_len on return is at
* least sizeof(struct ifreq) less than it was on entry.
*
* If the user buffer pointer is NULL, this routine copies no data and
* returns the amount of space that would be needed.
*
* Invariants:
* ifrp points to the next part of the user's buffer to be used. If
* ifrp != NULL, space holds the number of bytes remaining that we may
* write at ifrp. Otherwise, space holds the number of bytes that
* would have been written had there been adequate space.
*/
/*ARGSUSED*/
static int
ifconf(u_long cmd, void *data)
{
struct ifconf *ifc = (struct ifconf *)data;
struct ifnet *ifp;
struct ifaddr *ifa;
struct ifreq ifr, *ifrp = NULL;
int space = 0, error = 0;
const int sz = (int)sizeof(struct ifreq);
const bool docopy = ifc->ifc_req != NULL;
struct psref psref;
if (docopy) { if (ifc->ifc_len < 0)
return EINVAL;
space = ifc->ifc_len;
ifrp = ifc->ifc_req;
}
memset(&ifr, 0, sizeof(ifr));
const int bound = curlwp_bind();
int s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
psref_acquire(&psref, &ifp->if_psref, ifnet_psref_class);
pserialize_read_exit(s);
(void)strncpy(ifr.ifr_name, ifp->if_xname,
sizeof(ifr.ifr_name));
if (ifr.ifr_name[sizeof(ifr.ifr_name) - 1] != '\0') {
error = ENAMETOOLONG;
goto release_exit;
}
if (IFADDR_READER_EMPTY(ifp)) {
/* Interface with no addresses - send zero sockaddr. */
memset(&ifr.ifr_addr, 0, sizeof(ifr.ifr_addr));
if (!docopy) {
space += sz;
goto next;
}
if (space >= sz) {
error = copyout(&ifr, ifrp, sz);
if (error != 0)
goto release_exit;
ifrp++;
space -= sz;
}
}
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct sockaddr *sa = ifa->ifa_addr;
/* all sockaddrs must fit in sockaddr_storage */
KASSERT(sa->sa_len <= sizeof(ifr.ifr_ifru));
if (!docopy) {
space += sz;
continue;
}
memcpy(&ifr.ifr_space, sa, sa->sa_len);
pserialize_read_exit(s);
if (space >= sz) {
error = copyout(&ifr, ifrp, sz);
if (error != 0)
goto release_exit;
ifrp++; space -= sz;
}
s = pserialize_read_enter();
}
pserialize_read_exit(s);
next:
s = pserialize_read_enter();
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
if (docopy) {
KASSERT(0 <= space && space <= ifc->ifc_len);
ifc->ifc_len -= space;
} else {
KASSERT(space >= 0);
ifc->ifc_len = space;
}
return 0;
release_exit:
psref_release(&psref, &ifp->if_psref, ifnet_psref_class);
curlwp_bindx(bound);
return error;
}
int
ifreq_setaddr(u_long cmd, struct ifreq *ifr, const struct sockaddr *sa)
{
uint8_t len = sizeof(ifr->ifr_ifru.ifru_space);
struct ifreq ifrb;
struct oifreq *oifr = NULL;
u_long ocmd = cmd;
int hook;
MODULE_HOOK_CALL(if_cvtcmd_43_hook, (&cmd, ocmd), enosys(), hook); if (hook != ENOSYS) {
if (cmd != ocmd) {
oifr = (struct oifreq *)(void *)ifr;
ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
len = sizeof(oifr->ifr_addr);
}
}
if (len < sa->sa_len)
return EFBIG;
memset(&ifr->ifr_addr, 0, len);
sockaddr_copy(&ifr->ifr_addr, len, sa);
if (cmd != ocmd) IFREQN2O_43(oifr, ifr);
return 0;
}
/*
* wrapper function for the drivers which doesn't have if_transmit().
*/
static int
if_transmit(struct ifnet *ifp, struct mbuf *m)
{
int error;
size_t pktlen = m->m_pkthdr.len;
bool mcast = (m->m_flags & M_MCAST) != 0;
const int s = splnet();
IFQ_ENQUEUE(&ifp->if_snd, m, error);
if (error != 0) {
/* mbuf is already freed */
goto out;
}
net_stat_ref_t nsr = IF_STAT_GETREF(ifp);
if_statadd_ref(nsr, if_obytes, pktlen);
if (mcast) if_statinc_ref(nsr, if_omcasts);
IF_STAT_PUTREF(ifp);
if ((ifp->if_flags & IFF_OACTIVE) == 0) if_start_lock(ifp);
out:
splx(s);
return error;
}
int
if_transmit_lock(struct ifnet *ifp, struct mbuf *m)
{
int error;
kmsan_check_mbuf(m);
#ifdef ALTQ
KERNEL_LOCK(1, NULL);
if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
error = if_transmit(ifp, m);
KERNEL_UNLOCK_ONE(NULL);
} else {
KERNEL_UNLOCK_ONE(NULL);
error = (*ifp->if_transmit)(ifp, m);
/* mbuf is already freed */
}
#else /* !ALTQ */
error = (*ifp->if_transmit)(ifp, m);
/* mbuf is already freed */
#endif /* !ALTQ */
return error;
}
/*
* Queue message on interface, and start output if interface
* not yet active.
*/
int
ifq_enqueue(struct ifnet *ifp, struct mbuf *m)
{
return if_transmit_lock(ifp, m);
}
/*
* Queue message on interface, possibly using a second fast queue
*/
int
ifq_enqueue2(struct ifnet *ifp, struct ifqueue *ifq, struct mbuf *m)
{
int error = 0;
if (ifq != NULL
#ifdef ALTQ
&& ALTQ_IS_ENABLED(&ifp->if_snd) == 0
#endif
) {
if (IF_QFULL(ifq)) {
IF_DROP(&ifp->if_snd);
m_freem(m);
if (error == 0)
error = ENOBUFS;
} else
IF_ENQUEUE(ifq, m);
} else
IFQ_ENQUEUE(&ifp->if_snd, m, error);
if (error != 0) {
if_statinc(ifp, if_oerrors);
return error;
}
return 0;
}
int
if_addr_init(ifnet_t *ifp, struct ifaddr *ifa, const bool src)
{
int rc;
KASSERT(IFNET_LOCKED(ifp));
if (ifp->if_initaddr != NULL)
rc = (*ifp->if_initaddr)(ifp, ifa, src); else if (src || (rc = if_ioctl(ifp, SIOCSIFDSTADDR, ifa)) == ENOTTY) rc = if_ioctl(ifp, SIOCINITIFADDR, ifa);
return rc;
}
int
if_do_dad(struct ifnet *ifp)
{ if ((ifp->if_flags & IFF_LOOPBACK) != 0)
return 0;
switch (ifp->if_type) {
case IFT_FAITH:
/*
* These interfaces do not have the IFF_LOOPBACK flag,
* but loop packets back. We do not have to do DAD on such
* interfaces. We should even omit it, because loop-backed
* responses would confuse the DAD procedure.
*/
return 0;
default:
/*
* Our DAD routine requires the interface up and running.
* However, some interfaces can be up before the RUNNING
* status. Additionally, users may try to assign addresses
* before the interface becomes up (or running).
* We simply skip DAD in such a case as a work around.
* XXX: we should rather mark "tentative" on such addresses,
* and do DAD after the interface becomes ready.
*/
if ((ifp->if_flags & (IFF_UP | IFF_RUNNING)) !=
(IFF_UP | IFF_RUNNING))
return 0;
return 1;
}
}
/*
* if_flags_set(ifp, flags)
*
* Ask ifp to change ifp->if_flags to flags, as if with the
* SIOCSIFFLAGS ioctl command.
*
* May sleep. Caller must hold ifp->if_ioctl_lock, a.k.a
* IFNET_LOCK.
*/
int
if_flags_set(ifnet_t *ifp, const u_short flags)
{
int rc;
KASSERT(IFNET_LOCKED(ifp));
if (ifp->if_setflags != NULL)
rc = (*ifp->if_setflags)(ifp, flags);
else {
u_short cantflags, chgdflags;
struct ifreq ifr;
chgdflags = ifp->if_flags ^ flags;
cantflags = chgdflags & IFF_CANTCHANGE;
if (cantflags != 0)
ifp->if_flags ^= cantflags;
/*
* Traditionally, we do not call if_ioctl after
* setting/clearing only IFF_PROMISC if the interface
* isn't IFF_UP. Uphold that tradition.
*/
if (chgdflags == IFF_PROMISC && (ifp->if_flags & IFF_UP) == 0)
return 0;
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = flags & ~IFF_CANTCHANGE;
rc = if_ioctl(ifp, SIOCSIFFLAGS, &ifr);
if (rc != 0 && cantflags != 0)
ifp->if_flags ^= cantflags;
}
return rc;
}
/*
* if_mcast_op(ifp, cmd, sa)
*
* Apply a multicast command, SIOCADDMULTI/SIOCDELMULTI, to the
* interface. Returns 0 on success, nonzero errno(3) number on
* failure.
*
* May sleep.
*
* Use this, not if_ioctl, for the multicast commands.
*/
int
if_mcast_op(ifnet_t *ifp, const unsigned long cmd, const struct sockaddr *sa)
{
int rc;
struct ifreq ifr;
switch (cmd) {
case SIOCADDMULTI:
case SIOCDELMULTI:
break;
default:
panic("invalid ifnet multicast command: 0x%lx", cmd);
}
ifreq_setaddr(cmd, &ifr, sa);
rc = if_ioctl(ifp, cmd, &ifr);
return rc;
}
static void
sysctl_sndq_setup(struct sysctllog **clog, const char *ifname,
struct ifaltq *ifq)
{
const struct sysctlnode *cnode, *rnode;
if (sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "interfaces",
SYSCTL_DESCR("Per-interface controls"),
NULL, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, ifname,
SYSCTL_DESCR("Interface controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sndq",
SYSCTL_DESCR("Interface output queue controls"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "len",
SYSCTL_DESCR("Current output queue length"),
NULL, 0, &ifq->ifq_len, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed output queue length"),
NULL, 0, &ifq->ifq_maxlen, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
if (sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Packets dropped due to full output queue"),
NULL, 0, &ifq->ifq_drops, 0,
CTL_CREATE, CTL_EOL) != 0)
goto bad;
return;
bad:
printf("%s: could not attach sysctl nodes\n", ifname);
return;
}
static int
if_sdl_sysctl(SYSCTLFN_ARGS)
{
struct ifnet *ifp;
const struct sockaddr_dl *sdl;
struct psref psref;
int error = 0;
if (namelen != 1)
return EINVAL;
const int bound = curlwp_bind();
ifp = if_get_byindex(name[0], &psref);
if (ifp == NULL) {
error = ENODEV;
goto out0;
}
sdl = ifp->if_sadl;
if (sdl == NULL) {
*oldlenp = 0;
goto out1;
}
if (oldp == NULL) {
*oldlenp = sdl->sdl_alen;
goto out1;
}
if (*oldlenp >= sdl->sdl_alen)
*oldlenp = sdl->sdl_alen;
error = sysctl_copyout(l, &sdl->sdl_data[sdl->sdl_nlen],
oldp, *oldlenp);
out1:
if_put(ifp, &psref);
out0:
curlwp_bindx(bound);
return error;
}
static void
if_sysctl_setup(struct sysctllog **clog)
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sdl",
SYSCTL_DESCR("Get active link-layer address"),
if_sdl_sysctl, 0, NULL, 0,
CTL_NET, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: cpu.h,v 1.72 2023/09/04 20:58:52 mrg Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)cpu.h 5.4 (Berkeley) 5/9/91
*/
#ifndef _AMD64_CPU_H_
#define _AMD64_CPU_H_
#ifdef __x86_64__
#include <x86/cpu.h>
#ifdef _KERNEL
#if defined(__GNUC__) && !defined(_MODULE)
static struct cpu_info *x86_curcpu(void);
static lwp_t *x86_curlwp(void);
/*
* XXXGCC12 has:
* ./machine/cpu.h:57:9: error: array subscript 0 is outside array bounds of 'struct cpu_info * const[0]' [-Werror=array-bounds]
* 56 | __asm("movq %%gs:%1, %0" :
*/
#pragma GCC push_options
#pragma GCC diagnostic ignored "-Warray-bounds"
__inline __always_inline static struct cpu_info * __unused __nomsan
x86_curcpu(void)
{
struct cpu_info *ci;
__asm("movq %%gs:%1, %0" :
"=r" (ci) :
"m"
(*(struct cpu_info * const *)offsetof(struct cpu_info, ci_self)));
return ci;
}
__inline static lwp_t * __unused __nomsan __attribute__ ((const))
x86_curlwp(void)
{
lwp_t *l;
__asm("movq %%gs:%1, %0" :
"=r" (l) :
"m"
(*(struct cpu_info * const *)offsetof(struct cpu_info, ci_curlwp)));
return l;
}
#pragma GCC pop_options
#endif /* __GNUC__ && !_MODULE */
#ifdef XENPV
#define CLKF_USERMODE(frame) (curcpu()->ci_xen_clockf_usermode)
#define CLKF_PC(frame) (curcpu()->ci_xen_clockf_pc)
#else /* XENPV */
#define CLKF_USERMODE(frame) USERMODE((frame)->cf_if.if_tf.tf_cs)
#define CLKF_PC(frame) ((frame)->cf_if.if_tf.tf_rip)
#endif /* XENPV */
#define CLKF_INTR(frame) (curcpu()->ci_idepth > 0)
#define LWP_PC(l) ((l)->l_md.md_regs->tf_rip)
void *cpu_uarea_alloc(bool);
bool cpu_uarea_free(void *);
#endif /* _KERNEL */
#else /* __x86_64__ */
#include <i386/cpu.h>
#endif /* __x86_64__ */
#endif /* !_AMD64_CPU_H_ */
/* $NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock.c,v 1.256 2022/08/27 08:36:41 skrll Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/intr.h>
#include <sys/condvar.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <netinet/in_var.h>
#include <netinet/if_inarp.h>
#include <netmpls/mpls.h>
#include <compat/net/if.h>
#include <compat/net/route.h>
#ifdef COMPAT_RTSOCK
#undef COMPAT_RTSOCK
#endif
static int if_addrflags(struct ifaddr *);
#include <net/rtsock_shared.c>
/*
* XXX avoid using void * once msghdr compat disappears.
*/
void
rt_setmetrics(void *in, struct rtentry *out)
{
const struct rt_xmsghdr *rtm = in;
_rt_setmetrics(rtm->rtm_inits, rtm, out);
}
int
rt_msg3(int type, struct rt_addrinfo *rtinfo, void *cpv, struct rt_walkarg *w,
int *lenp)
{
return rt_msg2(type, rtinfo, cpv, w, lenp);
}
static int
if_addrflags(struct ifaddr *ifa)
{
switch (ifa->ifa_addr->sa_family) {
#ifdef INET
case AF_INET:
return ifatoia(ifa)->ia4_flags;
#endif
#ifdef INET6
case AF_INET6:
return ifatoia6(ifa)->ia6_flags;
#endif
default:
return 0;
}
}
/*
* Send a routing message as mimicing that a cloned route is added.
*/
void
rt_clonedmsg(int type, const struct sockaddr *src, const struct sockaddr *dst,
const uint8_t *lladdr, const struct ifnet *ifp)
{
struct rt_addrinfo info;
/* Mimic flags exactly */
#define RTF_LLINFO 0x400
#define RTF_CLONED 0x2000
int flags = RTF_DONE;
union {
struct sockaddr sa;
struct sockaddr_storage ss;
struct sockaddr_dl sdl;
} u;
if (type != RTM_MISS)
flags |= RTF_HOST | RTF_CLONED | RTF_LLINFO;
if (type == RTM_ADD || type == RTM_CHANGE)
flags |= RTF_UP;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_AUTHOR] = src;
info.rti_info[RTAX_DST] = dst;
sockaddr_dl_init(&u.sdl, sizeof(u.ss), ifp->if_index, ifp->if_type,
NULL, 0, lladdr, ifp->if_addrlen);
info.rti_info[RTAX_GATEWAY] = &u.sa;
rt_missmsg(type, &info, flags, 0);
#undef RTF_LLINFO
#undef RTF_CLONED
}
/*
* The remaining code implements the routing-table sysctl node. It is
* compiled only for the non-COMPAT case.
*/
/*
* This is used in dumping the kernel table via sysctl().
*/
static int
sysctl_dumpentry(struct rtentry *rt, void *v)
{
struct rt_walkarg *w = v;
int error = 0, size;
struct rt_addrinfo info;
if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
return 0;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = rt_getkey(rt);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
info.rti_info[RTAX_TAG] = rt_gettag(rt);
if (rt->rt_ifp) {
const struct ifaddr *rtifa;
info.rti_info[RTAX_IFP] = rt->rt_ifp->if_dl->ifa_addr;
/* rtifa used to be simply rt->rt_ifa. If rt->rt_ifa != NULL,
* then rt_get_ifa() != NULL. So this ought to still be safe.
* --dyoung
*/
rtifa = rt_get_ifa(rt);
info.rti_info[RTAX_IFA] = rtifa->ifa_addr;
if (rt->rt_ifp->if_flags & IFF_POINTOPOINT)
info.rti_info[RTAX_BRD] = rtifa->ifa_dstaddr;
}
if ((error = rt_msg2(RTM_GET, &info, 0, w, &size)))
return error;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
struct rt_xmsghdr *rtm = (struct rt_xmsghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
rtm->rtm_use = rt->rt_use;
rtm_setmetrics(rt, rtm);
KASSERT(rt->rt_ifp != NULL);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
if ((error = copyout(rtm, w->w_where, size)) != 0)
w->w_where = NULL;
else
w->w_where = (char *)w->w_where + size;
}
return error;
}
static int
sysctl_iflist_if(struct ifnet *ifp, struct rt_walkarg *w,
struct rt_addrinfo *info, size_t len)
{
struct if_xmsghdr *ifm;
int error;
ifm = (struct if_xmsghdr *)w->w_tmem;
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags;
if_export_if_data(ifp, &ifm->ifm_data, false);
ifm->ifm_addrs = info->rti_addrs;
if ((error = copyout(ifm, w->w_where, len)) == 0)
w->w_where = (char *)w->w_where + len;
return error;
}
static int
sysctl_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa,
struct rt_addrinfo *info)
{
int len, error;
if ((error = rt_msg2(RTM_XNEWADDR, info, 0, w, &len)))
return error;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
struct ifa_xmsghdr *ifam;
ifam = (struct ifa_xmsghdr *)w->w_tmem;
ifam->ifam_index = ifa->ifa_ifp->if_index;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_addrs = info->rti_addrs;
ifam->ifam_pid = 0;
ifam->ifam_addrflags = if_addrflags(ifa);
if ((error = copyout(w->w_tmem, w->w_where, len)) == 0)
w->w_where = (char *)w->w_where + len;
}
return error;
}
static int
sysctl_iflist(int af, struct rt_walkarg *w, int type)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct rt_addrinfo info;
int cmd, len, error = 0;
int s;
struct psref psref;
int bound;
switch (type) {
case NET_RT_IFLIST:
cmd = RTM_IFINFO;
break;
case NET_RT_OOOIFLIST:
cmd = RTM_OOIFINFO;
break;
case NET_RT_OOIFLIST:
cmd = RTM_OIFINFO;
break;
case NET_RT_OIFLIST:
cmd = RTM_IFINFO;
break;
default:
#ifdef RTSOCK_DEBUG
printf("%s: unsupported IFLIST type %d\n", __func__, type);
#endif
return EINVAL;
}
memset(&info, 0, sizeof(info));
bound = curlwp_bind();
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
int _s;
if (w->w_arg && w->w_arg != ifp->if_index)
continue;
if (IFADDR_READER_EMPTY(ifp))
continue;
if_acquire(ifp, &psref);
pserialize_read_exit(s);
info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
if ((error = rt_msg2(cmd, &info, NULL, w, &len)) != 0)
goto release_exit;
info.rti_info[RTAX_IFP] = NULL;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
switch (type) {
case NET_RT_OIFLIST: /* old _70 */
if (!rtsock_iflist_70_hook.hooked) {
error = EINVAL;
break;
}
/* FALLTHROUGH */
case NET_RT_IFLIST: /* current */
error = sysctl_iflist_if(ifp, w, &info, len);
break;
case NET_RT_OOIFLIST: /* old _50 */
MODULE_HOOK_CALL(rtsock_iflist_50_hook,
(ifp, w, &info, len), enosys(), error);
break;
case NET_RT_OOOIFLIST: /* old _14 */
MODULE_HOOK_CALL(rtsock_iflist_14_hook,
(ifp, w, &info, len), enosys(), error);
break;
default:
error = EINVAL;
}
if (error != 0) {
if (error == ENOSYS)
error = EINVAL;
goto release_exit;
}
}
_s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref _psref;
if (af && af != ifa->ifa_addr->sa_family)
continue;
ifa_acquire(ifa, &_psref);
pserialize_read_exit(_s);
info.rti_info[RTAX_IFA] = ifa->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
switch (type) {
case NET_RT_IFLIST:
error = sysctl_iflist_addr(w, ifa, &info);
break;
case NET_RT_OIFLIST:
case NET_RT_OOIFLIST:
case NET_RT_OOOIFLIST:
MODULE_HOOK_CALL(rtsock_iflist_70_hook,
(w, ifa, &info), enosys(), error);
break;
default:
error = EINVAL;
}
_s = pserialize_read_enter();
ifa_release(ifa, &_psref);
if (error != 0) {
pserialize_read_exit(_s);
goto release_exit;
}
}
pserialize_read_exit(_s);
info.rti_info[RTAX_IFA] = info.rti_info[RTAX_NETMASK] =
info.rti_info[RTAX_BRD] = NULL;
s = pserialize_read_enter();
if_release(ifp, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
return 0;
release_exit:
if_release(ifp, &psref);
curlwp_bindx(bound);
return error;
}
static int
sysctl_rtable(SYSCTLFN_ARGS)
{
void *where = oldp;
size_t *given = oldlenp;
int i, error = EINVAL;
u_char af;
struct rt_walkarg w;
if (namelen == 1 && name[0] == CTL_QUERY)
return sysctl_query(SYSCTLFN_CALL(rnode));
if (newp)
return EPERM;
if (namelen != 3)
return EINVAL;
af = name[0];
w.w_tmemneeded = 0;
w.w_tmemsize = 0;
w.w_tmem = NULL;
again:
/* we may return here if a later [re]alloc of the t_mem buffer fails */
if (w.w_tmemneeded) {
w.w_tmem = kmem_zalloc(w.w_tmemneeded, KM_SLEEP);
w.w_tmemsize = w.w_tmemneeded;
w.w_tmemneeded = 0;
}
w.w_op = name[1];
w.w_arg = name[2];
w.w_given = *given;
w.w_needed = 0 - w.w_given;
w.w_where = where;
KERNEL_LOCK_UNLESS_NET_MPSAFE();
const int s = splsoftnet();
switch (w.w_op) {
case NET_RT_DUMP:
case NET_RT_FLAGS:
#if defined(INET) || defined(INET6)
/*
* take care of llinfo entries, the caller must
* specify an AF
*/
if (w.w_op == NET_RT_FLAGS &&
(w.w_arg == 0 || w.w_arg & RTF_LLDATA)) {
if (af != 0)
error = lltable_sysctl_dump(af, &w);
else
error = EINVAL;
break;
}
#endif
for (i = 1; i <= AF_MAX; i++) {
if (af == 0 || af == i) {
error = rt_walktree(i, sysctl_dumpentry, &w);
if (error != 0)
break;
#if defined(INET) || defined(INET6)
/*
* Return ARP/NDP entries too for
* backward compatibility.
*/
error = lltable_sysctl_dump(i, &w);
if (error != 0)
break;
#endif
}
}
break;
case NET_RT_OOOIFLIST: /* compat_14 */
case NET_RT_OOIFLIST: /* compat_50 */
case NET_RT_OIFLIST: /* compat_70 */
case NET_RT_IFLIST: /* current */
error = sysctl_iflist(af, &w, w.w_op);
break;
}
splx(s);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
/* check to see if we couldn't allocate memory with NOWAIT */
if (error == ENOBUFS && w.w_tmem == 0 && w.w_tmemneeded)
goto again;
if (w.w_tmem)
kmem_free(w.w_tmem, w.w_tmemsize);
w.w_needed += w.w_given;
if (where) {
*given = (char *)w.w_where - (char *)where;
if (*given < w.w_needed)
return ENOMEM;
} else {
*given = (11 * w.w_needed) / 10;
}
return error;
}
void
sysctl_net_route_setup(struct sysctllog **clog, int pf, const char *name)
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, name,
SYSCTL_DESCR("PF_ROUTE information"),
NULL, 0, NULL, 0,
CTL_NET, pf, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "rtable",
SYSCTL_DESCR("Routing table information"),
sysctl_rtable, 0, NULL, 0,
CTL_NET, pf, 0 /* any protocol */, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("Routing statistics"),
NULL, 0, &rtstat, sizeof(rtstat),
CTL_CREATE, CTL_EOL);
}
/* $NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cprng_fast.c,v 1.19 2023/08/05 11:39:18 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/evcnt.h>
#include <sys/kmem.h>
#include <sys/percpu.h>
#include <sys/pserialize.h>
#include <crypto/chacha/chacha.h>
#define CPRNG_FAST_SEED_BYTES CHACHA_STREAM_KEYBYTES
struct cprng_fast {
/* 128-bit vector unit generates 256 bytes at once */
uint8_t buf[256];
uint8_t key[CPRNG_FAST_SEED_BYTES];
uint8_t nonce[CHACHA_STREAM_NONCEBYTES];
unsigned i;
struct evcnt *reseed_evcnt;
unsigned epoch;
};
static void cprng_fast_init_cpu(void *, void *, struct cpu_info *);
static void cprng_fast_reseed(struct cprng_fast **, unsigned);
static void cprng_fast_seed(struct cprng_fast *, const void *);
static void cprng_fast_buf(struct cprng_fast *, void *, unsigned);
static void cprng_fast_buf_short(void *, size_t);
static void cprng_fast_buf_long(void *, size_t);
static percpu_t *cprng_fast_percpu __read_mostly;
void
cprng_fast_init(void)
{
cprng_fast_percpu = percpu_create(sizeof(struct cprng_fast),
cprng_fast_init_cpu, NULL, NULL);
}
static void
cprng_fast_init_cpu(void *p, void *arg __unused, struct cpu_info *ci)
{
struct cprng_fast *const cprng = p;
cprng->epoch = 0;
cprng->reseed_evcnt = kmem_alloc(sizeof(*cprng->reseed_evcnt),
KM_SLEEP);
evcnt_attach_dynamic(cprng->reseed_evcnt, EVCNT_TYPE_MISC, NULL,
ci->ci_cpuname, "cprng_fast reseed");
}
static int
cprng_fast_get(struct cprng_fast **cprngp)
{
struct cprng_fast *cprng;
unsigned epoch;
int s;
KASSERT(!cpu_intr_p()); KASSERT(pserialize_not_in_read_section());
*cprngp = cprng = percpu_getref(cprng_fast_percpu);
s = splsoftserial();
epoch = entropy_epoch();
if (__predict_false(cprng->epoch != epoch)) { splx(s);
cprng_fast_reseed(cprngp, epoch);
s = splsoftserial();
}
return s;
}
static void
cprng_fast_put(struct cprng_fast *cprng, int s)
{
KASSERT((cprng == percpu_getref(cprng_fast_percpu)) &&
(percpu_putref(cprng_fast_percpu), true));
splx(s);
percpu_putref(cprng_fast_percpu);
}
static void
cprng_fast_reseed(struct cprng_fast **cprngp, unsigned epoch)
{
struct cprng_fast *cprng;
uint8_t seed[CPRNG_FAST_SEED_BYTES];
int s;
/*
* Drop the percpu(9) reference to extract a fresh seed from
* the entropy pool. cprng_strong may sleep on an adaptive
* lock, which invalidates our percpu(9) reference.
*
* This may race with reseeding in another thread, which is no
* big deal -- worst case, we rewind the entropy epoch here and
* cause the next caller to reseed again, and in the end we
* just reseed a couple more times than necessary.
*/
percpu_putref(cprng_fast_percpu);
cprng_strong(kern_cprng, seed, sizeof(seed), 0);
*cprngp = cprng = percpu_getref(cprng_fast_percpu);
s = splsoftserial();
cprng_fast_seed(cprng, seed);
cprng->epoch = epoch;
cprng->reseed_evcnt->ev_count++;
splx(s);
explicit_memset(seed, 0, sizeof(seed));
}
/* CPRNG algorithm */
static void
cprng_fast_seed(struct cprng_fast *cprng, const void *seed)
{
(void)memset(cprng->buf, 0, sizeof cprng->buf);
(void)memcpy(cprng->key, seed, sizeof cprng->key);
(void)memset(cprng->nonce, 0, sizeof cprng->nonce);
cprng->i = sizeof cprng->buf;
}
static void
cprng_fast_buf(struct cprng_fast *cprng, void *buf, unsigned len)
{
uint8_t *p = buf;
unsigned n = len, n0;
KASSERT(cprng->i <= sizeof(cprng->buf));
KASSERT(len <= sizeof(cprng->buf));
n0 = MIN(n, sizeof(cprng->buf) - cprng->i);
memcpy(p, &cprng->buf[cprng->i], n0);
if ((n -= n0) == 0) {
cprng->i += n0;
KASSERT(cprng->i <= sizeof(cprng->buf));
return;
}
p += n0;
le64enc(cprng->nonce, 1 + le64dec(cprng->nonce));
chacha_stream(cprng->buf, sizeof(cprng->buf), 0, cprng->nonce,
cprng->key, 8);
memcpy(p, cprng->buf, n);
cprng->i = n;
}
/* Public API */
static void
cprng_fast_buf_short(void *buf, size_t len)
{
struct cprng_fast *cprng;
int s;
KASSERT(len <= sizeof(cprng->buf));
s = cprng_fast_get(&cprng); cprng_fast_buf(cprng, buf, len); cprng_fast_put(cprng, s);
}
static void
cprng_fast_buf_long(void *buf, size_t len)
{
uint8_t seed[CHACHA_STREAM_KEYBYTES];
uint8_t nonce[CHACHA_STREAM_NONCEBYTES] = {0};
CTASSERT(sizeof(seed) <= sizeof(((struct cprng_fast *)0)->buf));
#if SIZE_MAX >= 0x3fffffffff
/* >=256 GB is not reasonable */
KASSERT(len <= 0x3fffffffff);
#endif
cprng_fast_buf_short(seed, sizeof seed);
chacha_stream(buf, len, 0, nonce, seed, 8);
(void)explicit_memset(seed, 0, sizeof seed);
}
uint32_t
cprng_fast32(void)
{
uint32_t v;
cprng_fast_buf_short(&v, sizeof v);
return v;
}
uint64_t
cprng_fast64(void)
{
uint64_t v;
cprng_fast_buf_short(&v, sizeof v);
return v;
}
size_t
cprng_fast(void *buf, size_t len)
{
/*
* We don't want to hog the CPU, so we use the short version,
* to generate output without preemption, only if we can do it
* with at most one ChaCha call.
*/
if (len <= sizeof(((struct cprng_fast *)0)->buf))
cprng_fast_buf_short(buf, len);
else
cprng_fast_buf_long(buf, len);
return len; /* hysterical raisins */
}
/* $NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_subr.c 8.5 (Berkeley) 3/21/95
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_subr.c,v 1.54 2023/01/07 19:41:30 chs Exp $");
#include <sys/param.h>
/* in ffs_tables.c */
extern const int inside[], around[];
extern const u_char * const fragtbl[];
#ifndef _KERNEL
#define FFS_EI /* always include byteswapped filesystems support */
#endif
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#ifndef _KERNEL
#include <ufs/ufs/dinode.h>
void panic(const char *, ...)
__attribute__((__noreturn__,__format__(__printf__,1,2)));
#else /* _KERNEL */
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/buf.h>
#include <sys/inttypes.h>
#include <sys/pool.h>
#include <sys/fstrans.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
/*
* Load up the contents of an inode and copy the appropriate pieces
* to the incore copy.
*/
void
ffs_load_inode(struct buf *bp, struct inode *ip, struct fs *fs, ino_t ino)
{
struct ufs1_dinode *dp1;
struct ufs2_dinode *dp2;
if (ip->i_ump->um_fstype == UFS1) {
dp1 = (struct ufs1_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode1_swap(dp1, ip->i_din.ffs1_din);
else
#endif
*ip->i_din.ffs1_din = *dp1;
ip->i_mode = ip->i_ffs1_mode;
ip->i_nlink = ip->i_ffs1_nlink;
ip->i_size = ip->i_ffs1_size;
ip->i_flags = ip->i_ffs1_flags;
ip->i_gen = ip->i_ffs1_gen;
ip->i_uid = ip->i_ffs1_uid;
ip->i_gid = ip->i_ffs1_gid;
} else {
dp2 = (struct ufs2_dinode *)bp->b_data + ino_to_fsbo(fs, ino);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode2_swap(dp2, ip->i_din.ffs2_din);
else
#endif
*ip->i_din.ffs2_din = *dp2;
ip->i_mode = ip->i_ffs2_mode;
ip->i_nlink = ip->i_ffs2_nlink;
ip->i_size = ip->i_ffs2_size;
ip->i_flags = ip->i_ffs2_flags;
ip->i_gen = ip->i_ffs2_gen;
ip->i_uid = ip->i_ffs2_uid;
ip->i_gid = ip->i_ffs2_gid;
}
}
int
ffs_getblk(struct vnode *vp, daddr_t lblkno, daddr_t blkno, int size,
bool clearbuf, buf_t **bpp)
{
int error = 0;
KASSERT(blkno >= 0 || blkno == FFS_NOBLK); if ((*bpp = getblk(vp, lblkno, size, 0, 0)) == NULL)
return ENOMEM;
if (blkno != FFS_NOBLK) (*bpp)->b_blkno = blkno; if (clearbuf) clrbuf(*bpp); if ((*bpp)->b_blkno >= 0 && (error = fscow_run(*bpp, false)) != 0) { brelse(*bpp, BC_INVAL);
*bpp = NULL;
}
return error;
}
#endif /* _KERNEL */
/*
* Update the frsum fields to reflect addition or deletion
* of some frags.
*/
void
ffs_fragacct(struct fs *fs, int fragmap, uint32_t fraglist[], int cnt,
int needswap)
{
int inblk;
int field, subfield;
int siz, pos;
inblk = (int)(fragtbl[fs->fs_frag][fragmap]) << 1;
fragmap <<= 1;
for (siz = 1; siz < fs->fs_frag; siz++) { if ((inblk & (1 << (siz + (fs->fs_frag & (NBBY - 1))))) == 0)
continue;
field = around[siz];
subfield = inside[siz];
for (pos = siz; pos <= fs->fs_frag; pos++) { if ((fragmap & field) == subfield) {
fraglist[siz] = ufs_rw32(
ufs_rw32(fraglist[siz], needswap) + cnt,
needswap);
pos += siz;
field <<= siz;
subfield <<= siz;
}
field <<= 1;
subfield <<= 1;
}
}
}
/*
* block operations
*
* check if a block is available
* returns true if all the corresponding bits in the free map are 1
* returns false if any corresponding bit in the free map is 0
*/
int
ffs_isblock(struct fs *fs, u_char *cp, int32_t h)
{
u_char mask;
switch ((int)fs->fs_fragshift) {
case 3:
return (cp[h] == 0xff);
case 2:
mask = 0x0f << ((h & 0x1) << 2);
return ((cp[h >> 1] & mask) == mask);
case 1:
mask = 0x03 << ((h & 0x3) << 1);
return ((cp[h >> 2] & mask) == mask);
case 0:
mask = 0x01 << (h & 0x7);
return ((cp[h >> 3] & mask) == mask);
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* check if a block is completely allocated
* returns true if all the corresponding bits in the free map are 0
* returns false if any corresponding bit in the free map is 1
*/
int
ffs_isfreeblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
return (cp[h] == 0);
case 2:
return ((cp[h >> 1] & (0x0f << ((h & 0x1) << 2))) == 0);
case 1:
return ((cp[h >> 2] & (0x03 << ((h & 0x3) << 1))) == 0);
case 0:
return ((cp[h >> 3] & (0x01 << (h & 0x7))) == 0);
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* take a block out of the map
*/
void
ffs_clrblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
cp[h] = 0;
return;
case 2:
cp[h >> 1] &= ~(0x0f << ((h & 0x1) << 2));
return;
case 1:
cp[h >> 2] &= ~(0x03 << ((h & 0x3) << 1));
return;
case 0:
cp[h >> 3] &= ~(0x01 << (h & 0x7));
return;
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* put a block into the map
*/
void
ffs_setblock(struct fs *fs, u_char *cp, int32_t h)
{
switch ((int)fs->fs_fragshift) {
case 3:
cp[h] = 0xff;
return;
case 2:
cp[h >> 1] |= (0x0f << ((h & 0x1) << 2));
return;
case 1:
cp[h >> 2] |= (0x03 << ((h & 0x3) << 1));
return;
case 0:
cp[h >> 3] |= (0x01 << (h & 0x7));
return;
default:
panic("%s: unknown fs_fragshift %d", __func__,
(int)fs->fs_fragshift);
}
}
/*
* Update the cluster map because of an allocation or free.
*
* Cnt == 1 means free; cnt == -1 means allocating.
*/
void
ffs_clusteracct(struct fs *fs, struct cg *cgp, int32_t blkno, int cnt)
{
int32_t *sump;
int32_t *lp;
u_char *freemapp, *mapp;
int i, start, end, forw, back, map;
unsigned int bit;
const int needswap = UFS_FSNEEDSWAP(fs);
/* KASSERT(mutex_owned(&ump->um_lock)); */
if (fs->fs_contigsumsize <= 0)
return;
freemapp = cg_clustersfree(cgp, needswap);
sump = cg_clustersum(cgp, needswap);
/*
* Allocate or clear the actual block.
*/
if (cnt > 0)
setbit(freemapp, blkno);
else
clrbit(freemapp, blkno);
/*
* Find the size of the cluster going forward.
*/
start = blkno + 1;
end = start + fs->fs_contigsumsize;
if ((uint32_t)end >= ufs_rw32(cgp->cg_nclusterblks, needswap)) end = ufs_rw32(cgp->cg_nclusterblks, needswap);
mapp = &freemapp[start / NBBY];
map = *mapp++;
bit = 1U << ((unsigned int)start % NBBY);
for (i = start; i < end; i++) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != (NBBY - 1)) {
bit <<= 1;
} else {
map = *mapp++;
bit = 1;
}
}
forw = i - start;
/*
* Find the size of the cluster going backward.
*/
start = blkno - 1;
end = start - fs->fs_contigsumsize;
if (end < 0)
end = -1;
mapp = &freemapp[start / NBBY];
map = *mapp--;
bit = 1U << ((unsigned int)start % NBBY);
for (i = start; i > end; i--) {
if ((map & bit) == 0)
break;
if ((i & (NBBY - 1)) != 0) {
bit >>= 1;
} else {
map = *mapp--;
bit = 1U << (NBBY - 1);
}
}
back = start - i;
/*
* Account for old cluster and the possibly new forward and
* back clusters.
*/
i = back + forw + 1;
if (i > fs->fs_contigsumsize)
i = fs->fs_contigsumsize;
ufs_add32(sump[i], cnt, needswap);
if (back > 0)
ufs_add32(sump[back], -cnt, needswap);
if (forw > 0) ufs_add32(sump[forw], -cnt, needswap);
/*
* Update cluster summary information.
*/
lp = &sump[fs->fs_contigsumsize];
for (i = fs->fs_contigsumsize; i > 0; i--) if (ufs_rw32(*lp--, needswap) > 0)
break;
#if defined(_KERNEL)
fs->fs_maxcluster[ufs_rw32(cgp->cg_cgx, needswap)] = i;
#endif
}
/* $NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003, 2007, 2007
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Konrad E. Schroder <perseant@hhhh.org>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1989, 1991, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)lfs_vfsops.c 8.20 (Berkeley) 6/10/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: lfs_vfsops.c,v 1.382 2022/03/19 13:53:33 hannken Exp $");
#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#include "opt_quota.h"
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/device.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/socket.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/lfs/ulfs_quotacommon.h>
#include <ufs/lfs/ulfs_inode.h>
#include <ufs/lfs/ulfsmount.h>
#include <ufs/lfs/ulfs_bswap.h>
#include <ufs/lfs/ulfs_extern.h>
#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>
#include <ufs/lfs/lfs.h>
#include <ufs/lfs/lfs_accessors.h>
#include <ufs/lfs/lfs_kernel.h>
#include <ufs/lfs/lfs_extern.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
MODULE(MODULE_CLASS_VFS, lfs, NULL);
static int lfs_gop_write(struct vnode *, struct vm_page **, int, int);
static int lfs_mountfs(struct vnode *, struct mount *, struct lwp *);
static int lfs_flushfiles(struct mount *, int);
extern const struct vnodeopv_desc lfs_vnodeop_opv_desc;
extern const struct vnodeopv_desc lfs_specop_opv_desc;
extern const struct vnodeopv_desc lfs_fifoop_opv_desc;
struct lwp * lfs_writer_daemon = NULL;
kcondvar_t lfs_writerd_cv;
int lfs_do_flush = 0;
#ifdef LFS_KERNEL_RFW
int lfs_do_rfw = 0;
#endif
const struct vnodeopv_desc * const lfs_vnodeopv_descs[] = {
&lfs_vnodeop_opv_desc,
&lfs_specop_opv_desc,
&lfs_fifoop_opv_desc,
NULL,
};
struct vfsops lfs_vfsops = {
.vfs_name = MOUNT_LFS,
.vfs_min_mount_data = sizeof (struct ulfs_args),
.vfs_mount = lfs_mount,
.vfs_start = ulfs_start,
.vfs_unmount = lfs_unmount,
.vfs_root = ulfs_root,
.vfs_quotactl = ulfs_quotactl,
.vfs_statvfs = lfs_statvfs,
.vfs_sync = lfs_sync,
.vfs_vget = lfs_vget,
.vfs_loadvnode = lfs_loadvnode,
.vfs_newvnode = lfs_newvnode,
.vfs_fhtovp = lfs_fhtovp,
.vfs_vptofh = lfs_vptofh,
.vfs_init = lfs_init,
.vfs_reinit = lfs_reinit,
.vfs_done = lfs_done,
.vfs_mountroot = lfs_mountroot,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = lfs_extattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = lfs_vnodeopv_descs
};
const struct genfs_ops lfs_genfsops = {
.gop_size = lfs_gop_size,
.gop_alloc = ulfs_gop_alloc,
.gop_write = lfs_gop_write,
.gop_markupdate = ulfs_gop_markupdate,
.gop_putrange = genfs_gop_putrange,
};
struct shortlong {
const char *sname;
const char *lname;
};
static int
sysctl_lfs_dostats(SYSCTLFN_ARGS)
{
extern struct lfs_stats lfs_stats;
extern int lfs_dostats;
int error;
error = sysctl_lookup(SYSCTLFN_CALL(rnode));
if (error || newp == NULL)
return (error);
if (lfs_dostats == 0)
memset(&lfs_stats, 0, sizeof(lfs_stats));
return (0);
}
SYSCTL_SETUP(lfs_sysctl_setup, "lfs sysctl")
{
int i;
extern int lfs_writeindir, lfs_dostats, lfs_clean_vnhead,
lfs_fs_pagetrip, lfs_ignore_lazy_sync;
#ifdef DEBUG
extern int lfs_debug_log_subsys[DLOG_MAX];
struct shortlong dlog_names[DLOG_MAX] = { /* Must match lfs.h ! */
{ "rollforward", "Debug roll-forward code" },
{ "alloc", "Debug inode allocation and free list" },
{ "avail", "Debug space-available-now accounting" },
{ "flush", "Debug flush triggers" },
{ "lockedlist", "Debug locked list accounting" },
{ "vnode_verbose", "Verbose per-vnode-written debugging" },
{ "vnode", "Debug vnode use during segment write" },
{ "segment", "Debug segment writing" },
{ "seguse", "Debug segment used-bytes accounting" },
{ "cleaner", "Debug cleaning routines" },
{ "mount", "Debug mount/unmount routines" },
{ "pagecache", "Debug UBC interactions" },
{ "dirop", "Debug directory-operation accounting" },
{ "malloc", "Debug private malloc accounting" },
};
#endif /* DEBUG */
struct shortlong stat_names[] = { /* Must match lfs.h! */
{ "segsused", "Number of new segments allocated" },
{ "psegwrites", "Number of partial-segment writes" },
{ "psyncwrites", "Number of synchronous partial-segment"
" writes" },
{ "pcleanwrites", "Number of partial-segment writes by the"
" cleaner" },
{ "blocktot", "Number of blocks written" },
{ "cleanblocks", "Number of blocks written by the cleaner" },
{ "ncheckpoints", "Number of checkpoints made" },
{ "nwrites", "Number of whole writes" },
{ "nsync_writes", "Number of synchronous writes" },
{ "wait_exceeded", "Number of times writer waited for"
" cleaner" },
{ "write_exceeded", "Number of times writer invoked flush" },
{ "flush_invoked", "Number of times flush was invoked" },
{ "vflush_invoked", "Number of time vflush was called" },
{ "clean_inlocked", "Number of vnodes skipped for being dead" },
{ "clean_vnlocked", "Number of vnodes skipped for vget failure" },
{ "segs_reclaimed", "Number of segments reclaimed" },
};
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "lfs",
SYSCTL_DESCR("Log-structured file system"),
NULL, 0, NULL, 0,
CTL_VFS, 5, CTL_EOL);
/*
* XXX the "5" above could be dynamic, thereby eliminating one
* more instance of the "number to vfs" mapping problem, but
* "5" is the order as taken from sys/mount.h
*/
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "flushindir", NULL,
NULL, 0, &lfs_writeindir, 0,
CTL_VFS, 5, LFS_WRITEINDIR, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "clean_vnhead", NULL,
NULL, 0, &lfs_clean_vnhead, 0,
CTL_VFS, 5, LFS_CLEAN_VNHEAD, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "dostats",
SYSCTL_DESCR("Maintain statistics on LFS operations"),
sysctl_lfs_dostats, 0, &lfs_dostats, 0,
CTL_VFS, 5, LFS_DOSTATS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "pagetrip",
SYSCTL_DESCR("How many dirty pages in fs triggers"
" a flush"),
NULL, 0, &lfs_fs_pagetrip, 0,
CTL_VFS, 5, LFS_FS_PAGETRIP, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "ignore_lazy_sync",
SYSCTL_DESCR("Lazy Sync is ignored entirely"),
NULL, 0, &lfs_ignore_lazy_sync, 0,
CTL_VFS, 5, LFS_IGNORE_LAZY_SYNC, CTL_EOL);
#ifdef LFS_KERNEL_RFW
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "rfw",
SYSCTL_DESCR("Use in-kernel roll-forward on mount"),
NULL, 0, &lfs_do_rfw, 0,
CTL_VFS, 5, LFS_DO_RFW, CTL_EOL);
#endif
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "stats",
SYSCTL_DESCR("Debugging options"),
NULL, 0, NULL, 0,
CTL_VFS, 5, LFS_STATS, CTL_EOL);
for (i = 0; i < sizeof(struct lfs_stats) / sizeof(u_int); i++) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, stat_names[i].sname,
SYSCTL_DESCR(stat_names[i].lname),
NULL, 0, &(((u_int *)&lfs_stats.segsused)[i]),
0, CTL_VFS, 5, LFS_STATS, i, CTL_EOL);
}
#ifdef DEBUG
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "debug",
SYSCTL_DESCR("Debugging options"),
NULL, 0, NULL, 0,
CTL_VFS, 5, LFS_DEBUGLOG, CTL_EOL);
for (i = 0; i < DLOG_MAX; i++) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, dlog_names[i].sname,
SYSCTL_DESCR(dlog_names[i].lname),
NULL, 0, &(lfs_debug_log_subsys[i]), 0,
CTL_VFS, 5, LFS_DEBUGLOG, i, CTL_EOL);
}
#endif
}
/* old cleaner syscall interface. see VOP_FCNTL() */
static const struct syscall_package lfs_syscalls[] = {
{ SYS_lfs_bmapv, 0, (sy_call_t *)sys_lfs_bmapv },
{ SYS_lfs_markv, 0, (sy_call_t *)sys_lfs_markv },
{ SYS___lfs_segwait50, 0, (sy_call_t *)sys___lfs_segwait50 },
{ SYS_lfs_segclean, 0, (sy_call_t *)sys_lfs_segclean },
{ 0, 0, NULL },
};
static int
lfs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = syscall_establish(NULL, lfs_syscalls);
if (error)
return error;
error = vfs_attach(&lfs_vfsops);
if (error != 0) {
syscall_disestablish(NULL, lfs_syscalls);
break;
}
cv_init(&lfs_allclean_wakeup, "segment");
break;
case MODULE_CMD_FINI:
error = vfs_detach(&lfs_vfsops);
if (error != 0)
break;
syscall_disestablish(NULL, lfs_syscalls);
cv_destroy(&lfs_allclean_wakeup);
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*
* XXX Same structure as FFS inodes? Should we share a common pool?
*/
struct pool lfs_inode_pool;
struct pool lfs_dinode_pool;
struct pool lfs_inoext_pool;
struct pool lfs_lbnentry_pool;
/*
* The writer daemon. UVM keeps track of how many dirty pages we are holding
* in lfs_subsys_pages; the daemon flushes the filesystem when this value
* crosses the (user-defined) threshold LFS_MAX_PAGES.
*/
static void
lfs_writerd(void *arg)
{
mount_iterator_t *iter;
struct mount *mp;
struct lfs *fs;
struct vfsops *vfs = NULL;
int fsflags;
int lfsc;
int wrote_something = 0;
mutex_enter(&lfs_lock);
KASSERTMSG(lfs_writer_daemon == NULL, "more than one LFS writer daemon");
lfs_writer_daemon = curlwp;
mutex_exit(&lfs_lock);
/* Take an extra reference to the LFS vfsops. */
vfs = vfs_getopsbyname(MOUNT_LFS);
mutex_enter(&lfs_lock);
for (;;) {
KASSERT(mutex_owned(&lfs_lock));
if (wrote_something == 0)
cv_timedwait(&lfs_writerd_cv, &lfs_lock, hz/10 + 1);
KASSERT(mutex_owned(&lfs_lock));
wrote_something = 0;
/*
* If global state wants a flush, flush everything.
*/
if (lfs_do_flush || locked_queue_count > LFS_MAX_BUFS ||
locked_queue_bytes > LFS_MAX_BYTES ||
lfs_subsys_pages > LFS_MAX_PAGES) {
if (lfs_do_flush) {
DLOG((DLOG_FLUSH, "lfs_writerd: lfs_do_flush\n"));
}
if (locked_queue_count > LFS_MAX_BUFS) {
DLOG((DLOG_FLUSH, "lfs_writerd: lqc = %d, max %d\n",
locked_queue_count, LFS_MAX_BUFS));
}
if (locked_queue_bytes > LFS_MAX_BYTES) {
DLOG((DLOG_FLUSH, "lfs_writerd: lqb = %ld, max %ld\n",
locked_queue_bytes, LFS_MAX_BYTES));
}
if (lfs_subsys_pages > LFS_MAX_PAGES) {
DLOG((DLOG_FLUSH, "lfs_writerd: lssp = %d, max %d\n",
lfs_subsys_pages, LFS_MAX_PAGES));
}
lfs_flush(NULL, SEGM_WRITERD, 0);
lfs_do_flush = 0;
KASSERT(mutex_owned(&lfs_lock));
continue;
}
KASSERT(mutex_owned(&lfs_lock));
mutex_exit(&lfs_lock);
/*
* Look through the list of LFSs to see if any of them
* have requested pageouts.
*/
mountlist_iterator_init(&iter);
lfsc = 0;
while ((mp = mountlist_iterator_next(iter)) != NULL) {
KASSERT(!mutex_owned(&lfs_lock));
if (strncmp(mp->mnt_stat.f_fstypename, MOUNT_LFS,
sizeof(mp->mnt_stat.f_fstypename)) == 0) {
++lfsc;
fs = VFSTOULFS(mp)->um_lfs;
daddr_t ooffset = 0;
fsflags = SEGM_SINGLE;
mutex_enter(&lfs_lock);
ooffset = lfs_sb_getoffset(fs);
if (lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs) && fs->lfs_nowrap) {
/* Don't try to write if we're suspended */
mutex_exit(&lfs_lock);
continue;
}
if (LFS_STARVED_FOR_SEGS(fs)) {
mutex_exit(&lfs_lock);
DLOG((DLOG_FLUSH, "lfs_writerd: need cleaning before writing possible\n"));
lfs_wakeup_cleaner(fs);
continue;
}
if ((fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
lfs_dirvcount > LFS_MAX_DIROP) &&
fs->lfs_dirops == 0) {
fsflags &= ~SEGM_SINGLE;
fsflags |= SEGM_CKP;
DLOG((DLOG_FLUSH, "lfs_writerd: checkpoint\n"));
lfs_flush_fs(fs, fsflags);
} else if (fs->lfs_pdflush) {
DLOG((DLOG_FLUSH, "lfs_writerd: pdflush set\n"));
lfs_flush_fs(fs, fsflags);
} else if (!TAILQ_EMPTY(&fs->lfs_pchainhd)) {
DLOG((DLOG_FLUSH, "lfs_writerd: pchain non-empty\n"));
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "wrdirop");
lfs_flush_pchain(fs);
lfs_writer_leave(fs);
mutex_enter(&lfs_lock);
}
if (lfs_sb_getoffset(fs) != ooffset)
++wrote_something;
mutex_exit(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
}
if (lfsc == 0) {
mutex_enter(&lfs_lock);
lfs_writer_daemon = NULL;
mutex_exit(&lfs_lock);
mountlist_iterator_destroy(iter);
break;
}
mountlist_iterator_destroy(iter);
mutex_enter(&lfs_lock);
}
KASSERT(!mutex_owned(&lfs_lock));
/* Give up our extra reference so the module can be unloaded. */
mutex_enter(&vfs_list_lock);
if (vfs != NULL)
vfs->vfs_refcount--;
mutex_exit(&vfs_list_lock);
/* Done! */
kthread_exit(0);
}
/*
* Initialize the filesystem, most work done by ulfs_init.
*/
void
lfs_init(void)
{
/*
* XXX: should we use separate pools for 32-bit and 64-bit
* dinodes?
*/
malloc_type_attach(M_SEGMENT);
pool_init(&lfs_inode_pool, sizeof(struct inode), 0, 0, 0,
"lfsinopl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_dinode_pool, sizeof(union lfs_dinode), 0, 0, 0,
"lfsdinopl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_inoext_pool, sizeof(struct lfs_inode_ext), 8, 0, 0,
"lfsinoextpl", &pool_allocator_nointr, IPL_NONE);
pool_init(&lfs_lbnentry_pool, sizeof(struct lbnentry), 0, 0, 0,
"lfslbnpool", &pool_allocator_nointr, IPL_NONE);
ulfs_init();
#ifdef DEBUG
memset(lfs_log, 0, sizeof(lfs_log));
#endif
mutex_init(&lfs_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&lfs_writerd_cv, "lfswrite");
cv_init(&locked_queue_cv, "lfsbuf");
cv_init(&lfs_writing_cv, "lfsflush");
}
void
lfs_reinit(void)
{
ulfs_reinit();
}
void
lfs_done(void)
{
ulfs_done();
mutex_destroy(&lfs_lock);
cv_destroy(&lfs_writerd_cv);
cv_destroy(&locked_queue_cv);
cv_destroy(&lfs_writing_cv);
pool_destroy(&lfs_inode_pool);
pool_destroy(&lfs_dinode_pool);
pool_destroy(&lfs_inoext_pool);
pool_destroy(&lfs_lbnentry_pool);
malloc_type_detach(M_SEGMENT);
}
/*
* Called by main() when ulfs is going to be mounted as root.
*/
int
lfs_mountroot(void)
{
extern struct vnode *rootvp;
struct lfs *fs = NULL; /* LFS */
struct mount *mp;
struct lwp *l = curlwp;
struct ulfsmount *ump;
int error;
if (device_class(root_device) != DV_DISK)
return (ENODEV);
if (rootdev == NODEV)
return (ENODEV);
if ((error = vfs_rootmountalloc(MOUNT_LFS, "root_device", &mp))) {
vrele(rootvp);
return (error);
}
if ((error = lfs_mountfs(rootvp, mp, l))) {
vfs_unbusy(mp);
vfs_rele(mp);
return (error);
}
mountlist_append(mp);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
(void)lfs_statvfs(mp, &mp->mnt_stat);
vfs_unbusy(mp);
setrootfstime((time_t)lfs_sb_gettstamp(VFSTOULFS(mp)->um_lfs));
return (0);
}
/*
* VFS Operations.
*
* mount system call
*/
int
lfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
struct vnode *devvp;
struct ulfs_args *args = data;
struct ulfsmount *ump = NULL;
struct lfs *fs = NULL; /* LFS */
int error = 0, update;
mode_t accessmode;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
ump = VFSTOULFS(mp);
if (ump == NULL)
return EIO;
args->fspec = NULL;
*data_len = sizeof *args;
return 0;
}
update = mp->mnt_flag & MNT_UPDATE;
/* Check arguments */
if (args->fspec != NULL) {
/*
* Look up the name and verify that it's sane.
*/
error = namei_simple_user(args->fspec,
NSM_FOLLOW_NOEMULROOT, &devvp);
if (error != 0)
return (error);
if (!update) {
/*
* Be sure this is a valid block device
*/
if (devvp->v_type != VBLK)
error = ENOTBLK;
else if (bdevsw_lookup(devvp->v_rdev) == NULL)
error = ENXIO;
} else {
/*
* Be sure we're still naming the same device
* used for our initial mount
*
* XXX dholland 20151010: if namei gives us a
* different vnode for the same device,
* wouldn't it be better to use it going
* forward rather than ignore it in favor of
* the old one?
*/
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
if (devvp != fs->lfs_devvp) { if (devvp->v_rdev != fs->lfs_devvp->v_rdev)
error = EINVAL;
else {
vrele(devvp);
devvp = fs->lfs_devvp;
vref(devvp);
}
}
}
} else {
if (!update) {
/* New mounts must have a filename for the device */
return (EINVAL);
} else {
/* Use the extant mount */
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
devvp = fs->lfs_devvp;
vref(devvp);
}
}
/*
* If mount by non-root, then verify that user has necessary
* permissions on the device.
*/
if (error == 0) {
accessmode = VREAD;
if (update ?
(mp->mnt_iflag & IMNT_WANTRDWR) != 0 :
(mp->mnt_flag & MNT_RDONLY) == 0)
accessmode |= VWRITE;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_DEVICE, mp, devvp,
KAUTH_ARG(accessmode));
VOP_UNLOCK(devvp);
}
if (error) {
vrele(devvp);
return (error);
}
if (!update) {
int flags;
if (mp->mnt_flag & MNT_RDONLY)
flags = FREAD;
else
flags = FREAD|FWRITE;
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(devvp, flags, FSCRED);
VOP_UNLOCK(devvp);
if (error)
goto fail;
error = lfs_mountfs(devvp, mp, l); /* LFS */
if (error) {
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
(void)VOP_CLOSE(devvp, flags, NOCRED);
VOP_UNLOCK(devvp);
goto fail;
}
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
} else {
/*
* Update the mount.
*/
/*
* The initial mount got a reference on this
* device, so drop the one obtained via
* namei(), above.
*/
vrele(devvp);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
if (!fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDONLY)) {
/*
* Changing from read/write to read-only.
*/
int flags = WRITECLOSE;
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
error = lfs_flushfiles(mp, flags);
if (error)
return error;
fs->lfs_ronly = 1; } else if (fs->lfs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) {
/*
* Changing from read-only to read/write.
* Note in the superblocks that we're writing.
*/
/* XXX: quotas should have been on even if readonly */
if (fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
error = lfs_quota2_mount(mp);
#else
uprintf("%s: no kernel support for this "
"filesystem's quotas\n",
mp->mnt_stat.f_mntonname);
if (mp->mnt_flag & MNT_FORCE) { uprintf("%s: mounting anyway; "
"fsck afterwards\n",
mp->mnt_stat.f_mntonname);
} else {
error = EINVAL;
}
#endif
if (error) {
return error;
}
}
fs->lfs_ronly = 0;
if (lfs_sb_getpflags(fs) & LFS_PF_CLEAN) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0)); lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
}
if (args->fspec == NULL)
return 0;
}
error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error == 0) lfs_sb_setfsmnt(fs, mp->mnt_stat.f_mntonname);
return error;
fail:
vrele(devvp);
return (error);
}
/*
* Helper for mountfs. Note that the fs pointer may be a dummy one
* pointing into a superblock buffer. (Which is gross; see below.)
*/
static int
lfs_checkmagic(struct lfs *fs)
{
switch (fs->lfs_dlfs_u.u_32.dlfs_magic) {
case LFS_MAGIC:
fs->lfs_is64 = false;
fs->lfs_dobyteswap = false;
break;
case LFS64_MAGIC:
fs->lfs_is64 = true;
fs->lfs_dobyteswap = false;
break;
#ifdef LFS_EI
case LFS_MAGIC_SWAPPED:
fs->lfs_is64 = false;
fs->lfs_dobyteswap = true;
break;
case LFS64_MAGIC_SWAPPED:
fs->lfs_is64 = true;
fs->lfs_dobyteswap = true;
break;
#endif
default:
/* XXX needs translation */
return EINVAL;
}
return 0;
}
/*
* Common code for mount and mountroot
* LFS specific
*/
int
lfs_mountfs(struct vnode *devvp, struct mount *mp, struct lwp *l)
{
struct lfs *primarysb, *altsb, *thesb;
struct buf *primarybuf, *altbuf;
struct lfs *fs;
struct ulfsmount *ump;
struct vnode *vp;
dev_t dev;
int error, i, ronly, fsbsize;
kauth_cred_t cred;
CLEANERINFO *cip;
SEGUSE *sup;
daddr_t sb_addr;
ino_t *orphan;
size_t norphan;
cred = l ? l->l_cred : NOCRED;
/* The superblock is supposed to be 512 bytes. */
__CTASSERT(sizeof(struct dlfs) == DEV_BSIZE);
/*
* Flush out any old buffers remaining from a previous use.
*/
vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
error = vinvalbuf(devvp, V_SAVE, cred, l, 0, 0);
VOP_UNLOCK(devvp);
if (error)
return (error);
ronly = (mp->mnt_flag & MNT_RDONLY) != 0;
/* Don't free random space on error. */
primarybuf = NULL;
altbuf = NULL;
ump = NULL;
sb_addr = LFS_LABELPAD / DEV_BSIZE;
while (1) {
/*
* Read in the superblock.
*
* Note that because LFS_SBPAD is substantially larger
* (8K) than the actual on-disk superblock (512 bytes)
* the buffer contains enough space to be used as a
* whole struct lfs (in-memory superblock) - we do this
* only so we can set and use the is64 and dobyteswap
* members. XXX this is gross and the logic here should
* be reworked.
*/
error = bread(devvp, sb_addr, LFS_SBPAD, 0, &primarybuf);
if (error)
goto out;
primarysb = (struct lfs *)primarybuf->b_data;
/* Check the basics. */
error = lfs_checkmagic(primarysb);
if (error) {
DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock wrong magic\n"));
goto out;
}
if (lfs_sb_getbsize(primarysb) > MAXBSIZE ||
lfs_sb_getversion(primarysb) > LFS_VERSION ||
lfs_sb_getbsize(primarysb) < sizeof(struct dlfs)) {
DLOG((DLOG_MOUNT, "lfs_mountfs: primary superblock sanity failed\n"));
/* XXX needs translation */
error = EINVAL;
goto out;
}
if (lfs_sb_getinodefmt(primarysb) > LFS_MAXINODEFMT) {
DLOG((DLOG_MOUNT, "lfs_mountfs: unknown inode format %d\n",
lfs_sb_getinodefmt(primarysb)));
error = EINVAL;
goto out;
}
if (lfs_sb_getversion(primarysb) == 1)
fsbsize = DEV_BSIZE;
else {
fsbsize = 1 << lfs_sb_getffshift(primarysb);
/*
* Could be, if the frag size is large enough, that we
* don't have the "real" primary superblock. If that's
* the case, get the real one, and try again.
*/
if (sb_addr != (lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))) {
DLOG((DLOG_MOUNT, "lfs_mountfs: sb daddr"
" 0x%llx is not right, trying 0x%llx\n",
(long long)sb_addr,
(long long)(lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT))));
sb_addr = lfs_sb_getsboff(primarysb, 0) << (lfs_sb_getffshift(primarysb) - DEV_BSHIFT);
brelse(primarybuf, BC_INVAL);
continue;
}
}
break;
}
/*
* Check the second superblock to see which is newer; then mount
* using the older of the two. This is necessary to ensure that
* the filesystem is valid if it was not unmounted cleanly.
*/
if (lfs_sb_getsboff(primarysb, 1) &&
lfs_sb_getsboff(primarysb, 1) - LFS_LABELPAD / fsbsize > LFS_SBPAD / fsbsize)
{
error = bread(devvp, lfs_sb_getsboff(primarysb, 1) * (fsbsize / DEV_BSIZE),
LFS_SBPAD, 0, &altbuf);
if (error)
goto out;
altsb = (struct lfs *)altbuf->b_data;
/*
* Note: this used to do the sanity check only if the
* timestamp/serial comparison required use of altsb;
* this way is less tolerant, but if altsb is corrupted
* enough that the magic number, version, and blocksize
* are bogus, why would the timestamp or serial fields
* mean anything either? If this kind of thing happens,
* you need to fsck anyway.
*/
error = lfs_checkmagic(altsb);
if (error)
goto out;
/* Check the basics. */
if (lfs_sb_getbsize(altsb) > MAXBSIZE ||
lfs_sb_getversion(altsb) > LFS_VERSION ||
lfs_sb_getbsize(altsb) < sizeof(struct dlfs)) {
DLOG((DLOG_MOUNT, "lfs_mountfs: alt superblock"
" sanity failed\n"));
error = EINVAL; /* XXX needs translation */
goto out;
}
if (lfs_sb_getversion(primarysb) == 1) {
/* 1s resolution comparison */
if (lfs_sb_gettstamp(altsb) < lfs_sb_gettstamp(primarysb))
thesb = altsb;
else
thesb = primarysb;
} else {
/* monotonic infinite-resolution comparison */
if (lfs_sb_getserial(altsb) < lfs_sb_getserial(primarysb))
thesb = altsb;
else
thesb = primarysb;
}
} else {
DLOG((DLOG_MOUNT, "lfs_mountfs: invalid alt superblock location"
" daddr=0x%x\n", lfs_sb_getsboff(primarysb, 1)));
error = EINVAL;
goto out;
}
/*
* Allocate the mount structure, copy the superblock into it.
* Note that the 32-bit and 64-bit superblocks are the same size.
*/
fs = kmem_zalloc(sizeof(struct lfs), KM_SLEEP);
memcpy(&fs->lfs_dlfs_u.u_32, &thesb->lfs_dlfs_u.u_32,
sizeof(struct dlfs));
fs->lfs_is64 = thesb->lfs_is64;
fs->lfs_dobyteswap = thesb->lfs_dobyteswap;
fs->lfs_hasolddirfmt = false; /* set for real below */
/* Compatibility */
if (lfs_sb_getversion(fs) < 2) {
lfs_sb_setsumsize(fs, LFS_V1_SUMMARY_SIZE);
lfs_sb_setibsize(fs, lfs_sb_getbsize(fs));
lfs_sb_sets0addr(fs, lfs_sb_getsboff(fs, 0));
lfs_sb_settstamp(fs, lfs_sb_getotstamp(fs));
lfs_sb_setfsbtodb(fs, 0);
}
if (lfs_sb_getresvseg(fs) == 0)
lfs_sb_setresvseg(fs, MIN(lfs_sb_getminfreeseg(fs) - 1, \
MAX(MIN_RESV_SEGS, lfs_sb_getminfreeseg(fs) / 2 + 1)));
/*
* If we aren't going to be able to write meaningfully to this
* filesystem, and were not mounted readonly, bomb out now.
*/
if (lfs_fsbtob(fs, LFS_NRESERVE(fs)) > LFS_MAX_BYTES && !ronly) {
DLOG((DLOG_MOUNT, "lfs_mount: to mount this filesystem read/write,"
" we need BUFPAGES >= %lld\n",
(long long)((bufmem_hiwater / bufmem_lowater) *
LFS_INVERSE_MAX_BYTES(
lfs_fsbtob(fs, LFS_NRESERVE(fs))) >> PAGE_SHIFT)));
kmem_free(fs, sizeof(struct lfs));
error = EFBIG; /* XXX needs translation */
goto out;
}
/* Before rolling forward, lock so vget will sleep for other procs */
if (l != NULL) {
fs->lfs_flags = LFS_NOTYET;
fs->lfs_rfpid = l->l_proc->p_pid;
}
ump = kmem_zalloc(sizeof(*ump), KM_SLEEP);
ump->um_lfs = fs;
ump->um_fstype = fs->lfs_is64 ? ULFS2 : ULFS1;
/* ump->um_cleaner_thread = NULL; */
brelse(primarybuf, BC_INVAL);
brelse(altbuf, BC_INVAL);
primarybuf = NULL;
altbuf = NULL;
/* Set up the I/O information */
fs->lfs_devbsize = DEV_BSIZE;
fs->lfs_iocount = 0;
fs->lfs_diropwait = 0;
fs->lfs_activesb = 0;
lfs_sb_setuinodes(fs, 0);
fs->lfs_ravail = 0;
fs->lfs_favail = 0;
fs->lfs_sbactive = 0;
/* Set up the ifile and lock aflags */
fs->lfs_doifile = 0;
fs->lfs_writer = 0;
fs->lfs_dirops = 0;
fs->lfs_nadirop = 0;
fs->lfs_seglock = 0;
fs->lfs_pdflush = 0;
fs->lfs_sleepers = 0;
fs->lfs_pages = 0;
rw_init(&fs->lfs_fraglock);
rw_init(&fs->lfs_iflock);
cv_init(&fs->lfs_sleeperscv, "lfs_slp");
cv_init(&fs->lfs_diropscv, "lfs_dirop");
cv_init(&fs->lfs_stopcv, "lfsstop");
cv_init(&fs->lfs_nextsegsleep, "segment");
/* Set the file system readonly/modify bits. */
fs->lfs_ronly = ronly;
if (ronly == 0)
fs->lfs_fmod = 1;
/* Device we're using */
dev = devvp->v_rdev;
fs->lfs_dev = dev;
fs->lfs_devvp = devvp;
/* ulfs-level information */
fs->um_flags = 0;
fs->um_bptrtodb = lfs_sb_getffshift(fs) - DEV_BSHIFT;
fs->um_seqinc = lfs_sb_getfrag(fs);
fs->um_nindir = lfs_sb_getnindir(fs);
fs->um_lognindir = ffs(lfs_sb_getnindir(fs)) - 1;
fs->um_maxsymlinklen = lfs_sb_getmaxsymlinklen(fs);
fs->um_dirblksiz = LFS_DIRBLKSIZ;
fs->um_maxfilesize = lfs_sb_getmaxfilesize(fs);
/* quota stuff */
/* XXX: these need to come from the on-disk superblock to be used */
fs->lfs_use_quota2 = 0;
fs->lfs_quota_magic = 0;
fs->lfs_quota_flags = 0;
fs->lfs_quotaino[0] = 0;
fs->lfs_quotaino[1] = 0;
/* Initialize the mount structure. */
mp->mnt_data = ump;
mp->mnt_stat.f_fsidx.__fsid_val[0] = (long)dev;
mp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_LFS);
mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
mp->mnt_stat.f_namemax = LFS_MAXNAMLEN;
mp->mnt_stat.f_iosize = lfs_sb_getbsize(fs);
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_iflag |= IMNT_SHRLOOKUP;
mp->mnt_fs_bshift = lfs_sb_getbshift(fs);
mp->mnt_iflag |= IMNT_CAN_RWTORO;
if (fs->um_maxsymlinklen > 0)
mp->mnt_iflag |= IMNT_DTYPE;
else
fs->lfs_hasolddirfmt = true;
ump->um_mountp = mp;
for (i = 0; i < ULFS_MAXQUOTAS; i++)
ump->um_quotas[i] = NULLVP;
spec_node_setmountedfs(devvp, mp);
/* Set up reserved memory for pageout */
lfs_setup_resblks(fs);
/* Set up vdirop tailq */
TAILQ_INIT(&fs->lfs_dchainhd);
/* and paging tailq */
TAILQ_INIT(&fs->lfs_pchainhd);
/* and delayed segment accounting for truncation list */
LIST_INIT(&fs->lfs_segdhd);
/*
* We use the ifile vnode for almost every operation. Instead of
* retrieving it from the hash table each time we retrieve it here,
* artificially increment the reference count and keep a pointer
* to it in the incore copy of the superblock.
*/
if ((error = VFS_VGET(mp, LFS_IFILE_INUM, LK_EXCLUSIVE, &vp)) != 0) {
DLOG((DLOG_MOUNT, "lfs_mountfs: ifile vget failed, error=%d\n", error));
goto out;
}
fs->lfs_ivnode = vp;
vref(vp);
/* Set up inode bitmap, order free list, and gather orphans. */
lfs_order_freelist(fs, &orphan, &norphan);
/* Set up segment usage flags for the autocleaner. */
fs->lfs_nactive = 0;
fs->lfs_suflags = malloc(2 * sizeof(u_int32_t *),
M_SEGMENT, M_WAITOK);
fs->lfs_suflags[0] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
M_SEGMENT, M_WAITOK);
fs->lfs_suflags[1] = malloc(lfs_sb_getnseg(fs) * sizeof(u_int32_t),
M_SEGMENT, M_WAITOK);
memset(fs->lfs_suflags[1], 0, lfs_sb_getnseg(fs) * sizeof(u_int32_t));
for (i = 0; i < lfs_sb_getnseg(fs); i++) {
int changed;
struct buf *bp;
LFS_SEGENTRY(sup, fs, i, bp);
changed = 0;
if (!ronly) {
if (sup->su_nbytes == 0 &&
!(sup->su_flags & SEGUSE_EMPTY)) {
sup->su_flags |= SEGUSE_EMPTY;
++changed;
} else if (!(sup->su_nbytes == 0) &&
(sup->su_flags & SEGUSE_EMPTY)) {
sup->su_flags &= ~SEGUSE_EMPTY;
++changed;
}
if (sup->su_flags & (SEGUSE_ACTIVE|SEGUSE_INVAL)) {
sup->su_flags &= ~(SEGUSE_ACTIVE|SEGUSE_INVAL);
++changed;
}
}
fs->lfs_suflags[0][i] = sup->su_flags;
if (changed)
LFS_WRITESEGENTRY(sup, fs, i, bp);
else
brelse(bp, 0);
}
/* Free the orphans we discovered while ordering the freelist. */
lfs_free_orphans(fs, orphan, norphan);
/*
* XXX: if the fs has quotas, quotas should be on even if
* readonly. Otherwise you can't query the quota info!
* However, that's not how the quota2 code got written and I
* don't know if it'll behave itself if enabled while
* readonly, so for now use the same enable logic as ffs.
*
* XXX: also, if you use the -f behavior allowed here (and
* equivalently above for remount) it will corrupt the fs. It
* ought not to allow that. It should allow mounting readonly
* if there are quotas and the kernel doesn't have the quota
* code, but only readonly.
*
* XXX: and if you use the -f behavior allowed here it will
* likely crash at unmount time (or remount time) because we
* think quotas are active.
*
* Although none of this applies until there's a way to set
* lfs_use_quota2 and have quotas in the fs at all.
*/
if (!ronly && fs->lfs_use_quota2) {
#ifdef LFS_QUOTA2
error = lfs_quota2_mount(mp);
#else
uprintf("%s: no kernel support for this filesystem's quotas\n",
mp->mnt_stat.f_mntonname);
if (mp->mnt_flag & MNT_FORCE) {
uprintf("%s: mounting anyway; fsck afterwards\n",
mp->mnt_stat.f_mntonname);
} else {
error = EINVAL;
}
#endif
if (error) {
/* XXX XXX must clean up the stuff immediately above */
printf("lfs_mountfs: sorry, leaking some memory\n");
goto out;
}
}
#ifdef LFS_KERNEL_RFW
lfs_roll_forward(fs, mp, l);
#endif
/* If writing, sb is not clean; record in case of immediate crash */
if (!fs->lfs_ronly) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) & ~LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
/* Allow vget now that roll-forward is complete */
fs->lfs_flags &= ~(LFS_NOTYET);
wakeup(&fs->lfs_flags);
/*
* Initialize the ifile cleaner info with information from
* the superblock.
*/
{
struct buf *bp;
LFS_CLEANERINFO(cip, fs, bp);
lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs));
lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs));
(void) LFS_BWRITE_LOG(bp); /* Ifile */
}
/*
* Mark the current segment as ACTIVE, since we're going to
* be writing to it.
*/
{
struct buf *bp;
LFS_SEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp);
sup->su_flags |= SEGUSE_DIRTY | SEGUSE_ACTIVE;
fs->lfs_nactive++;
LFS_WRITESEGENTRY(sup, fs, lfs_dtosn(fs, lfs_sb_getoffset(fs)), bp); /* Ifile */
}
/* Now that roll-forward is done, unlock the Ifile */
vput(vp);
/* Start the pagedaemon-anticipating daemon */
mutex_enter(&lfs_lock);
if (lfs_writer_daemon == NULL &&
kthread_create(PRI_BIO, 0, NULL,
lfs_writerd, NULL, NULL, "lfs_writer") != 0)
panic("fork lfs_writer");
mutex_exit(&lfs_lock);
printf("WARNING: the log-structured file system is experimental\n"
"WARNING: it may cause system crashes and/or corrupt data\n");
return (0);
out:
if (primarybuf)
brelse(primarybuf, BC_INVAL);
if (altbuf)
brelse(altbuf, BC_INVAL);
if (ump) {
kmem_free(ump->um_lfs, sizeof(struct lfs));
kmem_free(ump, sizeof(*ump));
mp->mnt_data = NULL;
}
return (error);
}
/*
* unmount system call
*/
int
lfs_unmount(struct mount *mp, int mntflags)
{
struct ulfsmount *ump;
struct lfs *fs;
int error, ronly;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
error = lfs_flushfiles(mp, mntflags & MNT_FORCE ? FORCECLOSE : 0);
if (error)
return error;
/* Finish with the Ifile, now that we're done with it */
vgone(fs->lfs_ivnode);
ronly = !fs->lfs_ronly;
if (fs->lfs_devvp->v_type != VBAD)
spec_node_setmountedfs(fs->lfs_devvp, NULL);
vn_lock(fs->lfs_devvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_CLOSE(fs->lfs_devvp,
ronly ? FREAD : FREAD|FWRITE, NOCRED);
vput(fs->lfs_devvp);
/* Complain about page leakage */
if (fs->lfs_pages > 0)
printf("lfs_unmount: still claim %d pages (%d in subsystem)\n",
fs->lfs_pages, lfs_subsys_pages);
/* Free per-mount data structures */
free(fs->lfs_ino_bitmap, M_SEGMENT);
free(fs->lfs_suflags[0], M_SEGMENT);
free(fs->lfs_suflags[1], M_SEGMENT);
free(fs->lfs_suflags, M_SEGMENT);
lfs_free_resblks(fs);
cv_destroy(&fs->lfs_sleeperscv);
cv_destroy(&fs->lfs_diropscv);
cv_destroy(&fs->lfs_stopcv);
cv_destroy(&fs->lfs_nextsegsleep);
rw_destroy(&fs->lfs_fraglock);
rw_destroy(&fs->lfs_iflock);
kmem_free(fs, sizeof(struct lfs));
kmem_free(ump, sizeof(*ump));
mp->mnt_data = NULL;
mp->mnt_flag &= ~MNT_LOCAL;
return (error);
}
static int
lfs_flushfiles(struct mount *mp, int flags)
{
struct lwp *l = curlwp;
struct ulfsmount *ump;
struct lfs *fs;
struct vnode *vp;
int error;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
/* Two checkpoints */
if (!fs->lfs_ronly) {
lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
lfs_segwrite(mp, SEGM_CKP | SEGM_SYNC);
}
/* wake up the cleaner so it can die */
/* XXX: shouldn't this be *after* the error cases below? */
lfs_wakeup_cleaner(fs);
mutex_enter(&lfs_lock);
while (fs->lfs_sleepers)
cv_wait(&fs->lfs_sleeperscv, &lfs_lock);
mutex_exit(&lfs_lock);
#ifdef LFS_EXTATTR
if (ump->um_fstype == ULFS1) {
if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_STARTED) {
ulfs_extattr_stop(mp, curlwp);
}
if (ump->um_extattr.uepm_flags & ULFS_EXTATTR_UEPM_INITIALIZED) {
ulfs_extattr_uepm_destroy(&ump->um_extattr);
mp->mnt_flag &= ~MNT_EXTATTR;
}
}
#endif
#ifdef LFS_QUOTA
if ((error = lfsquota1_umount(mp, flags)) != 0)
return (error);
#endif
#ifdef LFS_QUOTA2
if ((error = lfsquota2_umount(mp, flags)) != 0)
return (error);
#endif
if ((error = vflush(mp, fs->lfs_ivnode, flags)) != 0)
return (error);
if ((error = VFS_SYNC(mp, 1, l->l_cred)) != 0)
return (error);
vp = fs->lfs_ivnode;
mutex_enter(vp->v_interlock);
if (LIST_FIRST(&vp->v_dirtyblkhd))
panic("lfs_unmount: still dirty blocks on ifile vnode");
mutex_exit(vp->v_interlock);
/* Explicitly write the superblock, to update serial and pflags */
if (!fs->lfs_ronly) {
lfs_sb_setpflags(fs, lfs_sb_getpflags(fs) | LFS_PF_CLEAN);
lfs_writesuper(fs, lfs_sb_getsboff(fs, 0));
lfs_writesuper(fs, lfs_sb_getsboff(fs, 1));
}
mutex_enter(&lfs_lock);
while (fs->lfs_iocount)
mtsleep(&fs->lfs_iocount, PRIBIO + 1, "lfs_umount", 0,
&lfs_lock);
mutex_exit(&lfs_lock);
return 0;
}
/*
* Get file system statistics.
*
* NB: We don't lock to access the superblock here, because it's not
* really that important if we get it wrong.
*/
int
lfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
struct lfs *fs;
struct ulfsmount *ump;
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
sbp->f_bsize = lfs_sb_getbsize(fs);
sbp->f_frsize = lfs_sb_getfsize(fs);
sbp->f_iosize = lfs_sb_getbsize(fs);
sbp->f_blocks = LFS_EST_NONMETA(fs) - VTOI(fs->lfs_ivnode)->i_lfs_effnblks;
sbp->f_bfree = LFS_EST_BFREE(fs);
/*
* XXX this should be lfs_sb_getsize (measured in frags)
* rather than dsize (measured in diskblocks). However,
* getsize needs a format version check (for version 1 it
* needs to be blockstofrags'd) so for the moment I'm going to
* leave this... it won't fire wrongly as frags are at least
* as big as diskblocks.
*/
KASSERT(sbp->f_bfree <= lfs_sb_getdsize(fs));
#if 0
if (sbp->f_bfree < 0)
sbp->f_bfree = 0;
#endif
sbp->f_bresvd = LFS_EST_RSVD(fs);
if (sbp->f_bfree > sbp->f_bresvd)
sbp->f_bavail = sbp->f_bfree - sbp->f_bresvd;
else
sbp->f_bavail = 0;
/* XXX: huh? - dholland 20150728 */
sbp->f_files = lfs_sb_getbfree(fs) / lfs_btofsb(fs, lfs_sb_getibsize(fs))
* LFS_INOPB(fs);
sbp->f_ffree = sbp->f_files - lfs_sb_getnfiles(fs);
sbp->f_favail = sbp->f_ffree;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, mp);
return (0);
}
/*
* Go through the disk queues to initiate sandbagged IO;
* go through the inodes to write those that have been modified;
* initiate the writing of the super block if it has been modified.
*
* Note: we are always called with the filesystem marked `MPBUSY'.
*/
int
lfs_sync(struct mount *mp, int waitfor, kauth_cred_t cred)
{
int error;
struct lfs *fs;
fs = VFSTOULFS(mp)->um_lfs;
if (fs->lfs_ronly)
return 0;
/* Snapshots should not hose the syncer */
/*
* XXX Sync can block here anyway, since we don't have a very
* XXX good idea of how much data is pending. If it's more
* XXX than a segment and lfs_nextseg is close to the end of
* XXX the log, we'll likely block.
*/
mutex_enter(&lfs_lock);
if (fs->lfs_nowrap && lfs_sb_getnextseg(fs) < lfs_sb_getcurseg(fs)) {
mutex_exit(&lfs_lock);
return 0;
}
mutex_exit(&lfs_lock);
lfs_writer_enter(fs, "lfs_dirops");
/* All syncs must be checkpoints until roll-forward is implemented. */
DLOG((DLOG_FLUSH, "lfs_sync at 0x%jx\n",
(uintmax_t)lfs_sb_getoffset(fs)));
error = lfs_segwrite(mp, SEGM_CKP | (waitfor ? SEGM_SYNC : 0));
lfs_writer_leave(fs);
#ifdef LFS_QUOTA
lfs_qsync(mp);
#endif
return (error);
}
/*
* Look up an LFS dinode number to find its incore vnode. If not already
* in core, read it in from the specified device. Return the inode locked.
* Detection and handling of mount points must be done by the calling routine.
*/
int
lfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
int error;
error = vcache_get(mp, &ino, sizeof(ino), vpp);
if (error)
return error;
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
/*
* Create a new vnode/inode pair and initialize what fields we can.
*/
static void
lfs_init_vnode(struct ulfsmount *ump, ino_t ino, struct vnode *vp)
{
struct lfs *fs = ump->um_lfs;
struct inode *ip;
union lfs_dinode *dp;
ASSERT_NO_SEGLOCK(fs);
/* Initialize the inode. */
ip = pool_get(&lfs_inode_pool, PR_WAITOK);
memset(ip, 0, sizeof(*ip));
dp = pool_get(&lfs_dinode_pool, PR_WAITOK);
memset(dp, 0, sizeof(*dp));
ip->inode_ext.lfs = pool_get(&lfs_inoext_pool, PR_WAITOK);
memset(ip->inode_ext.lfs, 0, sizeof(*ip->inode_ext.lfs));
ip->i_din = dp;
ip->i_ump = ump;
ip->i_vnode = vp;
ip->i_dev = fs->lfs_dev;
lfs_dino_setinumber(fs, dp, ino);
ip->i_number = ino;
ip->i_lfs = fs;
ip->i_lfs_effnblks = 0;
SPLAY_INIT(&ip->i_lfs_lbtree);
ip->i_lfs_nbtree = 0;
LIST_INIT(&ip->i_lfs_segdhd);
vp->v_tag = VT_LFS;
vp->v_op = lfs_vnodeop_p;
vp->v_data = ip;
}
/*
* Undo lfs_init_vnode().
*/
static void
lfs_deinit_vnode(struct ulfsmount *ump, struct vnode *vp)
{
struct inode *ip = VTOI(vp);
pool_put(&lfs_inoext_pool, ip->inode_ext.lfs);
pool_put(&lfs_dinode_pool, ip->i_din);
pool_put(&lfs_inode_pool, ip);
vp->v_data = NULL;
}
/*
* Read an inode from disk and initialize this vnode / inode pair.
* Caller assures no other thread will try to load this inode.
*/
int
lfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct lfs *fs;
union lfs_dinode *dip;
struct inode *ip;
struct buf *bp;
IFILE *ifp;
struct ulfsmount *ump;
ino_t ino;
daddr_t daddr;
int error, retries;
struct timespec ts;
KASSERT(key_len == sizeof(ino));
memcpy(&ino, key, key_len);
memset(&ts, 0, sizeof ts); /* XXX gcc */
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
/*
* If the filesystem is not completely mounted yet, suspend
* any access requests (wait for roll-forward to complete).
*/
mutex_enter(&lfs_lock);
while ((fs->lfs_flags & LFS_NOTYET) && curproc->p_pid != fs->lfs_rfpid)
mtsleep(&fs->lfs_flags, PRIBIO+1, "lfs_notyet", 0,
&lfs_lock);
mutex_exit(&lfs_lock);
/* Translate the inode number to a disk address. */
if (ino == LFS_IFILE_INUM)
daddr = lfs_sb_getidaddr(fs);
else {
/* XXX bounds-check this too */
LFS_IENTRY(ifp, fs, ino, bp);
daddr = lfs_if_getdaddr(fs, ifp);
if (lfs_sb_getversion(fs) > 1) {
ts.tv_sec = lfs_if_getatime_sec(fs, ifp);
ts.tv_nsec = lfs_if_getatime_nsec(fs, ifp);
}
brelse(bp, 0);
if (daddr == LFS_UNUSED_DADDR)
return (ENOENT);
}
/* Allocate/init new vnode/inode. */
lfs_init_vnode(ump, ino, vp);
ip = VTOI(vp);
/* If the cleaner supplied the inode, use it. */
if (curlwp == fs->lfs_cleaner_thread && fs->lfs_cleaner_hint != NULL &&
fs->lfs_cleaner_hint->bi_lbn == LFS_UNUSED_LBN) {
dip = fs->lfs_cleaner_hint->bi_bp;
if (fs->lfs_is64) {
error = copyin(dip, &ip->i_din->u_64,
sizeof(struct lfs64_dinode));
} else {
error = copyin(dip, &ip->i_din->u_32,
sizeof(struct lfs32_dinode));
}
if (error) {
lfs_deinit_vnode(ump, vp);
return error;
}
KASSERT(ip->i_number == ino);
goto out;
}
/* Read in the disk contents for the inode, copy into the inode. */
retries = 0;
again:
error = bread(fs->lfs_devvp, LFS_FSBTODB(fs, daddr),
(lfs_sb_getversion(fs) == 1 ? lfs_sb_getbsize(fs) : lfs_sb_getibsize(fs)),
0, &bp);
if (error) {
lfs_deinit_vnode(ump, vp);
return error;
}
dip = lfs_ifind(fs, ino, bp);
if (dip == NULL) {
/* Assume write has not completed yet; try again */
brelse(bp, BC_INVAL);
++retries;
if (retries <= LFS_IFIND_RETRIES) {
mutex_enter(&lfs_lock);
if (fs->lfs_iocount) {
DLOG((DLOG_VNODE,
"%s: dinode %d not found, retrying...\n",
__func__, ino));
(void)mtsleep(&fs->lfs_iocount, PRIBIO + 1,
"lfs ifind", 1, &lfs_lock);
} else
retries = LFS_IFIND_RETRIES;
mutex_exit(&lfs_lock);
goto again;
}
#ifdef DEBUG
/* If the seglock is held look at the bpp to see
what is there anyway */
mutex_enter(&lfs_lock);
if (fs->lfs_seglock > 0) {
struct buf **bpp;
union lfs_dinode *dp;
int i;
for (bpp = fs->lfs_sp->bpp;
bpp != fs->lfs_sp->cbpp; ++bpp) {
if ((*bpp)->b_vp == fs->lfs_ivnode &&
bpp != fs->lfs_sp->bpp) {
/* Inode block */
printf("%s: block 0x%" PRIx64 ": ",
__func__, (*bpp)->b_blkno);
for (i = 0; i < LFS_INOPB(fs); i++) {
dp = DINO_IN_BLOCK(fs,
(*bpp)->b_data, i);
if (lfs_dino_getinumber(fs, dp))
printf("%ju ",
(uintmax_t)lfs_dino_getinumber(fs, dp));
}
printf("\n");
}
}
}
mutex_exit(&lfs_lock);
#endif /* DEBUG */
panic("lfs_loadvnode: dinode not found");
}
lfs_copy_dinode(fs, ip->i_din, dip);
brelse(bp, 0);
out:
if (lfs_sb_getversion(fs) > 1) {
lfs_dino_setatime(fs, ip->i_din, ts.tv_sec);
lfs_dino_setatimensec(fs, ip->i_din, ts.tv_nsec);
}
lfs_vinit(mp, &vp);
*new_key = &ip->i_number;
return 0;
}
/*
* Create a new inode and initialize this vnode / inode pair.
*/
int
lfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
struct vattr *vap, kauth_cred_t cred, void *extra,
size_t *key_len, const void **new_key)
{
ino_t ino;
struct inode *ip;
struct ulfsmount *ump;
struct lfs *fs;
int error, mode, gen;
KASSERT(dvp != NULL || vap->va_fileid > 0);
KASSERT(dvp != NULL && dvp->v_mount == mp);
KASSERT(vap->va_type != VNON);
*key_len = sizeof(ino);
ump = VFSTOULFS(mp);
fs = ump->um_lfs;
mode = MAKEIMODE(vap->va_type, vap->va_mode);
/*
* Allocate fresh inode. With "dvp == NULL" take the inode number
* and version from "vap".
*/
if (dvp == NULL) {
ino = vap->va_fileid;
gen = vap->va_gen;
error = lfs_valloc_fixed(fs, ino, gen);
} else {
error = lfs_valloc(dvp, mode, cred, &ino, &gen);
}
if (error)
return error;
/* Attach inode to vnode. */
lfs_init_vnode(ump, ino, vp);
ip = VTOI(vp);
mutex_enter(&lfs_lock);
LFS_SET_UINO(ip, IN_CHANGE);
mutex_exit(&lfs_lock);
/* Note no blocks yet */
ip->i_lfs_hiblk = -1;
/* Set a new generation number for this inode. */
ip->i_gen = gen;
lfs_dino_setgen(fs, ip->i_din, gen);
memset(ip->i_lfs_fragsize, 0,
ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));
/* Set uid / gid. */
if (cred == NOCRED || cred == FSCRED) {
ip->i_gid = 0;
ip->i_uid = 0;
} else {
ip->i_gid = VTOI(dvp)->i_gid;
ip->i_uid = kauth_cred_geteuid(cred);
}
DIP_ASSIGN(ip, gid, ip->i_gid);
DIP_ASSIGN(ip, uid, ip->i_uid);
#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
error = lfs_chkiq(ip, 1, cred, 0);
if (error) {
lfs_vfree(dvp, ino, mode);
lfs_deinit_vnode(ump, vp);
return error;
}
#endif
/* Set type and finalize. */
ip->i_flags = 0;
DIP_ASSIGN(ip, flags, 0);
ip->i_mode = mode;
DIP_ASSIGN(ip, mode, mode);
if (vap->va_rdev != VNOVAL) {
/*
* Want to be able to use this to make badblock
* inodes, so don't truncate the dev number.
*/
// XXX clean this up
if (ump->um_fstype == ULFS1)
ip->i_din->u_32.di_rdev = ulfs_rw32(vap->va_rdev,
ULFS_MPNEEDSWAP(fs));
else
ip->i_din->u_64.di_rdev = ulfs_rw64(vap->va_rdev,
ULFS_MPNEEDSWAP(fs));
}
lfs_vinit(mp, &vp);
*new_key = &ip->i_number;
return 0;
}
/*
* File handle to vnode
*/
int
lfs_fhtovp(struct mount *mp, struct fid *fhp, int lktype, struct vnode **vpp)
{
struct lfid lfh;
struct lfs *fs;
if (fhp->fid_len != sizeof(struct lfid))
return EINVAL;
memcpy(&lfh, fhp, sizeof(lfh));
if (lfh.lfid_ino < LFS_IFILE_INUM)
return ESTALE;
fs = VFSTOULFS(mp)->um_lfs;
if (lfh.lfid_ident != lfs_sb_getident(fs))
return ESTALE;
if (lfh.lfid_ino >
((lfs_dino_getsize(fs, VTOI(fs->lfs_ivnode)->i_din) >> lfs_sb_getbshift(fs)) -
lfs_sb_getcleansz(fs) - lfs_sb_getsegtabsz(fs)) * lfs_sb_getifpb(fs))
return ESTALE;
return (ulfs_fhtovp(mp, &lfh.lfid_ufid, lktype, vpp));
}
/*
* Vnode pointer to File handle
*/
/* ARGSUSED */
int
lfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
struct inode *ip;
struct lfid lfh;
if (*fh_size < sizeof(struct lfid)) {
*fh_size = sizeof(struct lfid);
return E2BIG;
}
*fh_size = sizeof(struct lfid);
ip = VTOI(vp);
memset(&lfh, 0, sizeof(lfh));
lfh.lfid_len = sizeof(struct lfid);
lfh.lfid_ino = ip->i_number;
lfh.lfid_gen = ip->i_gen;
lfh.lfid_ident = lfs_sb_getident(ip->i_lfs);
memcpy(fhp, &lfh, sizeof(lfh));
return (0);
}
/*
* ulfs_bmaparray callback function for writing.
*
* Since blocks will be written to the new segment anyway,
* we don't care about current daddr of them.
*/
static bool
lfs_issequential_hole(const struct lfs *fs,
daddr_t daddr0, daddr_t daddr1)
{
(void)fs; /* not used */
KASSERT(daddr0 == UNWRITTEN ||
(0 <= daddr0 && daddr0 <= LFS_MAX_DADDR(fs)));
KASSERT(daddr1 == UNWRITTEN ||
(0 <= daddr1 && daddr1 <= LFS_MAX_DADDR(fs)));
/* NOTE: all we want to know here is 'hole or not'. */
/* NOTE: UNASSIGNED is converted to 0 by ulfs_bmaparray. */
/*
* treat UNWRITTENs and all resident blocks as 'contiguous'
*/
if (daddr0 != 0 && daddr1 != 0)
return true;
/*
* both are in hole?
*/
if (daddr0 == 0 && daddr1 == 0)
return true; /* all holes are 'contiguous' for us. */
return false;
}
/*
* lfs_gop_write functions exactly like genfs_gop_write, except that
* (1) it requires the seglock to be held by its caller, and sp->fip
* to be properly initialized (it will return without re-initializing
* sp->fip, and without calling lfs_writeseg).
* (2) it uses the remaining space in the segment, rather than VOP_BMAP,
* to determine how large a block it can write at once (though it does
* still use VOP_BMAP to find holes in the file);
* (3) it calls lfs_gatherblock instead of VOP_STRATEGY on its blocks
* (leaving lfs_writeseg to deal with the cluster blocks, so we might
* now have clusters of clusters, ick.)
*/
static int
lfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
int i, error, run, haveeof = 0;
int fs_bshift;
vaddr_t kva;
off_t eof, offset, startoffset = 0;
size_t bytes, iobytes, skipbytes;
bool async = (flags & PGO_SYNCIO) == 0;
daddr_t lbn, blkno;
struct vm_page *pg;
struct buf *mbp, *bp;
struct vnode *devvp = VTOI(vp)->i_devvp;
struct inode *ip = VTOI(vp);
struct lfs *fs = ip->i_lfs;
struct segment *sp = fs->lfs_sp;
SEGSUM *ssp;
UVMHIST_FUNC("lfs_gop_write"); UVMHIST_CALLED(ubchist);
const char * failreason = NULL;
ASSERT_SEGLOCK(fs);
/* The Ifile lives in the buffer cache */
KASSERT(vp != fs->lfs_ivnode);
/*
* We don't want to fill the disk before the cleaner has a chance
* to make room for us. If we're in danger of doing that, fail
* with EAGAIN. The caller will have to notice this, unlock
* so the cleaner can run, relock and try again.
*
* We must write everything, however, if our vnode is being
* reclaimed.
*/
mutex_enter(vp->v_interlock);
if (LFS_STARVED_FOR_SEGS(fs) && vdead_check(vp, VDEAD_NOWAIT) == 0) {
mutex_exit(vp->v_interlock);
failreason = "Starved for segs and not flushing vp";
goto tryagain;
}
mutex_exit(vp->v_interlock);
/*
* Sometimes things slip past the filters in lfs_putpages,
* and the pagedaemon tries to write pages---problem is
* that the pagedaemon never acquires the segment lock.
*
* Alternatively, pages that were clean when we called
* genfs_putpages may have become dirty in the meantime. In this
* case the segment header is not properly set up for blocks
* to be added to it.
*
* Unbusy and unclean the pages, and put them on the ACTIVE
* queue under the hypothesis that they couldn't have got here
* unless they were modified *quite* recently.
*
* XXXUBC that last statement is an oversimplification of course.
*/
if (!LFS_SEGLOCK_HELD(fs)) {
failreason = "Seglock not held";
goto tryagain;
}
if (ip->i_lfs_iflags & LFSI_NO_GOP_WRITE) {
failreason = "Inode with no_gop_write";
goto tryagain;
}
if ((pgs[0]->offset & lfs_sb_getbmask(fs)) != 0) {
failreason = "Bad page offset";
goto tryagain;
}
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
GOP_SIZE(vp, vp->v_size, &eof, 0);
haveeof = 1;
if (vp->v_type == VREG)
fs_bshift = vp->v_mount->mnt_fs_bshift;
else
fs_bshift = DEV_BSHIFT;
error = 0;
pg = pgs[0];
startoffset = pg->offset;
KASSERT(eof >= 0);
if (startoffset >= eof) {
failreason = "Offset beyond EOF";
goto tryagain;
} else
bytes = MIN(npages << PAGE_SHIFT, eof - startoffset);
skipbytes = 0;
KASSERT(bytes != 0);
/* Swap PG_DELWRI for PG_PAGEOUT */
for (i = 0; i < npages; i++) {
if (pgs[i]->flags & PG_DELWRI) {
KASSERT(!(pgs[i]->flags & PG_PAGEOUT));
pgs[i]->flags &= ~PG_DELWRI;
pgs[i]->flags |= PG_PAGEOUT;
uvm_pageout_start(1);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
uvm_pagelock(pgs[i]);
uvm_pageunwire(pgs[i]);
uvm_pageunlock(pgs[i]);
rw_exit(vp->v_uobj.vmobjlock);
}
}
/*
* Check to make sure we're starting on a block boundary.
* We'll check later to make sure we always write entire
* blocks (or fragments).
*/
if (startoffset & lfs_sb_getbmask(fs))
printf("%" PRId64 " & %" PRIu64 " = %" PRId64 "\n",
startoffset, lfs_sb_getbmask(fs),
startoffset & lfs_sb_getbmask(fs));
KASSERT((startoffset & lfs_sb_getbmask(fs)) == 0);
if (bytes & lfs_sb_getffmask(fs)) {
printf("lfs_gop_write: asked to write %ld bytes\n", (long)bytes);
panic("lfs_gop_write: non-integer blocks");
}
/*
* We could deadlock here on pager_map with UVMPAGER_MAPIN_WAITOK.
* If we would, write what we have and try again. If we don't
* have anything to write, we'll have to sleep.
*/
ssp = (SEGSUM *)sp->segsum;
if ((kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
(lfs_ss_getnfinfo(fs, ssp) < 1 ?
UVMPAGER_MAPIN_WAITOK : 0))) == 0x0) {
DLOG((DLOG_PAGE, "lfs_gop_write: forcing write\n"));
#if 0
" with nfinfo=%d at offset 0x%jx\n",
(int)lfs_ss_getnfinfo(fs, ssp),
(uintmax_t)lfs_sb_getoffset(fs)));
#endif
lfs_updatemeta(sp);
lfs_release_finfo(fs);
(void) lfs_writeseg(fs, sp);
lfs_acquire_finfo(fs, ip->i_number, ip->i_gen);
/*
* Having given up all of the pager_map we were holding,
* we can now wait for aiodoned to reclaim it for us
* without fear of deadlock.
*/
kva = uvm_pagermapin(pgs, npages, UVMPAGER_MAPIN_WRITE |
UVMPAGER_MAPIN_WAITOK);
}
mbp = getiobuf(NULL, true);
UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
(uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
mbp->b_bufsize = npages << PAGE_SHIFT;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY|BC_AGE;
mbp->b_iodone = uvm_aio_aiodone;
bp = NULL;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
lbn = offset >> fs_bshift;
error = ulfs_bmaparray(vp, lbn, &blkno, NULL, NULL, &run,
lfs_issequential_hole);
if (error) {
UVMHIST_LOG(ubchist, "ulfs_bmaparray() -> %jd",
error,0,0,0);
skipbytes += bytes;
bytes = 0;
break;
}
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
if (blkno == (daddr_t)-1) {
skipbytes += iobytes;
continue;
}
/*
* Discover how much we can really pack into this buffer.
*/
/* If no room in the current segment, finish it up */
if (sp->sum_bytes_left < sizeof(int32_t) ||
sp->seg_bytes_left < (1 << lfs_sb_getbshift(fs))) {
int vers;
lfs_updatemeta(sp);
vers = lfs_fi_getversion(fs, sp->fip);
lfs_release_finfo(fs);
(void) lfs_writeseg(fs, sp);
lfs_acquire_finfo(fs, ip->i_number, vers);
}
/* Check both for space in segment and space in segsum */
iobytes = MIN(iobytes, (sp->seg_bytes_left >> fs_bshift)
<< fs_bshift);
iobytes = MIN(iobytes, (sp->sum_bytes_left / sizeof(int32_t))
<< fs_bshift);
KASSERT(iobytes > 0);
/* if it's really one i/o, don't make a second buf */
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
/*
* All the LFS output is done by the segwriter. It
* will increment numoutput by one for all the bufs it
* receives. However this buffer needs one extra to
* account for aiodone.
*/
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
} else {
bp = getiobuf(NULL, true);
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
nestiobuf_setup(mbp, bp, offset - pg->offset, iobytes);
/*
* LFS doesn't like async I/O here, dies with
* an assert in lfs_bwrite(). Is that assert
* valid? I retained non-async behaviour when
* converted this to use nestiobuf --pooka
*/
bp->b_flags &= ~B_ASYNC;
}
/* XXX This is silly ... is this necessary? */
mutex_enter(&bufcache_lock);
mutex_enter(vp->v_interlock);
bgetvp(vp, bp);
mutex_exit(vp->v_interlock);
mutex_exit(&bufcache_lock);
bp->b_lblkno = lfs_lblkno(fs, offset);
bp->b_private = mbp;
if (devvp->v_type == VBLK) {
bp->b_dev = devvp->v_rdev;
}
VOP_BWRITE(bp->b_vp, bp);
while (lfs_gatherblock(sp, bp, NULL))
continue;
}
nestiobuf_done(mbp, skipbytes, error);
if (skipbytes) {
UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
}
UVMHIST_LOG(ubchist, "returning 0", 0,0,0,0);
if (!async) {
/* Start a segment write. */
UVMHIST_LOG(ubchist, "flushing", 0,0,0,0);
mutex_enter(&lfs_lock);
lfs_flush(fs, 0, 1);
mutex_exit(&lfs_lock);
}
if ((sp->seg_flags & SEGM_SINGLE) && lfs_sb_getcurseg(fs) != fs->lfs_startseg)
return EAGAIN;
return (0);
tryagain:
/*
* We can't write the pages, for whatever reason.
* Clean up after ourselves, and make the caller try again.
*/
mutex_enter(vp->v_interlock);
/* Tell why we're here, if we know */
if (failreason != NULL) {
DLOG((DLOG_PAGE, "lfs_gop_write: %s\n", failreason));
}
if (haveeof && startoffset >= eof) {
DLOG((DLOG_PAGE, "lfs_gop_write: ino %d start 0x%" PRIx64
" eof 0x%" PRIx64 " npages=%d\n", VTOI(vp)->i_number,
pgs[0]->offset, eof, npages));
}
for (i = 0; i < npages; i++) {
pg = pgs[i];
if (pg->flags & PG_PAGEOUT)
uvm_pageout_done(1);
uvm_pagelock(pg);
if (pg->flags & PG_DELWRI) {
uvm_pageunwire(pg);
}
uvm_pageactivate(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_DELWRI|PG_PAGEOUT|PG_RELEASED);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
DLOG((DLOG_PAGE, "pg[%d] = %p (vp %p off %" PRIx64 ")\n", i, pg,
vp, pg->offset));
DLOG((DLOG_PAGE, "pg[%d]->flags = %x\n", i, pg->flags));
DLOG((DLOG_PAGE, "pg[%d]->pqflags = %x\n", i, pg->pqflags));
DLOG((DLOG_PAGE, "pg[%d]->uanon = %p\n", i, pg->uanon));
DLOG((DLOG_PAGE, "pg[%d]->uobject = %p\n", i, pg->uobject));
DLOG((DLOG_PAGE, "pg[%d]->wire_count = %d\n", i,
pg->wire_count));
DLOG((DLOG_PAGE, "pg[%d]->loan_count = %d\n", i,
pg->loan_count));
}
uvm_page_unbusy(pgs, npages);
mutex_exit(vp->v_interlock);
return EAGAIN;
}
/*
* finish vnode/inode initialization.
* used by lfs_vget.
*/
void
lfs_vinit(struct mount *mp, struct vnode **vpp)
{
struct vnode *vp = *vpp;
struct inode *ip = VTOI(vp);
struct ulfsmount *ump = VFSTOULFS(mp);
struct lfs *fs = ump->um_lfs;
int i;
ip->i_mode = lfs_dino_getmode(fs, ip->i_din);
ip->i_nlink = lfs_dino_getnlink(fs, ip->i_din);
ip->i_lfs_osize = ip->i_size = lfs_dino_getsize(fs, ip->i_din);
ip->i_flags = lfs_dino_getflags(fs, ip->i_din);
ip->i_gen = lfs_dino_getgen(fs, ip->i_din);
ip->i_uid = lfs_dino_getuid(fs, ip->i_din);
ip->i_gid = lfs_dino_getgid(fs, ip->i_din);
ip->i_lfs_effnblks = lfs_dino_getblocks(fs, ip->i_din);
ip->i_lfs_odnlink = lfs_dino_getnlink(fs, ip->i_din);
/*
* Initialize the vnode from the inode, check for aliases. In all
* cases re-init ip, the underlying vnode/inode may have changed.
*/
ulfs_vinit(mp, lfs_specop_p, lfs_fifoop_p, &vp);
ip = VTOI(vp);
memset(ip->i_lfs_fragsize, 0, ULFS_NDADDR * sizeof(*ip->i_lfs_fragsize));
if (vp->v_type != VLNK || ip->i_size >= ip->i_lfs->um_maxsymlinklen) {
#ifdef DEBUG
for (i = (ip->i_size + lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs);
i < ULFS_NDADDR; i++) {
if ((vp->v_type == VBLK || vp->v_type == VCHR) &&
i == 0)
continue;
if (lfs_dino_getdb(fs, ip->i_din, i) != 0) {
lfs_dump_dinode(fs, ip->i_din);
panic("inconsistent inode (direct)");
}
}
for ( ; i < ULFS_NDADDR + ULFS_NIADDR; i++) {
if (lfs_dino_getib(fs, ip->i_din, i - ULFS_NDADDR) != 0) {
lfs_dump_dinode(fs, ip->i_din);
panic("inconsistent inode (indirect)");
}
}
#endif /* DEBUG */
for (i = 0; i < ULFS_NDADDR; i++)
if (lfs_dino_getdb(fs, ip->i_din, i) != 0)
ip->i_lfs_fragsize[i] = lfs_blksize(fs, ip, i);
}
KASSERTMSG((vp->v_type != VNON),
"lfs_vinit: ino %llu is type VNON! (ifmt=%o)\n",
(unsigned long long)ip->i_number,
(ip->i_mode & LFS_IFMT) >> 12);
/*
* Finish inode initialization now that aliasing has been resolved.
*/
ip->i_devvp = fs->lfs_devvp;
vref(ip->i_devvp);
#if defined(LFS_QUOTA) || defined(LFS_QUOTA2)
ulfsquota_init(ip);
#endif
genfs_node_init(vp, &lfs_genfsops);
uvm_vnp_setsize(vp, ip->i_size);
/* Initialize hiblk from file size */
ip->i_lfs_hiblk = lfs_lblkno(ip->i_lfs, ip->i_size + lfs_sb_getbsize(ip->i_lfs) - 1) - 1;
*vpp = vp;
}
/*
* Resize the filesystem to contain the specified number of segments.
*/
int
lfs_resize_fs(struct lfs *fs, int newnsegs)
{
SEGUSE *sup;
CLEANERINFO *cip;
struct buf *bp, *obp;
daddr_t olast, nlast, ilast, noff, start, end;
struct vnode *ivp;
struct inode *ip;
int error, badnews, inc, oldnsegs;
int sbbytes, csbbytes, gain, cgain;
int i;
/* Only support v2 and up */
if (lfs_sb_getversion(fs) < 2)
return EOPNOTSUPP;
/* If we're doing nothing, do it fast */
oldnsegs = lfs_sb_getnseg(fs);
if (newnsegs == oldnsegs)
return 0;
/* We always have to have two superblocks */
if (newnsegs <= lfs_dtosn(fs, lfs_sb_getsboff(fs, 1)))
/* XXX this error code is rather nonsense */
return EFBIG;
ivp = fs->lfs_ivnode;
ip = VTOI(ivp);
error = 0;
/* Take the segment lock so no one else calls lfs_newseg() */
lfs_seglock(fs, SEGM_PROT);
/*
* Make sure the segments we're going to be losing, if any,
* are in fact empty. We hold the seglock, so their status
* cannot change underneath us. Count the superblocks we lose,
* while we're at it.
*/
sbbytes = csbbytes = 0;
cgain = 0;
for (i = newnsegs; i < oldnsegs; i++) {
LFS_SEGENTRY(sup, fs, i, bp);
badnews = sup->su_nbytes || !(sup->su_flags & SEGUSE_INVAL);
if (sup->su_flags & SEGUSE_SUPERBLOCK)
sbbytes += LFS_SBPAD;
if (!(sup->su_flags & SEGUSE_DIRTY)) {
++cgain;
if (sup->su_flags & SEGUSE_SUPERBLOCK)
csbbytes += LFS_SBPAD;
}
brelse(bp, 0);
if (badnews) {
error = EBUSY;
goto out;
}
}
/* Note old and new segment table endpoints, and old ifile size */
olast = lfs_sb_getcleansz(fs) + lfs_sb_getsegtabsz(fs);
nlast = howmany(newnsegs, lfs_sb_getsepb(fs)) + lfs_sb_getcleansz(fs);
ilast = ivp->v_size >> lfs_sb_getbshift(fs);
noff = nlast - olast;
/*
* Make sure no one can use the Ifile while we change it around.
* Even after taking the iflock we need to make sure no one still
* is holding Ifile buffers, so we get each one, to drain them.
* (XXX this could be done better.)
*/
rw_enter(&fs->lfs_iflock, RW_WRITER);
for (i = 0; i < ilast; i++) {
/* XXX what to do if bread fails? */
bread(ivp, i, lfs_sb_getbsize(fs), 0, &bp);
brelse(bp, 0);
}
/* Allocate new Ifile blocks */
for (i = ilast; i < ilast + noff; i++) {
if (lfs_balloc(ivp, i * lfs_sb_getbsize(fs), lfs_sb_getbsize(fs), NOCRED, 0,
&bp) != 0)
panic("balloc extending ifile");
memset(bp->b_data, 0, lfs_sb_getbsize(fs));
VOP_BWRITE(bp->b_vp, bp);
}
/* Register new ifile size */
ip->i_size += noff * lfs_sb_getbsize(fs);
lfs_dino_setsize(fs, ip->i_din, ip->i_size);
uvm_vnp_setsize(ivp, ip->i_size);
/* Copy the inode table to its new position */
if (noff != 0) {
if (noff < 0) {
start = nlast;
end = ilast + noff;
inc = 1;
} else {
start = ilast + noff - 1;
end = nlast - 1;
inc = -1;
}
for (i = start; i != end; i += inc) {
if (bread(ivp, i, lfs_sb_getbsize(fs),
B_MODIFY, &bp) != 0)
panic("resize: bread dst blk failed");
if (bread(ivp, i - noff, lfs_sb_getbsize(fs),
0, &obp))
panic("resize: bread src blk failed");
memcpy(bp->b_data, obp->b_data, lfs_sb_getbsize(fs));
VOP_BWRITE(bp->b_vp, bp);
brelse(obp, 0);
}
}
/* If we are expanding, write the new empty SEGUSE entries */
if (newnsegs > oldnsegs) {
for (i = oldnsegs; i < newnsegs; i++) {
if ((error = bread(ivp, i / lfs_sb_getsepb(fs) +
lfs_sb_getcleansz(fs), lfs_sb_getbsize(fs),
B_MODIFY, &bp)) != 0)
panic("lfs: ifile read: %d", error);
while ((i + 1) % lfs_sb_getsepb(fs) && i < newnsegs) {
sup = &((SEGUSE *)bp->b_data)[i % lfs_sb_getsepb(fs)];
memset(sup, 0, sizeof(*sup));
i++;
}
VOP_BWRITE(bp->b_vp, bp);
}
}
/* Zero out unused superblock offsets */
for (i = 2; i < LFS_MAXNUMSB; i++)
if (lfs_dtosn(fs, lfs_sb_getsboff(fs, i)) >= newnsegs)
lfs_sb_setsboff(fs, i, 0x0);
/*
* Correct superblock entries that depend on fs size.
* The computations of these are as follows:
*
* size = lfs_segtod(fs, nseg)
* dsize = lfs_segtod(fs, nseg - minfreeseg) - lfs_btofsb(#super * LFS_SBPAD)
* bfree = dsize - lfs_btofsb(fs, bsize * nseg / 2) - blocks_actually_used
* avail = lfs_segtod(fs, nclean) - lfs_btofsb(#clean_super * LFS_SBPAD)
* + (lfs_segtod(fs, 1) - (offset - curseg))
* - lfs_segtod(fs, minfreeseg - (minfreeseg / 2))
*
* XXX - we should probably adjust minfreeseg as well.
*/
gain = (newnsegs - oldnsegs);
lfs_sb_setnseg(fs, newnsegs);
lfs_sb_setsegtabsz(fs, nlast - lfs_sb_getcleansz(fs));
lfs_sb_addsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
lfs_sb_adddsize(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes));
lfs_sb_addbfree(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)) - lfs_btofsb(fs, sbbytes)
- gain * lfs_btofsb(fs, lfs_sb_getbsize(fs) / 2));
if (gain > 0) {
lfs_sb_addnclean(fs, gain);
lfs_sb_addavail(fs, gain * lfs_btofsb(fs, lfs_sb_getssize(fs)));
} else {
lfs_sb_subnclean(fs, cgain);
lfs_sb_subavail(fs, cgain * lfs_btofsb(fs, lfs_sb_getssize(fs)) -
lfs_btofsb(fs, csbbytes));
}
/* Resize segment flag cache */
fs->lfs_suflags[0] = realloc(fs->lfs_suflags[0],
lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
fs->lfs_suflags[1] = realloc(fs->lfs_suflags[1],
lfs_sb_getnseg(fs) * sizeof(u_int32_t), M_SEGMENT, M_WAITOK);
for (i = oldnsegs; i < newnsegs; i++)
fs->lfs_suflags[0][i] = fs->lfs_suflags[1][i] = 0x0;
/* Truncate Ifile if necessary */
if (noff < 0)
lfs_truncate(ivp, ivp->v_size + (noff << lfs_sb_getbshift(fs)), 0,
NOCRED);
/* Update cleaner info so the cleaner can die */
/* XXX what to do if bread fails? */
bread(ivp, 0, lfs_sb_getbsize(fs), B_MODIFY, &bp);
cip = bp->b_data;
lfs_ci_setclean(fs, cip, lfs_sb_getnclean(fs));
lfs_ci_setdirty(fs, cip, lfs_sb_getnseg(fs) - lfs_sb_getnclean(fs));
VOP_BWRITE(bp->b_vp, bp);
/* Let Ifile accesses proceed */
rw_exit(&fs->lfs_iflock);
out:
lfs_segunlock(fs);
return error;
}
/*
* Extended attribute dispatch
*/
int
lfs_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
int attrnamespace, const char *attrname)
{
#ifdef LFS_EXTATTR
struct ulfsmount *ump;
ump = VFSTOULFS(mp);
if (ump->um_fstype == ULFS1) {
return ulfs_extattrctl(mp, cmd, vp, attrnamespace, attrname);
}
#endif
return vfs_stdextattrctl(mp, cmd, vp, attrnamespace, attrname);
}
/* $NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $ */
/*-
* Copyright (c) 1994, 2000, 2005, 2015, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas and Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: exec_elf.c,v 1.105 2023/08/17 06:58:26 rin Exp $");
#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif /* _KERNEL_OPT */
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_elf.h>
#include <sys/syscall.h>
#include <sys/signalvar.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <machine/reg.h>
#include <compat/common/compat_util.h>
#include <sys/pax.h>
#include <uvm/uvm_param.h>
#define elf_check_header ELFNAME(check_header)
#define elf_copyargs ELFNAME(copyargs)
#define elf_populate_auxv ELFNAME(populate_auxv)
#define elf_load_interp ELFNAME(load_interp)
#define elf_load_psection ELFNAME(load_psection)
#define exec_elf_makecmds ELFNAME2(exec,makecmds)
#define netbsd_elf_signature ELFNAME2(netbsd,signature)
#define netbsd_elf_note ELFNAME2(netbsd,note)
#define netbsd_elf_probe ELFNAME2(netbsd,probe)
#define coredump ELFNAMEEND(coredump)
#define elf_free_emul_arg ELFNAME(free_emul_arg)
static int
elf_load_interp(struct lwp *, struct exec_package *, char *,
struct exec_vmcmd_set *, u_long *, Elf_Addr *);
static int
elf_load_psection(struct exec_vmcmd_set *, struct vnode *, const Elf_Phdr *,
Elf_Addr *, u_long *, int);
int netbsd_elf_signature(struct lwp *, struct exec_package *, Elf_Ehdr *);
int netbsd_elf_note(struct exec_package *, const Elf_Nhdr *, const char *,
const char *);
int netbsd_elf_probe(struct lwp *, struct exec_package *, void *, char *,
vaddr_t *);
static void elf_free_emul_arg(void *);
#ifdef DEBUG_ELF
#define DPRINTF(a, ...) printf("%s: " a "\n", __func__, ##__VA_ARGS__)
#else
#define DPRINTF(a, ...)
#endif
/* round up and down to page boundaries. */
#define ELF_ROUND(a, b) (((a) + (b) - 1) & ~((b) - 1))
#define ELF_TRUNC(a, b) ((a) & ~((b) - 1))
static int
elf_placedynexec(struct exec_package *epp, Elf_Ehdr *eh, Elf_Phdr *ph)
{
Elf_Addr align, offset;
int i;
for (align = 1, i = 0; i < eh->e_phnum; i++) if (ph[i].p_type == PT_LOAD && ph[i].p_align > align)
align = ph[i].p_align;
offset = (Elf_Addr)pax_aslr_exec_offset(epp, align);
if (offset < epp->ep_vm_minaddr) offset = roundup(epp->ep_vm_minaddr, align);
if ((offset & (align - 1)) != 0) {
DPRINTF("bad offset=%#jx align=%#jx",
(uintmax_t)offset, (uintmax_t)align);
return EINVAL;
}
for (i = 0; i < eh->e_phnum; i++)
ph[i].p_vaddr += offset;
epp->ep_entryoffset = offset;
eh->e_entry += offset;
return 0;
}
int
elf_populate_auxv(struct lwp *l, struct exec_package *pack, char **stackp)
{
size_t len, vlen;
AuxInfo ai[ELF_AUX_ENTRIES], *a, *execname;
struct elf_args *ap;
char *path = l->l_proc->p_path;
int error;
execname = NULL;
a = ai;
memset(ai, 0, sizeof(ai));
/*
* Push extra arguments on the stack needed by dynamically
* linked binaries
*/
if ((ap = (struct elf_args *)pack->ep_emul_arg)) {
struct vattr *vap = pack->ep_vap;
a->a_type = AT_PHDR;
a->a_v = ap->arg_phaddr;
a++;
a->a_type = AT_PHENT;
a->a_v = ap->arg_phentsize;
a++;
a->a_type = AT_PHNUM;
a->a_v = ap->arg_phnum;
a++;
a->a_type = AT_PAGESZ;
a->a_v = PAGE_SIZE;
a++;
a->a_type = AT_BASE;
a->a_v = ap->arg_interp;
a++;
a->a_type = AT_FLAGS;
a->a_v = 0;
a++;
a->a_type = AT_ENTRY;
a->a_v = ap->arg_entry;
a++;
a->a_type = AT_STACKBASE;
a->a_v = l->l_proc->p_stackbase;
a++;
a->a_type = AT_EUID;
if (vap->va_mode & S_ISUID)
a->a_v = vap->va_uid;
else
a->a_v = kauth_cred_geteuid(l->l_cred);
a++;
a->a_type = AT_RUID;
a->a_v = kauth_cred_getuid(l->l_cred);
a++;
a->a_type = AT_EGID;
if (vap->va_mode & S_ISGID)
a->a_v = vap->va_gid;
else
a->a_v = kauth_cred_getegid(l->l_cred);
a++;
a->a_type = AT_RGID;
a->a_v = kauth_cred_getgid(l->l_cred);
a++;
/* "/" means fexecve(2) could not resolve the pathname */
if (path[0] == '/' && path[1] != '\0') {
execname = a;
a->a_type = AT_SUN_EXECNAME;
a++;
}
exec_free_emul_arg(pack);
}
a->a_type = AT_NULL;
a->a_v = 0;
a++;
vlen = (a - ai) * sizeof(ai[0]);
KASSERT(vlen <= sizeof(ai));
if (execname) {
execname->a_v = (uintptr_t)(*stackp + vlen);
len = strlen(path) + 1;
if ((error = copyout(path, (*stackp + vlen), len)) != 0)
return error;
len = ALIGN(len);
} else {
len = 0;
}
if ((error = copyout(ai, *stackp, vlen)) != 0)
return error;
*stackp += vlen + len;
return 0;
}
/*
* Copy arguments onto the stack in the normal way, but add some
* extra information in case of dynamic binding.
*/
int
elf_copyargs(struct lwp *l, struct exec_package *pack,
struct ps_strings *arginfo, char **stackp, void *argp)
{
int error;
if ((error = copyargs(l, pack, arginfo, stackp, argp)) != 0)
return error;
return elf_populate_auxv(l, pack, stackp);
}
/*
* elf_check_header():
*
* Check header for validity; return 0 if ok, ENOEXEC if error
*/
int
elf_check_header(Elf_Ehdr *eh)
{
if (memcmp(eh->e_ident, ELFMAG, SELFMAG) != 0 ||
eh->e_ident[EI_CLASS] != ELFCLASS) {
DPRINTF("bad magic e_ident[EI_MAG0,EI_MAG3] %#x%x%x%x, "
"e_ident[EI_CLASS] %#x", eh->e_ident[EI_MAG0],
eh->e_ident[EI_MAG1], eh->e_ident[EI_MAG2],
eh->e_ident[EI_MAG3], eh->e_ident[EI_CLASS]);
return ENOEXEC;
}
switch (eh->e_machine) {
ELFDEFNNAME(MACHDEP_ID_CASES)
default:
DPRINTF("bad machine %#x", eh->e_machine);
return ENOEXEC;
}
if (ELF_EHDR_FLAGS_OK(eh) == 0) {
DPRINTF("bad flags %#x", eh->e_flags);
return ENOEXEC;
}
if (eh->e_shnum > ELF_MAXSHNUM || eh->e_phnum > ELF_MAXPHNUM) {
DPRINTF("bad shnum/phnum %#x/%#x", eh->e_shnum, eh->e_phnum);
return ENOEXEC;
}
return 0;
}
/*
* elf_load_psection():
*
* Load a psection at the appropriate address
*/
static int
elf_load_psection(struct exec_vmcmd_set *vcset, struct vnode *vp,
const Elf_Phdr *ph, Elf_Addr *addr, u_long *size, int flags)
{
u_long msize, psize, rm, rf;
long diff, offset;
int vmprot = 0;
KASSERT(VOP_ISLOCKED(vp) != LK_NONE);
/*
* If the user specified an address, then we load there.
*/
if (*addr == ELFDEFNNAME(NO_ADDR))
*addr = ph->p_vaddr;
if (ph->p_align > 1) {
/*
* Make sure we are virtually aligned as we are supposed to be.
*/
diff = ph->p_vaddr - ELF_TRUNC(ph->p_vaddr, ph->p_align);
if (*addr - diff != ELF_TRUNC(*addr, ph->p_align)) {
DPRINTF("bad alignment %#jx != %#jx\n",
(uintptr_t)(*addr - diff),
(uintptr_t)ELF_TRUNC(*addr, ph->p_align));
return EINVAL;
}
/*
* But make sure to not map any pages before the start of the
* psection by limiting the difference to within a page.
*/
diff &= PAGE_MASK;
} else
diff = 0;
vmprot |= (ph->p_flags & PF_R) ? VM_PROT_READ : 0;
vmprot |= (ph->p_flags & PF_W) ? VM_PROT_WRITE : 0;
vmprot |= (ph->p_flags & PF_X) ? VM_PROT_EXECUTE : 0;
/*
* Adjust everything so it all starts on a page boundary.
*/
*addr -= diff;
offset = ph->p_offset - diff;
*size = ph->p_filesz + diff;
msize = ph->p_memsz + diff;
if (ph->p_align >= PAGE_SIZE) {
if ((ph->p_flags & PF_W) != 0) {
/*
* Because the pagedvn pager can't handle zero fill
* of the last data page if it's not page aligned we
* map the last page readvn.
*/
psize = trunc_page(*size);
} else {
psize = round_page(*size);
}
} else {
psize = *size;
}
if (psize > 0) {
NEW_VMCMD2(vcset, ph->p_align < PAGE_SIZE ?
vmcmd_map_readvn : vmcmd_map_pagedvn, psize, *addr, vp,
offset, vmprot, flags);
flags &= VMCMD_RELATIVE;
}
if (psize < *size) {
NEW_VMCMD2(vcset, vmcmd_map_readvn, *size - psize,
*addr + psize, vp, offset + psize, vmprot, flags);
}
/*
* Check if we need to extend the size of the segment (does
* bss extend page the next page boundary)?
*/
rm = round_page(*addr + msize);
rf = round_page(*addr + *size);
if (rm != rf) {
NEW_VMCMD2(vcset, vmcmd_map_zero, rm - rf, rf, NULLVP,
0, vmprot, flags & VMCMD_RELATIVE);
*size = msize;
}
return 0;
}
/*
* elf_load_interp():
*
* Load an interpreter pointed to by path.
*/
static int
elf_load_interp(struct lwp *l, struct exec_package *epp, char *path,
struct exec_vmcmd_set *vcset, u_long *entryoff, Elf_Addr *last)
{
int error, i;
struct vnode *vp;
Elf_Ehdr eh;
Elf_Phdr *ph = NULL;
const Elf_Phdr *base_ph;
const Elf_Phdr *last_ph;
u_long phsize;
Elf_Addr addr = *last;
struct proc *p;
bool use_topdown;
p = l->l_proc;
KASSERT(p->p_vmspace); KASSERT(p->p_vmspace != proc0.p_vmspace);
#ifdef __USE_TOPDOWN_VM
use_topdown = epp->ep_flags & EXEC_TOPDOWN_VM;
#else
use_topdown = false;
#endif
/*
* 1. open file
* 2. read filehdr
* 3. map text, data, and bss out of it using VM_*
*/
vp = epp->ep_interp;
if (vp == NULL) {
error = emul_find_interp(l, epp, path);
if (error != 0)
return error;
vp = epp->ep_interp;
}
/* We'll tidy this ourselves - otherwise we have locking issues */
epp->ep_interp = NULL;
vn_lock(vp, LK_SHARED | LK_RETRY);
/*
* Similarly, if it's not marked as executable, or it's not a regular
* file, we don't allow it to be used.
*/
if (vp->v_type != VREG) {
error = EACCES;
goto bad;
}
if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
goto bad;
/*
* Check mount point. Though we're not trying to exec this binary,
* we will be executing code from it, so if the mount point
* disallows execution or set-id-ness, we punt or kill the set-id.
*/
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad;
}
if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
error = vn_marktext(vp);
if (error)
goto bad;
error = exec_read(l, vp, 0, &eh, sizeof(eh), IO_NODELOCKED);
if (error != 0)
goto bad;
if ((error = elf_check_header(&eh)) != 0)
goto bad;
if (eh.e_type != ET_DYN || eh.e_phnum == 0) {
DPRINTF("bad interpreter type %#x", eh.e_type);
error = ENOEXEC;
goto bad;
}
phsize = eh.e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, vp, eh.e_phoff, ph, phsize, IO_NODELOCKED);
if (error != 0)
goto bad;
#ifdef ELF_INTERP_NON_RELOCATABLE
/*
* Evil hack: Only MIPS should be non-relocatable, and the
* psections should have a high address (typically 0x5ffe0000).
* If it's now relocatable, it should be linked at 0 and the
* psections should have zeros in the upper part of the address.
* Otherwise, force the load at the linked address.
*/
if (*last == ELF_LINK_ADDR && (ph->p_vaddr & 0xffff0000) == 0)
*last = ELFDEFNNAME(NO_ADDR);
#endif
/*
* If no position to load the interpreter was set by a probe
* function, pick the same address that a non-fixed mmap(0, ..)
* would (i.e. something safely out of the way).
*/
if (*last == ELFDEFNNAME(NO_ADDR)) {
u_long limit = 0;
/*
* Find the start and ending addresses of the psections to
* be loaded. This will give us the size.
*/
for (i = 0, base_ph = NULL; i < eh.e_phnum; i++) { if (ph[i].p_type == PT_LOAD) { u_long psize = ph[i].p_vaddr + ph[i].p_memsz;
if (base_ph == NULL)
base_ph = &ph[i];
if (psize > limit)
limit = psize;
}
}
if (base_ph == NULL) {
DPRINTF("no interpreter loadable sections");
error = ENOEXEC;
goto bad;
}
/*
* Now compute the size and load address.
*/
addr = (*epp->ep_esch->es_emul->e_vm_default_addr)(p,
epp->ep_daddr,
round_page(limit) - trunc_page(base_ph->p_vaddr),
use_topdown);
addr += (Elf_Addr)pax_aslr_rtld_offset(epp, base_ph->p_align,
use_topdown);
} else {
addr = *last; /* may be ELF_LINK_ADDR */
}
/*
* Load all the necessary sections
*/
for (i = 0, base_ph = NULL, last_ph = NULL; i < eh.e_phnum; i++) { switch (ph[i].p_type) {
case PT_LOAD: {
u_long size;
int flags;
if (base_ph == NULL) {
/*
* First encountered psection is always the
* base psection. Make sure it's aligned
* properly (align down for topdown and align
* upwards for not topdown).
*/
base_ph = &ph[i];
flags = VMCMD_BASE;
if (addr == ELF_LINK_ADDR) addr = ph[i].p_vaddr;
if (use_topdown)
addr = ELF_TRUNC(addr, ph[i].p_align);
else
addr = ELF_ROUND(addr, ph[i].p_align);
} else {
u_long limit = round_page(last_ph->p_vaddr
+ last_ph->p_memsz);
u_long base = trunc_page(ph[i].p_vaddr);
/*
* If there is a gap in between the psections,
* map it as inaccessible so nothing else
* mmap'ed will be placed there.
*/
if (limit != base) { NEW_VMCMD2(vcset, vmcmd_map_zero,
base - limit,
limit - base_ph->p_vaddr, NULLVP,
0, VM_PROT_NONE, VMCMD_RELATIVE);
}
addr = ph[i].p_vaddr - base_ph->p_vaddr;
flags = VMCMD_RELATIVE;
}
last_ph = &ph[i];
if ((error = elf_load_psection(vcset, vp, &ph[i], &addr,
&size, flags)) != 0)
goto bad;
/*
* If entry is within this psection then this
* must contain the .text section. *entryoff is
* relative to the base psection.
*/
if (eh.e_entry >= ph[i].p_vaddr &&
eh.e_entry < (ph[i].p_vaddr + size)) {
*entryoff = eh.e_entry - base_ph->p_vaddr;
}
addr += size;
break;
}
default:
break;
}
}
kmem_free(ph, phsize);
/*
* This value is ignored if TOPDOWN.
*/
*last = addr;
vput(vp);
return 0;
bad:
if (ph != NULL)
kmem_free(ph, phsize);
vput(vp);
return error;
}
/*
* exec_elf_makecmds(): Prepare an Elf binary's exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
exec_elf_makecmds(struct lwp *l, struct exec_package *epp)
{
Elf_Ehdr *eh = epp->ep_hdr;
Elf_Phdr *ph, *pp;
Elf_Addr phdr = 0, computed_phdr = 0, pos = 0, end_text = 0;
int error, i;
char *interp = NULL;
u_long phsize;
struct elf_args *ap;
bool is_dyn = false;
if (epp->ep_hdrvalid < sizeof(Elf_Ehdr)) {
DPRINTF("small header %#x", epp->ep_hdrvalid);
return ENOEXEC;
}
if ((error = elf_check_header(eh)) != 0)
return error;
if (eh->e_type == ET_DYN)
/* PIE, and some libs have an entry point */
is_dyn = true;
else if (eh->e_type != ET_EXEC) {
DPRINTF("bad type %#x", eh->e_type);
return ENOEXEC;
}
if (eh->e_phnum == 0) {
DPRINTF("no program headers");
return ENOEXEC;
}
/* XXX only LK_EXCLUSIVE to match all others - allow spinning */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
error = vn_marktext(epp->ep_vp);
if (error) {
VOP_UNLOCK(epp->ep_vp);
return error;
}
/*
* Allocate space to hold all the program headers, and read them
* from the file
*/
phsize = eh->e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
IO_NODELOCKED);
if (error != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
epp->ep_taddr = epp->ep_tsize = ELFDEFNNAME(NO_ADDR);
epp->ep_daddr = epp->ep_dsize = ELFDEFNNAME(NO_ADDR);
for (i = 0; i < eh->e_phnum; i++) {
pp = &ph[i];
if (pp->p_type == PT_INTERP) {
if (pp->p_filesz < 2 || pp->p_filesz > MAXPATHLEN) {
DPRINTF("bad interpreter namelen %#jx",
(uintmax_t)pp->p_filesz);
error = ENOEXEC;
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
interp = PNBUF_GET();
error = exec_read(l, epp->ep_vp, pp->p_offset, interp,
pp->p_filesz, IO_NODELOCKED);
if (error != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/* Ensure interp is NUL-terminated and of the expected length */
if (strnlen(interp, pp->p_filesz) != pp->p_filesz - 1) {
DPRINTF("bad interpreter name");
error = ENOEXEC;
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
break;
}
}
/*
* On the same architecture, we may be emulating different systems.
* See which one will accept this executable.
*
* Probe functions would normally see if the interpreter (if any)
* exists. Emulation packages may possibly replace the interpreter in
* interp with a changed path (/emul/xxx/<path>).
*/
pos = ELFDEFNNAME(NO_ADDR);
if (epp->ep_esch->u.elf_probe_func) {
vaddr_t startp = (vaddr_t)pos;
error = (*epp->ep_esch->u.elf_probe_func)(l, epp, eh, interp,
&startp);
if (error) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
pos = (Elf_Addr)startp;
}
if (is_dyn && (error = elf_placedynexec(epp, eh, ph)) != 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/*
* Load all the necessary sections
*/
for (i = 0; i < eh->e_phnum; i++) {
Elf_Addr addr = ELFDEFNNAME(NO_ADDR);
u_long size = 0;
switch (ph[i].p_type) {
case PT_LOAD:
if ((error = elf_load_psection(&epp->ep_vmcmds,
epp->ep_vp, &ph[i], &addr, &size, VMCMD_FIXED))
!= 0) {
VOP_UNLOCK(epp->ep_vp);
goto bad;
}
/*
* Consider this as text segment, if it is executable.
* If there is more than one text segment, pick the
* largest.
*/
if (ph[i].p_flags & PF_X) {
if (epp->ep_taddr == ELFDEFNNAME(NO_ADDR) ||
size > epp->ep_tsize) {
epp->ep_taddr = addr;
epp->ep_tsize = size;
}
end_text = addr + size;
} else {
epp->ep_daddr = addr;
epp->ep_dsize = size;
}
if (ph[i].p_offset == 0) { computed_phdr = ph[i].p_vaddr + eh->e_phoff;
}
break;
case PT_SHLIB:
/* SCO has these sections. */
case PT_INTERP:
/* Already did this one. */
case PT_DYNAMIC:
case PT_NOTE:
break;
case PT_PHDR:
/* Note address of program headers (in text segment) */
phdr = ph[i].p_vaddr;
break;
default:
/*
* Not fatal; we don't need to understand everything.
*/
break;
}
}
/* Now done with the vnode. */
VOP_UNLOCK(epp->ep_vp);
if (epp->ep_vmcmds.evs_used == 0) {
/* No VMCMD; there was no PT_LOAD section, or those
* sections were empty */
DPRINTF("no vmcommands");
error = ENOEXEC;
goto bad;
}
if (epp->ep_daddr == ELFDEFNNAME(NO_ADDR)) { epp->ep_daddr = round_page(end_text);
epp->ep_dsize = 0;
}
/*
* Check if we found a dynamically linked binary and arrange to load
* its interpreter
*/
if (interp) {
u_int nused = epp->ep_vmcmds.evs_used;
u_long interp_offset = 0;
if ((error = elf_load_interp(l, epp, interp,
&epp->ep_vmcmds, &interp_offset, &pos)) != 0) {
goto bad;
}
if (epp->ep_vmcmds.evs_used == nused) {
/* elf_load_interp() has not set up any new VMCMD */
DPRINTF("no vmcommands for interpreter");
error = ENOEXEC;
goto bad;
}
ap = kmem_alloc(sizeof(*ap), KM_SLEEP);
ap->arg_interp = epp->ep_vmcmds.evs_cmds[nused].ev_addr;
epp->ep_entryoffset = interp_offset;
epp->ep_entry = ap->arg_interp + interp_offset;
PNBUF_PUT(interp);
interp = NULL;
} else {
epp->ep_entry = eh->e_entry;
if (epp->ep_flags & EXEC_FORCEAUX) { ap = kmem_zalloc(sizeof(*ap), KM_SLEEP);
ap->arg_interp = (vaddr_t)NULL;
} else {
ap = NULL;
}
}
if (ap) {
ap->arg_phaddr = phdr ? phdr : computed_phdr;
ap->arg_phentsize = eh->e_phentsize;
ap->arg_phnum = eh->e_phnum;
ap->arg_entry = eh->e_entry;
epp->ep_emul_arg = ap;
epp->ep_emul_arg_free = elf_free_emul_arg;
}
#ifdef ELF_MAP_PAGE_ZERO
/* Dell SVR4 maps page zero, yeuch! */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, PAGE_SIZE, 0,
epp->ep_vp, 0, VM_PROT_READ);
#endif
error = (*epp->ep_esch->es_setup_stack)(l, epp);
if (error)
goto bad;
kmem_free(ph, phsize);
return 0;
bad:
if (interp)
PNBUF_PUT(interp);
exec_free_emul_arg(epp);
kmem_free(ph, phsize);
kill_vmcmds(&epp->ep_vmcmds);
return error;
}
int
netbsd_elf_signature(struct lwp *l, struct exec_package *epp,
Elf_Ehdr *eh)
{
size_t i;
Elf_Phdr *ph;
size_t phsize;
char *nbuf;
int error;
int isnetbsd = 0;
epp->ep_pax_flags = 0;
if (eh->e_phnum > ELF_MAXPHNUM || eh->e_phnum == 0) {
DPRINTF("no signature %#x", eh->e_phnum);
return ENOEXEC;
}
phsize = eh->e_phnum * sizeof(Elf_Phdr);
ph = kmem_alloc(phsize, KM_SLEEP);
error = exec_read(l, epp->ep_vp, eh->e_phoff, ph, phsize,
IO_NODELOCKED);
if (error)
goto out;
nbuf = kmem_alloc(ELF_MAXNOTESIZE, KM_SLEEP);
for (i = 0; i < eh->e_phnum; i++) {
const char *nptr;
size_t nlen;
if (ph[i].p_type != PT_NOTE ||
ph[i].p_filesz > ELF_MAXNOTESIZE)
continue;
nlen = ph[i].p_filesz;
error = exec_read(l, epp->ep_vp, ph[i].p_offset, nbuf, nlen,
IO_NODELOCKED);
if (error)
continue;
nptr = nbuf;
while (nlen > 0) {
const Elf_Nhdr *np;
const char *ndata, *ndesc;
/* note header */
np = (const Elf_Nhdr *)nptr;
if (nlen < sizeof(*np)) {
break;
}
nptr += sizeof(*np);
nlen -= sizeof(*np);
/* note name */
ndata = nptr;
if (nlen < roundup(np->n_namesz, 4)) {
break;
}
nptr += roundup(np->n_namesz, 4);
nlen -= roundup(np->n_namesz, 4);
/* note description */
ndesc = nptr;
if (nlen < roundup(np->n_descsz, 4)) {
break;
}
nptr += roundup(np->n_descsz, 4);
nlen -= roundup(np->n_descsz, 4);
isnetbsd |= netbsd_elf_note(epp, np, ndata, ndesc);
}
}
kmem_free(nbuf, ELF_MAXNOTESIZE);
error = isnetbsd ? 0 : ENOEXEC;
#ifdef DEBUG_ELF
if (error)
DPRINTF("not netbsd");
#endif
out:
kmem_free(ph, phsize);
return error;
}
int
netbsd_elf_note(struct exec_package *epp,
const Elf_Nhdr *np, const char *ndata, const char *ndesc)
{
int isnetbsd = 0;
#ifdef DIAGNOSTIC
const char *badnote;
#define BADNOTE(n) badnote = (n)
#else
#define BADNOTE(n)
#endif
switch (np->n_type) {
case ELF_NOTE_TYPE_NETBSD_TAG:
/* It is us */
if (np->n_namesz == ELF_NOTE_NETBSD_NAMESZ &&
np->n_descsz == ELF_NOTE_NETBSD_DESCSZ &&
memcmp(ndata, ELF_NOTE_NETBSD_NAME,
ELF_NOTE_NETBSD_NAMESZ) == 0) {
memcpy(&epp->ep_osversion, ndesc,
ELF_NOTE_NETBSD_DESCSZ);
isnetbsd = 1;
break;
}
/*
* Ignore SuSE tags; SuSE's n_type is the same the
* NetBSD one.
*/
if (np->n_namesz == ELF_NOTE_SUSE_NAMESZ &&
memcmp(ndata, ELF_NOTE_SUSE_NAME,
ELF_NOTE_SUSE_NAMESZ) == 0)
break;
/*
* Ignore old GCC
*/
if (np->n_namesz == ELF_NOTE_OGCC_NAMESZ &&
memcmp(ndata, ELF_NOTE_OGCC_NAME,
ELF_NOTE_OGCC_NAMESZ) == 0)
break;
BADNOTE("NetBSD tag");
goto bad;
case ELF_NOTE_TYPE_PAX_TAG:
if (np->n_namesz == ELF_NOTE_PAX_NAMESZ &&
np->n_descsz == ELF_NOTE_PAX_DESCSZ &&
memcmp(ndata, ELF_NOTE_PAX_NAME,
ELF_NOTE_PAX_NAMESZ) == 0) {
uint32_t flags;
memcpy(&flags, ndesc, sizeof(flags));
/* Convert the flags and insert them into
* the exec package. */
pax_setup_elf_flags(epp, flags);
break;
}
BADNOTE("PaX tag");
goto bad;
case ELF_NOTE_TYPE_MARCH_TAG:
/* Copy the machine arch into the package. */
if (np->n_namesz == ELF_NOTE_MARCH_NAMESZ
&& memcmp(ndata, ELF_NOTE_MARCH_NAME,
ELF_NOTE_MARCH_NAMESZ) == 0) {
/* Do not truncate the buffer */
if (np->n_descsz > sizeof(epp->ep_machine_arch)) {
BADNOTE("description size limit");
goto bad;
}
/*
* Ensure ndesc is NUL-terminated and of the
* expected length.
*/
if (strnlen(ndesc, np->n_descsz) + 1 !=
np->n_descsz) {
BADNOTE("description size");
goto bad;
}
strlcpy(epp->ep_machine_arch, ndesc,
sizeof(epp->ep_machine_arch));
break;
}
BADNOTE("march tag");
goto bad;
case ELF_NOTE_TYPE_MCMODEL_TAG:
/* arch specific check for code model */
#ifdef ELF_MD_MCMODEL_CHECK
if (np->n_namesz == ELF_NOTE_MCMODEL_NAMESZ
&& memcmp(ndata, ELF_NOTE_MCMODEL_NAME,
ELF_NOTE_MCMODEL_NAMESZ) == 0) {
ELF_MD_MCMODEL_CHECK(epp, ndesc, np->n_descsz);
break;
}
BADNOTE("mcmodel tag");
goto bad;
#endif
break;
case ELF_NOTE_TYPE_SUSE_VERSION_TAG:
break;
case ELF_NOTE_TYPE_GO_BUILDID_TAG:
break;
case ELF_NOTE_TYPE_FDO_PACKAGING_METADATA:
break;
case ELF_NOTE_TYPE_NETBSD_EMUL_TAG:
/* Ancient NetBSD version tag */
break;
default:
BADNOTE("unknown tag");
bad:
#ifdef DIAGNOSTIC
/* Ignore GNU tags */
if (np->n_namesz == ELF_NOTE_GNU_NAMESZ &&
memcmp(ndata, ELF_NOTE_GNU_NAME,
ELF_NOTE_GNU_NAMESZ) == 0)
break;
int ns = (int)np->n_namesz;
printf("%s: Unknown elf note type %d (%s): "
"[namesz=%d, descsz=%d name=%-*.*s]\n",
epp->ep_kname, np->n_type, badnote, np->n_namesz,
np->n_descsz, ns, ns, ndata);
#endif
break;
}
return isnetbsd;
}
int
netbsd_elf_probe(struct lwp *l, struct exec_package *epp, void *eh, char *itp,
vaddr_t *pos)
{
int error;
if ((error = netbsd_elf_signature(l, epp, eh)) != 0)
return error;
#ifdef ELF_MD_PROBE_FUNC
if ((error = ELF_MD_PROBE_FUNC(l, epp, eh, itp, pos)) != 0)
return error;
#elif defined(ELF_INTERP_NON_RELOCATABLE)
*pos = ELF_LINK_ADDR;
#endif
epp->ep_flags |= EXEC_FORCEAUX;
return 0;
}
void
elf_free_emul_arg(void *arg)
{
struct elf_args *ap = arg;
KASSERT(ap != NULL);
kmem_free(ap, sizeof(*ap));
}
/* $NetBSD: userret.h,v 1.35 2024/01/28 10:06:19 skrll Exp $ */
/*-
* Copyright (c) 1998, 2000, 2003, 2006, 2008, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum, and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_USERRET_H_
#define _SYS_USERRET_H_
#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/psref.h>
/*
* Define the MI code needed before returning to user mode, for trap and
* syscall.
*
* We handle "exceptional" events: pending signals, stop/exit actions, etc.
* Note that the event must be flagged BEFORE any AST is posted as we are
* reading unlocked.
*/
static __inline void
mi_userret(struct lwp *l)
{
int exception;
KPREEMPT_DISABLE(l); KASSERTMSG(l->l_cpu->ci_biglock_count == 0, "kernel_lock leaked"); KASSERT(l->l_blcnt == 0);
exception = l->l_cpu->ci_want_resched | (l->l_flag & LW_USERRET);
KPREEMPT_ENABLE(l); if (__predict_false(exception)) { lwp_userret(l);
}
LOCKDEBUG_BARRIER(NULL, 0);
KASSERT(l->l_nopreempt == 0);
PSREF_DEBUG_BARRIER();
KASSERT(l->l_psrefs == 0);
}
#endif /* !_SYS_USERRET_H_ */
/* $NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $ */
/* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_disksort.c,v 1.14 2017/05/04 11:03:27 kamil Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>
/*
* Seek sort for disks.
*
* There are actually two queues, sorted in ascendening order. The first
* queue holds those requests which are positioned after the current block;
* the second holds requests which came in after their position was passed.
* Thus we implement a one-way scan, retracting after reaching the end of
* the drive to the first request on the second queue, at which time it
* becomes the first queue.
*
* A one-way scan is natural because of the way UNIX read-ahead blocks are
* allocated.
*/
struct bufq_disksort {
TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */
};
static void bufq_disksort_init(struct bufq_state *);
static void bufq_disksort_put(struct bufq_state *, struct buf *);
static struct buf *bufq_disksort_get(struct bufq_state *, int);
BUFQ_DEFINE(disksort, 20, bufq_disksort_init);
static void
bufq_disksort_put(struct bufq_state *bufq, struct buf *bp)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bq, *nbq;
int sortby;
sortby = bufq->bq_flags & BUFQ_SORT_MASK;
bq = TAILQ_FIRST(&disksort->bq_head);
/*
* If the queue is empty it's easy; we just go on the end.
*/
if (bq == NULL) {
TAILQ_INSERT_TAIL(&disksort->bq_head, bp, b_actq);
return;
}
/*
* If we lie before the currently active request, then we
* must locate the second request list and add ourselves to it.
*/
if (buf_inorder(bp, bq, sortby)) {
while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
/*
* Check for an ``inversion'' in the normally ascending
* block numbers, indicating the start of the second
* request list.
*/
if (buf_inorder(nbq, bq, sortby)) {
/*
* Search the second request list for the first
* request at a larger block number. We go
* after that; if there is no such request, we
* go at the end.
*/
do {
if (buf_inorder(bp, nbq, sortby))
goto insert;
bq = nbq;
} while ((nbq =
TAILQ_NEXT(bq, b_actq)) != NULL);
goto insert; /* after last */
}
bq = nbq;
}
/*
* No inversions... we will go after the last, and
* be the first request in the second request list.
*/
goto insert;
}
/*
* Request is at/after the current request...
* sort in the first request list.
*/
while ((nbq = TAILQ_NEXT(bq, b_actq)) != NULL) {
/*
* We want to go after the current request if there is an
* inversion after it (i.e. it is the end of the first
* request list), or if the next request is a larger cylinder
* than our request.
*/
if (buf_inorder(nbq, bq, sortby) ||
buf_inorder(bp, nbq, sortby))
goto insert;
bq = nbq;
}
/*
* Neither a second list nor a larger request... we go at the end of
* the first list, which is the same as the end of the whole schebang.
*/
insert: TAILQ_INSERT_AFTER(&disksort->bq_head, bq, bp, b_actq);
}
static struct buf *
bufq_disksort_get(struct bufq_state *bufq, int remove)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bp;
bp = TAILQ_FIRST(&disksort->bq_head);
if (bp != NULL && remove) TAILQ_REMOVE(&disksort->bq_head, bp, b_actq);
return (bp);
}
static struct buf *
bufq_disksort_cancel(struct bufq_state *bufq, struct buf *buf)
{
struct bufq_disksort *disksort = bufq_private(bufq);
struct buf *bq;
TAILQ_FOREACH(bq, &disksort->bq_head, b_actq) {
if (bq == buf) {
TAILQ_REMOVE(&disksort->bq_head, bq, b_actq);
return buf;
}
}
return NULL;
}
static void
bufq_disksort_fini(struct bufq_state *bufq)
{ KASSERT(bufq->bq_private != NULL);
kmem_free(bufq->bq_private, sizeof(struct bufq_disksort));
}
static void
bufq_disksort_init(struct bufq_state *bufq)
{
struct bufq_disksort *disksort;
disksort = kmem_zalloc(sizeof(*disksort), KM_SLEEP);
bufq->bq_private = disksort;
bufq->bq_get = bufq_disksort_get;
bufq->bq_put = bufq_disksort_put;
bufq->bq_cancel = bufq_disksort_cancel;
bufq->bq_fini = bufq_disksort_fini;
TAILQ_INIT(&disksort->bq_head);
}
MODULE(MODULE_CLASS_BUFQ, bufq_disksort, NULL);
static int
bufq_disksort_modcmd(modcmd_t cmd, void *opaque)
{
switch (cmd) {
case MODULE_CMD_INIT:
return bufq_register(&bufq_strat_disksort);
case MODULE_CMD_FINI:
return bufq_unregister(&bufq_strat_disksort);
default:
return ENOTTY;
}
}
/* $NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $ */
/*-
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* VFS hooks.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_hooks.c,v 1.6 2009/03/15 17:14:40 cegger Exp $");
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/mutex.h>
LIST_HEAD(vfs_hooks_head, vfs_hooks) vfs_hooks_head =
LIST_HEAD_INITIALIZER(vfs_hooks_head);
kmutex_t vfs_hooks_lock;
void
vfs_hooks_init(void)
{
mutex_init(&vfs_hooks_lock, MUTEX_DEFAULT, IPL_NONE);
}
int
vfs_hooks_attach(struct vfs_hooks *vfs_hooks)
{
mutex_enter(&vfs_hooks_lock);
LIST_INSERT_HEAD(&vfs_hooks_head, vfs_hooks, vfs_hooks_list);
mutex_exit(&vfs_hooks_lock);
return (0);
}
int
vfs_hooks_detach(struct vfs_hooks *vfs_hooks)
{
struct vfs_hooks *hp;
int ret = 0;
mutex_enter(&vfs_hooks_lock);
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) {
if (hp == vfs_hooks) {
LIST_REMOVE(hp, vfs_hooks_list);
break;
}
}
if (hp == NULL)
ret = ESRCH;
mutex_exit(&vfs_hooks_lock);
return (ret);
}
/*
* Macro to be used in one of the vfs_hooks_* function for hooks that
* return an error code. Calls will stop as soon as one of the hooks
* fails.
*/
#define VFS_HOOKS_W_ERROR(func, fargs, hook, hargs) \
int \
func fargs \
{ \
int error; \
struct vfs_hooks *hp; \
\
error = EJUSTRETURN; \
\
mutex_enter(&vfs_hooks_lock); \
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \
if (hp-> hook != NULL) { \
error = hp-> hook hargs; \
if (error != 0) \
break; \
} \
} \
mutex_exit(&vfs_hooks_lock); \
\
return error; \
}
/*
* Macro to be used in one of the vfs_hooks_* function for hooks that
* do not return any error code. All hooks will be executed
* unconditionally.
*/
#define VFS_HOOKS_WO_ERROR(func, fargs, hook, hargs) \
void \
func fargs \
{ \
struct vfs_hooks *hp; \
\
mutex_enter(&vfs_hooks_lock); \
LIST_FOREACH(hp, &vfs_hooks_head, vfs_hooks_list) { \
if (hp-> hook != NULL) \
hp-> hook hargs; \
} \
mutex_exit(&vfs_hooks_lock); \
}
/*
* Routines to iterate over VFS hooks lists and execute them.
*/
VFS_HOOKS_WO_ERROR(vfs_hooks_unmount, (struct mount *mp), vh_unmount, (mp));VFS_HOOKS_W_ERROR(vfs_hooks_reexport, (struct mount *mp, const char *path, void *data), vh_reexport, (mp, path, data));
/* $NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $ */
/*-
* Copyright (c) 2000, 2004, 2005, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christopher G. Demetriou.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rate.c,v 1.2 2012/12/12 11:10:56 pooka Exp $");
#include <sys/param.h>
#include <sys/time.h>
/*
* ratecheck(): simple time-based rate-limit checking. see ratecheck(9)
* for usage and rationale.
*/
int
ratecheck(struct timeval *lasttime, const struct timeval *mininterval)
{
struct timeval tv, delta;
int rv = 0;
getmicrouptime(&tv);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once,
* even if interval is huge.
*/
if (timercmp(&delta, mininterval, >=) ||
(lasttime->tv_sec == 0 && lasttime->tv_usec == 0)) {
*lasttime = tv;
rv = 1;
}
return (rv);
}
/*
* ppsratecheck(): packets (or events) per second limitation.
*/
int
ppsratecheck(struct timeval *lasttime, int *curpps, int maxpps)
{
struct timeval tv, delta;
int rv;
getmicrouptime(&tv);
timersub(&tv, lasttime, &delta);
/*
* check for 0,0 is so that the message will be seen at least once.
* if more than one second have passed since the last update of
* lasttime, reset the counter.
*
* we do increment *curpps even in *curpps < maxpps case, as some may
* try to use *curpps for stat purposes as well.
*/
if ((lasttime->tv_sec == 0 && lasttime->tv_usec == 0) ||
delta.tv_sec >= 1) {
*lasttime = tv;
*curpps = 0;
}
if (maxpps < 0)
rv = 1;
else if (*curpps < maxpps)
rv = 1;
else
rv = 0;
#if 1 /*DIAGNOSTIC?*/
/* be careful about wrap-around */
if (__predict_true(*curpps != INT_MAX)) *curpps = *curpps + 1;
#else
/*
* assume that there's not too many calls to this function.
* not sure if the assumption holds, as it depends on *caller's*
* behavior, not the behavior of this function.
* IMHO it is wrong to make assumption on the caller's behavior,
* so the above #if is #if 1, not #ifdef DIAGNOSTIC.
*/
*curpps = *curpps + 1;
#endif
return (rv);
}
/* $NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncmp.c 8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncmp.c,v 1.3 2018/02/04 20:22:17 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
int
strncmp(const char *s1, const char *s2, size_t n)
{ if (n == 0)
return (0);
do {
if (*s1 != *s2++)
return (*(const unsigned char *)s1 -
*(const unsigned char *)--s2);
if (*s1++ == 0)
break;
} while (--n != 0);
return (0);
}
/* $NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
/*-
* Copyright (c) 1993 Jan-Simon Pendry.
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace_common.c,v 1.92 2021/08/09 20:49:10 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#include "opt_ktrace.h"
#include "opt_pax.h"
#include "opt_compat_netbsd32.h"
#endif
#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
&& !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/module.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
# ifdef PTRACE_DEBUG
# define DPRINTF(a) uprintf a
# else
# define DPRINTF(a)
# endif
static kauth_listener_t ptrace_listener;
static int process_auxv_offset(struct proc *, struct uio *);
extern int user_va0_disable;
#if 0
static int ptrace_cbref;
static kmutex_t ptrace_mtx;
static kcondvar_t ptrace_cv;
#endif
#ifdef PT_GETREGS
# define case_PT_GETREGS case PT_GETREGS:
#else
# define case_PT_GETREGS
#endif
#ifdef PT_SETREGS
# define case_PT_SETREGS case PT_SETREGS:
#else
# define case_PT_SETREGS
#endif
#ifdef PT_GETFPREGS
# define case_PT_GETFPREGS case PT_GETFPREGS:
#else
# define case_PT_GETFPREGS
#endif
#ifdef PT_SETFPREGS
# define case_PT_SETFPREGS case PT_SETFPREGS:
#else
# define case_PT_SETFPREGS
#endif
#ifdef PT_GETDBREGS
# define case_PT_GETDBREGS case PT_GETDBREGS:
#else
# define case_PT_GETDBREGS
#endif
#ifdef PT_SETDBREGS
# define case_PT_SETDBREGS case PT_SETDBREGS:
#else
# define case_PT_SETDBREGS
#endif
static int
ptrace_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
#ifdef PT_SETDBREGS
extern int user_set_dbregs;
#endif
result = KAUTH_RESULT_DEFER;
p = arg0;
#if 0
mutex_enter(&ptrace_mtx);
ptrace_cbref++;
mutex_exit(&ptrace_mtx);
#endif
if (action != KAUTH_PROCESS_PTRACE)
goto out;
switch ((u_long)arg1) {
#ifdef PT_SETDBREGS
case_PT_SETDBREGS
if (kauth_cred_getuid(cred) != 0 && user_set_dbregs == 0) {
result = KAUTH_RESULT_DENY;
break;
}
#endif
/* FALLTHROUGH */
case PT_TRACE_ME:
case PT_ATTACH:
case PT_WRITE_I:
case PT_WRITE_D:
case PT_READ_I:
case PT_READ_D:
case PT_IO:
case_PT_GETREGS
case_PT_SETREGS
case_PT_GETFPREGS
case_PT_SETFPREGS
case_PT_GETDBREGS
case PT_SET_EVENT_MASK:
case PT_GET_EVENT_MASK:
case PT_GET_PROCESS_STATE:
case PT_SET_SIGINFO:
case PT_GET_SIGINFO:
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
#endif
if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) || ISSET(p->p_flag, PK_SUGID)) {
break;
}
result = KAUTH_RESULT_ALLOW;
break;
#ifdef PT_STEP
case PT_STEP:
case PT_SETSTEP:
case PT_CLEARSTEP:
#endif
case PT_CONTINUE:
case PT_KILL:
case PT_DETACH:
case PT_LWPINFO:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_DUMPCORE:
case PT_RESUME:
case PT_SUSPEND:
case PT_STOP:
case PT_LWPSTATUS:
case PT_LWPNEXT:
case PT_SET_SIGPASS:
case PT_GET_SIGPASS:
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
out:
#if 0
mutex_enter(&ptrace_mtx);
if (--ptrace_cbref == 0)
cv_broadcast(&ptrace_cv);
mutex_exit(&ptrace_mtx);
#endif
return result;
}
static struct proc *
ptrace_find(struct lwp *l, int req, pid_t pid)
{
struct proc *t;
/* "A foolish consistency..." XXX */
if (req == PT_TRACE_ME) {
t = l->l_proc;
mutex_enter(t->p_lock);
return t;
}
/* Find the process we're supposed to be operating on. */
t = proc_find(pid);
if (t == NULL)
return NULL;
/* XXX-elad */
mutex_enter(t->p_lock);
int error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
t, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error) { mutex_exit(t->p_lock);
return NULL;
}
return t;
}
static int
ptrace_allowed(struct lwp *l, int req, struct proc *t, struct proc *p,
bool *locked)
{
*locked = false;
/*
* Grab a reference on the process to prevent it from execing or
* exiting.
*/
if (!rw_tryenter(&t->p_reflock, RW_READER))
return EBUSY;
*locked = true;
/* Make sure we can operate on it. */
switch (req) {
case PT_TRACE_ME:
/*
* You can't say to the parent of a process to start tracing if:
* (1) the parent is initproc,
*/
if (p->p_pptr == initproc)
return EPERM;
/*
* (2) the process is initproc, or
*/
if (p == initproc)
return EPERM;
/*
* (3) the child is already traced.
*/
if (ISSET(p->p_slflag, PSL_TRACED))
return EBUSY;
return 0;
case PT_ATTACH:
/*
* You can't attach to a process if:
* (1) it's the process that's doing the attaching,
*/
if (t == p)
return EINVAL;
/*
* (2) it's a system process,
*/
if (t->p_flag & PK_SYSTEM)
return EPERM;
/*
* (3) the tracer is initproc,
*/
if (p == initproc)
return EPERM;
/*
* (4) it's already being traced,
*/
if (ISSET(t->p_slflag, PSL_TRACED))
return EBUSY;
/*
* (5) it's a vfork(2)ed parent of the current process, or
*/
if (ISSET(p->p_lflag, PL_PPWAIT) && p->p_pptr == t)
return EPERM;
/*
* (6) the tracer is chrooted, and its root directory is
* not at or above the root directory of the tracee
*/
mutex_exit(t->p_lock); /* XXXSMP */
int tmp = proc_isunder(t, l);
mutex_enter(t->p_lock); /* XXXSMP */
if (!tmp)
return EPERM;
return 0;
case PT_READ_I:
case PT_READ_D:
case PT_WRITE_I:
case PT_WRITE_D:
case PT_IO:
case PT_SET_SIGINFO:
case PT_GET_SIGINFO:
case_PT_GETREGS
case_PT_SETREGS
case_PT_GETFPREGS
case_PT_SETFPREGS
case_PT_GETDBREGS
case_PT_SETDBREGS
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
#endif
/*
* You can't read/write the memory or registers of a process
* if the tracer is chrooted, and its root directory is not at
* or above the root directory of the tracee.
*/
mutex_exit(t->p_lock); /* XXXSMP */
tmp = proc_isunder(t, l);
mutex_enter(t->p_lock); /* XXXSMP */
if (!tmp)
return EPERM;
/*FALLTHROUGH*/
case PT_CONTINUE:
case PT_KILL:
case PT_DETACH:
case PT_LWPINFO:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_DUMPCORE:
#ifdef PT_STEP
case PT_STEP:
case PT_SETSTEP:
case PT_CLEARSTEP:
#endif
case PT_SET_EVENT_MASK:
case PT_GET_EVENT_MASK:
case PT_GET_PROCESS_STATE:
case PT_RESUME:
case PT_SUSPEND:
case PT_STOP:
case PT_LWPSTATUS:
case PT_LWPNEXT:
case PT_SET_SIGPASS:
case PT_GET_SIGPASS:
/*
* You can't do what you want to the process if:
* (1) It's not being traced at all,
*/
if (!ISSET(t->p_slflag, PSL_TRACED))
return EPERM;
/*
* (2) it's not being traced by _you_, or
*/
if (t->p_pptr != p) {
DPRINTF(("parent %d != %d\n", t->p_pptr->p_pid,
p->p_pid));
return EBUSY;
}
/*
* (3) it's not currently stopped.
*
* As an exception allow PT_KILL and PT_STOP here.
*/
if (req != PT_KILL && req != PT_STOP && (t->p_stat != SSTOP || !t->p_waited /* XXXSMP */)) {
DPRINTF(("stat %d flag %d\n", t->p_stat,
!t->p_waited));
return EBUSY;
}
return 0;
default: /* It was not a legal request. */
return EINVAL;
}
}
static int
ptrace_needs_hold(int req)
{
switch (req) {
#ifdef PT_STEP
case PT_STEP:
#endif
case PT_CONTINUE:
case PT_DETACH:
case PT_KILL:
case PT_SYSCALL:
case PT_SYSCALLEMU:
case PT_ATTACH:
case PT_TRACE_ME:
case PT_GET_SIGINFO:
case PT_SET_SIGINFO:
case PT_STOP:
return 1;
default:
return 0;
}
}
static int
ptrace_get_siginfo(struct proc *t, struct ptrace_methods *ptm, void *addr,
size_t data)
{
struct ptrace_siginfo psi;
memset(&psi, 0, sizeof(psi));
psi.psi_siginfo._info = t->p_sigctx.ps_info;
psi.psi_lwpid = t->p_sigctx.ps_lwp;
DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
psi.psi_siginfo.si_signo));
return ptm->ptm_copyout_siginfo(&psi, addr, data);
}
static int
ptrace_set_siginfo(struct proc *t, struct lwp **lt, struct ptrace_methods *ptm,
void *addr, size_t data)
{
struct ptrace_siginfo psi;
int error = ptm->ptm_copyin_siginfo(&psi, addr, data);
if (error)
return error;
/* Check that the data is a valid signal number or zero. */
if (psi.psi_siginfo.si_signo < 0 || psi.psi_siginfo.si_signo >= NSIG)
return EINVAL;
t->p_sigctx.ps_faked = true;
t->p_sigctx.ps_info = psi.psi_siginfo._info;
t->p_sigctx.ps_lwp = psi.psi_lwpid;
DPRINTF(("%s: lwp=%d signal=%d\n", __func__, psi.psi_lwpid,
psi.psi_siginfo.si_signo));
return 0;
}
static int
ptrace_get_sigpass(struct proc *t, void *addr, size_t data)
{
sigset_t set;
if (data > sizeof(set) || data <= 0) {
DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
__func__, sizeof(set), data));
return EINVAL;
}
set = t->p_sigctx.ps_sigpass;
return copyout(&set, addr, data);
}
static int
ptrace_set_sigpass(struct proc *t, void *addr, size_t data)
{
sigset_t set;
int error;
if (data > sizeof(set) || data <= 0) {
DPRINTF(("%s: invalid data: %zu < %zu <= 0\n",
__func__, sizeof(set), data));
return EINVAL;
}
memset(&set, 0, sizeof(set));
if ((error = copyin(addr, &set, data)))
return error;
/* We catch SIGSTOP and cannot intercept SIGKILL. */
sigminusset(&sigcantmask, &set);
t->p_sigctx.ps_sigpass = set;
return 0;
}
static int
ptrace_get_event_mask(struct proc *t, void *addr, size_t data)
{
struct ptrace_event pe;
if (data != sizeof(pe)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
return EINVAL;
}
memset(&pe, 0, sizeof(pe));
pe.pe_set_event = ISSET(t->p_slflag, PSL_TRACEFORK) ?
PTRACE_FORK : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK) ?
PTRACE_VFORK : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEVFORK_DONE) ?
PTRACE_VFORK_DONE : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_CREATE) ?
PTRACE_LWP_CREATE : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACELWP_EXIT) ?
PTRACE_LWP_EXIT : 0;
pe.pe_set_event |= ISSET(t->p_slflag, PSL_TRACEPOSIX_SPAWN) ?
PTRACE_POSIX_SPAWN : 0;
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
t->p_sigctx.ps_lwp, pe.pe_set_event));
return copyout(&pe, addr, sizeof(pe));
}
static int
ptrace_set_event_mask(struct proc *t, void *addr, size_t data)
{
struct ptrace_event pe;
int error;
if (data != sizeof(pe)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pe)));
return EINVAL;
}
if ((error = copyin(addr, &pe, sizeof(pe))) != 0)
return error;
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
t->p_sigctx.ps_lwp, pe.pe_set_event));
if (pe.pe_set_event & PTRACE_FORK)
SET(t->p_slflag, PSL_TRACEFORK);
else
CLR(t->p_slflag, PSL_TRACEFORK);
if (pe.pe_set_event & PTRACE_VFORK)
SET(t->p_slflag, PSL_TRACEVFORK);
else
CLR(t->p_slflag, PSL_TRACEVFORK);
if (pe.pe_set_event & PTRACE_VFORK_DONE)
SET(t->p_slflag, PSL_TRACEVFORK_DONE);
else
CLR(t->p_slflag, PSL_TRACEVFORK_DONE);
if (pe.pe_set_event & PTRACE_LWP_CREATE)
SET(t->p_slflag, PSL_TRACELWP_CREATE);
else
CLR(t->p_slflag, PSL_TRACELWP_CREATE);
if (pe.pe_set_event & PTRACE_LWP_EXIT)
SET(t->p_slflag, PSL_TRACELWP_EXIT);
else
CLR(t->p_slflag, PSL_TRACELWP_EXIT);
if (pe.pe_set_event & PTRACE_POSIX_SPAWN)
SET(t->p_slflag, PSL_TRACEPOSIX_SPAWN);
else
CLR(t->p_slflag, PSL_TRACEPOSIX_SPAWN);
return 0;
}
static int
ptrace_get_process_state(struct proc *t, void *addr, size_t data)
{
struct _ksiginfo *si;
struct ptrace_state ps;
if (data != sizeof(ps)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(ps)));
return EINVAL;
}
if (t->p_sigctx.ps_info._signo != SIGTRAP || (t->p_sigctx.ps_info._code != TRAP_CHLD &&
t->p_sigctx.ps_info._code != TRAP_LWP)) {
memset(&ps, 0, sizeof(ps));
} else {
si = &t->p_sigctx.ps_info;
KASSERT(si->_reason._ptrace_state._pe_report_event > 0); KASSERT(si->_reason._ptrace_state._option._pe_other_pid > 0);
ps.pe_report_event = si->_reason._ptrace_state._pe_report_event;
CTASSERT(sizeof(ps.pe_other_pid) == sizeof(ps.pe_lwp));
ps.pe_other_pid =
si->_reason._ptrace_state._option._pe_other_pid;
}
DPRINTF(("%s: lwp=%d event=%#x pid=%d lwp=%d\n", __func__,
t->p_sigctx.ps_lwp, ps.pe_report_event,
ps.pe_other_pid, ps.pe_lwp));
return copyout(&ps, addr, sizeof(ps));
}
static int
ptrace_lwpinfo(struct proc *t, struct lwp **lt, void *addr, size_t data)
{
struct ptrace_lwpinfo pl;
if (data != sizeof(pl)) {
DPRINTF(("%s: %zu != %zu\n", __func__, data, sizeof(pl)));
return EINVAL;
}
int error = copyin(addr, &pl, sizeof(pl));
if (error)
return error;
lwpid_t tmp = pl.pl_lwpid;
lwp_delref(*lt);
mutex_enter(t->p_lock);
if (tmp == 0)
*lt = lwp_find_first(t);
else {
*lt = lwp_find(t, tmp);
if (*lt == NULL) {
mutex_exit(t->p_lock);
return ESRCH;
}
*lt = LIST_NEXT(*lt, l_sibling);
}
while (*lt != NULL && (!lwp_alive(*lt) ||
((*lt)->l_flag & LW_SYSTEM) != 0))
*lt = LIST_NEXT(*lt, l_sibling);
pl.pl_lwpid = 0;
pl.pl_event = 0;
if (*lt) {
lwp_addref(*lt);
pl.pl_lwpid = (*lt)->l_lid;
if ((*lt)->l_flag & LW_WSUSPEND)
pl.pl_event = PL_EVENT_SUSPENDED;
/*
* If we match the lwp, or it was sent to every lwp,
* we set PL_EVENT_SIGNAL.
* XXX: ps_lwp == 0 means everyone and noone, so
* check ps_signo too.
*/
else if ((*lt)->l_lid == t->p_sigctx.ps_lwp || (t->p_sigctx.ps_lwp == 0 &&
t->p_sigctx.ps_info._signo)) {
DPRINTF(("%s: lwp=%d siglwp=%d signo %d\n", __func__,
pl.pl_lwpid, t->p_sigctx.ps_lwp,
t->p_sigctx.ps_info._signo));
pl.pl_event = PL_EVENT_SIGNAL;
}
}
mutex_exit(t->p_lock);
DPRINTF(("%s: lwp=%d event=%#x\n", __func__,
pl.pl_lwpid, pl.pl_event));
return copyout(&pl, addr, sizeof(pl));
}
static int
ptrace_lwpstatus(struct proc *t, struct ptrace_methods *ptm, struct lwp **lt,
void *addr, size_t data, bool next)
{
struct ptrace_lwpstatus pls;
struct lwp *l;
int error;
if (data > sizeof(pls) || data < sizeof(lwpid_t)) {
DPRINTF(("%s: invalid data: %zu < %zu < %zu\n",
__func__, sizeof(lwpid_t), data, sizeof(pls)));
return EINVAL;
}
error = copyin(addr, &pls.pl_lwpid, sizeof(lwpid_t));
if (error)
return error;
if (next) {
lwp_delref(*lt);
lwpid_t tmp = pls.pl_lwpid;
mutex_enter(t->p_lock);
if (tmp == 0)
*lt = lwp_find_first(t);
else {
*lt = lwp_find(t, tmp);
if (*lt == NULL) {
mutex_exit(t->p_lock);
return ESRCH;
}
*lt = LIST_NEXT(*lt, l_sibling);
}
while (*lt != NULL && (!lwp_alive(*lt) ||
((*lt)->l_flag & LW_SYSTEM) != 0))
*lt = LIST_NEXT(*lt, l_sibling);
if (*lt == NULL) {
memset(&pls, 0, sizeof(pls));
mutex_exit(t->p_lock);
goto out;
}
lwp_addref(*lt);
mutex_exit(t->p_lock);
pls.pl_lwpid = (*lt)->l_lid;
} else {
if ((error = ptrace_update_lwp(t, lt, pls.pl_lwpid)) != 0)
return error;
}
l = *lt;
ptrace_read_lwpstatus(l, &pls);
out:
DPRINTF(("%s: lwp=%d sigpend=%02x%02x%02x%02x sigmask=%02x%02x%02x%02x "
"name='%s' private=%p\n", __func__, pls.pl_lwpid,
pls.pl_sigpend.__bits[0], pls.pl_sigpend.__bits[1],
pls.pl_sigpend.__bits[2], pls.pl_sigpend.__bits[3],
pls.pl_sigmask.__bits[0], pls.pl_sigmask.__bits[1],
pls.pl_sigmask.__bits[2], pls.pl_sigmask.__bits[3],
pls.pl_name, pls.pl_private));
return ptm->ptm_copyout_lwpstatus(&pls, addr, data);
}
static int
ptrace_startstop(struct proc *t, struct lwp **lt, int rq, void *addr,
size_t data)
{
int error;
if ((error = ptrace_update_lwp(t, lt, data)) != 0)
return error;
DPRINTF(("%s: lwp=%d request=%d\n", __func__, (*lt)->l_lid, rq));
lwp_lock(*lt);
if (rq == PT_SUSPEND)
(*lt)->l_flag |= LW_DBGSUSPEND;
else {
(*lt)->l_flag &= ~LW_DBGSUSPEND;
if ((*lt)->l_flag != LSSUSPENDED) (*lt)->l_stat = LSSTOP;
}
lwp_unlock(*lt);
return 0;
}
#ifdef PT_REGISTERS
static int
ptrace_uio_dir(int req)
{
switch (req) {
case_PT_GETREGS
case_PT_GETFPREGS
case_PT_GETDBREGS
return UIO_READ;
case_PT_SETREGS
case_PT_SETFPREGS
case_PT_SETDBREGS
return UIO_WRITE;
default:
return -1;
}
}
static int
ptrace_regs(struct lwp *l, struct lwp **lt, int rq, struct ptrace_methods *ptm,
void *addr, size_t data)
{
int error;
struct proc *p, *t;
struct vmspace *vm;
p = l->l_proc; /* tracer */
t = (*lt)->l_proc; /* traced */
if ((error = ptrace_update_lwp(t, lt, data)) != 0)
return error;
int dir = ptrace_uio_dir(rq);
size_t size;
int (*func)(struct lwp *, struct lwp *, struct uio *);
DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, rq));
switch (rq) {
#if defined(PT_SETREGS) || defined(PT_GETREGS)
case_PT_GETREGS
case_PT_SETREGS
if (!process_validregs(*lt))
return EINVAL;
size = PROC_REGSZ(p);
func = ptm->ptm_doregs;
break;
#endif
#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
case_PT_GETFPREGS
case_PT_SETFPREGS
if (!process_validfpregs(*lt))
return EINVAL;
size = PROC_FPREGSZ(p);
func = ptm->ptm_dofpregs;
break;
#endif
#if defined(PT_SETDBREGS) || defined(PT_GETDBREGS)
case_PT_GETDBREGS
case_PT_SETDBREGS
if (!process_validdbregs(*lt))
return EINVAL;
size = PROC_DBREGSZ(p);
func = ptm->ptm_dodbregs;
break;
#endif
default:
return EINVAL;
}
error = proc_vmspace_getref(l->l_proc, &vm);
if (error)
return error;
struct uio uio;
struct iovec iov;
iov.iov_base = addr;
iov.iov_len = size;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_resid = iov.iov_len;
uio.uio_rw = dir;
uio.uio_vmspace = vm;
error = (*func)(l, *lt, &uio);
uvmspace_free(vm);
return error;
}
#endif
static int
ptrace_sendsig(struct lwp *l, int req, struct proc *t, struct lwp *lt, int signo, int resume_all)
{
ksiginfo_t ksi;
/* Finally, deliver the requested signal (or none). */
if (t->p_stat == SSTOP) {
/*
* Unstop the process. If it needs to take a
* signal, make all efforts to ensure that at
* an LWP runs to see it.
*/
t->p_xsig = signo;
/*
* signo > 0 check prevents a potential panic, as
* sigismember(&...,0) is invalid check and signo
* can be equal to 0 as a special case of no-signal.
*/
if (signo > 0 && sigismember(&stopsigmask, signo)) { t->p_waited = 0;
child_psignal(t, 0);
} else if (resume_all)
proc_unstop(t);
else
lwp_unstop(lt);
return 0;
}
KASSERT(req == PT_KILL || req == PT_STOP || req == PT_ATTACH);
KSI_INIT(&ksi);
ksi.ksi_signo = signo;
ksi.ksi_code = SI_USER;
ksi.ksi_pid = l->l_proc->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
t->p_sigctx.ps_faked = false;
DPRINTF(("%s: pid=%d.%d signal=%d resume_all=%d\n", __func__, t->p_pid,
lt->l_lid, signo, resume_all));
return kpsignal2(t, &ksi);
}
static int
ptrace_dumpcore(struct lwp *lt, char *path, size_t len)
{
int error;
if (path != NULL) { if (len >= MAXPATHLEN)
return EINVAL;
char *src = path;
path = kmem_alloc(len + 1, KM_SLEEP);
error = copyin(src, path, len);
if (error)
goto out;
path[len] = '\0';
}
DPRINTF(("%s: lwp=%d\n", __func__, lt->l_lid));
MODULE_HOOK_CALL(coredump_hook, (lt, path), 0, error);
out:
if (path) kmem_free(path, len + 1);
return error;
}
static int
ptrace_doio(struct lwp *l, struct proc *t, struct lwp *lt,
struct ptrace_io_desc *piod, void *addr, bool sysspace)
{
struct uio uio;
struct iovec iov;
int error, tmp;
error = 0;
iov.iov_base = piod->piod_addr;
iov.iov_len = piod->piod_len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(unsigned long)piod->piod_offs;
uio.uio_resid = piod->piod_len;
DPRINTF(("%s: lwp=%d request=%d\n", __func__, l->l_lid, piod->piod_op));
switch (piod->piod_op) {
case PIOD_READ_D:
case PIOD_READ_I:
uio.uio_rw = UIO_READ;
break;
case PIOD_WRITE_D:
case PIOD_WRITE_I:
/*
* Can't write to a RAS
*/
if (ras_lookup(t, addr) != (void *)-1) {
return EACCES;
}
uio.uio_rw = UIO_WRITE;
break;
case PIOD_READ_AUXV:
uio.uio_rw = UIO_READ;
tmp = t->p_execsw->es_arglen;
if (uio.uio_offset > tmp)
return EIO;
if (uio.uio_resid > tmp - uio.uio_offset) uio.uio_resid = tmp - uio.uio_offset;
piod->piod_len = iov.iov_len = uio.uio_resid;
error = process_auxv_offset(t, &uio);
break;
default:
error = EINVAL;
break;
}
if (error)
return error;
if (sysspace) {
uio.uio_vmspace = vmspace_kernel();
} else {
error = proc_vmspace_getref(l->l_proc, &uio.uio_vmspace);
if (error)
return error;
}
error = process_domem(l, lt, &uio);
if (!sysspace)
uvmspace_free(uio.uio_vmspace);
if (error)
return error;
piod->piod_len -= uio.uio_resid;
return 0;
}
int
do_ptrace(struct ptrace_methods *ptm, struct lwp *l, int req, pid_t pid,
void *addr, int data, register_t *retval)
{
struct proc *p = l->l_proc;
struct lwp *lt = NULL;
struct lwp *lt2;
struct proc *t; /* target process */
struct ptrace_io_desc piod;
int error, write, tmp, pheld;
int signo = 0;
int resume_all;
bool locked;
error = 0;
/*
* If attaching or detaching, we need to get a write hold on the
* proclist lock so that we can re-parent the target process.
*/
mutex_enter(&proc_lock);
t = ptrace_find(l, req, pid);
if (t == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
pheld = 1;
if ((error = ptrace_allowed(l, req, t, p, &locked)) != 0)
goto out;
if ((error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_PTRACE, t, KAUTH_ARG(req), NULL, NULL)) != 0)
goto out;
if ((lt = lwp_find_first(t)) == NULL) {
error = ESRCH;
goto out;
}
/* Do single-step fixup if needed. */
FIX_SSTEP(t);
KASSERT(lt != NULL);
lwp_addref(lt);
/*
* Which locks do we need held? XXX Ugly.
*/
if ((pheld = ptrace_needs_hold(req)) == 0) {
mutex_exit(t->p_lock);
mutex_exit(&proc_lock);
}
/* Now do the operation. */
write = 0;
*retval = 0;
tmp = 0;
resume_all = 1;
switch (req) {
case PT_TRACE_ME:
/* Just set the trace flag. */
SET(t->p_slflag, PSL_TRACED);
t->p_opptr = t->p_pptr;
break;
/*
* The I and D separate address space has been inherited from PDP-11.
* The 16-bit UNIX started with a single address space per program,
* but was extended to two 16-bit (2 x 64kb) address spaces.
*
* We no longer maintain this feature in maintained architectures, but
* we keep the API for backward compatibility. Currently the I and D
* operations are exactly the same and not distinguished in debuggers.
*/
case PT_WRITE_I:
case PT_WRITE_D:
write = 1;
tmp = data;
/* FALLTHROUGH */
case PT_READ_I:
case PT_READ_D:
piod.piod_addr = &tmp;
piod.piod_len = sizeof(tmp);
piod.piod_offs = addr;
piod.piod_op = write ? PIOD_WRITE_D : PIOD_READ_D;
if ((error = ptrace_doio(l, t, lt, &piod, addr, true)) != 0)
break;
/*
* For legacy reasons we treat here two results as success:
* - incomplete transfer piod.piod_len < sizeof(tmp)
* - no transfer piod.piod_len == 0
*
* This means that there is no way to determine whether
* transfer operation was performed in PT_WRITE and PT_READ
* calls.
*/
if (!write)
*retval = tmp;
break;
case PT_IO:
if ((error = ptm->ptm_copyin_piod(&piod, addr, data)) != 0)
break;
if (piod.piod_len < 1) {
error = EINVAL;
break;
}
if ((error = ptrace_doio(l, t, lt, &piod, addr, false)) != 0)
break;
/*
* For legacy reasons we treat here two results as success:
* - incomplete transfer piod.piod_len < sizeof(tmp)
* - no transfer piod.piod_len == 0
*/
error = ptm->ptm_copyout_piod(&piod, addr, data);
break;
case PT_DUMPCORE:
error = ptrace_dumpcore(lt, addr, data);
break;
#ifdef PT_STEP
case PT_STEP:
/*
* From the 4.4BSD PRM:
* "Execution continues as in request PT_CONTINUE; however
* as soon as possible after execution of at least one
* instruction, execution stops again. [ ... ]"
*/
#endif
case PT_CONTINUE:
case PT_SYSCALL:
case PT_DETACH:
if (req == PT_SYSCALL) {
if (!ISSET(t->p_slflag, PSL_SYSCALL)) { SET(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
(*t->p_emul->e_syscall_intern)(t);
#endif
}
} else {
if (ISSET(t->p_slflag, PSL_SYSCALL)) { CLR(t->p_slflag, PSL_SYSCALL);
#ifdef __HAVE_SYSCALL_INTERN
(*t->p_emul->e_syscall_intern)(t);
#endif
}
}
t->p_trace_enabled = trace_is_enabled(t);
/*
* Pick up the LWPID, if supplied. There are two cases:
* data < 0 : step or continue single thread, lwp = -data
* data > 0 in PT_STEP : step this thread, continue others
* For operations other than PT_STEP, data > 0 means
* data is the signo to deliver to the process.
*/
tmp = data;
if (tmp >= 0) {
#ifdef PT_STEP
if (req == PT_STEP)
signo = 0;
else
#endif
{
signo = tmp;
tmp = 0; /* don't search for LWP */
}
} else if (tmp == INT_MIN) {
error = ESRCH;
break;
} else {
tmp = -tmp;
}
if (tmp > 0) { if (req == PT_DETACH) {
error = EINVAL;
break;
}
lwp_delref2 (lt);
lt = lwp_find(t, tmp);
if (lt == NULL) {
error = ESRCH;
break;
}
lwp_addref(lt);
resume_all = 0;
signo = 0;
}
/*
* From the 4.4BSD PRM:
* "The data argument is taken as a signal number and the
* child's execution continues at location addr as if it
* incurred that signal. Normally the signal number will
* be either 0 to indicate that the signal that caused the
* stop should be ignored, or that value fetched out of
* the process's image indicating which signal caused
* the stop. If addr is (int *)1 then execution continues
* from where it stopped."
*/
/* Check that the data is a valid signal number or zero. */
if (signo < 0 || signo >= NSIG) {
error = EINVAL;
break;
}
/* Prevent process deadlock */
if (resume_all) {
#ifdef PT_STEP
if (req == PT_STEP) {
if (lt->l_flag &
(LW_WSUSPEND | LW_DBGSUSPEND)) {
error = EDEADLK;
break;
}
} else
#endif
{
error = EDEADLK;
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) { if ((lt2->l_flag &
(LW_WSUSPEND | LW_DBGSUSPEND)) == 0
) {
error = 0;
break;
}
}
if (error != 0)
break;
}
} else {
if (lt->l_flag & (LW_WSUSPEND | LW_DBGSUSPEND)) {
error = EDEADLK;
break;
}
}
/*
* Reject setting program counter to 0x0 if VA0 is disabled.
*
* Not all kernels implement this feature to set Program
* Counter in one go in PT_CONTINUE and similar operations.
* This causes portability issues as passing address 0x0
* on these kernels is no-operation, but can cause failure
* in most cases on NetBSD.
*/
if (user_va0_disable && addr == 0) {
error = EINVAL;
break;
}
/* If the address parameter is not (int *)1, set the pc. */
if ((int *)addr != (int *)1) {
error = process_set_pc(lt, addr);
if (error != 0)
break;
}
#ifdef PT_STEP
/*
* Arrange for a single-step, if that's requested and possible.
* More precisely, set the single step status as requested for
* the requested thread, and clear it for other threads.
*/
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
error = process_sstep(lt2,
ISSET(lt2->l_pflag, LP_SINGLESTEP));
if (error)
break;
}
if (error)
break;
error = process_sstep(lt,
ISSET(lt->l_pflag, LP_SINGLESTEP) || req == PT_STEP);
if (error)
break;
#endif
if (req == PT_DETACH) {
CLR(t->p_slflag,
PSL_TRACED|PSL_TRACEDCHILD|PSL_SYSCALL);
/* clear sigpass mask */
sigemptyset(&t->p_sigctx.ps_sigpass);
/* give process back to original parent or init */
if (t->p_opptr != t->p_pptr) {
struct proc *pp = t->p_opptr;
proc_reparent(t, pp ? pp : initproc);
}
/* not being traced any more */
t->p_opptr = NULL;
/* clear single step */
LIST_FOREACH(lt2, &t->p_lwps, l_sibling) {
CLR(lt2->l_pflag, LP_SINGLESTEP);
}
CLR(lt->l_pflag, LP_SINGLESTEP);
}
sendsig:
error = ptrace_sendsig(l, req, t, lt, signo, resume_all);
break;
case PT_SYSCALLEMU:
if (!ISSET(t->p_slflag, PSL_SYSCALL) || t->p_stat != SSTOP) {
error = EINVAL;
break;
}
SET(t->p_slflag, PSL_SYSCALLEMU);
break;
#ifdef PT_STEP
case PT_SETSTEP:
write = 1;
/* FALLTHROUGH */
case PT_CLEARSTEP:
/* write = 0 done above. */
if ((error = ptrace_update_lwp(t, <, data)) != 0)
break;
if (write)
SET(lt->l_pflag, LP_SINGLESTEP);
else
CLR(lt->l_pflag, LP_SINGLESTEP);
break;
#endif
case PT_KILL:
/* just send the process a KILL signal. */
signo = SIGKILL;
goto sendsig; /* in PT_CONTINUE, above. */
case PT_STOP:
/* just send the process a STOP signal. */
signo = SIGSTOP;
goto sendsig; /* in PT_CONTINUE, above. */
case PT_ATTACH:
/*
* Go ahead and set the trace flag.
* Save the old parent (it's reset in
* _DETACH, and also in kern_exit.c:wait4()
* Reparent the process so that the tracing
* proc gets to see all the action.
* Stop the target.
*/
proc_changeparent(t, p);
signo = SIGSTOP;
goto sendsig;
case PT_GET_EVENT_MASK:
error = ptrace_get_event_mask(t, addr, data);
break;
case PT_SET_EVENT_MASK:
error = ptrace_set_event_mask(t, addr, data);
break;
case PT_GET_PROCESS_STATE:
error = ptrace_get_process_state(t, addr, data);
break;
case PT_LWPINFO:
error = ptrace_lwpinfo(t, <, addr, data);
break;
case PT_SET_SIGINFO:
error = ptrace_set_siginfo(t, <, ptm, addr, data);
break;
case PT_GET_SIGINFO:
error = ptrace_get_siginfo(t, ptm, addr, data);
break;
case PT_RESUME:
case PT_SUSPEND:
error = ptrace_startstop(t, <, req, addr, data);
break;
case PT_LWPSTATUS:
error = ptrace_lwpstatus(t, ptm, <, addr, data, false);
break;
case PT_LWPNEXT:
error = ptrace_lwpstatus(t, ptm, <, addr, data, true);
break;
case PT_SET_SIGPASS:
error = ptrace_set_sigpass(t, addr, data);
break;
case PT_GET_SIGPASS:
error = ptrace_get_sigpass(t, addr, data);
break;
#ifdef PT_REGISTERS
case_PT_SETREGS
case_PT_GETREGS
case_PT_SETFPREGS
case_PT_GETFPREGS
case_PT_SETDBREGS
case_PT_GETDBREGS
error = ptrace_regs(l, <, req, ptm, addr, data);
break;
#endif
#ifdef __HAVE_PTRACE_MACHDEP
PTRACE_MACHDEP_REQUEST_CASES
error = ptrace_machdep_dorequest(l, <, req, addr, data);
break;
#endif
}
out:
if (pheld) {
mutex_exit(t->p_lock);
mutex_exit(&proc_lock);
}
if (lt != NULL) lwp_delref(lt); if (locked) rw_exit(&t->p_reflock);
return error;
}
static int
process_auxv_offset(struct proc *p, struct uio *uio)
{
struct ps_strings pss;
int error;
off_t off = (off_t)p->p_psstrp;
if ((error = copyin_psstrings(p, &pss)) != 0)
return error;
if (pss.ps_envstr == NULL)
return EIO;
#ifdef COMPAT_NETBSD32
if (p->p_flag & PK_32)
uio->uio_offset += (off_t)((vaddr_t)pss.ps_envstr +
sizeof(uint32_t) * (pss.ps_nenvstr + 1));
else
#endif
uio->uio_offset += (off_t)(vaddr_t)(pss.ps_envstr +
pss.ps_nenvstr + 1);
#ifdef __MACHINE_STACK_GROWS_UP
if (uio->uio_offset < off)
return EIO;
#else
if (uio->uio_offset > off)
return EIO;
if ((uio->uio_offset + uio->uio_resid) > off) uio->uio_resid = off - uio->uio_offset;
#endif
return 0;
}
MODULE(MODULE_CLASS_EXEC, ptrace_common, NULL);
static int
ptrace_common_init(void)
{
#if 0
mutex_init(&ptrace_mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ptrace_cv, "ptracecb");
ptrace_cbref = 0;
#endif
ptrace_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
ptrace_listener_cb, NULL);
return 0;
}
static int
ptrace_common_fini(void)
{
kauth_unlisten_scope(ptrace_listener);
#if 0
/* Make sure no-one is executing our kauth listener */
mutex_enter(&ptrace_mtx);
while (ptrace_cbref != 0)
cv_wait(&ptrace_cv, &ptrace_mtx);
mutex_exit(&ptrace_mtx);
mutex_destroy(&ptrace_mtx);
cv_destroy(&ptrace_cv);
#endif
return 0;
}
static int
ptrace_common_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = ptrace_common_init();
break;
case MODULE_CMD_FINI:
error = ptrace_common_fini();
break;
default:
ptrace_hooks();
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* The packet queue (pktqueue) interface is a lockless IP input queue
* which also abstracts and handles network ISR scheduling. It provides
* a mechanism to enable receiver-side packet steering (RPS).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pktqueue.c,v 1.22 2023/05/28 08:09:34 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/pcq.h>
#include <sys/intr.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#include <sys/once.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <net/pktqueue.h>
#include <net/rss_config.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
struct pktqueue {
/*
* The lock used for a barrier mechanism. The barrier counter,
* as well as the drop counter, are managed atomically though.
* Ensure this group is in a separate cache line.
*/
union {
struct {
kmutex_t pq_lock;
volatile u_int pq_barrier;
};
uint8_t _pad[COHERENCY_UNIT];
};
/* The size of the queue, counters and the interrupt handler. */
u_int pq_maxlen;
percpu_t * pq_counters;
void * pq_sih;
/* The per-CPU queues. */
struct percpu * pq_pcq; /* struct pcq * */
/* The linkage on the list of all pktqueues. */
LIST_ENTRY(pktqueue) pq_list;
};
/* The counters of the packet queue. */
#define PQCNT_ENQUEUE 0
#define PQCNT_DEQUEUE 1
#define PQCNT_DROP 2
#define PQCNT_NCOUNTERS 3
typedef struct {
uint64_t count[PQCNT_NCOUNTERS];
} pktq_counters_t;
/* Special marker value used by pktq_barrier() mechanism. */
#define PKTQ_MARKER ((void *)(~0ULL))
/*
* This is a list of all pktqueues. This list is used by
* pktq_ifdetach() to issue a barrier on every pktqueue.
*
* The r/w lock is acquired for writing in pktq_create() and
* pktq_destroy(), and for reading in pktq_ifdetach().
*
* This list is not performance critical, and will seldom be
* accessed.
*/
static LIST_HEAD(, pktqueue) pktqueue_list __read_mostly;
static krwlock_t pktqueue_list_lock __read_mostly;
static once_t pktqueue_list_init_once __read_mostly;
static int
pktqueue_list_init(void)
{
LIST_INIT(&pktqueue_list);
rw_init(&pktqueue_list_lock);
return 0;
}
static void
pktq_init_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
struct pcq **qp = vqp;
struct pktqueue *pq = vpq;
*qp = pcq_create(pq->pq_maxlen, KM_SLEEP);
}
static void
pktq_fini_cpu(void *vqp, void *vpq, struct cpu_info *ci)
{
struct pcq **qp = vqp, *q = *qp;
KASSERT(pcq_peek(q) == NULL);
pcq_destroy(q);
*qp = NULL; /* paranoia */
}
static struct pcq *
pktq_pcq(struct pktqueue *pq, struct cpu_info *ci)
{
struct pcq **qp, *q;
/*
* As long as preemption is disabled, the xcall to swap percpu
* buffers can't complete, so it is safe to read the pointer.
*/
KASSERT(kpreempt_disabled());
qp = percpu_getptr_remote(pq->pq_pcq, ci);
q = *qp;
return q;
}
pktqueue_t *
pktq_create(size_t maxlen, void (*intrh)(void *), void *sc)
{
const u_int sflags = SOFTINT_NET | SOFTINT_MPSAFE | SOFTINT_RCPU;
pktqueue_t *pq;
percpu_t *pc;
void *sih;
RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
pc = percpu_alloc(sizeof(pktq_counters_t));
if ((sih = softint_establish(sflags, intrh, sc)) == NULL) {
percpu_free(pc, sizeof(pktq_counters_t));
return NULL;
}
pq = kmem_zalloc(sizeof(*pq), KM_SLEEP);
mutex_init(&pq->pq_lock, MUTEX_DEFAULT, IPL_NONE);
pq->pq_maxlen = maxlen;
pq->pq_counters = pc;
pq->pq_sih = sih;
pq->pq_pcq = percpu_create(sizeof(struct pcq *),
pktq_init_cpu, pktq_fini_cpu, pq);
rw_enter(&pktqueue_list_lock, RW_WRITER);
LIST_INSERT_HEAD(&pktqueue_list, pq, pq_list);
rw_exit(&pktqueue_list_lock);
return pq;
}
void
pktq_destroy(pktqueue_t *pq)
{
KASSERT(pktqueue_list_init_once.o_status == ONCE_DONE);
rw_enter(&pktqueue_list_lock, RW_WRITER);
LIST_REMOVE(pq, pq_list);
rw_exit(&pktqueue_list_lock);
percpu_free(pq->pq_pcq, sizeof(struct pcq *));
percpu_free(pq->pq_counters, sizeof(pktq_counters_t));
softint_disestablish(pq->pq_sih);
mutex_destroy(&pq->pq_lock);
kmem_free(pq, sizeof(*pq));
}
/*
* - pktq_inc_counter: increment the counter given an ID.
* - pktq_collect_counts: handler to sum up the counts from each CPU.
* - pktq_getcount: return the effective count given an ID.
*/
static inline void
pktq_inc_count(pktqueue_t *pq, u_int i)
{
percpu_t *pc = pq->pq_counters;
pktq_counters_t *c;
c = percpu_getref(pc);
c->count[i]++;
percpu_putref(pc);
}
static void
pktq_collect_counts(void *mem, void *arg, struct cpu_info *ci)
{
const pktq_counters_t *c = mem;
pktq_counters_t *sum = arg;
int s = splnet();
for (u_int i = 0; i < PQCNT_NCOUNTERS; i++) {
sum->count[i] += c->count[i];
}
splx(s);
}
static uint64_t
pktq_get_count(pktqueue_t *pq, pktq_count_t c)
{
pktq_counters_t sum;
if (c != PKTQ_MAXLEN) {
memset(&sum, 0, sizeof(sum));
percpu_foreach_xcall(pq->pq_counters,
XC_HIGHPRI_IPL(IPL_SOFTNET), pktq_collect_counts, &sum);
}
switch (c) {
case PKTQ_NITEMS:
return sum.count[PQCNT_ENQUEUE] - sum.count[PQCNT_DEQUEUE];
case PKTQ_DROPS:
return sum.count[PQCNT_DROP];
case PKTQ_MAXLEN:
return pq->pq_maxlen;
}
return 0;
}
uint32_t
pktq_rps_hash(const pktq_rps_hash_func_t *funcp, const struct mbuf *m)
{
pktq_rps_hash_func_t func = atomic_load_relaxed(funcp);
KASSERT(func != NULL);
return (*func)(m);
}
static uint32_t
pktq_rps_hash_zero(const struct mbuf *m __unused)
{
return 0;
}
static uint32_t
pktq_rps_hash_curcpu(const struct mbuf *m __unused)
{
return cpu_index(curcpu());
}
static uint32_t
pktq_rps_hash_toeplitz(const struct mbuf *m)
{
struct ip *ip;
/*
* Disable UDP port - IP fragments aren't currently being handled
* and so we end up with a mix of 2-tuple and 4-tuple
* traffic.
*/
const u_int flag = RSS_TOEPLITZ_USE_TCP_PORT;
/* glance IP version */
if ((m->m_flags & M_PKTHDR) == 0)
return 0;
ip = mtod(m, struct ip *);
if (ip->ip_v == IPVERSION) {
if (__predict_false(m->m_len < sizeof(struct ip)))
return 0;
return rss_toeplitz_hash_from_mbuf_ipv4(m, flag);
} else if (ip->ip_v == 6) {
if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
return 0;
return rss_toeplitz_hash_from_mbuf_ipv6(m, flag);
}
return 0;
}
/*
* toeplitz without curcpu.
* Generally, this has better performance than toeplitz.
*/
static uint32_t
pktq_rps_hash_toeplitz_othercpus(const struct mbuf *m)
{
uint32_t hash;
if (ncpu == 1)
return 0;
hash = pktq_rps_hash_toeplitz(m);
hash %= ncpu - 1;
if (hash >= cpu_index(curcpu()))
return hash + 1;
else
return hash;
}
static struct pktq_rps_hash_table {
const char* prh_type;
pktq_rps_hash_func_t prh_func;
} const pktq_rps_hash_tab[] = {
{ "zero", pktq_rps_hash_zero },
{ "curcpu", pktq_rps_hash_curcpu },
{ "toeplitz", pktq_rps_hash_toeplitz },
{ "toeplitz-othercpus", pktq_rps_hash_toeplitz_othercpus },
};
const pktq_rps_hash_func_t pktq_rps_hash_default =
#ifdef NET_MPSAFE
pktq_rps_hash_curcpu;
#else
pktq_rps_hash_zero;
#endif
static const char *
pktq_get_rps_hash_type(pktq_rps_hash_func_t func)
{
for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
if (func == pktq_rps_hash_tab[i].prh_func) {
return pktq_rps_hash_tab[i].prh_type;
}
}
return NULL;
}
static int
pktq_set_rps_hash_type(pktq_rps_hash_func_t *func, const char *type)
{
if (strcmp(type, pktq_get_rps_hash_type(*func)) == 0)
return 0;
for (int i = 0; i < __arraycount(pktq_rps_hash_tab); i++) {
if (strcmp(type, pktq_rps_hash_tab[i].prh_type) == 0) {
atomic_store_relaxed(func, pktq_rps_hash_tab[i].prh_func);
return 0;
}
}
return ENOENT;
}
int
sysctl_pktq_rps_hash_handler(SYSCTLFN_ARGS)
{
struct sysctlnode node;
pktq_rps_hash_func_t *func;
int error;
char type[PKTQ_RPS_HASH_NAME_LEN];
node = *rnode;
func = node.sysctl_data;
strlcpy(type, pktq_get_rps_hash_type(*func), PKTQ_RPS_HASH_NAME_LEN);
node.sysctl_data = &type;
node.sysctl_size = sizeof(type);
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
error = pktq_set_rps_hash_type(func, type);
return error;
}
/*
* pktq_enqueue: inject the packet into the end of the queue.
*
* => Must be called from the interrupt or with the preemption disabled.
* => Consumes the packet and returns true on success.
* => Returns false on failure; caller is responsible to free the packet.
*/
bool
pktq_enqueue(pktqueue_t *pq, struct mbuf *m, const u_int hash __unused)
{
#if defined(_RUMPKERNEL) || defined(_RUMP_NATIVE_ABI)
struct cpu_info *ci = curcpu();
#else
struct cpu_info *ci = cpu_lookup(hash % ncpu);
#endif
KASSERT(kpreempt_disabled()); if (__predict_false(!pcq_put(pktq_pcq(pq, ci), m))) { pktq_inc_count(pq, PQCNT_DROP);
return false;
}
softint_schedule_cpu(pq->pq_sih, ci);
pktq_inc_count(pq, PQCNT_ENQUEUE);
return true;
}
/*
* pktq_dequeue: take a packet from the queue.
*
* => Must be called with preemption disabled.
* => Must ensure there are not concurrent dequeue calls.
*/
struct mbuf *
pktq_dequeue(pktqueue_t *pq)
{
struct cpu_info *ci = curcpu();
struct mbuf *m;
KASSERT(kpreempt_disabled());
m = pcq_get(pktq_pcq(pq, ci));
if (__predict_false(m == PKTQ_MARKER)) {
/* Note the marker entry. */
atomic_inc_uint(&pq->pq_barrier);
/* Get the next queue entry. */
m = pcq_get(pktq_pcq(pq, ci));
/*
* There can only be one barrier operation pending
* on a pktqueue at any given time, so we can assert
* that the next item is not a marker.
*/
KASSERT(m != PKTQ_MARKER);
}
if (__predict_true(m != NULL)) {
pktq_inc_count(pq, PQCNT_DEQUEUE);
}
return m;
}
/*
* pktq_barrier: waits for a grace period when all packets enqueued at
* the moment of calling this routine will be processed. This is used
* to ensure that e.g. packets referencing some interface were drained.
*/
void
pktq_barrier(pktqueue_t *pq)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
u_int pending = 0;
mutex_enter(&pq->pq_lock);
KASSERT(pq->pq_barrier == 0);
for (CPU_INFO_FOREACH(cii, ci)) {
struct pcq *q;
kpreempt_disable();
q = pktq_pcq(pq, ci);
kpreempt_enable();
/* If the queue is empty - nothing to do. */
if (pcq_peek(q) == NULL) {
continue;
}
/* Otherwise, put the marker and entry. */
while (!pcq_put(q, PKTQ_MARKER)) {
kpause("pktqsync", false, 1, NULL);
}
kpreempt_disable();
softint_schedule_cpu(pq->pq_sih, ci);
kpreempt_enable();
pending++;
}
/* Wait for each queue to process the markers. */
while (pq->pq_barrier != pending) {
kpause("pktqsync", false, 1, NULL);
}
pq->pq_barrier = 0;
mutex_exit(&pq->pq_lock);
}
/*
* pktq_ifdetach: issue a barrier on all pktqueues when a network
* interface is detached.
*/
void
pktq_ifdetach(void)
{
pktqueue_t *pq;
/* Just in case no pktqueues have been created yet... */
RUN_ONCE(&pktqueue_list_init_once, pktqueue_list_init);
rw_enter(&pktqueue_list_lock, RW_READER);
LIST_FOREACH(pq, &pktqueue_list, pq_list) {
pktq_barrier(pq);
}
rw_exit(&pktqueue_list_lock);
}
/*
* pktq_flush: free mbufs in all queues.
*
* => The caller must ensure there are no concurrent writers or flush calls.
*/
void
pktq_flush(pktqueue_t *pq)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct mbuf *m, *m0 = NULL;
ASSERT_SLEEPABLE();
/*
* Run a dummy softint at IPL_SOFTNET on all CPUs to ensure that any
* already running handler for this pktqueue is no longer running.
*/
xc_barrier(XC_HIGHPRI_IPL(IPL_SOFTNET));
/*
* Acquire the barrier lock. While the caller ensures that
* no explicit pktq_barrier() calls will be issued, this holds
* off any implicit pktq_barrier() calls that would happen
* as the result of pktq_ifdetach().
*/
mutex_enter(&pq->pq_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
struct pcq *q;
kpreempt_disable();
q = pktq_pcq(pq, ci);
kpreempt_enable();
/*
* Pull the packets off the pcq and chain them into
* a list to be freed later.
*/
while ((m = pcq_get(q)) != NULL) {
pktq_inc_count(pq, PQCNT_DEQUEUE);
m->m_nextpkt = m0;
m0 = m;
}
}
mutex_exit(&pq->pq_lock);
/* Free the packets now that the critical section is over. */
while ((m = m0) != NULL) {
m0 = m->m_nextpkt;
m_freem(m);
}
}
static void
pktq_set_maxlen_cpu(void *vpq, void *vqs)
{
struct pktqueue *pq = vpq;
struct pcq **qp, *q, **qs = vqs;
unsigned i = cpu_index(curcpu());
int s;
s = splnet();
qp = percpu_getref(pq->pq_pcq);
q = *qp;
*qp = qs[i];
qs[i] = q;
percpu_putref(pq->pq_pcq);
splx(s);
}
/*
* pktq_set_maxlen: create per-CPU queues using a new size and replace
* the existing queues without losing any packets.
*
* XXX ncpu must remain stable throughout.
*/
int
pktq_set_maxlen(pktqueue_t *pq, size_t maxlen)
{
const u_int slotbytes = ncpu * sizeof(pcq_t *);
pcq_t **qs;
if (!maxlen || maxlen > PCQ_MAXLEN)
return EINVAL;
if (pq->pq_maxlen == maxlen)
return 0;
/* First, allocate the new queues. */
qs = kmem_zalloc(slotbytes, KM_SLEEP);
for (u_int i = 0; i < ncpu; i++) {
qs[i] = pcq_create(maxlen, KM_SLEEP);
}
/*
* Issue an xcall to replace the queue pointers on each CPU.
* This implies all the necessary memory barriers.
*/
mutex_enter(&pq->pq_lock);
xc_wait(xc_broadcast(XC_HIGHPRI, pktq_set_maxlen_cpu, pq, qs));
pq->pq_maxlen = maxlen;
mutex_exit(&pq->pq_lock);
/*
* At this point, the new packets are flowing into the new
* queues. However, the old queues may have some packets
* present which are no longer being processed. We are going
* to re-enqueue them. This may change the order of packet
* arrival, but it is not considered an issue.
*
* There may be in-flight interrupts calling pktq_dequeue()
* which reference the old queues. Issue a barrier to ensure
* that we are going to be the only pcq_get() callers on the
* old queues.
*/
pktq_barrier(pq);
for (u_int i = 0; i < ncpu; i++) {
struct pcq *q;
struct mbuf *m;
kpreempt_disable();
q = pktq_pcq(pq, cpu_lookup(i));
kpreempt_enable();
while ((m = pcq_get(qs[i])) != NULL) {
while (!pcq_put(q, m)) {
kpause("pktqrenq", false, 1, NULL);
}
}
pcq_destroy(qs[i]);
}
/* Well, that was fun. */
kmem_free(qs, slotbytes);
return 0;
}
static int
sysctl_pktq_maxlen(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
pktqueue_t * const pq = node.sysctl_data;
u_int nmaxlen = pktq_get_count(pq, PKTQ_MAXLEN);
int error;
node.sysctl_data = &nmaxlen;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
return pktq_set_maxlen(pq, nmaxlen);
}
static int
sysctl_pktq_count(SYSCTLFN_ARGS, u_int count_id)
{
struct sysctlnode node = *rnode;
pktqueue_t * const pq = node.sysctl_data;
uint64_t count = pktq_get_count(pq, count_id);
node.sysctl_data = &count;
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
static int
sysctl_pktq_nitems(SYSCTLFN_ARGS)
{
return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_NITEMS);
}
static int
sysctl_pktq_drops(SYSCTLFN_ARGS)
{
return sysctl_pktq_count(SYSCTLFN_CALL(rnode), PKTQ_DROPS);
}
/*
* pktqueue_sysctl_setup: set up the sysctl nodes for a pktqueue
* using standardized names at the specified parent node and
* node ID (or CTL_CREATE).
*/
void
pktq_sysctl_setup(pktqueue_t * const pq, struct sysctllog ** const clog,
const struct sysctlnode * const parent_node, const int qid)
{
const struct sysctlnode *rnode = parent_node, *cnode;
KASSERT(pq != NULL);
KASSERT(parent_node != NULL);
KASSERT(qid == CTL_CREATE || qid >= 0);
/* Create the "ifq" node below the parent node. */
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ifq",
SYSCTL_DESCR("Protocol input queue controls"),
NULL, 0, NULL, 0,
qid, CTL_EOL);
/* Now create the standard child nodes below "ifq". */
rnode = cnode;
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "len",
SYSCTL_DESCR("Current input queue length"),
sysctl_pktq_nitems, 0, (void *)pq, 0,
IFQCTL_LEN, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "maxlen",
SYSCTL_DESCR("Maximum allowed input queue length"),
sysctl_pktq_maxlen, 0, (void *)pq, 0,
IFQCTL_MAXLEN, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "drops",
SYSCTL_DESCR("Packets dropped due to full input queue"),
sysctl_pktq_drops, 0, (void *)pq, 0,
IFQCTL_DROPS, CTL_EOL);
}
/* $NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.21 2020/08/31 19:51:30 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "opt_ktrace.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#endif
/* XXX To get syscall prototypes. */
#define SYSVSHM
#define SYSVSEM
#define SYSVMSG
#include <sys/param.h>
#include <sys/module.h>
#include <sys/sched.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/systm.h>
#include <sys/xcall.h>
#include <sys/ktrace.h>
#include <sys/ptrace.h>
int
sys_nomodule(struct lwp *l, const void *v, register_t *retval)
{
#ifdef MODULAR
const struct sysent *sy;
const struct emul *em;
const struct sc_autoload *auto_list;
u_int code;
/*
* Restart the syscall if we interrupted a module unload that
* failed. Acquiring kernconfig_lock delays us until any unload
* has been completed or rolled back.
*/
kernconfig_lock();
sy = l->l_sysent;
if (sy->sy_call != sys_nomodule) {
kernconfig_unlock();
return ERESTART;
}
/*
* Try to autoload a module to satisfy the request. If it
* works, retry the request.
*/
em = l->l_proc->p_emul;
code = sy - em->e_sysent;
if ((auto_list = em->e_sc_autoload) != NULL)
for (; auto_list->al_code > 0; auto_list++) {
if (auto_list->al_code != code) {
continue;
}
if (module_autoload(auto_list->al_module,
MODULE_CLASS_ANY) != 0 ||
sy->sy_call == sys_nomodule) {
break;
}
kernconfig_unlock();
return ERESTART;
}
kernconfig_unlock();
#endif /* MODULAR */
return sys_nosys(l, v, retval);
}
int
syscall_establish(const struct emul *em, const struct syscall_package *sp)
{
struct sysent *sy;
int i;
KASSERT(kernconfig_is_held());
if (em == NULL) {
em = &emul_netbsd;
}
sy = em->e_sysent;
/*
* Ensure that all preconditions are valid, since this is
* an all or nothing deal. Once a system call is entered,
* it can become busy and we could be unable to remove it
* on error.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
if (sp[i].sp_code >= SYS_NSYSENT)
return EINVAL;
if (sy[sp[i].sp_code].sy_call != sys_nomodule &&
sy[sp[i].sp_code].sy_call != sys_nosys) {
#ifdef DIAGNOSTIC
printf("syscall %d is busy\n", sp[i].sp_code);
#endif
return EBUSY;
}
}
/* Everything looks good, patch them in. */
for (i = 0; sp[i].sp_call != NULL; i++) {
sy[sp[i].sp_code].sy_call = sp[i].sp_call;
}
return 0;
}
int
syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
{
struct sysent *sy;
const uint32_t *sb;
lwp_t *l;
int i;
KASSERT(kernconfig_is_held());
if (em == NULL) {
em = &emul_netbsd;
}
sy = em->e_sysent;
sb = em->e_nomodbits;
/*
* First, patch the system calls to sys_nomodule or sys_nosys
* to gate further activity.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
sy[sp[i].sp_code].sy_call =
sb[sp[i].sp_code / 32] & (1 << (sp[i].sp_code % 32)) ?
sys_nomodule : sys_nosys;
}
/*
* Run a cross call to cycle through all CPUs. This does two
* things: lock activity provides a barrier and makes our update
* of sy_call visible to all CPUs, and upon return we can be sure
* that we see pertinent values of l_sysent posted by remote CPUs.
*/
xc_barrier(0);
/*
* Now it's safe to check l_sysent. Run through all LWPs and see
* if anyone is still using the system call.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
if (l->l_sysent == &sy[sp[i].sp_code]) {
break;
}
}
mutex_exit(&proc_lock);
if (l == NULL) {
continue;
}
/*
* We lose: one or more calls are still in use. Put back
* the old entrypoints and act like nothing happened.
* When we drop kernconfig_lock, any system calls held in
* sys_nomodule() will be restarted.
*/
for (i = 0; sp[i].sp_call != NULL; i++) {
sy[sp[i].sp_code].sy_call = sp[i].sp_call;
}
return EBUSY;
}
return 0;
}
/*
* Return true if system call tracing is enabled for the specified process.
*/
bool
trace_is_enabled(struct proc *p)
{
#ifdef SYSCALL_DEBUG
return (true);
#endif
#ifdef KTRACE
if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
return (true);
#endif
#ifdef PTRACE
if (ISSET(p->p_slflag, PSL_SYSCALL))
return (true);
#endif
return (false);
}
/*
* Start trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just before
* a system call is actually executed.
*/
int
trace_enter(register_t code, const struct sysent *sy, const void *args)
{
int error = 0;
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
struct proc *p = curlwp->l_proc;
#endif
#ifdef KDTRACE_HOOKS
if (sy->sy_entry) {
struct emul *e = p->p_emul;
if (e->e_dtrace_syscall) (*e->e_dtrace_syscall)(sy->sy_entry, code, sy, args,
NULL, 0);
}
#endif
#ifdef SYSCALL_DEBUG
scdebug_call(code, args);
#endif /* SYSCALL_DEBUG */
ktrsyscall(code, args, sy->sy_narg);
#ifdef PTRACE
if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
(PSL_SYSCALL|PSL_TRACED)) {
proc_stoptrace(TRAP_SCE, code, args, NULL, 0);
if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
/* tracer will emulate syscall for us */
error = EJUSTRETURN;
}
}
#endif
return error;
}
/*
* End trace of particular system call. If process is being traced,
* this routine is called by MD syscall dispatch code just after
* a system call finishes.
* MD caller guarantees the passed 'code' is within the supported
* system call number range for emulation the process runs under.
*/
void
trace_exit(register_t code, const struct sysent *sy, const void *args,
register_t rval[], int error)
{
#if defined(PTRACE) || defined(KDTRACE_HOOKS)
struct proc *p = curlwp->l_proc;
#endif
#ifdef KDTRACE_HOOKS
if (sy->sy_return) {
struct emul *e = p->p_emul;
if (e->e_dtrace_syscall) (*p->p_emul->e_dtrace_syscall)(sy->sy_return, code, sy,
args, rval, error);
}
#endif
#ifdef SYSCALL_DEBUG
scdebug_ret(code, error, rval);
#endif /* SYSCALL_DEBUG */
ktrsysret(code, error, rval);
#ifdef PTRACE
if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
(PSL_SYSCALL|PSL_TRACED)) {
proc_stoptrace(TRAP_SCX, code, args, rval, error);
}
CLR(p->p_slflag, PSL_SYSCALLEMU);
#endif
}
/* $NetBSD: tcp_var.h,v 1.198 2022/10/28 05:18:39 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_var.h 8.4 (Berkeley) 5/24/95
*/
#ifndef _NETINET_TCP_VAR_H_
#define _NETINET_TCP_VAR_H_
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_mbuftrace.h"
#endif
/*
* TCP kernel structures and variables.
*/
#include <sys/callout.h>
#ifdef TCP_SIGNATURE
/*
* Defines which are needed by the xform_tcp module and tcp_[in|out]put
* for SADB verification and lookup.
*/
#define TCP_SIGLEN 16 /* length of computed digest in bytes */
#define TCP_KEYLEN_MIN 1 /* minimum length of TCP-MD5 key */
#define TCP_KEYLEN_MAX 80 /* maximum length of TCP-MD5 key */
/*
* Only a single SA per host may be specified at this time. An SPI is
* needed in order for the KEY_LOOKUP_SA() lookup to work.
*/
#define TCP_SIG_SPI 0x1000
#endif /* TCP_SIGNATURE */
/*
* Tcp+ip header, after ip options removed.
*/
struct tcpiphdr {
struct ipovly ti_i; /* overlaid ip structure */
struct tcphdr ti_t; /* tcp header */
};
#ifdef CTASSERT
CTASSERT(sizeof(struct tcpiphdr) == 40);
#endif
#define ti_x1 ti_i.ih_x1
#define ti_pr ti_i.ih_pr
#define ti_len ti_i.ih_len
#define ti_src ti_i.ih_src
#define ti_dst ti_i.ih_dst
#define ti_sport ti_t.th_sport
#define ti_dport ti_t.th_dport
#define ti_seq ti_t.th_seq
#define ti_ack ti_t.th_ack
#define ti_x2 ti_t.th_x2
#define ti_off ti_t.th_off
#define ti_flags ti_t.th_flags
#define ti_win ti_t.th_win
#define ti_sum ti_t.th_sum
#define ti_urp ti_t.th_urp
/*
* SACK option block.
*/
struct sackblk {
tcp_seq left; /* Left edge of sack block. */
tcp_seq right; /* Right edge of sack block. */
};
TAILQ_HEAD(sackhead, sackhole);
struct sackhole {
tcp_seq start;
tcp_seq end;
tcp_seq rxmit;
TAILQ_ENTRY(sackhole) sackhole_q;
};
struct syn_cache;
/*
* Tcp control block, one per tcp; fields:
*/
struct tcpcb {
int t_family; /* address family on the wire */
struct ipqehead segq; /* sequencing queue */
int t_segqlen; /* length of the above */
callout_t t_timer[TCPT_NTIMERS];/* tcp timers */
short t_state; /* state of this connection */
short t_rxtshift; /* log(2) of rexmt exp. backoff */
uint32_t t_rxtcur; /* current retransmit value */
short t_dupacks; /* consecutive dup acks recd */
/*
* t_partialacks:
* <0 not in fast recovery.
* ==0 in fast recovery. has not received partial acks
* >0 in fast recovery. has received partial acks
*/
short t_partialacks; /* partials acks during fast rexmit */
u_short t_peermss; /* peer's maximum segment size */
u_short t_ourmss; /* our's maximum segment size */
u_short t_segsz; /* current segment size in use */
char t_force; /* 1 if forcing out a byte */
u_int t_flags;
#define TF_ACKNOW 0x0001 /* ack peer immediately */
#define TF_DELACK 0x0002 /* ack, but try to delay it */
#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */
#define TF_NOOPT 0x0008 /* don't use tcp options */
#define TF_REQ_SCALE 0x0020 /* have/will request window scaling */
#define TF_RCVD_SCALE 0x0040 /* other side has requested scaling */
#define TF_REQ_TSTMP 0x0080 /* have/will request timestamps */
#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */
#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */
#define TF_SYN_REXMT 0x0400 /* rexmit timer fired on SYN */
#define TF_WILL_SACK 0x0800 /* try to use SACK */
#define TF_REASSEMBLING 0x1000 /* we're busy reassembling */
#define TF_DEAD 0x2000 /* dead and to-be-released */
#define TF_PMTUD_PEND 0x4000 /* Path MTU Discovery pending */
#define TF_ECN_PERMIT 0x10000 /* other side said is ECN-ready */
#define TF_ECN_SND_CWR 0x20000 /* ECN CWR in queue */
#define TF_ECN_SND_ECE 0x40000 /* ECN ECE in queue */
#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */
struct mbuf *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
callout_t t_delack_ch; /* delayed ACK callout */
/*
* The following fields are used as in the protocol specification.
* See RFC793, Dec. 1981, page 21.
*/
/* send sequence variables */
tcp_seq snd_una; /* send unacknowledged */
tcp_seq snd_nxt; /* send next */
tcp_seq snd_up; /* send urgent pointer */
tcp_seq snd_wl1; /* window update seg seq number */
tcp_seq snd_wl2; /* window update seg ack number */
tcp_seq iss; /* initial send sequence number */
u_long snd_wnd; /* send window */
/*
* snd_recover
* it's basically same as the "recover" variable in RFC 2852 (NewReno).
* when entering fast retransmit, it's set to snd_max.
* newreno uses this to detect partial ack.
* snd_high
* it's basically same as the "send_high" variable in RFC 2852 (NewReno).
* on each RTO, it's set to snd_max.
* newreno uses this to avoid false fast retransmits.
*/
tcp_seq snd_recover;
tcp_seq snd_high;
/* receive sequence variables */
u_long rcv_wnd; /* receive window */
tcp_seq rcv_nxt; /* receive next */
tcp_seq rcv_up; /* receive urgent pointer */
tcp_seq irs; /* initial receive sequence number */
/*
* Additional variables for this implementation.
*/
/* receive variables */
tcp_seq rcv_adv; /* advertised window */
/*
* retransmit variables
*
* snd_max
* the highest sequence number we've ever sent.
* used to recognize retransmits.
*/
tcp_seq snd_max;
/* congestion control (for slow start, source quench, retransmit after loss) */
u_long snd_cwnd; /* congestion-controlled window */
u_long snd_ssthresh; /* snd_cwnd size threshold for
* for slow start exponential to
* linear switch
*/
/* auto-sizing variables */
u_int rfbuf_cnt; /* recv buffer autoscaling byte count */
uint32_t rfbuf_ts; /* recv buffer autoscaling timestamp */
/*
* transmit timing stuff. See below for scale of srtt and rttvar.
* "Variance" is actually smoothed difference.
*/
uint32_t t_rcvtime; /* time last segment received */
uint32_t t_rtttime; /* time we started measuring rtt */
tcp_seq t_rtseq; /* sequence number being timed */
int32_t t_srtt; /* smoothed round-trip time */
int32_t t_rttvar; /* variance in round-trip time */
uint32_t t_rttmin; /* minimum rtt allowed */
u_long max_sndwnd; /* largest window peer has offered */
/* out-of-band data */
char t_oobflags; /* have some */
char t_iobc; /* input character */
#define TCPOOB_HAVEDATA 0x01
#define TCPOOB_HADDATA 0x02
short t_softerror; /* possible error not yet reported */
/* RFC 1323 variables */
u_char snd_scale; /* window scaling for send window */
u_char rcv_scale; /* window scaling for recv window */
u_char request_r_scale; /* pending window scaling */
u_char requested_s_scale;
u_int32_t ts_recent; /* timestamp echo data */
u_int32_t ts_recent_age; /* when last updated */
u_int32_t ts_timebase; /* our timebase */
tcp_seq last_ack_sent;
/* RFC 3465 variables */
u_long t_bytes_acked; /* ABC "bytes_acked" parameter */
/* SACK stuff */
#define TCP_SACK_MAX 3
#define TCPSACK_NONE 0
#define TCPSACK_HAVED 1
u_char rcv_sack_flags; /* SACK flags. */
struct sackblk rcv_dsack_block; /* RX D-SACK block. */
struct ipqehead timeq; /* time sequenced queue. */
struct sackhead snd_holes; /* TX SACK holes. */
int snd_numholes; /* Number of TX SACK holes. */
tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
tcp_seq sack_newdata; /* New data xmitted in this recovery
episode starts at this seq number*/
tcp_seq snd_fack; /* FACK TCP. Forward-most data held by
peer. */
/* CUBIC variables */
ulong snd_cubic_wmax; /* W_max */
ulong snd_cubic_wmax_last; /* Used for fast convergence */
ulong snd_cubic_ctime; /* Last congestion time */
/* pointer for syn cache entries*/
LIST_HEAD(, syn_cache) t_sc; /* list of entries by this tcb */
/* prediction of next mbuf when using large window sizes */
struct mbuf *t_lastm; /* last mbuf that data was sent from */
int t_inoff; /* data offset in previous mbuf */
int t_lastoff; /* last data address in mbuf chain */
int t_lastlen; /* last length read from mbuf chain */
/* Path-MTU discovery blackhole detection */
int t_mtudisc; /* perform mtudisc for this tcb */
/* Path-MTU Discovery Information */
u_int t_pmtud_mss_acked; /* MSS acked, lower bound for MTU */
u_int t_pmtud_mtu_sent; /* MTU used, upper bound for MTU */
tcp_seq t_pmtud_th_seq; /* TCP SEQ from ICMP payload */
u_int t_pmtud_nextmtu; /* Advertised Next-Hop MTU from ICMP */
u_short t_pmtud_ip_len; /* IP length from ICMP payload */
u_short t_pmtud_ip_hl; /* IP header length from ICMP payload */
uint8_t t_ecn_retries; /* # of ECN setup retries */
const struct tcp_congctl *t_congctl; /* per TCB congctl algorithm */
/* Keepalive per socket */
u_int t_keepinit;
u_int t_keepidle;
u_int t_keepintvl;
u_int t_keepcnt;
u_int t_maxidle; /* t_keepcnt * t_keepintvl */
u_int t_msl; /* MSL to use for this connexion */
/* maintain a few stats per connection: */
uint32_t t_rcvoopack; /* out-of-order packets received */
uint32_t t_sndrexmitpack; /* retransmit packets sent */
uint32_t t_sndzerowin; /* zero-window updates sent */
};
/*
* Macros to aid ECN TCP.
*/
#define TCP_ECN_ALLOWED(tp) (tp->t_flags & TF_ECN_PERMIT)
/*
* Macros to aid SACK/FACK TCP.
*/
#define TCP_SACK_ENABLED(tp) (tp->t_flags & TF_WILL_SACK)
#define TCP_FACK_FASTRECOV(tp) \
(TCP_SACK_ENABLED(tp) && \
(SEQ_GT(tp->snd_fack, tp->snd_una + tcprexmtthresh * tp->t_segsz)))
#ifdef _KERNEL
/*
* TCP reassembly queue locks.
*/
static __inline int tcp_reass_lock_try (struct tcpcb *)
__unused;
static __inline void tcp_reass_unlock (struct tcpcb *)
__unused;
static __inline int
tcp_reass_lock_try(struct tcpcb *tp)
{
int s;
/*
* Use splvm() -- we're blocking things that would cause
* mbuf allocation.
*/
s = splvm();
if (tp->t_flags & TF_REASSEMBLING) {
splx(s);
return (0);
}
tp->t_flags |= TF_REASSEMBLING;
splx(s);
return (1);
}
static __inline void
tcp_reass_unlock(struct tcpcb *tp)
{
int s;
s = splvm();
KASSERT((tp->t_flags & TF_REASSEMBLING) != 0);
tp->t_flags &= ~TF_REASSEMBLING;
splx(s);
}
#ifdef DIAGNOSTIC
#define TCP_REASS_LOCK(tp) \
do { \
if (tcp_reass_lock_try(tp) == 0) { \
printf("%s:%d: tcpcb %p reass already locked\n", \
__FILE__, __LINE__, tp); \
panic("tcp_reass_lock"); \
} \
} while (/*CONSTCOND*/ 0)
#define TCP_REASS_LOCK_CHECK(tp) \
do { \
if (((tp)->t_flags & TF_REASSEMBLING) == 0) { \
printf("%s:%d: tcpcb %p reass lock not held\n", \
__FILE__, __LINE__, tp); \
panic("tcp reass lock check"); \
} \
} while (/*CONSTCOND*/ 0)
#else
#define TCP_REASS_LOCK(tp) (void) tcp_reass_lock_try((tp))
#define TCP_REASS_LOCK_CHECK(tp) /* nothing */
#endif
#define TCP_REASS_UNLOCK(tp) tcp_reass_unlock((tp))
#endif /* _KERNEL */
/*
* Queue for delayed ACK processing.
*/
#ifdef _KERNEL
extern int tcp_delack_ticks;
void tcp_delack(void *);
#define TCP_RESTART_DELACK(tp) \
callout_reset(&(tp)->t_delack_ch, tcp_delack_ticks, \
tcp_delack, tp)
#define TCP_SET_DELACK(tp) \
do { \
if (((tp)->t_flags & TF_DELACK) == 0) { \
(tp)->t_flags |= TF_DELACK; \
TCP_RESTART_DELACK(tp); \
} \
} while (/*CONSTCOND*/0)
#define TCP_CLEAR_DELACK(tp) \
do { \
if ((tp)->t_flags & TF_DELACK) { \
(tp)->t_flags &= ~TF_DELACK; \
callout_stop(&(tp)->t_delack_ch); \
} \
} while (/*CONSTCOND*/0)
#endif /* _KERNEL */
/*
* Compute the current timestamp for a connection.
*/
#define TCP_TIMESTAMP(tp) (tcp_now - (tp)->ts_timebase)
/*
* Handy way of passing around TCP option info.
*/
struct tcp_opt_info {
int ts_present;
u_int32_t ts_val;
u_int32_t ts_ecr;
u_int16_t maxseg;
};
#define TOF_SIGNATURE 0x0040 /* signature option present */
#define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define sototcpcb(so) (intotcpcb(sotoinpcb(so)))
/*
* See RFC2988 for a discussion of RTO calculation; comments assume
* familiarity with that document.
*
* The smoothed round-trip time and estimated variance are stored as
* fixed point numbers. Historically, srtt was scaled by
* TCP_RTT_SHIFT bits, and rttvar by TCP_RTTVAR_SHIFT bits. Because
* the values coincide with the alpha and beta parameters suggested
* for RTO calculation (1/8 for srtt, 1/4 for rttvar), the combination
* of computing 1/8 of the new value and transforming it to the
* fixed-point representation required zero instructions. However,
* the storage representations no longer coincide with the alpha/beta
* shifts; instead, more fractional bits are present.
*
* The storage representation of srtt is 1/32 slow ticks, or 1/64 s.
* (The assumption that a slow tick is 500 ms should not be present in
* the code.)
*
* The storage representation of rttvar is 1/16 slow ticks, or 1/32 s.
* There may be some confusion about this in the code.
*
* For historical reasons, these scales are also used in smoothing the
* average (smoothed = (1/scale)sample + ((scale-1)/scale)smoothed).
* This results in alpha of 0.125 and beta of 0.25, following RFC2988
* section 2.3
*
* XXX Change SHIFT values to LGWEIGHT and REP_SHIFT, and adjust
* the code to use the correct ones.
*/
#define TCP_RTT_SHIFT 3 /* shift for srtt; 3 bits frac. */
#define TCP_RTTVAR_SHIFT 2 /* multiplier for rttvar; 2 bits */
/*
* Compute TCP retransmission timer, following RFC2988.
* This macro returns a value in slow timeout ticks.
*
* Section 2.2 requires that the RTO value be
* srtt + max(G, 4*RTTVAR)
* where G is the clock granularity.
*
* This comment has not necessarily been updated for the new storage
* representation:
*
* Because of the way we do the smoothing, srtt and rttvar
* will each average +1/2 tick of bias. When we compute
* the retransmit timer, we want 1/2 tick of rounding and
* 1 extra tick because of +-1/2 tick uncertainty in the
* firing of the timer. The bias will give us exactly the
* 1.5 tick we need. But, because the bias is
* statistical, we have to test that we don't drop below
* the minimum feasible timer (which is 2 ticks).
* This macro assumes that the value of 1<<TCP_RTTVAR_SHIFT
* is the same as the multiplier for rttvar.
*
* This macro appears to be wrong; it should be checking rttvar*4 in
* ticks and making sure we use 1 instead if rttvar*4 rounds to 0. It
* appears to be treating srtt as being in the old storage
* representation, resulting in a factor of 4 extra.
*/
#define TCP_REXMTVAL(tp) \
((((tp)->t_srtt >> TCP_RTT_SHIFT) + (tp)->t_rttvar) >> 2)
/*
* Compute the initial window for slow start.
*/
#define TCP_INITIAL_WINDOW(iw, segsz) \
uimin((iw) * (segsz), uimax(2 * (segsz), tcp_init_win_max[(iw)]))
/*
* TCP statistics.
* Each counter is an unsigned 64-bit value.
*
* Many of these should be kept per connection, but that's inconvenient
* at the moment.
*/
#define TCP_STAT_CONNATTEMPT 0 /* connections initiated */
#define TCP_STAT_ACCEPTS 1 /* connections accepted */
#define TCP_STAT_CONNECTS 2 /* connections established */
#define TCP_STAT_DROPS 3 /* connections dropped */
#define TCP_STAT_CONNDROPS 4 /* embryonic connections dropped */
#define TCP_STAT_CLOSED 5 /* conn. closed (includes drops) */
#define TCP_STAT_SEGSTIMED 6 /* segs where we tried to get rtt */
#define TCP_STAT_RTTUPDATED 7 /* times we succeeded */
#define TCP_STAT_DELACK 8 /* delayed ACKs sent */
#define TCP_STAT_TIMEOUTDROP 9 /* conn. dropped in rxmt timeout */
#define TCP_STAT_REXMTTIMEO 10 /* retransmit timeouts */
#define TCP_STAT_PERSISTTIMEO 11 /* persist timeouts */
#define TCP_STAT_KEEPTIMEO 12 /* keepalive timeouts */
#define TCP_STAT_KEEPPROBE 13 /* keepalive probes sent */
#define TCP_STAT_KEEPDROPS 14 /* connections dropped in keepalive */
#define TCP_STAT_PERSISTDROPS 15 /* connections dropped in persist */
#define TCP_STAT_CONNSDRAINED 16 /* connections drained due to memory
shortage */
#define TCP_STAT_PMTUBLACKHOLE 17 /* PMTUD blackhole detected */
#define TCP_STAT_SNDTOTAL 18 /* total packets sent */
#define TCP_STAT_SNDPACK 19 /* data packlets sent */
#define TCP_STAT_SNDBYTE 20 /* data bytes sent */
#define TCP_STAT_SNDREXMITPACK 21 /* data packets retransmitted */
#define TCP_STAT_SNDREXMITBYTE 22 /* data bytes retransmitted */
#define TCP_STAT_SNDACKS 23 /* ACK-only packets sent */
#define TCP_STAT_SNDPROBE 24 /* window probes sent */
#define TCP_STAT_SNDURG 25 /* packets sent with URG only */
#define TCP_STAT_SNDWINUP 26 /* window update-only packets sent */
#define TCP_STAT_SNDCTRL 27 /* control (SYN|FIN|RST) packets sent */
#define TCP_STAT_RCVTOTAL 28 /* total packets received */
#define TCP_STAT_RCVPACK 29 /* packets received in sequence */
#define TCP_STAT_RCVBYTE 30 /* bytes received in sequence */
#define TCP_STAT_RCVBADSUM 31 /* packets received with cksum errs */
#define TCP_STAT_RCVBADOFF 32 /* packets received with bad offset */
#define TCP_STAT_RCVMEMDROP 33 /* packets dropped for lack of memory */
#define TCP_STAT_RCVSHORT 34 /* packets received too short */
#define TCP_STAT_RCVDUPPACK 35 /* duplicate-only packets received */
#define TCP_STAT_RCVDUPBYTE 36 /* duplicate-only bytes received */
#define TCP_STAT_RCVPARTDUPPACK 37 /* packets with some duplicate data */
#define TCP_STAT_RCVPARTDUPBYTE 38 /* dup. bytes in part-dup. packets */
#define TCP_STAT_RCVOOPACK 39 /* out-of-order packets received */
#define TCP_STAT_RCVOOBYTE 40 /* out-of-order bytes received */
#define TCP_STAT_RCVPACKAFTERWIN 41 /* packets with data after window */
#define TCP_STAT_RCVBYTEAFTERWIN 42 /* bytes received after window */
#define TCP_STAT_RCVAFTERCLOSE 43 /* packets received after "close" */
#define TCP_STAT_RCVWINPROBE 44 /* rcvd window probe packets */
#define TCP_STAT_RCVDUPACK 45 /* rcvd duplicate ACKs */
#define TCP_STAT_RCVACKTOOMUCH 46 /* rcvd ACKs for unsent data */
#define TCP_STAT_RCVACKPACK 47 /* rcvd ACK packets */
#define TCP_STAT_RCVACKBYTE 48 /* bytes ACKed by rcvd ACKs */
#define TCP_STAT_RCVWINUPD 49 /* rcvd window update packets */
#define TCP_STAT_PAWSDROP 50 /* segments dropped due to PAWS */
#define TCP_STAT_PREDACK 51 /* times hdr predict OK for ACKs */
#define TCP_STAT_PREDDAT 52 /* times hdr predict OK for data pkts */
#define TCP_STAT_PCBHASHMISS 53 /* input packets missing PCB hash */
#define TCP_STAT_NOPORT 54 /* no socket on port */
#define TCP_STAT_BADSYN 55 /* received ACK for which we have
no SYN in compressed state */
#define TCP_STAT_DELAYED_FREE 56 /* delayed pool_put() of tcpcb */
#define TCP_STAT_SC_ADDED 57 /* # of sc entries added */
#define TCP_STAT_SC_COMPLETED 58 /* # of sc connections completed */
#define TCP_STAT_SC_TIMED_OUT 59 /* # of sc entries timed out */
#define TCP_STAT_SC_OVERFLOWED 60 /* # of sc drops due to overflow */
#define TCP_STAT_SC_RESET 61 /* # of sc drops due to RST */
#define TCP_STAT_SC_UNREACH 62 /* # of sc drops due to ICMP unreach */
#define TCP_STAT_SC_BUCKETOVERFLOW 63 /* # of sc drops due to bucket ovflow */
#define TCP_STAT_SC_ABORTED 64 /* # of sc entries aborted (no mem) */
#define TCP_STAT_SC_DUPESYN 65 /* # of duplicate SYNs received */
#define TCP_STAT_SC_DROPPED 66 /* # of SYNs dropped (no route/mem) */
#define TCP_STAT_SC_COLLISIONS 67 /* # of sc hash collisions */
#define TCP_STAT_SC_RETRANSMITTED 68 /* # of sc retransmissions */
#define TCP_STAT_SC_DELAYED_FREE 69 /* # of delayed pool_put()s */
#define TCP_STAT_SELFQUENCH 70 /* # of ENOBUFS we get on output */
#define TCP_STAT_BADSIG 71 /* # of drops due to bad signature */
#define TCP_STAT_GOODSIG 72 /* # of packets with good signature */
#define TCP_STAT_ECN_SHS 73 /* # of successful ECN handshakes */
#define TCP_STAT_ECN_CE 74 /* # of packets with CE bit */
#define TCP_STAT_ECN_ECT 75 /* # of packets with ECT(0) bit */
#define TCP_NSTATS 76
/*
* Names for TCP sysctl objects.
*/
#define TCPCTL_RFC1323 1 /* RFC1323 timestamps/scaling */
#define TCPCTL_SENDSPACE 2 /* default send buffer */
#define TCPCTL_RECVSPACE 3 /* default recv buffer */
#define TCPCTL_MSSDFLT 4 /* default seg size */
#define TCPCTL_SYN_CACHE_LIMIT 5 /* max size of comp. state engine */
#define TCPCTL_SYN_BUCKET_LIMIT 6 /* max size of hash bucket */
#if 0 /*obsoleted*/
#define TCPCTL_SYN_CACHE_INTER 7 /* interval of comp. state timer */
#endif
#define TCPCTL_INIT_WIN 8 /* initial window */
#define TCPCTL_MSS_IFMTU 9 /* mss from interface, not in_maxmtu */
#define TCPCTL_SACK 10 /* RFC2018 selective acknowledgement */
#define TCPCTL_WSCALE 11 /* RFC1323 window scaling */
#define TCPCTL_TSTAMP 12 /* RFC1323 timestamps */
#if 0 /*obsoleted*/
#define TCPCTL_COMPAT_42 13 /* 4.2BSD TCP bug work-arounds */
#endif
#define TCPCTL_CWM 14 /* Congestion Window Monitoring */
#define TCPCTL_CWM_BURSTSIZE 15 /* burst size allowed by CWM */
#define TCPCTL_ACK_ON_PUSH 16 /* ACK immediately on PUSH */
#define TCPCTL_KEEPIDLE 17 /* keepalive idle time */
#define TCPCTL_KEEPINTVL 18 /* keepalive probe interval */
#define TCPCTL_KEEPCNT 19 /* keepalive count */
#define TCPCTL_SLOWHZ 20 /* PR_SLOWHZ (read-only) */
#define TCPCTL_NEWRENO 21 /* NewReno Congestion Control */
#define TCPCTL_LOG_REFUSED 22 /* Log refused connections */
#if 0 /*obsoleted*/
#define TCPCTL_RSTRATELIMIT 23 /* RST rate limit */
#endif
#define TCPCTL_RSTPPSLIMIT 24 /* RST pps limit */
#define TCPCTL_DELACK_TICKS 25 /* # ticks to delay ACK */
#define TCPCTL_INIT_WIN_LOCAL 26 /* initial window for local nets */
#define TCPCTL_IDENT 27 /* rfc 931 identd */
#define TCPCTL_ACKDROPRATELIMIT 28 /* SYN/RST -> ACK rate limit */
#define TCPCTL_LOOPBACKCKSUM 29 /* do TCP checksum on loopback */
#define TCPCTL_STATS 30 /* TCP statistics */
#define TCPCTL_DEBUG 31 /* TCP debug sockets */
#define TCPCTL_DEBX 32 /* # of tcp debug sockets */
#define TCPCTL_DROP 33 /* drop tcp connection */
#define TCPCTL_MSL 34 /* Max Segment Life */
#ifdef _KERNEL
extern struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
extern const struct pr_usrreqs tcp_usrreqs;
extern u_int32_t tcp_now; /* for RFC 1323 timestamps */
extern int tcp_do_rfc1323; /* enabled/disabled? */
extern int tcp_do_sack; /* SACK enabled/disabled? */
extern int tcp_do_win_scale; /* RFC1323 window scaling enabled/disabled? */
extern int tcp_do_timestamps; /* RFC1323 timestamps enabled/disabled? */
extern int tcp_mssdflt; /* default seg size */
extern int tcp_minmss; /* minimal seg size */
extern int tcp_msl; /* max segment life */
extern int tcp_init_win; /* initial window */
extern int tcp_init_win_local; /* initial window for local nets */
extern int tcp_init_win_max[11];/* max sizes for values of tcp_init_win_* */
extern int tcp_mss_ifmtu; /* take MSS from interface, not in_maxmtu */
extern int tcp_cwm; /* enable Congestion Window Monitoring */
extern int tcp_cwm_burstsize; /* burst size allowed by CWM */
extern int tcp_ack_on_push; /* ACK immediately on PUSH */
extern int tcp_log_refused; /* log refused connections */
extern int tcp_do_ecn; /* TCP ECN enabled/disabled? */
extern int tcp_ecn_maxretries; /* Max ECN setup retries */
extern int tcp_do_rfc1948; /* ISS by cryptographic hash */
extern int tcp_sack_tp_maxholes; /* Max holes per connection. */
extern int tcp_sack_globalmaxholes; /* Max holes per system. */
extern int tcp_sack_globalholes; /* Number of holes present. */
extern int tcp_do_abc; /* RFC3465 ABC enabled/disabled? */
extern int tcp_abc_aggressive; /* 1: L=2*SMSS 0: L=1*SMSS */
extern int tcp_msl_enable; /* enable TIME_WAIT truncation */
extern int tcp_msl_loop; /* MSL for loopback */
extern int tcp_msl_local; /* MSL for 'local' */
extern int tcp_msl_remote; /* MSL otherwise */
extern int tcp_msl_remote_threshold; /* RTT threshold */
extern int tcp_rttlocal; /* Use RTT to decide who's 'local' */
extern int tcp4_vtw_enable;
extern int tcp6_vtw_enable;
extern int tcp_vtw_was_enabled;
extern int tcp_vtw_entries;
extern int tcp_rst_ppslim;
extern int tcp_ackdrop_ppslim;
#ifdef MBUFTRACE
extern struct mowner tcp_rx_mowner;
extern struct mowner tcp_tx_mowner;
extern struct mowner tcp_reass_mowner;
extern struct mowner tcp_sock_mowner;
extern struct mowner tcp_sock_rx_mowner;
extern struct mowner tcp_sock_tx_mowner;
extern struct mowner tcp_mowner;
#endif
extern int tcp_do_autorcvbuf;
extern int tcp_autorcvbuf_inc;
extern int tcp_autorcvbuf_max;
extern int tcp_do_autosndbuf;
extern int tcp_autosndbuf_inc;
extern int tcp_autosndbuf_max;
struct secasvar;
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
tcp_close(struct tcpcb *);
int tcp_isdead(struct tcpcb *);
#ifdef INET6
void *tcp6_ctlinput(int, const struct sockaddr *, void *);
#endif
void *tcp_ctlinput(int, const struct sockaddr *, void *);
int tcp_ctloutput(int, struct socket *, struct sockopt *);
struct tcpcb *
tcp_disconnect1(struct tcpcb *);
struct tcpcb *
tcp_drop(struct tcpcb *, int);
#ifdef TCP_SIGNATURE
int tcp_signature_apply(void *, void *, u_int);
struct secasvar *tcp_signature_getsav(struct mbuf *);
int tcp_signature(struct mbuf *, struct tcphdr *, int, struct secasvar *,
char *);
#endif
void tcp_drain(void);
void tcp_drainstub(void);
void tcp_established(struct tcpcb *);
void tcp_init(void);
void tcp_init_common(unsigned);
#ifdef INET6
int tcp6_input(struct mbuf **, int *, int);
#endif
void tcp_input(struct mbuf *, int, int);
u_int tcp_hdrsz(struct tcpcb *);
u_long tcp_mss_to_advertise(const struct ifnet *, int);
void tcp_mss_from_peer(struct tcpcb *, int);
void tcp_tcpcb_template(void);
struct tcpcb *
tcp_newtcpcb(int, struct inpcb *);
void tcp_notify(struct inpcb *, int);
u_int tcp_optlen(struct tcpcb *);
int tcp_output(struct tcpcb *);
void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
void tcp_quench(struct inpcb *);
void tcp_mtudisc(struct inpcb *, int);
#ifdef INET6
void tcp6_mtudisc_callback(struct in6_addr *);
#endif
void tcpipqent_init(void);
struct ipqent *tcpipqent_alloc(void);
void tcpipqent_free(struct ipqent *);
int tcp_respond(struct tcpcb *, struct mbuf *, struct mbuf *,
struct tcphdr *, tcp_seq, tcp_seq, int);
void tcp_rmx_rtt(struct tcpcb *);
void tcp_setpersist(struct tcpcb *);
#ifdef TCP_SIGNATURE
int tcp_signature_compute(struct mbuf *, struct tcphdr *, int, int,
int, u_char *, u_int);
#endif
void tcp_fasttimo(void);
struct mbuf *
tcp_template(struct tcpcb *);
void tcp_trace(short, short, struct tcpcb *, struct mbuf *, int);
struct tcpcb *
tcp_usrclosed(struct tcpcb *);
void tcp_usrreq_init(void);
void tcp_xmit_timer(struct tcpcb *, uint32_t);
tcp_seq tcp_new_iss(struct tcpcb *);
tcp_seq tcp_new_iss1(void *, void *, u_int16_t, u_int16_t, size_t);
void tcp_sack_init(void);
void tcp_new_dsack(struct tcpcb *, tcp_seq, u_int32_t);
void tcp_sack_option(struct tcpcb *, const struct tcphdr *,
const u_char *, int);
void tcp_del_sackholes(struct tcpcb *, const struct tcphdr *);
void tcp_free_sackholes(struct tcpcb *);
void tcp_sack_adjust(struct tcpcb *tp);
struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt);
int tcp_sack_numblks(const struct tcpcb *);
#define TCP_SACK_OPTLEN(nblks) ((nblks) * 8 + 2 + 2)
void tcp_statinc(u_int);
void tcp_statadd(u_int, uint64_t);
int tcp_input_checksum(int, struct mbuf *, const struct tcphdr *, int, int,
int);
int tcp_dooptions(struct tcpcb *, const u_char *, int,
struct tcphdr *, struct mbuf *, int, struct tcp_opt_info *);
#endif
#endif /* !_NETINET_TCP_VAR_H_ */
/* $NetBSD: vfs_quotactl.c,v 1.40 2014/06/28 22:27:50 dholland Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by David A. Holland.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_quotactl.c,v 1.40 2014/06/28 22:27:50 dholland Exp $$");
#include <sys/mount.h>
#include <sys/quotactl.h>
int
vfs_quotactl_stat(struct mount *mp, struct quotastat *info)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_STAT;
args.u.stat.qc_info = info;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_idtypestat(struct mount *mp, int idtype,
struct quotaidtypestat *info)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_IDTYPESTAT;
args.u.idtypestat.qc_idtype = idtype;
args.u.idtypestat.qc_info = info;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_objtypestat(struct mount *mp, int objtype,
struct quotaobjtypestat *info)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_OBJTYPESTAT;
args.u.objtypestat.qc_objtype = objtype;
args.u.objtypestat.qc_info = info;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_get(struct mount *mp, const struct quotakey *key,
struct quotaval *val)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_GET;
args.u.get.qc_key = key;
args.u.get.qc_val = val;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_put(struct mount *mp, const struct quotakey *key,
const struct quotaval *val)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_PUT;
args.u.put.qc_key = key;
args.u.put.qc_val = val;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_del(struct mount *mp, const struct quotakey *key)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_DEL;
args.u.del.qc_key = key;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursoropen(struct mount *mp, struct quotakcursor *cursor)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSOROPEN;
args.u.cursoropen.qc_cursor = cursor;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursorclose(struct mount *mp, struct quotakcursor *cursor)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSORCLOSE;
args.u.cursorclose.qc_cursor = cursor;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursorskipidtype(struct mount *mp, struct quotakcursor *cursor,
int idtype)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSORSKIPIDTYPE;
args.u.cursorskipidtype.qc_cursor = cursor;
args.u.cursorskipidtype.qc_idtype = idtype;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursorget(struct mount *mp, struct quotakcursor *cursor,
struct quotakey *keys, struct quotaval *vals, unsigned maxnum,
unsigned *ret)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSORGET;
args.u.cursorget.qc_cursor = cursor;
args.u.cursorget.qc_keys = keys;
args.u.cursorget.qc_vals = vals;
args.u.cursorget.qc_maxnum = maxnum;
args.u.cursorget.qc_ret = ret;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursoratend(struct mount *mp, struct quotakcursor *cursor,
int *ret)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSORATEND;
args.u.cursoratend.qc_cursor = cursor;
args.u.cursoratend.qc_ret = ret;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_cursorrewind(struct mount *mp, struct quotakcursor *cursor)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_CURSORREWIND;
args.u.cursorrewind.qc_cursor = cursor;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_quotaon(struct mount *mp, int idtype, const char *path)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_QUOTAON;
args.u.quotaon.qc_idtype = idtype;
args.u.quotaon.qc_quotafile = path;
return VFS_QUOTACTL(mp, &args);
}
int
vfs_quotactl_quotaoff(struct mount *mp, int idtype)
{
struct quotactl_args args;
args.qc_op = QUOTACTL_QUOTAOFF;
args.u.quotaoff.qc_idtype = idtype;
return VFS_QUOTACTL(mp, &args);
}
/* $NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex_obj.c,v 1.15 2023/10/02 21:03:55 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/mutex.h>
#include <sys/kmem.h>
/* Mutex cache */
#define MUTEX_OBJ_MAGIC 0x5aa3c85d
struct kmutexobj {
kmutex_t mo_lock;
u_int mo_magic;
u_int mo_refcnt;
uint8_t mo_pad[COHERENCY_UNIT - sizeof(kmutex_t) -
sizeof(u_int) * 2];
};
/*
* mutex_obj_alloc:
*
* Allocate a single lock object, waiting for memory if needed.
*/
kmutex_t *
mutex_obj_alloc(kmutex_type_t type, int ipl)
{
struct kmutexobj *mo;
mo = kmem_intr_alloc(sizeof(*mo), KM_SLEEP);
KASSERT(ALIGNED_POINTER(mo, coherency_unit));
_mutex_init(&mo->mo_lock, type, ipl,
(uintptr_t)__builtin_return_address(0));
mo->mo_magic = MUTEX_OBJ_MAGIC;
mo->mo_refcnt = 1;
return (kmutex_t *)mo;
}
/*
* mutex_obj_alloc:
*
* Allocate a single lock object, failing if no memory available.
*/
kmutex_t *
mutex_obj_tryalloc(kmutex_type_t type, int ipl)
{
struct kmutexobj *mo;
mo = kmem_intr_alloc(sizeof(*mo), KM_NOSLEEP);
KASSERT(ALIGNED_POINTER(mo, coherency_unit));
if (__predict_true(mo != NULL)) {
_mutex_init(&mo->mo_lock, type, ipl,
(uintptr_t)__builtin_return_address(0));
mo->mo_magic = MUTEX_OBJ_MAGIC;
mo->mo_refcnt = 1;
}
return (kmutex_t *)mo;
}
/*
* mutex_obj_hold:
*
* Add a single reference to a lock object. A reference to the object
* must already be held, and must be held across this call.
*/
void
mutex_obj_hold(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
"%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
__func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
KASSERTMSG(mo->mo_refcnt > 0,
"%s: lock %p: mo->mo_refcnt (%#x) == 0",
__func__, mo, mo->mo_refcnt);
atomic_inc_uint(&mo->mo_refcnt);
}
/*
* mutex_obj_free:
*
* Drop a reference from a lock object. If the last reference is being
* dropped, free the object and return true. Otherwise, return false.
*/
bool
mutex_obj_free(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
KASSERTMSG(mo->mo_magic == MUTEX_OBJ_MAGIC,
"%s: lock %p: mo->mo_magic (%#x) != MUTEX_OBJ_MAGIC (%#x)",
__func__, mo, mo->mo_magic, MUTEX_OBJ_MAGIC);
KASSERTMSG(mo->mo_refcnt > 0,
"%s: lock %p: mo->mo_refcnt (%#x) == 0",
__func__, mo, mo->mo_refcnt);
membar_release();
if (atomic_dec_uint_nv(&mo->mo_refcnt) > 0) {
return false;
}
membar_acquire();
mutex_destroy(&mo->mo_lock);
kmem_intr_free(mo, sizeof(*mo));
return true;
}
/*
* mutex_obj_refcnt:
*
* Return the reference count on a lock object.
*/
u_int
mutex_obj_refcnt(kmutex_t *lock)
{
struct kmutexobj *mo = (struct kmutexobj *)lock;
return mo->mo_refcnt;
}
/* $NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $ */
/*-
* Copyright (c) 1997-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_mount.c,v 1.105 2024/04/19 00:45:41 riastradh Exp $");
#include "veriexec.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/fcntl.h>
#include <sys/filedesc.h>
#include <sys/device.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/extattr.h>
#include <sys/verified_exec.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm_swap.h>
enum mountlist_type {
ME_MOUNT,
ME_MARKER
};
struct mountlist_entry {
TAILQ_ENTRY(mountlist_entry) me_list; /* Mount list. */
struct mount *me_mount; /* Actual mount if ME_MOUNT,
current mount else. */
enum mountlist_type me_type; /* Mount or marker. */
};
struct mount_iterator {
struct mountlist_entry mi_entry;
};
static struct vnode *vfs_vnode_iterator_next1(struct vnode_iterator *,
bool (*)(void *, struct vnode *), void *, bool);
/* Root filesystem. */
vnode_t * rootvnode;
/* Mounted filesystem list. */
static TAILQ_HEAD(mountlist, mountlist_entry) mountlist;
static kmutex_t mountlist_lock __cacheline_aligned;
int vnode_offset_next_by_lru /* XXX: ugly hack for pstat.c */
= offsetof(vnode_impl_t, vi_lrulist.tqe_next);
kmutex_t vfs_list_lock __cacheline_aligned;
static specificdata_domain_t mount_specificdata_domain;
static kmutex_t mntid_lock;
static kmutex_t mountgen_lock __cacheline_aligned;
static uint64_t mountgen;
void
vfs_mount_sysinit(void)
{
TAILQ_INIT(&mountlist);
mutex_init(&mountlist_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&vfs_list_lock, MUTEX_DEFAULT, IPL_NONE);
mount_specificdata_domain = specificdata_domain_create();
mutex_init(&mntid_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&mountgen_lock, MUTEX_DEFAULT, IPL_NONE);
mountgen = 0;
}
struct mount *
vfs_mountalloc(struct vfsops *vfsops, vnode_t *vp)
{
struct mount *mp;
int error __diagused;
mp = kmem_zalloc(sizeof(*mp), KM_SLEEP);
mp->mnt_op = vfsops;
mp->mnt_refcnt = 1;
TAILQ_INIT(&mp->mnt_vnodelist);
mp->mnt_renamelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_vnodelock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_updating = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
mp->mnt_vnodecovered = vp;
mount_initspecific(mp);
error = fstrans_mount(mp);
KASSERT(error == 0);
mutex_enter(&mountgen_lock);
mp->mnt_gen = mountgen++;
mutex_exit(&mountgen_lock);
return mp;
}
/*
* vfs_rootmountalloc: lookup a filesystem type, and if found allocate and
* initialize a mount structure for it.
*
* Devname is usually updated by mount(8) after booting.
*/
int
vfs_rootmountalloc(const char *fstypename, const char *devname,
struct mount **mpp)
{
struct vfsops *vfsp = NULL;
struct mount *mp;
int error __diagused;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(vfsp, &vfs_list, vfs_list)
if (!strncmp(vfsp->vfs_name, fstypename,
sizeof(mp->mnt_stat.f_fstypename)))
break;
if (vfsp == NULL) {
mutex_exit(&vfs_list_lock);
return (ENODEV);
}
vfsp->vfs_refcount++;
mutex_exit(&vfs_list_lock);
if ((mp = vfs_mountalloc(vfsp, NULL)) == NULL)
return ENOMEM;
error = vfs_busy(mp);
KASSERT(error == 0);
mp->mnt_flag = MNT_RDONLY;
(void)strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfs_name,
sizeof(mp->mnt_stat.f_fstypename));
mp->mnt_stat.f_mntonname[0] = '/';
mp->mnt_stat.f_mntonname[1] = '\0';
mp->mnt_stat.f_mntfromname[sizeof(mp->mnt_stat.f_mntfromname) - 1] =
'\0';
(void)copystr(devname, mp->mnt_stat.f_mntfromname,
sizeof(mp->mnt_stat.f_mntfromname) - 1, 0);
*mpp = mp;
return 0;
}
/*
* vfs_getnewfsid: get a new unique fsid.
*/
void
vfs_getnewfsid(struct mount *mp)
{
static u_short xxxfs_mntid;
struct mountlist_entry *me;
fsid_t tfsid;
int mtype;
mutex_enter(&mntid_lock);
if (xxxfs_mntid == 0)
++xxxfs_mntid;
mtype = makefstype(mp->mnt_op->vfs_name);
tfsid.__fsid_val[0] = makedev(mtype & 0xff, xxxfs_mntid);
tfsid.__fsid_val[1] = mtype;
/* Always increment to not return the same fsid to parallel mounts. */
xxxfs_mntid++;
/*
* Directly walk mountlist to prevent deadlock through
* mountlist_iterator_next() -> vfs_busy().
*/
mutex_enter(&mountlist_lock);
for (me = TAILQ_FIRST(&mountlist); me != TAILQ_END(&mountlist); ) { if (me->me_type == ME_MOUNT &&
me->me_mount->mnt_stat.f_fsidx.__fsid_val[0] ==
tfsid.__fsid_val[0] &&
me->me_mount->mnt_stat.f_fsidx.__fsid_val[1] ==
tfsid.__fsid_val[1]) {
tfsid.__fsid_val[0]++;
xxxfs_mntid++;
me = TAILQ_FIRST(&mountlist);
} else {
me = TAILQ_NEXT(me, me_list);
}
}
mutex_exit(&mountlist_lock);
mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
mp->mnt_stat.f_fsid = mp->mnt_stat.f_fsidx.__fsid_val[0];
mutex_exit(&mntid_lock);
}
/*
* Lookup a mount point by filesystem identifier.
*
* XXX Needs to add a reference to the mount point.
*/
struct mount *
vfs_getvfs(fsid_t *fsid)
{
mount_iterator_t *iter;
struct mount *mp;
mountlist_iterator_init(&iter); while ((mp = mountlist_iterator_next(iter)) != NULL) { if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
mountlist_iterator_destroy(iter);
return mp;
}
}
mountlist_iterator_destroy(iter);
return NULL;
}
/*
* Take a reference to a mount structure.
*/
void
vfs_ref(struct mount *mp)
{ KASSERT(mp->mnt_refcnt > 0 || mutex_owned(&mountlist_lock));
atomic_inc_uint(&mp->mnt_refcnt);
}
/*
* Drop a reference to a mount structure, freeing if the last reference.
*/
void
vfs_rele(struct mount *mp)
{
membar_release();
if (__predict_true((int)atomic_dec_uint_nv(&mp->mnt_refcnt) > 0)) {
return;
}
membar_acquire();
/*
* Nothing else has visibility of the mount: we can now
* free the data structures.
*/
KASSERT(mp->mnt_refcnt == 0);
specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
mutex_obj_free(mp->mnt_updating);
mutex_obj_free(mp->mnt_renamelock);
mutex_obj_free(mp->mnt_vnodelock);
if (mp->mnt_op != NULL) { vfs_delref(mp->mnt_op);
}
fstrans_unmount(mp);
/*
* Final free of mp gets done from fstrans_mount_dtor().
*
* Prevents this memory to be reused as a mount before
* fstrans releases all references to it.
*/
}
/*
* Mark a mount point as busy, and gain a new reference to it. Used to
* prevent the file system from being unmounted during critical sections.
*
* vfs_busy can be called multiple times and by multiple threads
* and must be accompanied by the same number of vfs_unbusy calls.
*
* => The caller must hold a pre-existing reference to the mount.
* => Will fail if the file system is being unmounted, or is unmounted.
*/
static inline int
_vfs_busy(struct mount *mp, bool wait)
{
KASSERT(mp->mnt_refcnt > 0);
if (wait) {
fstrans_start(mp);
} else {
if (fstrans_start_nowait(mp))
return EBUSY;
}
if (__predict_false((mp->mnt_iflag & IMNT_GONE) != 0)) {
fstrans_done(mp);
return ENOENT;
}
vfs_ref(mp);
return 0;
}
int
vfs_busy(struct mount *mp)
{ return _vfs_busy(mp, true);
}
int
vfs_trybusy(struct mount *mp)
{
return _vfs_busy(mp, false);
}
/*
* Unbusy a busy filesystem.
*
* Every successful vfs_busy() call must be undone by a vfs_unbusy() call.
*/
void
vfs_unbusy(struct mount *mp)
{ KASSERT(mp->mnt_refcnt > 0);
fstrans_done(mp);
vfs_rele(mp);
}
/*
* Change a file systems lower mount.
* Both the current and the new lower mount may be NULL. The caller
* guarantees exclusive access to the mount and holds a pre-existing
* reference to the new lower mount.
*/
int
vfs_set_lowermount(struct mount *mp, struct mount *lowermp)
{
struct mount *oldlowermp;
int error;
#ifdef DEBUG
/*
* Limit the depth of file system stack so kernel sanitizers
* may stress mount/unmount without exhausting the kernel stack.
*/
int depth;
struct mount *mp2;
for (depth = 0, mp2 = lowermp; mp2; depth++, mp2 = mp2->mnt_lower) { if (depth == 23)
return EINVAL;
}
#endif
if (lowermp) {
if (lowermp == dead_rootmount)
return ENOENT;
error = vfs_busy(lowermp);
if (error)
return error;
vfs_ref(lowermp);
}
oldlowermp = mp->mnt_lower;
mp->mnt_lower = lowermp;
if (lowermp)
vfs_unbusy(lowermp); if (oldlowermp) vfs_rele(oldlowermp);
return 0;
}
struct vnode_iterator {
vnode_impl_t vi_vnode;
};
void
vfs_vnode_iterator_init(struct mount *mp, struct vnode_iterator **vnip)
{
vnode_t *vp;
vnode_impl_t *vip;
vp = vnalloc_marker(mp);
vip = VNODE_TO_VIMPL(vp);
mutex_enter(mp->mnt_vnodelock);
TAILQ_INSERT_HEAD(&mp->mnt_vnodelist, vip, vi_mntvnodes);
vp->v_usecount = 1;
mutex_exit(mp->mnt_vnodelock);
*vnip = (struct vnode_iterator *)vip;
}
void
vfs_vnode_iterator_destroy(struct vnode_iterator *vni)
{
vnode_impl_t *mvip = &vni->vi_vnode;
vnode_t *mvp = VIMPL_TO_VNODE(mvip);
kmutex_t *lock;
KASSERT(vnis_marker(mvp)); if (vrefcnt(mvp) != 0) {
lock = mvp->v_mount->mnt_vnodelock;
mutex_enter(lock);
TAILQ_REMOVE(&mvp->v_mount->mnt_vnodelist, mvip, vi_mntvnodes);
mvp->v_usecount = 0;
mutex_exit(lock);
}
vnfree_marker(mvp);
}
static struct vnode *
vfs_vnode_iterator_next1(struct vnode_iterator *vni,
bool (*f)(void *, struct vnode *), void *cl, bool do_wait)
{
vnode_impl_t *mvip = &vni->vi_vnode;
struct mount *mp = VIMPL_TO_VNODE(mvip)->v_mount;
vnode_t *vp;
vnode_impl_t *vip;
kmutex_t *lock;
int error;
KASSERT(vnis_marker(VIMPL_TO_VNODE(mvip)));
lock = mp->mnt_vnodelock;
do {
mutex_enter(lock);
vip = TAILQ_NEXT(mvip, vi_mntvnodes);
TAILQ_REMOVE(&mp->mnt_vnodelist, mvip, vi_mntvnodes);
VIMPL_TO_VNODE(mvip)->v_usecount = 0;
again:
if (vip == NULL) {
mutex_exit(lock);
return NULL;
}
vp = VIMPL_TO_VNODE(vip);
KASSERT(vp != NULL);
mutex_enter(vp->v_interlock);
if (vnis_marker(vp) || vdead_check(vp, (do_wait ? 0 : VDEAD_NOWAIT)) || (f && !(*f)(cl, vp))) {
mutex_exit(vp->v_interlock);
vip = TAILQ_NEXT(vip, vi_mntvnodes);
goto again;
}
TAILQ_INSERT_AFTER(&mp->mnt_vnodelist, vip, mvip, vi_mntvnodes);
VIMPL_TO_VNODE(mvip)->v_usecount = 1;
mutex_exit(lock);
error = vcache_vget(vp);
KASSERT(error == 0 || error == ENOENT); } while (error != 0);
return vp;
}
struct vnode *
vfs_vnode_iterator_next(struct vnode_iterator *vni,
bool (*f)(void *, struct vnode *), void *cl)
{
return vfs_vnode_iterator_next1(vni, f, cl, false);
}
/*
* Move a vnode from one mount queue to another.
*/
void
vfs_insmntque(vnode_t *vp, struct mount *mp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
struct mount *omp;
kmutex_t *lock;
KASSERT(mp == NULL || (mp->mnt_iflag & IMNT_UNMOUNT) == 0 ||
vp->v_tag == VT_VFS);
/*
* Delete from old mount point vnode list, if on one.
*/
if ((omp = vp->v_mount) != NULL) {
lock = omp->mnt_vnodelock;
mutex_enter(lock);
TAILQ_REMOVE(&vp->v_mount->mnt_vnodelist, vip, vi_mntvnodes);
mutex_exit(lock);
}
/*
* Insert into list of vnodes for the new mount point, if
* available. The caller must take a reference on the mount
* structure and donate to the vnode.
*/
if ((vp->v_mount = mp) != NULL) {
lock = mp->mnt_vnodelock;
mutex_enter(lock);
TAILQ_INSERT_TAIL(&mp->mnt_vnodelist, vip, vi_mntvnodes);
mutex_exit(lock);
}
if (omp != NULL) {
/* Release reference to old mount. */
vfs_rele(omp);
}
}
/*
* Remove any vnodes in the vnode table belonging to mount point mp.
*
* If FORCECLOSE is not specified, there should not be any active ones,
* return error if any are found (nb: this is a user error, not a
* system error). If FORCECLOSE is specified, detach any active vnodes
* that are found.
*
* If WRITECLOSE is set, only flush out regular file vnodes open for
* writing.
*
* SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
*/
#ifdef DEBUG
int busyprt = 0; /* print out busy vnodes */
struct ctldebug debug1 = { "busyprt", &busyprt };
#endif
static vnode_t *
vflushnext(struct vnode_iterator *marker, int *when)
{
if (getticks() > *when) { yield();
*when = getticks() + hz / 10;
}
preempt_point();
return vfs_vnode_iterator_next1(marker, NULL, NULL, true);
}
/*
* Flush one vnode. Referenced on entry, unreferenced on return.
*/
static int
vflush_one(vnode_t *vp, vnode_t *skipvp, int flags)
{
int error;
struct vattr vattr;
if (vp == skipvp || ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM))) {
vrele(vp);
return 0;
}
/*
* If WRITECLOSE is set, only flush out regular file
* vnodes open for writing or open and unlinked.
*/
if ((flags & WRITECLOSE)) {
if (vp->v_type != VREG) {
vrele(vp);
return 0;
}
error = vn_lock(vp, LK_EXCLUSIVE);
if (error) { KASSERT(error == ENOENT);
vrele(vp);
return 0;
}
error = VOP_FSYNC(vp, curlwp->l_cred, FSYNC_WAIT, 0, 0);
if (error == 0)
error = VOP_GETATTR(vp, &vattr, curlwp->l_cred);
VOP_UNLOCK(vp); if (error) {
vrele(vp);
return error;
}
if (vp->v_writecount == 0 && vattr.va_nlink > 0) { vrele(vp);
return 0;
}
}
/*
* First try to recycle the vnode.
*/
if (vrecycle(vp))
return 0;
/*
* If FORCECLOSE is set, forcibly close the vnode.
* For block or character devices, revert to an
* anonymous device. For all other files, just
* kill them.
*/
if (flags & FORCECLOSE) {
if (vrefcnt(vp) > 1 && (vp->v_type == VBLK || vp->v_type == VCHR)) vcache_make_anon(vp);
else
vgone(vp);
return 0;
}
vrele(vp);
return EBUSY;
}
int
vflush(struct mount *mp, vnode_t *skipvp, int flags)
{
vnode_t *vp;
struct vnode_iterator *marker;
int busy, error, when, retries = 2;
do {
busy = error = when = 0;
/*
* First, flush out any vnode references from the
* deferred vrele list.
*/
vrele_flush(mp);
vfs_vnode_iterator_init(mp, &marker); while ((vp = vflushnext(marker, &when)) != NULL) { error = vflush_one(vp, skipvp, flags); if (error == EBUSY) {
error = 0;
busy++;
#ifdef DEBUG
if (busyprt && retries == 0) vprint("vflush: busy vnode", vp);
#endif
} else if (error != 0) {
break;
}
}
vfs_vnode_iterator_destroy(marker);
} while (error == 0 && busy > 0 && retries-- > 0); if (error)
return error;
if (busy)
return EBUSY;
return 0;
}
/*
* Mount a file system.
*/
/*
* Scan all active processes to see if any of them have a current or root
* directory onto which the new filesystem has just been mounted. If so,
* replace them with the new mount point.
*/
static void
mount_checkdirs(vnode_t *olddp)
{
vnode_t *newdp, *rele1, *rele2;
struct cwdinfo *cwdi;
struct proc *p;
bool retry;
if (vrefcnt(olddp) == 1) {
return;
}
if (VFS_ROOT(olddp->v_mountedhere, LK_EXCLUSIVE, &newdp))
panic("mount: lost mount");
do {
retry = false;
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) { if ((cwdi = p->p_cwdi) == NULL)
continue;
/*
* Cannot change to the old directory any more,
* so even if we see a stale value it is not a
* problem.
*/
if (cwdi->cwdi_cdir != olddp &&
cwdi->cwdi_rdir != olddp)
continue;
retry = true;
rele1 = NULL;
rele2 = NULL;
atomic_inc_uint(&cwdi->cwdi_refcnt);
mutex_exit(&proc_lock);
rw_enter(&cwdi->cwdi_lock, RW_WRITER);
if (cwdi->cwdi_cdir == olddp) {
rele1 = cwdi->cwdi_cdir;
vref(newdp);
cwdi->cwdi_cdir = newdp;
}
if (cwdi->cwdi_rdir == olddp) {
rele2 = cwdi->cwdi_rdir;
vref(newdp);
cwdi->cwdi_rdir = newdp;
}
rw_exit(&cwdi->cwdi_lock);
cwdfree(cwdi);
if (rele1 != NULL) vrele(rele1); if (rele2 != NULL) vrele(rele2);
mutex_enter(&proc_lock);
break;
}
mutex_exit(&proc_lock);
} while (retry);
if (rootvnode == olddp) {
vrele(rootvnode);
vref(newdp);
rootvnode = newdp;
}
vput(newdp);
}
/*
* Start extended attributes
*/
static int
start_extattr(struct mount *mp)
{
int error;
error = VFS_EXTATTRCTL(mp, EXTATTR_CMD_START, NULL, 0, NULL);
if (error)
printf("%s: failed to start extattr: error = %d\n",
mp->mnt_stat.f_mntonname, error);
return error;
}
int
mount_domount(struct lwp *l, vnode_t **vpp, struct vfsops *vfsops,
const char *path, int flags, void *data, size_t *data_len)
{
vnode_t *vp = *vpp;
struct mount *mp;
struct pathbuf *pb;
struct nameidata nd;
int error, error2;
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_NEW, vp, KAUTH_ARG(flags), data);
if (error) {
vfs_delref(vfsops);
return error;
}
/* Cannot make a non-dir a mount-point (from here anyway). */
if (vp->v_type != VDIR) {
vfs_delref(vfsops);
return ENOTDIR;
}
if (flags & MNT_EXPORTED) {
vfs_delref(vfsops);
return EINVAL;
}
if ((mp = vfs_mountalloc(vfsops, vp)) == NULL) {
vfs_delref(vfsops);
return ENOMEM;
}
mp->mnt_stat.f_owner = kauth_cred_geteuid(l->l_cred);
/*
* The underlying file system may refuse the mount for
* various reasons. Allow the user to force it to happen.
*
* Set the mount level flags.
*/
mp->mnt_flag = flags & (MNT_BASIC_FLAGS | MNT_FORCE | MNT_IGNORE);
error = VFS_MOUNT(mp, path, data, data_len);
mp->mnt_flag &= ~MNT_OP_FLAGS;
if (error != 0) {
vfs_rele(mp);
return error;
}
/* Suspend new file system before taking mnt_updating. */
do {
error2 = vfs_suspend(mp, 0);
} while (error2 == EINTR || error2 == ERESTART); KASSERT(error2 == 0 || error2 == EOPNOTSUPP);
mutex_enter(mp->mnt_updating);
/*
* Validate and prepare the mount point.
*/
error = pathbuf_copyin(path, &pb);
if (error != 0) {
goto err_mounted;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
error = namei(&nd);
pathbuf_destroy(pb);
if (error != 0) {
goto err_mounted;
}
if (nd.ni_vp != vp) {
vput(nd.ni_vp);
error = EINVAL;
goto err_mounted;
}
if (vp->v_mountedhere != NULL) {
vput(nd.ni_vp);
error = EBUSY;
goto err_mounted;
}
error = vinvalbuf(vp, V_SAVE, l->l_cred, l, 0, 0);
if (error != 0) {
vput(nd.ni_vp);
goto err_mounted;
}
/*
* Put the new filesystem on the mount list after root.
*/
cache_purge(vp);
mp->mnt_iflag &= ~IMNT_WANTRDWR;
mountlist_append(mp); if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) vfs_syncer_add_to_worklist(mp);
vp->v_mountedhere = mp;
vput(nd.ni_vp);
mount_checkdirs(vp);
mutex_exit(mp->mnt_updating);
if (error2 == 0) vfs_resume(mp);
/* Hold an additional reference to the mount across VFS_START(). */
vfs_ref(mp);
(void) VFS_STATVFS(mp, &mp->mnt_stat);
error = VFS_START(mp, 0);
if (error) {
vrele(vp); } else if (flags & MNT_EXTATTR) { if (start_extattr(mp) != 0)
mp->mnt_flag &= ~MNT_EXTATTR;
}
/* Drop reference held for VFS_START(). */
vfs_rele(mp);
*vpp = NULL;
return error;
err_mounted:
if (VFS_UNMOUNT(mp, MNT_FORCE) != 0)
panic("Unmounting fresh file system failed");
mutex_exit(mp->mnt_updating);
if (error2 == 0) vfs_resume(mp); vfs_set_lowermount(mp, NULL);
vfs_rele(mp);
return error;
}
/*
* Do the actual file system unmount. File system is assumed to have
* been locked by the caller.
*
* => Caller hold reference to the mount, explicitly for dounmount().
*/
int
dounmount(struct mount *mp, int flags, struct lwp *l)
{
struct vnode *coveredvp, *vp;
struct vnode_impl *vip;
int error, async, used_syncer, used_extattr;
const bool was_suspended = fstrans_is_owner(mp);
#if NVERIEXEC > 0
error = veriexec_unmountchk(mp);
if (error)
return (error);
#endif /* NVERIEXEC > 0 */
if (!was_suspended) {
error = vfs_suspend(mp, 0);
if (error) {
return error;
}
}
KASSERT((mp->mnt_iflag & IMNT_GONE) == 0);
used_syncer = (mp->mnt_iflag & IMNT_ONWORKLIST) != 0;
used_extattr = mp->mnt_flag & MNT_EXTATTR;
mp->mnt_iflag |= IMNT_UNMOUNT;
mutex_enter(mp->mnt_updating);
async = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
cache_purgevfs(mp); /* remove cache entries for this file sys */
if (used_syncer) vfs_syncer_remove_from_worklist(mp);
error = 0;
if (((mp->mnt_flag & MNT_RDONLY) == 0) && ((flags & MNT_FORCE) == 0)) {
error = VFS_SYNC(mp, MNT_WAIT, l->l_cred);
}
if (error == 0 || (flags & MNT_FORCE)) {
error = VFS_UNMOUNT(mp, flags);
}
if (error) {
mp->mnt_iflag &= ~IMNT_UNMOUNT;
if ((mp->mnt_flag & (MNT_RDONLY | MNT_ASYNC)) == 0) vfs_syncer_add_to_worklist(mp);
mp->mnt_flag |= async;
mutex_exit(mp->mnt_updating);
if (!was_suspended) vfs_resume(mp); if (used_extattr) { if (start_extattr(mp) != 0)
mp->mnt_flag &= ~MNT_EXTATTR;
else
mp->mnt_flag |= MNT_EXTATTR;
}
return (error);
}
mutex_exit(mp->mnt_updating);
/*
* mark filesystem as gone to prevent further umounts
* after mnt_umounting lock is gone, this also prevents
* vfs_busy() from succeeding.
*/
mp->mnt_iflag |= IMNT_GONE;
if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { coveredvp->v_mountedhere = NULL;
}
if (!was_suspended) vfs_resume(mp);
mountlist_remove(mp);
if ((vip = TAILQ_FIRST(&mp->mnt_vnodelist)) != NULL) {
vp = VIMPL_TO_VNODE(vip);
vprint("dangling", vp);
panic("unmount: dangling vnode");
}
vfs_hooks_unmount(mp);
vfs_set_lowermount(mp, NULL);
vfs_rele(mp); /* reference from mount() */
if (coveredvp != NULLVP) { vrele(coveredvp);
}
return (0);
}
/*
* Unmount all file systems.
* We traverse the list in reverse order under the assumption that doing so
* will avoid needing to worry about dependencies.
*/
bool
vfs_unmountall(struct lwp *l)
{
printf("unmounting file systems...\n");
return vfs_unmountall1(l, true, true);
}
static void
vfs_unmount_print(struct mount *mp, const char *pfx)
{
aprint_verbose("%sunmounted %s on %s type %s\n", pfx,
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_fstypename);
}
/*
* Return the mount with the highest generation less than "gen".
*/
static struct mount *
vfs_unmount_next(uint64_t gen)
{
mount_iterator_t *iter;
struct mount *mp, *nmp;
nmp = NULL;
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_next(iter)) != NULL) {
if ((nmp == NULL || mp->mnt_gen > nmp->mnt_gen) &&
mp->mnt_gen < gen) {
if (nmp != NULL)
vfs_rele(nmp);
nmp = mp;
vfs_ref(nmp);
}
}
mountlist_iterator_destroy(iter);
return nmp;
}
bool
vfs_unmount_forceone(struct lwp *l)
{
struct mount *mp;
int error;
mp = vfs_unmount_next(mountgen);
if (mp == NULL) {
return false;
}
#ifdef DEBUG
printf("forcefully unmounting %s (%s)...\n",
mp->mnt_stat.f_mntonname, mp->mnt_stat.f_mntfromname);
#endif
if ((error = dounmount(mp, MNT_FORCE, l)) == 0) {
vfs_unmount_print(mp, "forcefully ");
return true;
} else {
vfs_rele(mp);
}
#ifdef DEBUG
printf("forceful unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
#endif
return false;
}
bool
vfs_unmountall1(struct lwp *l, bool force, bool verbose)
{
struct mount *mp;
mount_iterator_t *iter;
bool any_error = false, progress = false;
uint64_t gen;
int error;
gen = mountgen;
for (;;) {
mp = vfs_unmount_next(gen);
if (mp == NULL)
break;
gen = mp->mnt_gen;
#ifdef DEBUG
printf("unmounting %p %s (%s)...\n",
(void *)mp, mp->mnt_stat.f_mntonname,
mp->mnt_stat.f_mntfromname);
#endif
if ((error = dounmount(mp, force ? MNT_FORCE : 0, l)) == 0) {
vfs_unmount_print(mp, "");
progress = true;
} else {
vfs_rele(mp);
if (verbose) {
printf("unmount of %s failed with error %d\n",
mp->mnt_stat.f_mntonname, error);
}
any_error = true;
}
}
if (verbose) {
printf("unmounting done\n");
}
if (any_error && verbose) {
printf("WARNING: some file systems would not unmount\n");
}
/* If the mountlist is empty it is time to remove swap. */
mountlist_iterator_init(&iter);
if (mountlist_iterator_next(iter) == NULL) {
uvm_swap_shutdown(l);
}
mountlist_iterator_destroy(iter);
return progress;
}
void
vfs_sync_all(struct lwp *l)
{
printf("syncing disks... ");
/* remove user processes from run queue */
suspendsched();
(void)spl0();
/* avoid coming back this way again if we panic. */
doing_shutdown = 1;
do_sys_sync(l);
/* Wait for sync to finish. */
if (vfs_syncwait() != 0) {
#if defined(DDB) && defined(DEBUG_HALT_BUSY)
Debugger();
#endif
printf("giving up\n");
return;
} else
printf("done\n");
}
/*
* Sync and unmount file systems before shutting down.
*/
void
vfs_shutdown(void)
{
lwp_t *l = curlwp;
vfs_sync_all(l);
/*
* If we have panicked - do not make the situation potentially
* worse by unmounting the file systems.
*/
if (panicstr != NULL) {
return;
}
/* Unmount file systems. */
vfs_unmountall(l);
}
/*
* Print a list of supported file system types (used by vfs_mountroot)
*/
static void
vfs_print_fstypes(void)
{
struct vfsops *v;
int cnt = 0;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list)
++cnt;
mutex_exit(&vfs_list_lock);
if (cnt == 0) {
printf("WARNING: No file system modules have been loaded.\n");
return;
}
printf("Supported file systems:");
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
printf(" %s", v->vfs_name);
}
mutex_exit(&vfs_list_lock);
printf("\n");
}
/*
* Mount the root file system. If the operator didn't specify a
* file system to use, try all possible file systems until one
* succeeds.
*/
int
vfs_mountroot(void)
{
struct vfsops *v;
int error = ENODEV;
if (root_device == NULL)
panic("vfs_mountroot: root device unknown");
switch (device_class(root_device)) {
case DV_IFNET:
if (rootdev != NODEV)
panic("vfs_mountroot: rootdev set for DV_IFNET "
"(0x%llx -> %llu,%llu)",
(unsigned long long)rootdev,
(unsigned long long)major(rootdev),
(unsigned long long)minor(rootdev));
break;
case DV_DISK:
if (rootdev == NODEV)
panic("vfs_mountroot: rootdev not set for DV_DISK");
if (bdevvp(rootdev, &rootvp))
panic("vfs_mountroot: can't get vnode for rootdev");
vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(rootvp, FREAD, FSCRED);
VOP_UNLOCK(rootvp);
if (error) {
printf("vfs_mountroot: can't open root device\n");
return (error);
}
break;
case DV_VIRTUAL:
break;
default:
printf("%s: inappropriate for root file system\n",
device_xname(root_device));
return (ENODEV);
}
/*
* If user specified a root fs type, use it. Make sure the
* specified type exists and has a mount_root()
*/
if (strcmp(rootfstype, ROOT_FSTYPE_ANY) != 0) {
v = vfs_getopsbyname(rootfstype);
error = EFTYPE;
if (v != NULL) {
if (v->vfs_mountroot != NULL) {
error = (v->vfs_mountroot)();
}
v->vfs_refcount--;
}
goto done;
}
/*
* Try each file system currently configured into the kernel.
*/
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (v->vfs_mountroot == NULL)
continue;
#ifdef DEBUG
aprint_normal("mountroot: trying %s...\n", v->vfs_name);
#endif
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
error = (*v->vfs_mountroot)();
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (!error) {
aprint_normal("root file system type: %s\n",
v->vfs_name);
break;
}
}
mutex_exit(&vfs_list_lock);
if (v == NULL) {
vfs_print_fstypes();
printf("no file system for %s", device_xname(root_device));
if (device_class(root_device) == DV_DISK)
printf(" (dev 0x%llx)", (unsigned long long)rootdev);
printf("\n");
error = EFTYPE;
}
done:
if (error && device_class(root_device) == DV_DISK) {
vn_lock(rootvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(rootvp, FREAD, FSCRED);
VOP_UNLOCK(rootvp);
vrele(rootvp);
}
if (error == 0) {
mount_iterator_t *iter;
struct mount *mp;
mountlist_iterator_init(&iter);
mp = mountlist_iterator_next(iter);
KASSERT(mp != NULL);
mountlist_iterator_destroy(iter);
mp->mnt_flag |= MNT_ROOTFS;
mp->mnt_op->vfs_refcount++;
/*
* Get the vnode for '/'. Set cwdi0.cwdi_cdir to
* reference it, and donate it the reference grabbed
* with VFS_ROOT().
*/
error = VFS_ROOT(mp, LK_NONE, &rootvnode);
if (error)
panic("cannot find root vnode, error=%d", error);
cwdi0.cwdi_cdir = rootvnode;
cwdi0.cwdi_rdir = NULL;
/*
* Now that root is mounted, we can fixup initproc's CWD
* info. All other processes are kthreads, which merely
* share proc0's CWD info.
*/
initproc->p_cwdi->cwdi_cdir = rootvnode;
vref(initproc->p_cwdi->cwdi_cdir);
initproc->p_cwdi->cwdi_rdir = NULL;
/*
* Enable loading of modules from the filesystem
*/
module_load_vfs_init();
}
return (error);
}
/*
* mount_specific_key_create --
* Create a key for subsystem mount-specific data.
*/
int
mount_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return specificdata_key_create(mount_specificdata_domain, keyp, dtor);
}
/*
* mount_specific_key_delete --
* Delete a key for subsystem mount-specific data.
*/
void
mount_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(mount_specificdata_domain, key);
}
/*
* mount_initspecific --
* Initialize a mount's specificdata container.
*/
void
mount_initspecific(struct mount *mp)
{
int error __diagused;
error = specificdata_init(mount_specificdata_domain,
&mp->mnt_specdataref);
KASSERT(error == 0);
}
/*
* mount_finispecific --
* Finalize a mount's specificdata container.
*/
void
mount_finispecific(struct mount *mp)
{
specificdata_fini(mount_specificdata_domain, &mp->mnt_specdataref);
}
/*
* mount_getspecific --
* Return mount-specific data corresponding to the specified key.
*/
void *
mount_getspecific(struct mount *mp, specificdata_key_t key)
{
return specificdata_getspecific(mount_specificdata_domain,
&mp->mnt_specdataref, key);
}
/*
* mount_setspecific --
* Set mount-specific data corresponding to the specified key.
*/
void
mount_setspecific(struct mount *mp, specificdata_key_t key, void *data)
{
specificdata_setspecific(mount_specificdata_domain,
&mp->mnt_specdataref, key, data);
}
/*
* Check to see if a filesystem is mounted on a block device.
*/
int
vfs_mountedon(vnode_t *vp)
{
vnode_t *vq;
int error = 0;
if (vp->v_type != VBLK)
return ENOTBLK;
if (spec_node_getmountedfs(vp) != NULL)
return EBUSY;
if (spec_node_lookup_by_dev(vp->v_type, vp->v_rdev, VDEAD_NOWAIT, &vq)
== 0) {
if (spec_node_getmountedfs(vq) != NULL)
error = EBUSY;
vrele(vq);
}
return error;
}
/*
* Check if a device pointed to by vp is mounted.
*
* Returns:
* EINVAL if it's not a disk
* EBUSY if it's a disk and mounted
* 0 if it's a disk and not mounted
*/
int
rawdev_mounted(vnode_t *vp, vnode_t **bvpp)
{
vnode_t *bvp;
dev_t dev;
int d_type;
bvp = NULL;
d_type = D_OTHER;
if (iskmemvp(vp))
return EINVAL;
switch (vp->v_type) {
case VCHR: {
const struct cdevsw *cdev;
dev = vp->v_rdev;
cdev = cdevsw_lookup(dev);
if (cdev != NULL) {
dev_t blkdev;
blkdev = devsw_chr2blk(dev);
if (blkdev != NODEV) { if (vfinddev(blkdev, VBLK, &bvp) != 0) { d_type = (cdev->d_flag & D_TYPEMASK);
/* XXX: what if bvp disappears? */
vrele(bvp);
}
}
}
break;
}
case VBLK: {
const struct bdevsw *bdev;
dev = vp->v_rdev;
bdev = bdevsw_lookup(dev);
if (bdev != NULL) d_type = (bdev->d_flag & D_TYPEMASK);
bvp = vp;
break;
}
default:
break;
}
if (d_type != D_DISK)
return EINVAL;
if (bvpp != NULL)
*bvpp = bvp;
/*
* XXX: This is bogus. We should be failing the request
* XXX: not only if this specific slice is mounted, but
* XXX: if it's on a disk with any other mounted slice.
*/
if (vfs_mountedon(bvp))
return EBUSY;
return 0;
}
/*
* Make a 'unique' number from a mount type name.
*/
long
makefstype(const char *type)
{
long rv;
for (rv = 0; *type; type++) {
rv <<= 2;
rv ^= *type;
}
return rv;
}
static struct mountlist_entry *
mountlist_alloc(enum mountlist_type type, struct mount *mp)
{
struct mountlist_entry *me;
me = kmem_zalloc(sizeof(*me), KM_SLEEP);
me->me_mount = mp;
me->me_type = type;
return me;
}
static void
mountlist_free(struct mountlist_entry *me)
{
kmem_free(me, sizeof(*me));
}
void
mountlist_iterator_init(mount_iterator_t **mip)
{
struct mountlist_entry *me;
me = mountlist_alloc(ME_MARKER, NULL);
mutex_enter(&mountlist_lock);
TAILQ_INSERT_HEAD(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
*mip = (mount_iterator_t *)me;
}
void
mountlist_iterator_destroy(mount_iterator_t *mi)
{
struct mountlist_entry *marker = &mi->mi_entry;
if (marker->me_mount != NULL) vfs_unbusy(marker->me_mount);
mutex_enter(&mountlist_lock);
TAILQ_REMOVE(&mountlist, marker, me_list);
mutex_exit(&mountlist_lock);
mountlist_free(marker);
}
/*
* Return the next mount or NULL for this iterator.
* Mark it busy on success.
*/
static inline struct mount *
_mountlist_iterator_next(mount_iterator_t *mi, bool wait)
{
struct mountlist_entry *me, *marker = &mi->mi_entry;
struct mount *mp;
int error;
if (marker->me_mount != NULL) { vfs_unbusy(marker->me_mount);
marker->me_mount = NULL;
}
mutex_enter(&mountlist_lock);
for (;;) {
KASSERT(marker->me_type == ME_MARKER);
me = TAILQ_NEXT(marker, me_list);
if (me == NULL) {
/* End of list: keep marker and return. */
mutex_exit(&mountlist_lock);
return NULL;
}
TAILQ_REMOVE(&mountlist, marker, me_list); TAILQ_INSERT_AFTER(&mountlist, me, marker, me_list);
/* Skip other markers. */
if (me->me_type != ME_MOUNT)
continue;
/* Take an initial reference for vfs_busy() below. */
mp = me->me_mount;
KASSERT(mp != NULL); vfs_ref(mp);
mutex_exit(&mountlist_lock);
/* Try to mark this mount busy and return on success. */
if (wait)
error = vfs_busy(mp);
else
error = vfs_trybusy(mp);
if (error == 0) {
vfs_rele(mp);
marker->me_mount = mp;
return mp;
}
vfs_rele(mp);
mutex_enter(&mountlist_lock);
}
}
struct mount *
mountlist_iterator_next(mount_iterator_t *mi)
{
return _mountlist_iterator_next(mi, true);
}
struct mount *
mountlist_iterator_trynext(mount_iterator_t *mi)
{
return _mountlist_iterator_next(mi, false);
}
/*
* Attach new mount to the end of the mount list.
*/
void
mountlist_append(struct mount *mp)
{
struct mountlist_entry *me;
me = mountlist_alloc(ME_MOUNT, mp);
mutex_enter(&mountlist_lock);
TAILQ_INSERT_TAIL(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
}
/*
* Remove mount from mount list.
*/void
mountlist_remove(struct mount *mp)
{
struct mountlist_entry *me;
mutex_enter(&mountlist_lock);
TAILQ_FOREACH(me, &mountlist, me_list) if (me->me_type == ME_MOUNT && me->me_mount == mp)
break;
KASSERT(me != NULL);
TAILQ_REMOVE(&mountlist, me, me_list);
mutex_exit(&mountlist_lock);
mountlist_free(me);
}
/*
* Unlocked variant to traverse the mountlist.
* To be used from DDB only.
*/
struct mount *
_mountlist_next(struct mount *mp)
{
struct mountlist_entry *me;
if (mp == NULL) {
me = TAILQ_FIRST(&mountlist);
} else {
TAILQ_FOREACH(me, &mountlist, me_list)
if (me->me_type == ME_MOUNT && me->me_mount == mp)
break;
if (me != NULL)
me = TAILQ_NEXT(me, me_list);
}
while (me != NULL && me->me_type != ME_MOUNT)
me = TAILQ_NEXT(me, me_list);
return (me ? me->me_mount : NULL);
}
/* $NetBSD: in_var.h,v 1.103 2022/11/19 08:00:51 yamt Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1985, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_var.h 8.2 (Berkeley) 1/9/95
*/
#ifndef _NETINET_IN_VAR_H_
#define _NETINET_IN_VAR_H_
#include <sys/queue.h>
#define IN_IFF_TENTATIVE 0x01 /* tentative address */
#define IN_IFF_DUPLICATED 0x02 /* DAD detected duplicate */
#define IN_IFF_DETACHED 0x04 /* may be detached from the link */
#define IN_IFF_TRYTENTATIVE 0x08 /* intent to try DAD */
#define IN_IFFBITS \
"\020\1TENTATIVE\2DUPLICATED\3DETACHED\4TRYTENTATIVE"
/* do not input/output */
#define IN_IFF_NOTREADY \
(IN_IFF_TRYTENTATIVE | IN_IFF_TENTATIVE | IN_IFF_DUPLICATED)
/*
* Interface address, Internet version. One of these structures
* is allocated for each interface with an Internet address.
* The ifaddr structure contains the protocol-independent part
* of the structure and is assumed to be first.
*/
struct in_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
/* ia_{,sub}net{,mask} in host order */
u_int32_t ia_net; /* network number of interface */
u_int32_t ia_netmask; /* mask of net part */
u_int32_t ia_subnet; /* subnet number, including net */
u_int32_t ia_subnetmask; /* mask of subnet part */
struct in_addr ia_netbroadcast; /* to recognize net broadcasts */
LIST_ENTRY(in_ifaddr) ia_hash; /* entry in bucket of inet addresses */
TAILQ_ENTRY(in_ifaddr) ia_list; /* list of internet addresses */
struct sockaddr_in ia_addr; /* reserve space for interface name */
struct sockaddr_in ia_dstaddr; /* reserve space for broadcast addr */
#define ia_broadaddr ia_dstaddr
struct sockaddr_in ia_sockmask; /* reserve space for general netmask */
LIST_HEAD(, in_multi) ia_multiaddrs; /* list of multicast addresses */
struct in_multi *ia_allhosts; /* multicast address record for
the allhosts multicast group */
uint16_t ia_idsalt; /* ip_id salt for this ia */
int ia4_flags; /* address flags */
void (*ia_dad_start) (struct ifaddr *); /* DAD start function */
void (*ia_dad_stop) (struct ifaddr *); /* DAD stop function */
time_t ia_dad_defended; /* last time of DAD defence */
#ifdef _KERNEL
struct pslist_entry ia_hash_pslist_entry;
struct pslist_entry ia_pslist_entry;
#endif
};
struct in_nbrinfo {
char ifname[IFNAMSIZ]; /* if name, e.g. "en0" */
struct in_addr addr; /* IPv4 address of the neighbor */
long asked; /* number of queries already sent for this addr */
int state; /* reachability state */
int expire; /* lifetime for NDP state transition */
};
#ifdef _KERNEL
static __inline void
ia4_acquire(struct in_ifaddr *ia, struct psref *psref)
{
KASSERT(ia != NULL);
ifa_acquire(&ia->ia_ifa, psref);
}
static __inline void
ia4_release(struct in_ifaddr *ia, struct psref *psref)
{
if (ia == NULL)
return;
ifa_release(&ia->ia_ifa, psref);
}
#endif
struct in_aliasreq {
char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct sockaddr_in ifra_addr;
struct sockaddr_in ifra_dstaddr;
#define ifra_broadaddr ifra_dstaddr
struct sockaddr_in ifra_mask;
};
/*
* Given a pointer to an in_ifaddr (ifaddr),
* return a pointer to the addr as a sockaddr_in.
*/
#define IA_SIN(ia) (&(((struct in_ifaddr *)(ia))->ia_addr))
#ifdef _KERNEL
/* Note: 61, 127, 251, 509, 1021, 2039 are good. */
#ifndef IN_IFADDR_HASH_SIZE
#define IN_IFADDR_HASH_SIZE 509
#endif
/*
* This is a bit unconventional, and wastes a little bit of space, but
* because we want a very even hash function we don't use & in_ifaddrhash
* here, but rather % the hash size, which should obviously be prime.
*/
#define IN_IFADDR_HASH(x) in_ifaddrhashtbl[(u_long)(x) % IN_IFADDR_HASH_SIZE]
LIST_HEAD(in_ifaddrhashhead, in_ifaddr); /* Type of the hash head */
TAILQ_HEAD(in_ifaddrhead, in_ifaddr); /* Type of the list head */
extern u_long in_ifaddrhash; /* size of hash table - 1 */
extern struct in_ifaddrhashhead *in_ifaddrhashtbl; /* Hash table head */
extern struct in_ifaddrhead in_ifaddrhead; /* List head (in ip_input) */
extern pserialize_t in_ifaddrhash_psz;
extern struct pslist_head *in_ifaddrhashtbl_pslist;
extern u_long in_ifaddrhash_pslist;
extern struct pslist_head in_ifaddrhead_pslist;
#define IN_IFADDR_HASH_PSLIST(x) \
in_ifaddrhashtbl_pslist[(u_long)(x) % IN_IFADDR_HASH_SIZE]
#define IN_ADDRHASH_READER_FOREACH(__ia, __addr) \
PSLIST_READER_FOREACH((__ia), &IN_IFADDR_HASH_PSLIST(__addr), \
struct in_ifaddr, ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_INSERT_HEAD(__ia) \
PSLIST_WRITER_INSERT_HEAD( \
&IN_IFADDR_HASH_PSLIST((__ia)->ia_addr.sin_addr.s_addr), \
(__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_WRITER_REMOVE(__ia) \
PSLIST_WRITER_REMOVE((__ia), ia_hash_pslist_entry)
#define IN_ADDRHASH_ENTRY_INIT(__ia) \
PSLIST_ENTRY_INIT((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_ENTRY_DESTROY(__ia) \
PSLIST_ENTRY_DESTROY((__ia), ia_hash_pslist_entry);
#define IN_ADDRHASH_READER_NEXT(__ia) \
PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_hash_pslist_entry)
#define IN_ADDRLIST_ENTRY_INIT(__ia) \
PSLIST_ENTRY_INIT((__ia), ia_pslist_entry)
#define IN_ADDRLIST_ENTRY_DESTROY(__ia) \
PSLIST_ENTRY_DESTROY((__ia), ia_pslist_entry);
#define IN_ADDRLIST_READER_EMPTY() \
(PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry) == NULL)
#define IN_ADDRLIST_READER_FIRST() \
PSLIST_READER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry)
#define IN_ADDRLIST_READER_NEXT(__ia) \
PSLIST_READER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_READER_FOREACH(__ia) \
PSLIST_READER_FOREACH((__ia), &in_ifaddrhead_pslist, \
struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_HEAD(__ia) \
PSLIST_WRITER_INSERT_HEAD(&in_ifaddrhead_pslist, (__ia), \
ia_pslist_entry)
#define IN_ADDRLIST_WRITER_REMOVE(__ia) \
PSLIST_WRITER_REMOVE((__ia), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FOREACH(__ia) \
PSLIST_WRITER_FOREACH((__ia), &in_ifaddrhead_pslist, \
struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_FIRST() \
PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry)
#define IN_ADDRLIST_WRITER_NEXT(__ia) \
PSLIST_WRITER_NEXT((__ia), struct in_ifaddr, ia_pslist_entry)
#define IN_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new) \
PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia_pslist_entry)
#define IN_ADDRLIST_WRITER_EMPTY() \
(PSLIST_WRITER_FIRST(&in_ifaddrhead_pslist, struct in_ifaddr, \
ia_pslist_entry) == NULL)
#define IN_ADDRLIST_WRITER_INSERT_TAIL(__new) \
do { \
if (IN_ADDRLIST_WRITER_EMPTY()) { \
IN_ADDRLIST_WRITER_INSERT_HEAD((__new)); \
} else { \
struct in_ifaddr *__ia; \
IN_ADDRLIST_WRITER_FOREACH(__ia) { \
if (IN_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \
IN_ADDRLIST_WRITER_INSERT_AFTER(__ia,\
(__new)); \
break; \
} \
} \
} \
} while (0)
extern const int inetctlerrmap[];
/*
* Find whether an internet address (in_addr) belongs to one
* of our interfaces (in_ifaddr). NULL if the address isn't ours.
*/
static __inline struct in_ifaddr *
in_get_ia(struct in_addr addr)
{
struct in_ifaddr *ia;
IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, addr))
break;
}
return ia;
}
static __inline struct in_ifaddr *
in_get_ia_psref(struct in_addr addr, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia(addr);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* Find whether an internet address (in_addr) belongs to a specified
* interface. NULL if the address isn't ours.
*/
static __inline struct in_ifaddr *
in_get_ia_on_iface(struct in_addr addr, struct ifnet *ifp)
{
struct in_ifaddr *ia;
IN_ADDRHASH_READER_FOREACH(ia, addr.s_addr) {
if (in_hosteq(ia->ia_addr.sin_addr, addr) &&
ia->ia_ifp == ifp)
break;
}
return ia;
}
static __inline struct in_ifaddr *
in_get_ia_on_iface_psref(struct in_addr addr, struct ifnet *ifp, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia_on_iface(addr, ifp);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* Find an internet address structure (in_ifaddr) corresponding
* to a given interface (ifnet structure).
*/
static __inline struct in_ifaddr *
in_get_ia_from_ifp(struct ifnet *ifp)
{
struct ifaddr *ifa;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family == AF_INET)
break;
}
return ifatoia(ifa);
}
static __inline struct in_ifaddr *
in_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref)
{
struct in_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in_get_ia_from_ifp(ifp);
if (ia != NULL)
ia4_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
#include <netinet/in_selsrc.h>
/*
* IPv4 per-interface state.
*/
struct in_ifinfo {
struct lltable *ii_llt; /* ARP state */
struct in_ifsysctl *ii_selsrc;
#ifdef MBUFTRACE
struct mowner ii_mowner;
#endif
};
#endif /* _KERNEL */
/*
* Internet multicast address structure. There is one of these for each IP
* multicast group to which this host belongs on a given network interface.
* They are kept in a linked list, rooted in the interface's in_ifaddr
* structure.
*/
struct router_info;
struct in_multi {
LIST_ENTRY(in_multi) inm_list; /* list of multicast addresses */
struct router_info *inm_rti; /* router version info */
struct ifnet *inm_ifp; /* back pointer to ifnet */
struct in_addr inm_addr; /* IP multicast address */
u_int inm_refcount; /* no. membership claims by sockets */
u_int inm_timer; /* IGMP membership report timer */
u_int inm_state; /* state of membership */
};
#ifdef _KERNEL
#include <net/pktqueue.h>
#include <sys/cprng.h>
extern pktqueue_t *ip_pktq;
extern int ip_dad_count; /* Duplicate Address Detection probes */
static inline bool
ip_dad_enabled(void)
{
#if NARP > 0
return ip_dad_count > 0;
#else
return false;
#endif
}
#if defined(INET) && NARP > 0
extern int arp_debug;
#define ARPLOGADDR(a) IN_PRINT(_ipbuf, a)
#define ARPLOG(level, fmt, args...) \
do { \
char _ipbuf[INET_ADDRSTRLEN]; \
(void)_ipbuf; \
if (arp_debug) \
log(level, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#else
#define ARPLOG(level, fmt, args...)
#endif
/*
* Structure used by functions below to remember position when stepping
* through all of the in_multi records.
*/
struct in_multistep {
int i_n;
struct in_multi *i_inm;
};
bool in_multi_group(struct in_addr, struct ifnet *, int);
struct in_multi *in_first_multi(struct in_multistep *);
struct in_multi *in_next_multi(struct in_multistep *);
struct in_multi *in_lookup_multi(struct in_addr, struct ifnet *);
struct in_multi *in_addmulti(struct in_addr *, struct ifnet *);
void in_delmulti(struct in_multi *);
void in_multi_lock(int);
void in_multi_unlock(void);
int in_multi_lock_held(void);
struct ifaddr;
int in_ifinit(struct ifnet *, struct in_ifaddr *,
const struct sockaddr_in *, const struct sockaddr_in *, int);
void in_savemkludge(struct in_ifaddr *);
void in_restoremkludge(struct in_ifaddr *, struct ifnet *);
void in_purgemkludge(struct ifnet *);
void in_setmaxmtu(void);
int in_control(struct socket *, u_long, void *, struct ifnet *);
void in_purgeaddr(struct ifaddr *);
void in_purgeif(struct ifnet *);
void in_addrhash_insert(struct in_ifaddr *);
void in_addrhash_remove(struct in_ifaddr *);
int ipflow_fastforward(struct mbuf *);
extern uint16_t ip_id;
extern int ip_do_randomid;
static __inline uint16_t
ip_randomid(void)
{
uint16_t id = (uint16_t)cprng_fast32();
return id ? id : 1;
}
/*
* ip_newid_range: "allocate" num contiguous IP IDs.
*
* => Return the first ID.
*/
static __inline uint16_t
ip_newid_range(const struct in_ifaddr *ia, u_int num)
{
uint16_t id;
if (ip_do_randomid) {
/* XXX ignore num */
return ip_randomid();
}
/* Never allow an IP ID of 0 (detect wrap). */
if ((uint16_t)(ip_id + num) < ip_id) { ip_id = 1;
}
id = htons(ip_id);
ip_id += num;
return id;
}
static __inline uint16_t
ip_newid(const struct in_ifaddr *ia)
{
return ip_newid_range(ia, 1);
}
#ifdef SYSCTLFN_PROTO
int sysctl_inpcblist(SYSCTLFN_PROTO);
#endif
#define LLTABLE(ifp) \
((struct in_ifinfo *)(ifp)->if_afdata[AF_INET])->ii_llt
#endif /* !_KERNEL */
/* INET6 stuff */
#include <netinet6/in6_var.h>
#endif /* !_NETINET_IN_VAR_H_ */
/* $NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $ */
/*-
* Copyright (c) 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Joerg Sonnenberger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: popcount32.c,v 1.5 2015/05/29 19:39:41 matt Exp $");
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <limits.h>
#include <stdint.h>
#include <strings.h>
#else
#include <lib/libkern/libkern.h>
#include <machine/limits.h>
#endif
#ifndef popcount32 // might be a builtin
/*
* This a hybrid algorithm for bit counting between parallel counting and
* using multiplication. The idea is to sum up the bits in each Byte, so
* that the final accumulation can be done with a single multiplication.
* If the platform has a slow multiplication instruction, it can be replaced
* by the commented out version below.
*/
unsigned int
popcount32(uint32_t v)
{
unsigned int c;
v = v - ((v >> 1) & 0x55555555U);
v = (v & 0x33333333U) + ((v >> 2) & 0x33333333U);
v = (v + (v >> 4)) & 0x0f0f0f0fU;
c = (v * 0x01010101U) >> 24;
/*
* v = (v >> 16) + v;
* v = (v >> 8) + v;
* c = v & 255;
*/
return c;
}
#if UINT_MAX == 0xffffffffU
__strong_alias(popcount, popcount32)
#endif
#if ULONG_MAX == 0xffffffffU
__strong_alias(popcountl, popcount32)
#endif
#endif /* !popcount32 */
/* $NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel condition variable implementation.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_condvar.c,v 1.63 2023/11/02 10:31:55 martin Exp $");
#include <sys/param.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* Accessors for the private contents of the kcondvar_t data type.
*
* cv_opaque[0] sleepq_t
* cv_opaque[1] description for ps(1)
*
* cv_opaque[0] is protected by the interlock passed to cv_wait() (enqueue
* only), and the sleep queue lock acquired with sleepq_hashlock() (enqueue
* and dequeue).
*
* cv_opaque[1] (the wmesg) is static and does not change throughout the life
* of the CV.
*/
#define CV_SLEEPQ(cv) ((sleepq_t *)(cv)->cv_opaque)
#define CV_WMESG(cv) ((const char *)(cv)->cv_opaque[1])
#define CV_SET_WMESG(cv, v) (cv)->cv_opaque[1] = __UNCONST(v)
#define CV_DEBUG_P(cv) (CV_WMESG(cv) != nodebug)
#define CV_RA ((uintptr_t)__builtin_return_address(0))
static void cv_unsleep(lwp_t *, bool);
static inline void cv_wakeup_one(kcondvar_t *);
static inline void cv_wakeup_all(kcondvar_t *);
syncobj_t cv_syncobj = {
.sobj_name = "cv",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = cv_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static const char deadcv[] = "deadcv";
/*
* cv_init:
*
* Initialize a condition variable for use.
*/
void
cv_init(kcondvar_t *cv, const char *wmesg)
{ KASSERT(wmesg != NULL);
CV_SET_WMESG(cv, wmesg);
sleepq_init(CV_SLEEPQ(cv));
}
/*
* cv_destroy:
*
* Tear down a condition variable.
*/
void
cv_destroy(kcondvar_t *cv)
{
sleepq_destroy(CV_SLEEPQ(cv));
#ifdef DIAGNOSTIC
KASSERT(cv_is_valid(cv)); KASSERT(!cv_has_waiters(cv));
CV_SET_WMESG(cv, deadcv);
#endif
}
/*
* cv_enter:
*
* Look up and lock the sleep queue corresponding to the given
* condition variable, and increment the number of waiters.
*/
static inline int
cv_enter(kcondvar_t *cv, kmutex_t *mtx, lwp_t *l, bool catch_p)
{
sleepq_t *sq;
kmutex_t *mp;
int nlocks;
KASSERT(cv_is_valid(cv)); KASSERT(!cpu_intr_p()); KASSERT((l->l_pflag & LP_INTR) == 0 || panicstr != NULL);
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, cv, CV_WMESG(cv), &cv_syncobj, catch_p);
mutex_exit(mtx);
KASSERT(cv_has_waiters(cv));
return nlocks;
}
/*
* cv_unsleep:
*
* Remove an LWP from the condition variable and sleep queue. This
* is called when the LWP has not been awoken normally but instead
* interrupted: for example, when a signal is received. Must be
* called with the LWP locked. Will unlock if "unlock" is true.
*/
static void
cv_unsleep(lwp_t *l, bool unlock)
{
kcondvar_t *cv __diagused;
cv = (kcondvar_t *)(uintptr_t)l->l_wchan;
KASSERT(l->l_wchan == (wchan_t)cv);
KASSERT(l->l_sleepq == CV_SLEEPQ(cv)); KASSERT(cv_is_valid(cv)); KASSERT(cv_has_waiters(cv));
sleepq_unsleep(l, unlock);
}
/*
* cv_wait:
*
* Wait non-interruptably on a condition variable until awoken.
*/
void
cv_wait(kcondvar_t *cv, kmutex_t *mtx)
{
lwp_t *l = curlwp;
int nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, false);
(void)sleepq_block(0, false, &cv_syncobj, nlocks);
mutex_enter(mtx);
}
/*
* cv_wait_sig:
*
* Wait on a condition variable until a awoken or a signal is received.
* Will also return early if the process is exiting. Returns zero if
* awoken normally, ERESTART if a signal was received and the system
* call is restartable, or EINTR otherwise.
*/
int
cv_wait_sig(kcondvar_t *cv, kmutex_t *mtx)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, true);
error = sleepq_block(0, true, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* cv_timedwait:
*
* Wait on a condition variable until awoken or the specified timeout
* expires. Returns zero if awoken normally or EWOULDBLOCK if the
* timeout expired.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
cv_timedwait(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, false);
error = sleepq_block(timo, false, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* cv_timedwait_sig:
*
* Wait on a condition variable until a timeout expires, awoken or a
* signal is received. Will also return early if the process is
* exiting. Returns zero if awoken normally, EWOULDBLOCK if the
* timeout expires, ERESTART if a signal was received and the system
* call is restartable, or EINTR otherwise.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
cv_timedwait_sig(kcondvar_t *cv, kmutex_t *mtx, int timo)
{
lwp_t *l = curlwp;
int error, nlocks;
KASSERT(mutex_owned(mtx));
nlocks = cv_enter(cv, mtx, l, true);
error = sleepq_block(timo, true, &cv_syncobj, nlocks);
mutex_enter(mtx);
return error;
}
/*
* Given a number of seconds, sec, and 2^64ths of a second, frac, we
* want a number of ticks for a timeout:
*
* timo = hz*(sec + frac/2^64)
* = hz*sec + hz*frac/2^64
* = hz*sec + hz*(frachi*2^32 + fraclo)/2^64
* = hz*sec + hz*frachi/2^32 + hz*fraclo/2^64,
*
* where frachi is the high 32 bits of frac and fraclo is the
* low 32 bits.
*
* We assume hz < INT_MAX/2 < UINT32_MAX, so
*
* hz*fraclo/2^64 < fraclo*2^32/2^64 <= 1,
*
* since fraclo < 2^32.
*
* We clamp the result at INT_MAX/2 for a timeout in ticks, since we
* can't represent timeouts higher than INT_MAX in cv_timedwait, and
* spurious wakeup is OK. Moreover, we don't want to wrap around,
* because we compute end - start in ticks in order to compute the
* remaining timeout, and that difference cannot wrap around, so we use
* a timeout less than INT_MAX. Using INT_MAX/2 provides plenty of
* margin for paranoia and will exceed most waits in practice by far.
*/
static unsigned
bintime2timo(const struct bintime *bt)
{
KASSERT(hz < INT_MAX/2);
CTASSERT(INT_MAX/2 < UINT32_MAX);
if (bt->sec > ((INT_MAX/2)/hz))
return INT_MAX/2;
if ((hz*(bt->frac >> 32) >> 32) > (INT_MAX/2 - hz*bt->sec))
return INT_MAX/2;
return hz*bt->sec + (hz*(bt->frac >> 32) >> 32);
}
/*
* timo is in units of ticks. We want units of seconds and 2^64ths of
* a second. We know hz = 1 sec/tick, and 2^64 = 1 sec/(2^64th of a
* second), from which we can conclude 2^64 / hz = 1 (2^64th of a
* second)/tick. So for the fractional part, we compute
*
* frac = rem * 2^64 / hz
* = ((rem * 2^32) / hz) * 2^32
*
* Using truncating integer division instead of real division will
* leave us with only about 32 bits of precision, which means about
* 1/4-nanosecond resolution, which is good enough for our purposes.
*/
static struct bintime
timo2bintime(unsigned timo)
{
return (struct bintime) {
.sec = timo / hz,
.frac = (((uint64_t)(timo % hz) << 32)/hz << 32),
};
}
/*
* cv_timedwaitbt:
*
* Wait on a condition variable until awoken or the specified
* timeout expires. Returns zero if awoken normally or
* EWOULDBLOCK if the timeout expires.
*
* On entry, bt is a timeout in bintime. cv_timedwaitbt subtracts
* the time slept, so on exit, bt is the time remaining after
* sleeping, possibly negative if the complete time has elapsed.
* No infinite timeout; use cv_wait_sig instead.
*
* epsilon is a requested maximum error in timeout (excluding
* spurious wakeups). Currently not used, will be used in the
* future to choose between low- and high-resolution timers.
* Actual wakeup time will be somewhere in [t, t + max(e, r) + s)
* where r is the finest resolution of clock available and s is
* scheduling delays for scheduler overhead and competing threads.
* Time is measured by the interrupt source implementing the
* timeout, not by another timecounter.
*/
int
cv_timedwaitbt(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
const struct bintime *epsilon __diagused)
{
struct bintime slept;
unsigned start, end;
int timo;
int error;
KASSERTMSG(bt->sec >= 0, "negative timeout");
KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
/* If there's nothing left to wait, time out. */
if (bt->sec == 0 && bt->frac == 0)
return EWOULDBLOCK;
/* Convert to ticks, but clamp to be >=1. */
timo = bintime2timo(bt);
KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
if (timo == 0)
timo = 1;
/*
* getticks() is technically int, but nothing special
* happens instead of overflow, so we assume two's-complement
* wraparound and just treat it as unsigned.
*/
start = getticks();
error = cv_timedwait(cv, mtx, timo);
end = getticks();
/*
* Set it to the time left, or zero, whichever is larger. We
* do not fail with EWOULDBLOCK here because this may have been
* an explicit wakeup, so the caller needs to check before they
* give up or else cv_signal would be lost.
*/
slept = timo2bintime(end - start);
if (bintimecmp(bt, &slept, <=)) {
bt->sec = 0;
bt->frac = 0;
} else {
/* bt := bt - slept */
bintime_sub(bt, &slept);
}
return error;
}
/*
* cv_timedwaitbt_sig:
*
* Wait on a condition variable until awoken, the specified
* timeout expires, or interrupted by a signal. Returns zero if
* awoken normally, EWOULDBLOCK if the timeout expires, or
* EINTR/ERESTART if interrupted by a signal.
*
* On entry, bt is a timeout in bintime. cv_timedwaitbt_sig
* subtracts the time slept, so on exit, bt is the time remaining
* after sleeping. No infinite timeout; use cv_wait instead.
*
* epsilon is a requested maximum error in timeout (excluding
* spurious wakeups). Currently not used, will be used in the
* future to choose between low- and high-resolution timers.
*/
int
cv_timedwaitbt_sig(kcondvar_t *cv, kmutex_t *mtx, struct bintime *bt,
const struct bintime *epsilon __diagused)
{
struct bintime slept;
unsigned start, end;
int timo;
int error;
KASSERTMSG(bt->sec >= 0, "negative timeout");
KASSERTMSG(epsilon != NULL, "specify maximum requested delay");
/* If there's nothing left to wait, time out. */
if (bt->sec == 0 && bt->frac == 0)
return EWOULDBLOCK;
/* Convert to ticks, but clamp to be >=1. */
timo = bintime2timo(bt);
KASSERTMSG(timo >= 0, "negative ticks: %d", timo);
if (timo == 0)
timo = 1;
/*
* getticks() is technically int, but nothing special
* happens instead of overflow, so we assume two's-complement
* wraparound and just treat it as unsigned.
*/
start = getticks();
error = cv_timedwait_sig(cv, mtx, timo);
end = getticks();
/*
* Set it to the time left, or zero, whichever is larger. We
* do not fail with EWOULDBLOCK here because this may have been
* an explicit wakeup, so the caller needs to check before they
* give up or else cv_signal would be lost.
*/
slept = timo2bintime(end - start);
if (bintimecmp(bt, &slept, <=)) {
bt->sec = 0;
bt->frac = 0;
} else {
/* bt := bt - slept */
bintime_sub(bt, &slept);
}
return error;
}
/*
* cv_signal:
*
* Wake the highest priority LWP waiting on a condition variable. Must
* be called with the interlocking mutex held or just after it has been
* released (so the awoken LWP will see the changed condition).
*/
void
cv_signal(kcondvar_t *cv)
{ KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
/*
* Compiler turns into a tail call usually, i.e. jmp,
* because the arguments are the same and no locals.
*/
cv_wakeup_one(cv);
}
}
/*
* cv_wakeup_one:
*
* Slow path for cv_signal(). Deliberately marked __noinline to
* prevent the compiler pulling it in to cv_signal(), which adds
* extra prologue and epilogue code.
*/
static __noinline void
cv_wakeup_one(kcondvar_t *cv)
{
sleepq_t *sq;
kmutex_t *mp;
lwp_t *l;
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
if (__predict_true((l = LIST_FIRST(sq)) != NULL)) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv);
sleepq_remove(sq, l, true);
}
mutex_spin_exit(mp);
}
/*
* cv_broadcast:
*
* Wake all LWPs waiting on a condition variable. Must be called with
* the interlocking mutex held or just after it has been released (so
* the awoken LWP will see the changed condition).
*/
void
cv_broadcast(kcondvar_t *cv)
{ KASSERT(cv_is_valid(cv)); if (__predict_false(!LIST_EMPTY(CV_SLEEPQ(cv)))) {
/*
* Compiler turns into a tail call usually, i.e. jmp,
* because the arguments are the same and no locals.
*/
cv_wakeup_all(cv);
}
}
/*
* cv_wakeup_all:
*
* Slow path for cv_broadcast(). Deliberately marked __noinline to
* prevent the compiler pulling it in to cv_broadcast(), which adds
* extra prologue and epilogue code.
*/
static __noinline void
cv_wakeup_all(kcondvar_t *cv)
{
sleepq_t *sq;
kmutex_t *mp;
lwp_t *l;
mp = sleepq_hashlock(cv);
sq = CV_SLEEPQ(cv);
while ((l = LIST_FIRST(sq)) != NULL) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp); KASSERT(l->l_wchan == cv);
sleepq_remove(sq, l, true);
}
mutex_spin_exit(mp);
}
/*
* cv_has_waiters:
*
* For diagnostic assertions: return non-zero if a condition
* variable has waiters.
*/
bool
cv_has_waiters(kcondvar_t *cv)
{
return !LIST_EMPTY(CV_SLEEPQ(cv));
}
/*
* cv_is_valid:
*
* For diagnostic assertions: return non-zero if a condition
* variable appears to be valid. No locks need be held.
*/
bool
cv_is_valid(kcondvar_t *cv)
{
return CV_WMESG(cv) != deadcv && CV_WMESG(cv) != NULL;
}
/* $NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsi_base.c,v 1.93 2019/05/03 16:06:56 mlelstv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_disk.h>
#include <dev/scsipi/scsiconf.h>
#include <dev/scsipi/scsipi_base.h>
static void scsi_print_xfer_mode(struct scsipi_periph *);
/*
* Do a scsi operation, asking a device to run as SCSI-II if it can.
*/
int
scsi_change_def(struct scsipi_periph *periph, int flags)
{
struct scsi_changedef cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_CHANGE_DEFINITION;
cmd.how = SC_SCSI_2;
return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, 100000, NULL, flags));
}
/*
* ask the scsi driver to perform a command for us.
* tell it where to read/write the data, and how
* long the data is supposed to be. If we have a buf
* to associate with the transfer, we need that too.
*/
void
scsi_scsipi_cmd(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
SC_DEBUG(periph, SCSIPI_DB2, ("scsi_scsipi_cmd\n"));
/*
* Set the LUN in the CDB if we have an older device. We also
* set it for more modern SCSI-2 devices "just in case".
*/
if (periph->periph_version <= 2)
xs->cmd->bytes[0] |=
((periph->periph_lun << SCSI_CMD_LUN_SHIFT) &
SCSI_CMD_LUN_MASK);
}
/*
* Utility routines often used in SCSI stuff
*/
/*
* Print out the periph's address info.
*/
void
scsi_print_addr(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
struct scsipi_adapter *adapt = chan->chan_adapter;
printf("%s(%s:%d:%d:%d): ", periph->periph_dev != NULL ?
device_xname(periph->periph_dev) : "probe",
device_xname(adapt->adapt_dev),
chan->chan_channel, periph->periph_target,
periph->periph_lun);
}
/*
* Kill off all pending xfers for a periph.
*
* Must be called with channel lock held
*/
void
scsi_kill_pending(struct scsipi_periph *periph)
{
struct scsipi_xfer *xs;
TAILQ_FOREACH(xs, &periph->periph_xferq, device_q) {
callout_stop(&xs->xs_callout);
scsi_print_addr(periph);
printf("killed ");
scsipi_print_cdb(xs->cmd);
xs->error = XS_DRIVER_STUFFUP;
scsipi_done(xs);
}
}
/*
* scsi_print_xfer_mode:
*
* Print a parallel SCSI periph's capabilities.
*/
static void
scsi_print_xfer_mode(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
struct scsipi_adapter *adapt = chan->chan_adapter;
int period, freq, speed, mbs;
if (periph->periph_dev)
aprint_normal_dev(periph->periph_dev, "");
else
aprint_normal("probe(%s:%d:%d:%d): ",
device_xname(adapt->adapt_dev),
chan->chan_channel, periph->periph_target,
periph->periph_lun);
if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
period = scsipi_sync_factor_to_period(periph->periph_period);
aprint_normal("sync (%d.%02dns offset %d)",
period / 100, period % 100, periph->periph_offset);
} else
aprint_normal("async");
if (periph->periph_mode & PERIPH_CAP_WIDE32)
aprint_normal(", 32-bit");
else if (periph->periph_mode & (PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
aprint_normal(", 16-bit");
else
aprint_normal(", 8-bit");
if (periph->periph_mode & (PERIPH_CAP_SYNC | PERIPH_CAP_DT)) {
freq = scsipi_sync_factor_to_freq(periph->periph_period);
speed = freq;
if (periph->periph_mode & PERIPH_CAP_WIDE32)
speed *= 4;
else if (periph->periph_mode &
(PERIPH_CAP_WIDE16 | PERIPH_CAP_DT))
speed *= 2;
mbs = speed / 1000;
if (mbs > 0) {
aprint_normal(" (%d.%03dMB/s)", mbs,
speed % 1000);
} else
aprint_normal(" (%dKB/s)", speed % 1000);
}
aprint_normal(" transfers");
if (periph->periph_mode & PERIPH_CAP_TQING)
aprint_normal(", tagged queueing");
aprint_normal("\n");
}
/*
* scsi_async_event_xfer_mode:
*
* Update the xfer mode for all parallel SCSI periphs sharing the
* specified I_T Nexus.
*/
void
scsi_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
struct scsipi_xfer_mode *xm = arg;
struct scsipi_periph *periph;
int lun, announce, mode, period, offset;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
if (periph == NULL)
continue;
announce = 0;
/*
* Clamp the xfer mode down to this periph's capabilities.
*/
mode = xm->xm_mode & periph->periph_cap;
if (mode & PERIPH_CAP_SYNC) {
period = xm->xm_period;
offset = xm->xm_offset;
} else {
period = 0;
offset = 0;
}
/*
* If we do not have a valid xfer mode yet, or the parameters
* are different, announce them.
*/
if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
periph->periph_mode != mode ||
periph->periph_period != period ||
periph->periph_offset != offset)
announce = 1;
periph->periph_mode = mode;
periph->periph_period = period;
periph->periph_offset = offset;
periph->periph_flags |= PERIPH_MODE_VALID;
if (announce)
scsi_print_xfer_mode(periph);
}
}
/*
* scsipi_async_event_xfer_mode:
*
* Update the xfer mode for all SAS/FC periphs sharing the
* specified I_T Nexus.
*/
void
scsi_fc_sas_async_event_xfer_mode(struct scsipi_channel *chan, void *arg)
{
struct scsipi_xfer_mode *xm = arg;
struct scsipi_periph *periph;
int lun, announce, mode;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, xm->xm_target, lun);
if (periph == NULL)
continue;
announce = 0;
/*
* Clamp the xfer mode down to this periph's capabilities.
*/
mode = xm->xm_mode & periph->periph_cap;
/*
* If we do not have a valid xfer mode yet, or the parameters
* are different, announce them.
*/
if ((periph->periph_flags & PERIPH_MODE_VALID) == 0 ||
periph->periph_mode != mode)
announce = 1;
periph->periph_mode = mode;
periph->periph_flags |= PERIPH_MODE_VALID;
if (announce &&
(periph->periph_mode & PERIPH_CAP_TQING) != 0) {
aprint_normal_dev(periph->periph_dev,
"tagged queueing\n");
}
}
}
/* $NetBSD: hci_ioctl.c,v 1.15 2021/09/21 15:03:08 christos Exp $ */
/*-
* Copyright (c) 2005 Iain Hibbert.
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: hci_ioctl.c,v 1.15 2021/09/21 15:03:08 christos Exp $");
#include <sys/param.h>
#include <sys/domain.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/l2cap.h>
#include <netbt/rfcomm.h>
#ifdef BLUETOOTH_DEBUG
#define BDADDR(bd) (bd).b[5], (bd).b[4], (bd).b[3], \
(bd).b[2], (bd).b[1], (bd).b[0]
static void
hci_dump(void)
{
struct hci_unit *unit;
struct hci_link *link;
struct l2cap_channel *chan;
struct rfcomm_session *rs;
struct rfcomm_dlc *dlc;
uprintf("HCI:\n");
SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) {
uprintf("UNIT %s: flags 0x%4.4x, "
"num_cmd=%d, num_acl=%d, num_sco=%d\n",
device_xname(unit->hci_dev), unit->hci_flags,
unit->hci_num_cmd_pkts,
unit->hci_num_acl_pkts,
unit->hci_num_sco_pkts);
TAILQ_FOREACH(link, &unit->hci_links, hl_next) {
uprintf("+HANDLE #%d: %s "
"raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, "
"state %d, refcnt %d\n",
link->hl_handle,
(link->hl_type == HCI_LINK_ACL ? "ACL":"SCO"),
BDADDR(link->hl_bdaddr),
link->hl_state, link->hl_refcnt);
}
}
uprintf("L2CAP:\n");
LIST_FOREACH(chan, &l2cap_active_list, lc_ncid) {
uprintf("CID #%d state %d, psm=0x%4.4x, "
"laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, "
"raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
chan->lc_lcid, chan->lc_state, chan->lc_raddr.bt_psm,
BDADDR(chan->lc_laddr.bt_bdaddr),
BDADDR(chan->lc_raddr.bt_bdaddr));
}
LIST_FOREACH(chan, &l2cap_listen_list, lc_ncid) {
uprintf("LISTEN psm=0x%4.4x, "
"laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
chan->lc_laddr.bt_psm,
BDADDR(chan->lc_laddr.bt_bdaddr));
}
uprintf("RFCOMM:\n");
LIST_FOREACH(rs, &rfcomm_session_active, rs_next) {
chan = rs->rs_l2cap;
uprintf("SESSION: state=%d, flags=0x%4.4x, psm 0x%4.4x "
"laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, "
"raddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
rs->rs_state, rs->rs_flags, chan->lc_raddr.bt_psm,
BDADDR(chan->lc_laddr.bt_bdaddr),
BDADDR(chan->lc_raddr.bt_bdaddr));
LIST_FOREACH(dlc, &rs->rs_dlcs, rd_next) {
uprintf("+DLC channel=%d, dlci=%d, "
"state=%d, flags=0x%4.4x, rxcred=%d, rxsize=%ld, "
"txcred=%d, pending=%d, txqlen=%d\n",
dlc->rd_raddr.bt_channel, dlc->rd_dlci,
dlc->rd_state, dlc->rd_flags,
dlc->rd_rxcred, (unsigned long)dlc->rd_rxsize,
dlc->rd_txcred, dlc->rd_pending,
(dlc->rd_txbuf ? dlc->rd_txbuf->m_pkthdr.len : 0));
}
}
LIST_FOREACH(rs, &rfcomm_session_listen, rs_next) {
chan = rs->rs_l2cap;
uprintf("LISTEN: psm 0x%4.4x, "
"laddr=%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
chan->lc_laddr.bt_psm,
BDADDR(chan->lc_laddr.bt_bdaddr));
LIST_FOREACH(dlc, &rs->rs_dlcs, rd_next)
uprintf("+DLC channel=%d\n", dlc->rd_laddr.bt_channel);
}
}
#undef BDADDR
#endif
int
hci_ioctl_pcb(unsigned long cmd, void *data)
{
struct btreq *btr = data;
struct hci_unit *unit;
int err = 0;
DPRINTFN(1, "cmd %#lx\n", cmd);
switch(cmd) {
#ifdef BLUETOOTH_DEBUG
case SIOCBTDUMP:
hci_dump();
return 0;
#endif
/*
* Get unit info based on address rather than name
*/
case SIOCGBTINFOA:
unit = hci_unit_lookup(&btr->btr_bdaddr);
if (unit == NULL)
return ENXIO;
break;
/*
* The remaining ioctl's all use the same btreq structure and
* index on the name of the device, so we look that up first.
*/
case SIOCNBTINFO:
/* empty name means give the first unit */
if (btr->btr_name[0] == '\0') {
unit = NULL;
break;
}
/* else fall through and look it up */
/* FALLTHROUGH */
case SIOCGBTINFO:
case SIOCSBTFLAGS:
case SIOCSBTPOLICY:
case SIOCSBTPTYPE:
case SIOCGBTSTATS:
case SIOCZBTSTATS:
case SIOCSBTSCOMTU:
case SIOCGBTFEAT:
SIMPLEQ_FOREACH(unit, &hci_unit_list, hci_next) {
if (strncmp(device_xname(unit->hci_dev),
btr->btr_name, HCI_DEVNAME_SIZE) == 0)
break;
}
if (unit == NULL)
return ENXIO;
break;
default: /* not one of mine */
return EPASSTHROUGH;
}
switch(cmd) {
case SIOCNBTINFO: /* get next info */
if (unit)
unit = SIMPLEQ_NEXT(unit, hci_next);
else
unit = SIMPLEQ_FIRST(&hci_unit_list);
if (unit == NULL) {
err = ENXIO;
break;
}
/* FALLTHROUGH */
case SIOCGBTINFO: /* get unit info */
/* FALLTHROUGH */
case SIOCGBTINFOA: /* get info by address */
memset(btr, 0, sizeof(struct btreq));
strlcpy(btr->btr_name, device_xname(unit->hci_dev), HCI_DEVNAME_SIZE);
bdaddr_copy(&btr->btr_bdaddr, &unit->hci_bdaddr);
btr->btr_flags = unit->hci_flags;
btr->btr_num_cmd = unit->hci_num_cmd_pkts;
btr->btr_num_acl = unit->hci_num_acl_pkts;
btr->btr_num_sco = unit->hci_num_sco_pkts;
btr->btr_acl_mtu = unit->hci_max_acl_size;
btr->btr_sco_mtu = unit->hci_max_sco_size;
btr->btr_max_acl = unit->hci_max_acl_pkts;
btr->btr_max_sco = unit->hci_max_sco_pkts;
btr->btr_packet_type = unit->hci_packet_type;
btr->btr_link_policy = unit->hci_link_policy;
break;
case SIOCSBTFLAGS: /* set unit flags (privileged) */
err = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd),
btr, NULL);
if (err)
break;
if ((unit->hci_flags & BTF_UP) && (btr->btr_flags & BTF_UP) == 0) { hci_disable(unit);
unit->hci_flags &= ~BTF_UP;
}
unit->hci_flags &= ~BTF_MASTER;
unit->hci_flags |= (btr->btr_flags & (BTF_INIT | BTF_MASTER));
if ((unit->hci_flags & BTF_UP) == 0 && (btr->btr_flags & BTF_UP)) {
err = hci_enable(unit);
if (err)
break;
unit->hci_flags |= BTF_UP;
}
btr->btr_flags = unit->hci_flags;
break;
case SIOCSBTPOLICY: /* set unit link policy (privileged) */
err = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd),
btr, NULL);
if (err)
break;
unit->hci_link_policy = btr->btr_link_policy;
unit->hci_link_policy &= unit->hci_lmp_mask;
btr->btr_link_policy = unit->hci_link_policy;
break;
case SIOCSBTPTYPE: /* set unit packet types (privileged) */
err = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd),
btr, NULL);
if (err)
break;
unit->hci_packet_type = btr->btr_packet_type;
unit->hci_packet_type &= unit->hci_acl_mask;
btr->btr_packet_type = unit->hci_packet_type;
break;
case SIOCGBTSTATS: /* get unit statistics */
(*unit->hci_if->get_stats)(unit->hci_dev, &btr->btr_stats, 0);
break;
case SIOCZBTSTATS: /* get & reset unit statistics */
err = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd),
btr, NULL);
if (err)
break;
(*unit->hci_if->get_stats)(unit->hci_dev, &btr->btr_stats, 1);
break;
case SIOCSBTSCOMTU: /* set sco_mtu value for unit */
/*
* This is a temporary ioctl and may not be supported
* in the future. The need is that if SCO packets are
* sent to USB bluetooth controllers that are not an
* integer number of frame sizes, the USB bus locks up.
*/
err = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_BLUETOOTH_SETPRIV, unit, KAUTH_ARG(cmd),
btr, NULL);
if (err)
break;
unit->hci_max_sco_size = btr->btr_sco_mtu;
break;
case SIOCGBTFEAT: /* get unit features */
memset(btr, 0, sizeof(struct btreq));
strlcpy(btr->btr_name, device_xname(unit->hci_dev), HCI_DEVNAME_SIZE);
memcpy(btr->btr_features0, unit->hci_feat0, HCI_FEATURES_SIZE);
memcpy(btr->btr_features1, unit->hci_feat1, HCI_FEATURES_SIZE);
memcpy(btr->btr_features2, unit->hci_feat2, HCI_FEATURES_SIZE);
break;
default:
err = EFAULT;
break;
}
return err;
}
/* $NetBSD: ufs_wapbl.h,v 1.19 2020/04/11 17:43:54 jdolecek Exp $ */
/*-
* Copyright (c) 2003,2006,2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _UFS_UFS_UFS_WAPBL_H_
#define _UFS_UFS_UFS_WAPBL_H_
#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif
/*
* Information for the journal location stored in the superblock.
* We store the journal version, some flags, the journal location
* type, and some location specific "locators" that identify where
* the log itself is located.
*/
/* fs->fs_journal_version */
#define UFS_WAPBL_VERSION 1
/* fs->fs_journal_location */
#define UFS_WAPBL_JOURNALLOC_NONE 0
#define UFS_WAPBL_JOURNALLOC_END_PARTITION 1
#define UFS_WAPBL_EPART_ADDR 0 /* locator slots */
#define UFS_WAPBL_EPART_COUNT 1
#define UFS_WAPBL_EPART_BLKSZ 2
#define UFS_WAPBL_EPART_UNUSED 3
#define UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM 2
#define UFS_WAPBL_INFS_ADDR 0 /* locator slots */
#define UFS_WAPBL_INFS_COUNT 1
#define UFS_WAPBL_INFS_BLKSZ 2
#define UFS_WAPBL_INFS_INO 3
/* fs->fs_journal_flags */
#define UFS_WAPBL_FLAGS_CREATE_LOG 0x1
#define UFS_WAPBL_FLAGS_CLEAR_LOG 0x2
/*
* The journal size is limited to between 1MB and 64MB.
* The default journal size is the filesystem size divided by
* the scale factor - this is 1M of journal per 1GB of filesystem
* space.
*
* XXX: Is 64MB too limiting? If user explicitly asks for more, allow it?
*/
#define UFS_WAPBL_JOURNAL_SCALE 1024
#define UFS_WAPBL_MIN_JOURNAL_SIZE (1024 * 1024)
#define UFS_WAPBL_MAX_JOURNAL_SIZE (64 * 1024 * 1024)
#if defined(WAPBL)
static __inline int
ufs_wapbl_begin(struct mount *mp, const char *file, int line)
{
if (mp->mnt_wapbl) {
int error;
error = wapbl_begin(mp->mnt_wapbl, file, line);
if (error)
return error;
}
return 0;
}
static __inline void
ufs_wapbl_end(struct mount *mp)
{
if (mp->mnt_wapbl) { wapbl_end(mp->mnt_wapbl);
}
}
#define UFS_WAPBL_BEGIN(mp) \
ufs_wapbl_begin(mp, __func__, __LINE__)
#define UFS_WAPBL_END(mp) ufs_wapbl_end(mp)
#define UFS_WAPBL_UPDATE(vp, access, modify, flags) \
if ((vp)->v_mount->mnt_wapbl) { \
UFS_UPDATE(vp, access, modify, flags); \
}
#ifdef DIAGNOSTIC
#define UFS_WAPBL_JLOCK_ASSERT(mp) \
if (mp->mnt_wapbl) wapbl_jlock_assert(mp->mnt_wapbl)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp) \
if (mp->mnt_wapbl) wapbl_junlock_assert(mp->mnt_wapbl)
#else
#define UFS_WAPBL_JLOCK_ASSERT(mp)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
#endif
#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) \
if (mp->mnt_wapbl) wapbl_register_inode(mp->mnt_wapbl, ino, mode)
#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) \
if (mp->mnt_wapbl) wapbl_unregister_inode(mp->mnt_wapbl, ino, mode)
#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) \
(mp->mnt_wapbl) \
? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \
false, cookiep) \
: 0
#define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) \
( \
(mp->mnt_wapbl) \
? wapbl_register_deallocation(mp->mnt_wapbl, blk, len, \
true, NULL) \
: 0 \
)
#define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) \
if (mp->mnt_wapbl) wapbl_unregister_deallocation(mp->mnt_wapbl, cookie)
#else /* ! WAPBL */
#define UFS_WAPBL_BEGIN(mp) (__USE(mp), 0)
#define UFS_WAPBL_END(mp) do { } while (0)
#define UFS_WAPBL_UPDATE(vp, access, modify, flags) do { } while (0)
#define UFS_WAPBL_JLOCK_ASSERT(mp)
#define UFS_WAPBL_JUNLOCK_ASSERT(mp)
#define UFS_WAPBL_REGISTER_INODE(mp, ino, mode) do { } while (0)
#define UFS_WAPBL_UNREGISTER_INODE(mp, ino, mode) do { } while (0)
#define UFS_WAPBL_REGISTER_DEALLOCATION(mp, blk, len, cookiep) 0
#define UFS_WAPBL_REGISTER_DEALLOCATION_FORCE(mp, blk, len) 0
#define UFS_WAPBL_UNREGISTER_DEALLOCATION(mp, cookie) do { } while (0)
#endif
#endif /* !_UFS_UFS_UFS_WAPBL_H_ */
/* $NetBSD: prop_number.c,v 1.34 2022/08/03 21:13:46 riastradh Exp $ */
/*-
* Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include "prop_object_impl.h"
#include <prop/prop_number.h>
#include <sys/rbtree.h>
#if defined(_KERNEL)
#include <sys/systm.h>
#elif defined(_STANDALONE)
#include <sys/param.h>
#include <lib/libkern/libkern.h>
#else
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#endif
struct _prop_number_value {
union {
int64_t pnu_signed;
uint64_t pnu_unsigned;
} pnv_un;
#define pnv_signed pnv_un.pnu_signed
#define pnv_unsigned pnv_un.pnu_unsigned
unsigned int pnv_is_unsigned :1,
:31;
};
struct _prop_number {
struct _prop_object pn_obj;
struct rb_node pn_link;
struct _prop_number_value pn_value;
};
_PROP_POOL_INIT(_prop_number_pool, sizeof(struct _prop_number), "propnmbr")
static _prop_object_free_rv_t
_prop_number_free(prop_stack_t, prop_object_t *);
static bool _prop_number_externalize(
struct _prop_object_externalize_context *,
void *);
static _prop_object_equals_rv_t
_prop_number_equals(prop_object_t, prop_object_t,
void **, void **,
prop_object_t *, prop_object_t *);
static void _prop_number_lock(void);
static void _prop_number_unlock(void);
static const struct _prop_object_type _prop_object_type_number = {
.pot_type = PROP_TYPE_NUMBER,
.pot_free = _prop_number_free,
.pot_extern = _prop_number_externalize,
.pot_equals = _prop_number_equals,
.pot_lock = _prop_number_lock,
.pot_unlock = _prop_number_unlock,
};
#define prop_object_is_number(x) \
((x) != NULL && (x)->pn_obj.po_type == &_prop_object_type_number)
/*
* Number objects are immutable, and we are likely to have many number
* objects that have the same value. So, to save memory, we unique'ify
* numbers so we only have one copy of each.
*/
static int
_prop_number_compare_values(const struct _prop_number_value *pnv1,
const struct _prop_number_value *pnv2)
{
/* Signed numbers are sorted before unsigned numbers. */
if (pnv1->pnv_is_unsigned) {
if (! pnv2->pnv_is_unsigned)
return (1);
if (pnv1->pnv_unsigned < pnv2->pnv_unsigned)
return (-1);
if (pnv1->pnv_unsigned > pnv2->pnv_unsigned)
return (1);
return (0);
}
if (pnv2->pnv_is_unsigned)
return (-1);
if (pnv1->pnv_signed < pnv2->pnv_signed)
return (-1);
if (pnv1->pnv_signed > pnv2->pnv_signed)
return (1);
return (0);
}
static int
/*ARGSUSED*/
_prop_number_rb_compare_nodes(void *ctx _PROP_ARG_UNUSED,
const void *n1, const void *n2)
{
const struct _prop_number *pn1 = n1;
const struct _prop_number *pn2 = n2;
return _prop_number_compare_values(&pn1->pn_value, &pn2->pn_value);
}
static int
/*ARGSUSED*/
_prop_number_rb_compare_key(void *ctx _PROP_ARG_UNUSED,
const void *n, const void *v)
{
const struct _prop_number *pn = n;
const struct _prop_number_value *pnv = v;
return _prop_number_compare_values(&pn->pn_value, pnv);
}
static const rb_tree_ops_t _prop_number_rb_tree_ops = {
.rbto_compare_nodes = _prop_number_rb_compare_nodes,
.rbto_compare_key = _prop_number_rb_compare_key,
.rbto_node_offset = offsetof(struct _prop_number, pn_link),
.rbto_context = NULL
};
static struct rb_tree _prop_number_tree;
_PROP_MUTEX_DECL_STATIC(_prop_number_tree_mutex)
/* ARGSUSED */
static _prop_object_free_rv_t
_prop_number_free(prop_stack_t stack, prop_object_t *obj)
{
prop_number_t pn = *obj;
rb_tree_remove_node(&_prop_number_tree, pn);
_PROP_POOL_PUT(_prop_number_pool, pn);
return (_PROP_OBJECT_FREE_DONE);
}
_PROP_ONCE_DECL(_prop_number_init_once)
static int
_prop_number_init(void)
{
_PROP_MUTEX_INIT(_prop_number_tree_mutex);
rb_tree_init(&_prop_number_tree, &_prop_number_rb_tree_ops);
return 0;
}
static void
_prop_number_lock(void)
{
/* XXX: init necessary? */
_PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
}
static void
_prop_number_unlock(void)
{
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
}
static bool
_prop_number_externalize(struct _prop_object_externalize_context *ctx,
void *v)
{
prop_number_t pn = v;
char tmpstr[32];
/*
* For unsigned numbers, we output in hex. For signed numbers,
* we output in decimal.
*/
if (pn->pn_value.pnv_is_unsigned)
snprintf(tmpstr, sizeof(tmpstr), "0x%" PRIx64,
pn->pn_value.pnv_unsigned);
else
snprintf(tmpstr, sizeof(tmpstr), "%" PRIi64,
pn->pn_value.pnv_signed);
if (_prop_object_externalize_start_tag(ctx, "integer") == false ||
_prop_object_externalize_append_cstring(ctx, tmpstr) == false ||
_prop_object_externalize_end_tag(ctx, "integer") == false)
return (false);
return (true);
}
/* ARGSUSED */
static _prop_object_equals_rv_t
_prop_number_equals(prop_object_t v1, prop_object_t v2,
void **stored_pointer1, void **stored_pointer2,
prop_object_t *next_obj1, prop_object_t *next_obj2)
{
prop_number_t num1 = v1;
prop_number_t num2 = v2;
/*
* There is only ever one copy of a number object at any given
* time, so we can reduce this to a simple pointer equality check
* in the common case.
*/
if (num1 == num2)
return (_PROP_OBJECT_EQUALS_TRUE);
/*
* If the numbers are the same signed-ness, then we know they
* cannot be equal because they would have had pointer equality.
*/
if (num1->pn_value.pnv_is_unsigned == num2->pn_value.pnv_is_unsigned)
return (_PROP_OBJECT_EQUALS_FALSE);
/*
* We now have one signed value and one unsigned value. We can
* compare them iff:
* - The unsigned value is not larger than the signed value
* can represent.
* - The signed value is not smaller than the unsigned value
* can represent.
*/
if (num1->pn_value.pnv_is_unsigned) {
/*
* num1 is unsigned and num2 is signed.
*/
if (num1->pn_value.pnv_unsigned > INTMAX_MAX)
return (_PROP_OBJECT_EQUALS_FALSE);
if (num2->pn_value.pnv_signed < 0)
return (_PROP_OBJECT_EQUALS_FALSE);
} else {
/*
* num1 is signed and num2 is unsigned.
*/
if (num1->pn_value.pnv_signed < 0)
return (_PROP_OBJECT_EQUALS_FALSE);
if (num2->pn_value.pnv_unsigned > INTMAX_MAX)
return (_PROP_OBJECT_EQUALS_FALSE);
}
if (num1->pn_value.pnv_signed == num2->pn_value.pnv_signed)
return _PROP_OBJECT_EQUALS_TRUE;
else
return _PROP_OBJECT_EQUALS_FALSE;
}
static prop_number_t
_prop_number_alloc(const struct _prop_number_value *pnv)
{
prop_number_t opn, pn, rpn;
_PROP_ONCE_RUN(_prop_number_init_once, _prop_number_init);
/*
* Check to see if this already exists in the tree. If it does,
* we just retain it and return it.
*/
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
opn = rb_tree_find_node(&_prop_number_tree, pnv);
if (opn != NULL) {
prop_object_retain(opn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
return (opn);
}
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
/*
* Not in the tree. Create it now.
*/
pn = _PROP_POOL_GET(_prop_number_pool);
if (pn == NULL)
return (NULL);
_prop_object_init(&pn->pn_obj, &_prop_object_type_number);
pn->pn_value = *pnv;
/*
* We dropped the mutex when we allocated the new object, so
* we have to check again if it is in the tree.
*/
_PROP_MUTEX_LOCK(_prop_number_tree_mutex);
opn = rb_tree_find_node(&_prop_number_tree, pnv);
if (opn != NULL) {
prop_object_retain(opn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
_PROP_POOL_PUT(_prop_number_pool, pn);
return (opn);
}
rpn = rb_tree_insert_node(&_prop_number_tree, pn);
_PROP_ASSERT(rpn == pn);
_PROP_MUTEX_UNLOCK(_prop_number_tree_mutex);
return (rpn);
}
/*
* prop_number_create_signed --
* Create a prop_number_t and initialize it with the
* provided signed value.
*/
prop_number_t
prop_number_create_signed(intmax_t val)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
pnv.pnv_signed = val;
pnv.pnv_is_unsigned = false;
return (_prop_number_alloc(&pnv));
}
_PROP_DEPRECATED(prop_number_create_integer,
"this program uses prop_number_create_integer(), "
"which is deprecated; use prop_number_create_signed() instead.")
prop_number_t
prop_number_create_integer(int64_t val)
{
return prop_number_create_signed(val);
}
/*
* prop_number_create_unsigned --
* Create a prop_number_t and initialize it with the
* provided unsigned value.
*/
prop_number_t
prop_number_create_unsigned(uintmax_t val)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
pnv.pnv_unsigned = val;
pnv.pnv_is_unsigned = true;
return (_prop_number_alloc(&pnv));
}
_PROP_DEPRECATED(prop_number_create_unsigned_integer,
"this program uses prop_number_create_unsigned_integer(), "
"which is deprecated; use prop_number_create_unsigned() instead.")
prop_number_t
prop_number_create_unsigned_integer(uint64_t val)
{
return prop_number_create_unsigned(val);
}
/*
* prop_number_copy --
* Copy a prop_number_t.
*/
prop_number_t
prop_number_copy(prop_number_t opn)
{
if (! prop_object_is_number(opn))
return (NULL);
/*
* Because we only ever allocate one object for any given
* value, this can be reduced to a simple retain operation.
*/
prop_object_retain(opn);
return (opn);
}
/*
* prop_number_unsigned --
* Returns true if the prop_number_t has an unsigned value.
*/
bool
prop_number_unsigned(prop_number_t pn)
{
return (pn->pn_value.pnv_is_unsigned);
}
/*
* prop_number_size --
* Return the size, in bits, required to hold the value of
* the specified number.
*/
int
prop_number_size(prop_number_t pn)
{
struct _prop_number_value *pnv;
if (! prop_object_is_number(pn))
return (0);
pnv = &pn->pn_value;
if (pnv->pnv_is_unsigned) {
if (pnv->pnv_unsigned > UINT32_MAX)
return (64);
if (pnv->pnv_unsigned > UINT16_MAX)
return (32);
if (pnv->pnv_unsigned > UINT8_MAX)
return (16);
return (8);
}
if (pnv->pnv_signed > INT32_MAX || pnv->pnv_signed < INT32_MIN)
return (64);
if (pnv->pnv_signed > INT16_MAX || pnv->pnv_signed < INT16_MIN)
return (32);
if (pnv->pnv_signed > INT8_MAX || pnv->pnv_signed < INT8_MIN)
return (16);
return (8);
}
/*
* prop_number_signed_value --
* Get the signed value of a prop_number_t.
*/
intmax_t
prop_number_signed_value(prop_number_t pn)
{
/*
* XXX Impossible to distinguish between "not a prop_number_t"
* XXX and "prop_number_t has a value of 0".
*/
if (! prop_object_is_number(pn))
return (0);
return (pn->pn_value.pnv_signed);
}
_PROP_DEPRECATED(prop_number_integer_value,
"this program uses prop_number_integer_value(), "
"which is deprecated; use prop_number_signed_value() instead.")
int64_t
prop_number_integer_value(prop_number_t pn)
{
return prop_number_signed_value(pn);
}
/*
* prop_number_unsigned_value --
* Get the unsigned value of a prop_number_t.
*/
uintmax_t
prop_number_unsigned_value(prop_number_t pn)
{
/*
* XXX Impossible to distinguish between "not a prop_number_t"
* XXX and "prop_number_t has a value of 0".
*/
if (! prop_object_is_number(pn))
return (0);
return (pn->pn_value.pnv_unsigned);
}
_PROP_DEPRECATED(prop_number_unsigned_integer_value,
"this program uses prop_number_unsigned_integer_value(), "
"which is deprecated; use prop_number_unsigned_value() instead.")
uint64_t
prop_number_unsigned_integer_value(prop_number_t pn)
{
return prop_number_unsigned_value(pn);
}
/*
* prop_number_[...]_value --
* Retrieve the bounds-checked value as the specified type.
* Returns true if successful.
*/
#define TEMPLATE(name, typ, minv, maxv) \
bool \
prop_number_ ## name ## _value(prop_number_t pn, typ * const valp) \
{ \
\
if (! prop_object_is_number(pn)) \
return (false); \
\
if (pn->pn_value.pnv_is_unsigned) { \
if (pn->pn_value.pnv_unsigned > (maxv)) \
return (false); \
*valp = (typ) pn->pn_value.pnv_unsigned; \
} else { \
if ((pn->pn_value.pnv_signed > 0 && \
(uintmax_t)pn->pn_value.pnv_signed > (maxv)) || \
pn->pn_value.pnv_signed < (minv)) \
return (false); \
*valp = (typ) pn->pn_value.pnv_signed; \
} \
\
return (true); \
}
TEMPLATE(schar, signed char, SCHAR_MIN, SCHAR_MAX)
TEMPLATE(short, short, SHRT_MIN, SHRT_MAX)
TEMPLATE(int, int, INT_MIN, INT_MAX)
TEMPLATE(long, long, LONG_MIN, LONG_MAX)
TEMPLATE(longlong, long long, LLONG_MIN, LLONG_MAX)
TEMPLATE(intptr, intptr_t, INTPTR_MIN, INTPTR_MAX)
TEMPLATE(int8, int8_t, INT8_MIN, INT8_MAX)
TEMPLATE(int16, int16_t, INT16_MIN, INT16_MAX)
TEMPLATE(int32, int32_t, INT32_MIN, INT32_MAX)
TEMPLATE(int64, int64_t, INT64_MIN, INT64_MAX)
TEMPLATE(uchar, unsigned char, 0, UCHAR_MAX)
TEMPLATE(ushort, unsigned short, 0, USHRT_MAX)
TEMPLATE(uint, unsigned int, 0, UINT_MAX)
TEMPLATE(ulong, unsigned long, 0, ULONG_MAX)
TEMPLATE(ulonglong, unsigned long long, 0, ULLONG_MAX)
TEMPLATE(uintptr, uintptr_t, 0, UINTPTR_MAX)
TEMPLATE(uint8, uint8_t, 0, UINT8_MAX)
TEMPLATE(uint16, uint16_t, 0, UINT16_MAX)
TEMPLATE(uint32, uint32_t, 0, UINT32_MAX)
TEMPLATE(uint64, uint64_t, 0, UINT64_MAX)
#undef TEMPLATE
/*
* prop_number_equals --
* Return true if two numbers are equivalent.
*/
bool
prop_number_equals(prop_number_t num1, prop_number_t num2)
{
if (!prop_object_is_number(num1) || !prop_object_is_number(num2))
return (false);
return (prop_object_equals(num1, num2));
}
/*
* prop_number_equals_signed --
* Return true if the number is equivalent to the specified signed
* value.
*/
bool
prop_number_equals_signed(prop_number_t pn, intmax_t val)
{
if (! prop_object_is_number(pn))
return (false);
if (pn->pn_value.pnv_is_unsigned &&
(pn->pn_value.pnv_unsigned > INTMAX_MAX || val < 0))
return (false);
return (pn->pn_value.pnv_signed == val);
}
_PROP_DEPRECATED(prop_number_equals_integer,
"this program uses prop_number_equals_integer(), "
"which is deprecated; use prop_number_equals_signed() instead.")
bool
prop_number_equals_integer(prop_number_t pn, int64_t val)
{
return prop_number_equals_signed(pn, val);
}
/*
* prop_number_equals_unsigned --
* Return true if the number is equivalent to the specified
* unsigned value.
*/
bool
prop_number_equals_unsigned(prop_number_t pn, uintmax_t val)
{
if (! prop_object_is_number(pn))
return (false);
if (! pn->pn_value.pnv_is_unsigned &&
(pn->pn_value.pnv_signed < 0 || val > INT64_MAX))
return (false);
return (pn->pn_value.pnv_unsigned == val);
}
_PROP_DEPRECATED(prop_number_equals_unsigned_integer,
"this program uses prop_number_equals_unsigned_integer(), "
"which is deprecated; use prop_number_equals_unsigned() instead.")
bool
prop_number_equals_unsigned_integer(prop_number_t pn, uint64_t val)
{
return prop_number_equals_unsigned(pn, val);
}
static bool
_prop_number_internalize_unsigned(struct _prop_object_internalize_context *ctx,
struct _prop_number_value *pnv)
{
char *cp;
_PROP_ASSERT(/*CONSTCOND*/sizeof(unsigned long long) ==
sizeof(uint64_t));
#ifndef _KERNEL
errno = 0;
#endif
pnv->pnv_unsigned = (uint64_t) strtoull(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */
if (pnv->pnv_unsigned == UINT64_MAX && errno == ERANGE)
return (false);
#endif
pnv->pnv_is_unsigned = true;
ctx->poic_cp = cp;
return (true);
}
static bool
_prop_number_internalize_signed(struct _prop_object_internalize_context *ctx,
struct _prop_number_value *pnv)
{
char *cp;
_PROP_ASSERT(/*CONSTCOND*/sizeof(long long) == sizeof(int64_t));
#ifndef _KERNEL
errno = 0;
#endif
pnv->pnv_signed = (int64_t) strtoll(ctx->poic_cp, &cp, 0);
#ifndef _KERNEL /* XXX can't check for ERANGE in the kernel */
if ((pnv->pnv_signed == INT64_MAX || pnv->pnv_signed == INT64_MIN) &&
errno == ERANGE)
return (false);
#endif
pnv->pnv_is_unsigned = false;
ctx->poic_cp = cp;
return (true);
}
/*
* _prop_number_internalize --
* Parse a <number>...</number> and return the object created from
* the external representation.
*/
/* ARGSUSED */
bool
_prop_number_internalize(prop_stack_t stack, prop_object_t *obj,
struct _prop_object_internalize_context *ctx)
{
struct _prop_number_value pnv;
memset(&pnv, 0, sizeof(pnv));
/* No attributes, no empty elements. */
if (ctx->poic_tagattr != NULL || ctx->poic_is_empty_element)
return (true);
/*
* If the first character is '-', then we treat as signed.
* If the first two characters are "0x" (i.e. the number is
* in hex), then we treat as unsigned. Otherwise, we try
* signed first, and if that fails (presumably due to ERANGE),
* then we switch to unsigned.
*/
if (ctx->poic_cp[0] == '-') {
if (_prop_number_internalize_signed(ctx, &pnv) == false)
return (true);
} else if (ctx->poic_cp[0] == '0' && ctx->poic_cp[1] == 'x') {
if (_prop_number_internalize_unsigned(ctx, &pnv) == false)
return (true);
} else {
if (_prop_number_internalize_signed(ctx, &pnv) == false &&
_prop_number_internalize_unsigned(ctx, &pnv) == false)
return (true);
}
if (_prop_object_internalize_find_tag(ctx, "integer",
_PROP_TAG_TYPE_END) == false)
return (true);
*obj = _prop_number_alloc(&pnv);
return (true);
}
/* $NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $ */
/*
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Kentaro A. Kurahone.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95
* $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $
*/
/*
* @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.36 2018/05/18 18:58:51 maxv Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_tcp_debug.h"
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <net/if.h>
#include <net/route.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif
#ifndef INET6
#include <netinet/ip6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_debug.h>
/* SACK block pool. */
static struct pool sackhole_pool;
void
tcp_sack_init(void)
{
pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0,
"sackholepl", NULL, IPL_SOFTNET);
}
static struct sackhole *
sack_allochole(struct tcpcb *tp)
{
struct sackhole *hole;
if (tp->snd_numholes >= tcp_sack_tp_maxholes ||
tcp_sack_globalholes >= tcp_sack_globalmaxholes) {
return NULL;
}
hole = pool_get(&sackhole_pool, PR_NOWAIT);
if (hole == NULL) {
return NULL;
}
tp->snd_numholes++;
tcp_sack_globalholes++;
return hole;
}
static struct sackhole *
sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end,
struct sackhole *prev)
{
struct sackhole *hole;
hole = sack_allochole(tp);
if (hole == NULL) {
return NULL;
}
hole->start = hole->rxmit = start;
hole->end = end;
if (prev != NULL) {
TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q);
} else {
TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q);
}
return hole;
}
static struct sackhole *
sack_removehole(struct tcpcb *tp, struct sackhole *hole)
{
struct sackhole *next;
next = TAILQ_NEXT(hole, sackhole_q);
tp->snd_numholes--;
tcp_sack_globalholes--;
TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q);
pool_put(&sackhole_pool, hole);
return next;
}
/*
* tcp_new_dsack: record the reception of a duplicated segment.
*/
void
tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len)
{
if (TCP_SACK_ENABLED(tp)) {
tp->rcv_dsack_block.left = seq;
tp->rcv_dsack_block.right = seq + len;
tp->rcv_sack_flags |= TCPSACK_HAVED;
}
}
/*
* tcp_sack_option: parse the given SACK option and update the scoreboard.
*/
void
tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp,
int optlen)
{
struct sackblk
t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)];
struct sackblk *sack = NULL;
struct sackhole *cur = NULL;
struct sackhole *tmp = NULL;
const char *lp = cp + 2;
int i, j, num_sack_blks;
tcp_seq left, right, acked;
/*
* If we aren't processing SACK responses, this is not an ACK
* or the peer sends us a sack option with invalid length, don't
* update the scoreboard.
*/
if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) ||
(optlen % 8 != 2 || optlen < 10)) {
return;
}
/*
* If we don't want any SACK holes to be allocated, just return.
*/
if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) {
return;
}
/* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */
if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))
return;
/*
* Extract SACK blocks.
*
* Note that t_sack_block is sorted so that we only need to do
* one pass over the sequence number space. (SACK "fast-path")
*/
num_sack_blks = optlen / 8;
acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una;
for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) {
memcpy(&left, lp, sizeof(uint32_t));
memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t));
left = ntohl(left);
right = ntohl(right);
if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) ||
SEQ_GEQ(left, right)) {
/* SACK entry that's old, or invalid. */
i--;
num_sack_blks--;
continue;
}
/* Insertion sort. */
for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left);
j--) {
t_sack_block[j].left = t_sack_block[j - 1].left;
t_sack_block[j].right = t_sack_block[j - 1].right;
}
t_sack_block[j].left = left;
t_sack_block[j].right = right;
}
/* Update the scoreboard. */
cur = TAILQ_FIRST(&tp->snd_holes);
for (i = 0; i < num_sack_blks; i++) {
sack = &t_sack_block[i];
/*
* FACK TCP. Update snd_fack so we can enter Fast
* Recovery early.
*/
if (SEQ_GEQ(sack->right, tp->snd_fack))
tp->snd_fack = sack->right;
if (TAILQ_EMPTY(&tp->snd_holes)) {
/* First hole. */
cur = sack_inserthole(tp, th->th_ack, sack->left, NULL);
if (cur == NULL) {
/* ENOBUFS, bail out*/
return;
}
tp->rcv_lastsack = sack->right;
continue; /* With next sack block */
}
/* Go through the list of holes. */
while (cur) {
if (SEQ_LEQ(sack->right, cur->start))
/* SACKs data before the current hole */
break; /* No use going through more holes */
if (SEQ_GEQ(sack->left, cur->end)) {
/* SACKs data beyond the current hole */
cur = TAILQ_NEXT(cur, sackhole_q);
continue;
}
if (SEQ_LEQ(sack->left, cur->start)) {
/* Data acks at least the beginning of hole */
if (SEQ_GEQ(sack->right, cur->end)) {
/* Acks entire hole, so delete hole */
cur = sack_removehole(tp, cur);
break;
}
/* Otherwise, move start of hole forward */
cur->start = sack->right;
cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
break;
}
if (SEQ_GEQ(sack->right, cur->end)) {
/* Move end of hole backward. */
cur->end = sack->left;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur = TAILQ_NEXT(cur, sackhole_q);
break;
}
if (SEQ_LT(cur->start, sack->left) &&
SEQ_GT(cur->end, sack->right)) {
/*
* ACKs some data in middle of a hole; need to
* split current hole
*/
tmp = sack_inserthole(tp, sack->right, cur->end,
cur);
if (tmp == NULL) {
return;
}
tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start);
cur->end = sack->left;
cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
cur = tmp;
break;
}
}
/* At this point, we have reached the tail of the list. */
if (SEQ_LT(tp->rcv_lastsack, sack->left)) {
/*
* Need to append new hole at end.
*/
cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left,
NULL);
if (cur == NULL) {
return;
}
}
if (SEQ_LT(tp->rcv_lastsack, sack->right)) {
tp->rcv_lastsack = sack->right;
}
}
}
/*
* tcp_del_sackholes: remove holes covered by a cumulative ACK.
*/
void
tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th)
{
/* Max because this could be an older ack that just arrived. */
tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
th->th_ack : tp->snd_una;
struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
while (cur) {
if (SEQ_LEQ(cur->end, lastack)) {
cur = sack_removehole(tp, cur);
} else if (SEQ_LT(cur->start, lastack)) {
cur->start = lastack;
if (SEQ_LT(cur->rxmit, cur->start))
cur->rxmit = cur->start;
break;
} else
break;
}
}
/*
* tcp_free_sackholes: clear the scoreboard.
*/
void
tcp_free_sackholes(struct tcpcb *tp)
{
struct sackhole *sack;
/* Free up the SACK hole list. */
while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) {
sack_removehole(tp, sack);
}
KASSERT(tp->snd_numholes == 0);
}
/*
* Returns pointer to a sackhole if there are any pending retransmissions;
* NULL otherwise.
*/
struct sackhole *
tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
{
struct sackhole *cur = NULL;
if (!TCP_SACK_ENABLED(tp))
return (NULL);
*sack_bytes_rexmt = 0;
TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
if (SEQ_LT(cur->rxmit, cur->end)) {
if (SEQ_LT(cur->rxmit, tp->snd_una)) {
/* old SACK hole */
continue;
}
*sack_bytes_rexmt += (cur->rxmit - cur->start);
break;
}
*sack_bytes_rexmt += (cur->rxmit - cur->start);
}
return (cur);
}
/*
* After a timeout, the SACK list may be rebuilt. This SACK information
* should be used to avoid retransmitting SACKed data. This function
* traverses the SACK list to see if snd_nxt should be moved forward.
*/
void
tcp_sack_adjust(struct tcpcb *tp)
{
struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes);
struct sackhole *n = NULL;
if (TAILQ_EMPTY(&tp->snd_holes))
return; /* No holes */
if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
return; /* We're already beyond any SACKed blocks */
/*
* Two cases for which we want to advance snd_nxt:
* i) snd_nxt lies between end of one hole and beginning of another
* ii) snd_nxt lies between end of last hole and rcv_lastsack
*/
while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) {
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
if (SEQ_GEQ(tp->snd_nxt, n->start))
cur = n;
else {
tp->snd_nxt = n->start;
return;
}
}
if (SEQ_LT(tp->snd_nxt, cur->end))
return;
tp->snd_nxt = tp->rcv_lastsack;
return;
}
/*
* tcp_sack_numblks: return the number of SACK blocks to send.
*/
int
tcp_sack_numblks(const struct tcpcb *tp)
{
int numblks;
if (!TCP_SACK_ENABLED(tp)) {
return 0;
}
numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) +
tp->t_segqlen;
if (numblks == 0) {
return 0;
}
if (numblks > TCP_SACK_MAX) {
numblks = TCP_SACK_MAX;
}
return numblks;
}
#if defined(DDB)
void sack_dump(const struct tcpcb *);
void
sack_dump(const struct tcpcb *tp)
{
const struct sackhole *cur;
printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n",
tp->snd_una, tp->snd_max);
printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n",
tp->rcv_lastsack, tp->snd_fack);
printf("numholes=%d\n", tp->snd_numholes);
TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) {
printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n",
cur->start, cur->end, cur->rxmit);
}
}
#endif /* defined(DDB) */
/* $NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_map.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_map.c,v 1.1.2.27 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_map.c: uvm map operations
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_map.c,v 1.411 2024/02/09 22:08:38 andvar Exp $");
#include "opt_ddb.h"
#include "opt_pax.h"
#include "opt_uvmhist.h"
#include "opt_uvm.h"
#include "opt_sysv.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/pax.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/lockdebug.h>
#include <sys/atomic.h>
#include <sys/sysctl.h>
#ifndef __USER_VA0_IS_SAFE
#include <sys/kauth.h>
#include "opt_user_va0_disable_default.h"
#endif
#include <sys/shm.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#if defined(DDB) || defined(DEBUGPRINT)
#include <uvm/uvm_ddb.h>
#endif
#ifdef UVMHIST
#ifndef UVMHIST_MAPHIST_SIZE
#define UVMHIST_MAPHIST_SIZE 100
#endif
static struct kern_history_ent maphistbuf[UVMHIST_MAPHIST_SIZE];
UVMHIST_DEFINE(maphist) = UVMHIST_INITIALIZER(maphist, maphistbuf);
#endif
#if !defined(UVMMAP_COUNTERS)
#define UVMMAP_EVCNT_DEFINE(name) /* nothing */
#define UVMMAP_EVCNT_INCR(ev) /* nothing */
#define UVMMAP_EVCNT_DECR(ev) /* nothing */
#else /* defined(UVMMAP_NOCOUNTERS) */
#include <sys/evcnt.h>
#define UVMMAP_EVCNT_DEFINE(name) \
struct evcnt uvmmap_evcnt_##name = EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, \
"uvmmap", #name); \
EVCNT_ATTACH_STATIC(uvmmap_evcnt_##name);
#define UVMMAP_EVCNT_INCR(ev) uvmmap_evcnt_##ev.ev_count++
#define UVMMAP_EVCNT_DECR(ev) uvmmap_evcnt_##ev.ev_count--
#endif /* defined(UVMMAP_NOCOUNTERS) */
UVMMAP_EVCNT_DEFINE(ubackmerge)
UVMMAP_EVCNT_DEFINE(uforwmerge)
UVMMAP_EVCNT_DEFINE(ubimerge)
UVMMAP_EVCNT_DEFINE(unomerge)
UVMMAP_EVCNT_DEFINE(kbackmerge)
UVMMAP_EVCNT_DEFINE(kforwmerge)
UVMMAP_EVCNT_DEFINE(kbimerge)
UVMMAP_EVCNT_DEFINE(knomerge)
UVMMAP_EVCNT_DEFINE(map_call)
UVMMAP_EVCNT_DEFINE(mlk_call)
UVMMAP_EVCNT_DEFINE(mlk_hint)
UVMMAP_EVCNT_DEFINE(mlk_tree)
UVMMAP_EVCNT_DEFINE(mlk_treeloop)
const char vmmapbsy[] = "vmmapbsy";
/*
* cache for dynamically-allocated map entries.
*/
static struct pool_cache uvm_map_entry_cache;
#ifdef PMAP_GROWKERNEL
/*
* This global represents the end of the kernel virtual address
* space. If we want to exceed this, we must grow the kernel
* virtual address space dynamically.
*
* Note, this variable is locked by kernel_map's lock.
*/
vaddr_t uvm_maxkaddr;
#endif
#ifndef __USER_VA0_IS_SAFE
#ifndef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT 1
#endif
#ifdef USER_VA0_DISABLE_DEFAULT /* kernel config option overrides */
#undef __USER_VA0_DISABLE_DEFAULT
#define __USER_VA0_DISABLE_DEFAULT USER_VA0_DISABLE_DEFAULT
#endif
int user_va0_disable = __USER_VA0_DISABLE_DEFAULT;
#endif
/*
* macros
*/
/*
* uvm_map_align_va: round down or up virtual address
*/
static __inline void
uvm_map_align_va(vaddr_t *vap, vsize_t align, int topdown)
{
KASSERT(powerof2(align)); if (align != 0 && (*vap & (align - 1)) != 0) {
if (topdown)
*vap = rounddown2(*vap, align);
else
*vap = roundup2(*vap, align);
}
}
/*
* UVM_ET_ISCOMPATIBLE: check some requirements for map entry merging
*/
extern struct vm_map *pager_map;
#define UVM_ET_ISCOMPATIBLE(ent, type, uobj, meflags, \
prot, maxprot, inh, adv, wire) \
((ent)->etype == (type) && \
(((ent)->flags ^ (meflags)) & (UVM_MAP_NOMERGE)) == 0 && \
(ent)->object.uvm_obj == (uobj) && \
(ent)->protection == (prot) && \
(ent)->max_protection == (maxprot) && \
(ent)->inheritance == (inh) && \
(ent)->advice == (adv) && \
(ent)->wired_count == (wire))
/*
* uvm_map_entry_link: insert entry into a map
*
* => map must be locked
*/
#define uvm_map_entry_link(map, after_where, entry) do { \
uvm_mapent_check(entry); \
(map)->nentries++; \
(entry)->prev = (after_where); \
(entry)->next = (after_where)->next; \
(entry)->prev->next = (entry); \
(entry)->next->prev = (entry); \
uvm_rb_insert((map), (entry)); \
} while (/*CONSTCOND*/ 0)
/*
* uvm_map_entry_unlink: remove entry from a map
*
* => map must be locked
*/
#define uvm_map_entry_unlink(map, entry) do { \
KASSERT((entry) != (map)->first_free); \
KASSERT((entry) != (map)->hint); \
uvm_mapent_check(entry); \
(map)->nentries--; \
(entry)->next->prev = (entry)->prev; \
(entry)->prev->next = (entry)->next; \
uvm_rb_remove((map), (entry)); \
} while (/*CONSTCOND*/ 0)
/*
* SAVE_HINT: saves the specified entry as the hint for future lookups.
*
* => map need not be locked.
*/
#define SAVE_HINT(map, check, value) do { \
if ((map)->hint == (check)) \
(map)->hint = (value); \
} while (/*CONSTCOND*/ 0)
/*
* clear_hints: ensure that hints don't point to the entry.
*
* => map must be write-locked.
*/
static void
clear_hints(struct vm_map *map, struct vm_map_entry *ent)
{
SAVE_HINT(map, ent, ent->prev); if (map->first_free == ent) { map->first_free = ent->prev;
}
}
/*
* VM_MAP_RANGE_CHECK: check and correct range
*
* => map must at least be read locked
*/
#define VM_MAP_RANGE_CHECK(map, start, end) do { \
if (start < vm_map_min(map)) \
start = vm_map_min(map); \
if (end > vm_map_max(map)) \
end = vm_map_max(map); \
if (start > end) \
start = end; \
} while (/*CONSTCOND*/ 0)
/*
* local prototypes
*/
static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *, int);
static void uvm_mapent_copy(struct vm_map_entry *, struct vm_map_entry *);
static void uvm_mapent_free(struct vm_map_entry *);
#if defined(DEBUG)
static void _uvm_mapent_check(const struct vm_map_entry *, int);
#define uvm_mapent_check(map) _uvm_mapent_check(map, __LINE__)
#else /* defined(DEBUG) */
#define uvm_mapent_check(e) /* nothing */
#endif /* defined(DEBUG) */
static void uvm_map_entry_unwire(struct vm_map *, struct vm_map_entry *);
static void uvm_map_reference_amap(struct vm_map_entry *, int);
static int uvm_map_space_avail(vaddr_t *, vsize_t, voff_t, vsize_t, int,
int, struct vm_map_entry *);
static void uvm_map_unreference_amap(struct vm_map_entry *, int);
int _uvm_map_sanity(struct vm_map *);
int _uvm_tree_sanity(struct vm_map *);
static vsize_t uvm_rb_maxgap(const struct vm_map_entry *);
#define ROOT_ENTRY(map) ((struct vm_map_entry *)(map)->rb_tree.rbt_root)
#define LEFT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_left)
#define RIGHT_ENTRY(entry) ((struct vm_map_entry *)(entry)->rb_node.rb_right)
#define PARENT_ENTRY(map, entry) \
(ROOT_ENTRY(map) == (entry) \
? NULL : (struct vm_map_entry *)RB_FATHER(&(entry)->rb_node))
/*
* These get filled in if/when SYSVSHM shared memory code is loaded
*
* We do this with function pointers rather the #ifdef SYSVSHM so the
* SYSVSHM code can be loaded and unloaded
*/
void (*uvm_shmexit)(struct vmspace *) = NULL;
void (*uvm_shmfork)(struct vmspace *, struct vmspace *) = NULL;
static int
uvm_map_compare_nodes(void *ctx, const void *nparent, const void *nkey)
{
const struct vm_map_entry *eparent = nparent;
const struct vm_map_entry *ekey = nkey;
KASSERT(eparent->start < ekey->start || eparent->start >= ekey->end); KASSERT(ekey->start < eparent->start || ekey->start >= eparent->end); if (eparent->start < ekey->start)
return -1;
if (eparent->end >= ekey->start)
return 1;
return 0;
}
static int
uvm_map_compare_key(void *ctx, const void *nparent, const void *vkey)
{
const struct vm_map_entry *eparent = nparent;
const vaddr_t va = *(const vaddr_t *) vkey;
if (eparent->start < va)
return -1;
if (eparent->end >= va)
return 1;
return 0;
}
static const rb_tree_ops_t uvm_map_tree_ops = {
.rbto_compare_nodes = uvm_map_compare_nodes,
.rbto_compare_key = uvm_map_compare_key,
.rbto_node_offset = offsetof(struct vm_map_entry, rb_node),
.rbto_context = NULL
};
/*
* uvm_rb_gap: return the gap size between our entry and next entry.
*/
static inline vsize_t
uvm_rb_gap(const struct vm_map_entry *entry)
{
KASSERT(entry->next != NULL);
return entry->next->start - entry->end;
}
static vsize_t
uvm_rb_maxgap(const struct vm_map_entry *entry)
{
struct vm_map_entry *child;
vsize_t maxgap = entry->gap;
/*
* We need maxgap to be the largest gap of us or any of our
* descendents. Since each of our children's maxgap is the
* cached value of their largest gap of themselves or their
* descendents, we can just use that value and avoid recursing
* down the tree to calculate it.
*/
if ((child = LEFT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
maxgap = child->maxgap;
if ((child = RIGHT_ENTRY(entry)) != NULL && maxgap < child->maxgap)
maxgap = child->maxgap;
return maxgap;
}
static void
uvm_rb_fixup(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *parent;
KASSERT(entry->gap == uvm_rb_gap(entry)); entry->maxgap = uvm_rb_maxgap(entry); while ((parent = PARENT_ENTRY(map, entry)) != NULL) {
struct vm_map_entry *brother;
vsize_t maxgap = parent->gap;
unsigned int which;
KDASSERT(parent->gap == uvm_rb_gap(parent));
if (maxgap < entry->maxgap)
maxgap = entry->maxgap;
/*
* Since we work towards the root, we know entry's maxgap
* value is OK, but its brothers may now be out-of-date due
* to rebalancing. So refresh it.
*/
which = RB_POSITION(&entry->rb_node) ^ RB_DIR_OTHER;
brother = (struct vm_map_entry *)parent->rb_node.rb_nodes[which];
if (brother != NULL) { KDASSERT(brother->gap == uvm_rb_gap(brother)); brother->maxgap = uvm_rb_maxgap(brother);
if (maxgap < brother->maxgap)
maxgap = brother->maxgap;
}
parent->maxgap = maxgap;
entry = parent;
}
}
static void
uvm_rb_insert(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *ret __diagused;
entry->gap = entry->maxgap = uvm_rb_gap(entry); if (entry->prev != &map->header) entry->prev->gap = uvm_rb_gap(entry->prev);
ret = rb_tree_insert_node(&map->rb_tree, entry);
KASSERTMSG(ret == entry,
"uvm_rb_insert: map %p: duplicate entry %p", map, ret);
/*
* If the previous entry is not our immediate left child, then it's an
* ancestor and will be fixed up on the way to the root. We don't
* have to check entry->prev against &map->header since &map->header
* will never be in the tree.
*/
uvm_rb_fixup(map,
LEFT_ENTRY(entry) == entry->prev ? entry->prev : entry);
}
static void
uvm_rb_remove(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_map_entry *prev_parent = NULL, *next_parent = NULL;
/*
* If we are removing an interior node, then an adjacent node will
* be used to replace its position in the tree. Therefore we will
* need to fixup the tree starting at the parent of the replacement
* node. So record their parents for later use.
*/
if (entry->prev != &map->header) prev_parent = PARENT_ENTRY(map, entry->prev); if (entry->next != &map->header) next_parent = PARENT_ENTRY(map, entry->next);
rb_tree_remove_node(&map->rb_tree, entry);
/*
* If the previous node has a new parent, fixup the tree starting
* at the previous node's old parent.
*/
if (entry->prev != &map->header) {
/*
* Update the previous entry's gap due to our absence.
*/
entry->prev->gap = uvm_rb_gap(entry->prev);
uvm_rb_fixup(map, entry->prev);
if (prev_parent != NULL && prev_parent != entry && prev_parent != PARENT_ENTRY(map, entry->prev)) uvm_rb_fixup(map, prev_parent);
}
/*
* If the next node has a new parent, fixup the tree starting
* at the next node's old parent.
*/
if (entry->next != &map->header) {
uvm_rb_fixup(map, entry->next);
if (next_parent != NULL && next_parent != entry && next_parent != PARENT_ENTRY(map, entry->next)) uvm_rb_fixup(map, next_parent);
}
}
#if defined(DEBUG)
int uvm_debug_check_map = 0;
int uvm_debug_check_rbtree = 0;
#define uvm_map_check(map, name) \
_uvm_map_check((map), (name), __FILE__, __LINE__)
static void
_uvm_map_check(struct vm_map *map, const char *name,
const char *file, int line)
{
if ((uvm_debug_check_map && _uvm_map_sanity(map)) || (uvm_debug_check_rbtree && _uvm_tree_sanity(map))) {
panic("uvm_map_check failed: \"%s\" map=%p (%s:%d)",
name, map, file, line);
}
}
#else /* defined(DEBUG) */
#define uvm_map_check(map, name) /* nothing */
#endif /* defined(DEBUG) */
#if defined(DEBUG) || defined(DDB)
int
_uvm_map_sanity(struct vm_map *map)
{
bool first_free_found = false;
bool hint_found = false;
const struct vm_map_entry *e;
struct vm_map_entry *hint = map->hint;
e = &map->header;
for (;;) {
if (map->first_free == e) {
first_free_found = true;
} else if (!first_free_found && e->next->start > e->end) { printf("first_free %p should be %p\n",
map->first_free, e);
return -1;
}
if (hint == e) {
hint_found = true;
}
e = e->next;
if (e == &map->header) {
break;
}
}
if (!first_free_found) {
printf("stale first_free\n");
return -1;
}
if (!hint_found) { printf("stale hint\n");
return -1;
}
return 0;
}
int
_uvm_tree_sanity(struct vm_map *map)
{
struct vm_map_entry *tmp, *trtmp;
int n = 0, i = 1;
for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
if (tmp->gap != uvm_rb_gap(tmp)) {
printf("%d/%d gap %#lx != %#lx %s\n",
n + 1, map->nentries,
(ulong)tmp->gap, (ulong)uvm_rb_gap(tmp),
tmp->next == &map->header ? "(last)" : "");
goto error;
}
/*
* If any entries are out of order, tmp->gap will be unsigned
* and will likely exceed the size of the map.
*/
if (tmp->gap >= vm_map_max(map) - vm_map_min(map)) {
printf("too large gap %zu\n", (size_t)tmp->gap);
goto error;
}
n++;
}
if (n != map->nentries) {
printf("nentries: %d vs %d\n", n, map->nentries);
goto error;
}
trtmp = NULL;
for (tmp = map->header.next; tmp != &map->header; tmp = tmp->next) {
if (tmp->maxgap != uvm_rb_maxgap(tmp)) {
printf("maxgap %#lx != %#lx\n",
(ulong)tmp->maxgap,
(ulong)uvm_rb_maxgap(tmp));
goto error;
}
if (trtmp != NULL && trtmp->start >= tmp->start) {
printf("corrupt: 0x%"PRIxVADDR"x >= 0x%"PRIxVADDR"x\n",
trtmp->start, tmp->start);
goto error;
}
trtmp = tmp;
}
for (tmp = map->header.next; tmp != &map->header;
tmp = tmp->next, i++) {
trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_LEFT);
if (trtmp == NULL)
trtmp = &map->header;
if (tmp->prev != trtmp) {
printf("lookup: %d: %p->prev=%p: %p\n",
i, tmp, tmp->prev, trtmp);
goto error;
}
trtmp = rb_tree_iterate(&map->rb_tree, tmp, RB_DIR_RIGHT);
if (trtmp == NULL)
trtmp = &map->header;
if (tmp->next != trtmp) {
printf("lookup: %d: %p->next=%p: %p\n",
i, tmp, tmp->next, trtmp);
goto error;
}
trtmp = rb_tree_find_node(&map->rb_tree, &tmp->start);
if (trtmp != tmp) {
printf("lookup: %d: %p - %p: %p\n", i, tmp, trtmp,
PARENT_ENTRY(map, tmp));
goto error;
}
}
return (0);
error:
return (-1);
}
#endif /* defined(DEBUG) || defined(DDB) */
/*
* vm_map_lock: acquire an exclusive (write) lock on a map.
*
* => The locking protocol provides for guaranteed upgrade from shared ->
* exclusive by whichever thread currently has the map marked busy.
* See "LOCKING PROTOCOL NOTES" in uvm_map.h. This is horrible; among
* other problems, it defeats any fairness guarantees provided by RW
* locks.
*/
void
vm_map_lock(struct vm_map *map)
{
for (;;) {
rw_enter(&map->lock, RW_WRITER);
if (map->busy == NULL || map->busy == curlwp) {
break;
}
mutex_enter(&map->misc_lock);
rw_exit(&map->lock);
if (map->busy != NULL) { cv_wait(&map->cv, &map->misc_lock);
}
mutex_exit(&map->misc_lock);
}
map->timestamp++;
}
/*
* vm_map_lock_try: try to lock a map, failing if it is already locked.
*/
bool
vm_map_lock_try(struct vm_map *map)
{
if (!rw_tryenter(&map->lock, RW_WRITER)) {
return false;
}
if (map->busy != NULL) {
rw_exit(&map->lock);
return false;
}
map->timestamp++;
return true;
}
/*
* vm_map_unlock: release an exclusive lock on a map.
*/
void
vm_map_unlock(struct vm_map *map)
{ KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL || map->busy == curlwp);
rw_exit(&map->lock);
}
/*
* vm_map_unbusy: mark the map as unbusy, and wake any waiters that
* want an exclusive lock.
*/
void
vm_map_unbusy(struct vm_map *map)
{
KASSERT(map->busy == curlwp);
/*
* Safe to clear 'busy' and 'waiters' with only a read lock held:
*
* o they can only be set with a write lock held
* o writers are blocked out with a read or write hold
* o at any time, only one thread owns the set of values
*/
mutex_enter(&map->misc_lock);
map->busy = NULL;
cv_broadcast(&map->cv);
mutex_exit(&map->misc_lock);
}
/*
* vm_map_lock_read: acquire a shared (read) lock on a map.
*/
void
vm_map_lock_read(struct vm_map *map)
{
rw_enter(&map->lock, RW_READER);
}
/*
* vm_map_unlock_read: release a shared lock on a map.
*/
void
vm_map_unlock_read(struct vm_map *map)
{
rw_exit(&map->lock);
}
/*
* vm_map_busy: mark a map as busy.
*
* => the caller must hold the map write locked
*/
void
vm_map_busy(struct vm_map *map)
{
KASSERT(rw_write_held(&map->lock)); KASSERT(map->busy == NULL);
map->busy = curlwp;
}
/*
* vm_map_locked_p: return true if the map is write locked.
*
* => only for debug purposes like KASSERTs.
* => should not be used to verify that a map is not locked.
*/
bool
vm_map_locked_p(struct vm_map *map)
{
return rw_write_held(&map->lock);
}
/*
* uvm_mapent_alloc: allocate a map entry
*/
static struct vm_map_entry *
uvm_mapent_alloc(struct vm_map *map, int flags)
{
struct vm_map_entry *me;
int pflags = (flags & UVM_FLAG_NOWAIT) ? PR_NOWAIT : PR_WAITOK;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
me = pool_cache_get(&uvm_map_entry_cache, pflags); if (__predict_false(me == NULL)) {
return NULL;
}
me->flags = 0;
UVMHIST_LOG(maphist, "<- new entry=%#jx [kentry=%jd]", (uintptr_t)me,
(map == kernel_map), 0, 0);
return me;
}
/*
* uvm_mapent_free: free map entry
*/
static void
uvm_mapent_free(struct vm_map_entry *me)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"<- freeing map entry=%#jx [flags=%#jx]",
(uintptr_t)me, me->flags, 0, 0);
pool_cache_put(&uvm_map_entry_cache, me);
}
/*
* uvm_mapent_copy: copy a map entry, preserving flags
*/
static inline void
uvm_mapent_copy(struct vm_map_entry *src, struct vm_map_entry *dst)
{
memcpy(dst, src, sizeof(*dst));
dst->flags = 0;
}
#if defined(DEBUG)
static void
_uvm_mapent_check(const struct vm_map_entry *entry, int line)
{
if (entry->start >= entry->end) {
goto bad;
}
if (UVM_ET_ISOBJ(entry)) {
if (entry->object.uvm_obj == NULL) {
goto bad;
}
} else if (UVM_ET_ISSUBMAP(entry)) {
if (entry->object.sub_map == NULL) {
goto bad;
}
} else {
if (entry->object.uvm_obj != NULL ||
entry->object.sub_map != NULL) {
goto bad;
}
}
if (!UVM_ET_ISOBJ(entry)) { if (entry->offset != 0) {
goto bad;
}
}
return;
bad:
panic("%s: bad entry %p, line %d", __func__, entry, line);
}
#endif /* defined(DEBUG) */
/*
* uvm_map_entry_unwire: unwire a map entry
*
* => map should be locked by caller
*/
static inline void
uvm_map_entry_unwire(struct vm_map *map, struct vm_map_entry *entry)
{
entry->wired_count = 0;
uvm_fault_unwire_locked(map, entry->start, entry->end);
}
/*
* wrapper for calling amap_ref()
*/
static inline void
uvm_map_reference_amap(struct vm_map_entry *entry, int flags)
{
amap_ref(entry->aref.ar_amap, entry->aref.ar_pageoff,
(entry->end - entry->start) >> PAGE_SHIFT, flags);
}
/*
* wrapper for calling amap_unref()
*/
static inline void
uvm_map_unreference_amap(struct vm_map_entry *entry, int flags)
{
amap_unref(entry->aref.ar_amap, entry->aref.ar_pageoff,
(entry->end - entry->start) >> PAGE_SHIFT, flags);
}
/*
* uvm_map_init: init mapping system at boot time.
*/
void
uvm_map_init(void)
{
/*
* first, init logging system.
*/
UVMHIST_FUNC(__func__);
UVMHIST_LINK_STATIC(maphist);
UVMHIST_LINK_STATIC(pdhist);
UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist,"<starting uvm map system>", 0, 0, 0, 0);
/*
* initialize the global lock for kernel map entry.
*/
mutex_init(&uvm_kentry_lock, MUTEX_DRIVER, IPL_VM);
}
/*
* uvm_map_init_caches: init mapping system caches.
*/
void
uvm_map_init_caches(void)
{
/*
* initialize caches.
*/
pool_cache_bootstrap(&uvm_map_entry_cache, sizeof(struct vm_map_entry),
coherency_unit, 0, PR_LARGECACHE, "vmmpepl", NULL, IPL_NONE, NULL,
NULL, NULL);
}
/*
* clippers
*/
/*
* uvm_mapent_splitadj: adjust map entries for splitting, after uvm_mapent_copy.
*/
static void
uvm_mapent_splitadj(struct vm_map_entry *entry1, struct vm_map_entry *entry2,
vaddr_t splitat)
{
vaddr_t adj;
KASSERT(entry1->start < splitat); KASSERT(splitat < entry1->end);
adj = splitat - entry1->start;
entry1->end = entry2->start = splitat;
if (entry1->aref.ar_amap) {
amap_splitref(&entry1->aref, &entry2->aref, adj);
}
if (UVM_ET_ISSUBMAP(entry1)) {
/* ... unlikely to happen, but play it safe */
uvm_map_reference(entry1->object.sub_map); } else if (UVM_ET_ISOBJ(entry1)) { KASSERT(entry1->object.uvm_obj != NULL); /* suppress coverity */
entry2->offset += adj;
if (entry1->object.uvm_obj->pgops &&
entry1->object.uvm_obj->pgops->pgo_reference)
entry1->object.uvm_obj->pgops->pgo_reference(
entry1->object.uvm_obj);
}
}
/*
* uvm_map_clip_start: ensure that the entry begins at or after
* the starting address, if it doesn't we split the entry.
*
* => caller should use UVM_MAP_CLIP_START macro rather than calling
* this directly
* => map must be locked by caller
*/
void
uvm_map_clip_start(struct vm_map *map, struct vm_map_entry *entry,
vaddr_t start)
{
struct vm_map_entry *new_entry;
/* uvm_map_simplify_entry(map, entry); */ /* XXX */
uvm_map_check(map, "clip_start entry"); uvm_mapent_check(entry);
/*
* Split off the front portion. note that we must insert the new
* entry BEFORE this one, so that this entry has the specified
* starting address.
*/
new_entry = uvm_mapent_alloc(map, 0);
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
uvm_mapent_splitadj(new_entry, entry, start);
uvm_map_entry_link(map, entry->prev, new_entry); uvm_map_check(map, "clip_start leave");
}
/*
* uvm_map_clip_end: ensure that the entry ends at or before
* the ending address, if it does't we split the reference
*
* => caller should use UVM_MAP_CLIP_END macro rather than calling
* this directly
* => map must be locked by caller
*/
void
uvm_map_clip_end(struct vm_map *map, struct vm_map_entry *entry, vaddr_t end)
{
struct vm_map_entry *new_entry;
uvm_map_check(map, "clip_end entry"); uvm_mapent_check(entry);
/*
* Create a new entry and insert it
* AFTER the specified entry
*/
new_entry = uvm_mapent_alloc(map, 0);
uvm_mapent_copy(entry, new_entry); /* entry -> new_entry */
uvm_mapent_splitadj(entry, new_entry, end);
uvm_map_entry_link(map, entry, new_entry); uvm_map_check(map, "clip_end leave");
}
/*
* M A P - m a i n e n t r y p o i n t
*/
/*
* uvm_map: establish a valid mapping in a map
*
* => assume startp is page aligned.
* => assume size is a multiple of PAGE_SIZE.
* => assume sys_mmap provides enough of a "hint" to have us skip
* over text/data/bss area.
* => map must be unlocked (we will lock it)
* => <uobj,uoffset> value meanings (4 cases):
* [1] <NULL,uoffset> == uoffset is a hint for PMAP_PREFER
* [2] <NULL,UVM_UNKNOWN_OFFSET> == don't PMAP_PREFER
* [3] <uobj,uoffset> == normal mapping
* [4] <uobj,UVM_UNKNOWN_OFFSET> == uvm_map finds offset based on VA
*
* case [4] is for kernel mappings where we don't know the offset until
* we've found a virtual address. note that kernel object offsets are
* always relative to vm_map_min(kernel_map).
*
* => if `align' is non-zero, we align the virtual address to the specified
* alignment.
* this is provided as a mechanism for large pages.
*
* => XXXCDC: need way to map in external amap?
*/
int
uvm_map(struct vm_map *map, vaddr_t *startp /* IN/OUT */, vsize_t size,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags)
{
struct uvm_map_args args;
struct vm_map_entry *new_entry;
int error;
KASSERT((size & PAGE_MASK) == 0); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0);
/*
* for pager_map, allocate the new entry first to avoid sleeping
* for memory while we have the map locked.
*/
new_entry = NULL;
if (map == pager_map) { new_entry = uvm_mapent_alloc(map, (flags & UVM_FLAG_NOWAIT));
if (__predict_false(new_entry == NULL))
return ENOMEM;
}
if (map == pager_map)
flags |= UVM_FLAG_NOMERGE;
error = uvm_map_prepare(map, *startp, size, uobj, uoffset, align,
flags, &args);
if (!error) {
error = uvm_map_enter(map, &args, new_entry);
*startp = args.uma_start;
} else if (new_entry) { uvm_mapent_free(new_entry);
}
#if defined(DEBUG)
if (!error && VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) { uvm_km_check_empty(map, *startp, *startp + size);
}
#endif /* defined(DEBUG) */
return error;
}
/*
* uvm_map_prepare:
*
* called with map unlocked.
* on success, returns the map locked.
*/
int
uvm_map_prepare(struct vm_map *map, vaddr_t start, vsize_t size,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, uvm_flag_t flags,
struct uvm_map_args *args)
{
struct vm_map_entry *prev_entry;
vm_prot_t prot = UVM_PROTECTION(flags);
vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%jx, flags=%#jx)",
(uintptr_t)map, start, size, flags);
UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj,
uoffset,0,0);
/*
* detect a popular device driver bug.
*/
KASSERT(doing_shutdown || curlwp != NULL);
/*
* zero-sized mapping doesn't make any sense.
*/
KASSERT(size > 0); KASSERT((~flags & (UVM_FLAG_NOWAIT | UVM_FLAG_WAITVA)) != 0); uvm_map_check(map, "map entry");
/*
* check sanity of protection code
*/
if ((prot & maxprot) != prot) {
UVMHIST_LOG(maphist, "<- prot. failure: prot=%#jx, max=%#jx",
prot, maxprot,0,0);
return EACCES;
}
/*
* figure out where to put new VM range
*/
retry:
if (vm_map_lock_try(map) == false) { if ((flags & UVM_FLAG_TRYLOCK) != 0) {
return EAGAIN;
}
vm_map_lock(map); /* could sleep here */
}
if (flags & UVM_FLAG_UNMAP) {
KASSERT(flags & UVM_FLAG_FIXED); KASSERT((flags & UVM_FLAG_NOWAIT) == 0);
/*
* Set prev_entry to what it will need to be after any existing
* entries are removed later in uvm_map_enter().
*/
if (uvm_map_lookup_entry(map, start, &prev_entry)) {
if (start == prev_entry->start)
prev_entry = prev_entry->prev;
else
UVM_MAP_CLIP_END(map, prev_entry, start);
SAVE_HINT(map, map->hint, prev_entry);
}
} else {
prev_entry = uvm_map_findspace(map, start, size, &start,
uobj, uoffset, align, flags);
}
if (prev_entry == NULL) {
unsigned int timestamp;
timestamp = map->timestamp;
UVMHIST_LOG(maphist,"waiting va timestamp=%#jx",
timestamp,0,0,0);
map->flags |= VM_MAP_WANTVA;
vm_map_unlock(map);
/*
* try to reclaim kva and wait until someone does unmap.
* fragile locking here, so we awaken every second to
* recheck the condition.
*/
mutex_enter(&map->misc_lock);
while ((map->flags & VM_MAP_WANTVA) != 0 &&
map->timestamp == timestamp) {
if ((flags & UVM_FLAG_WAITVA) == 0) {
mutex_exit(&map->misc_lock);
UVMHIST_LOG(maphist,
"<- uvm_map_findspace failed!", 0,0,0,0);
return ENOMEM;
} else {
cv_timedwait(&map->cv, &map->misc_lock, hz);
}
}
mutex_exit(&map->misc_lock);
goto retry;
}
#ifdef PMAP_GROWKERNEL
/*
* If the kernel pmap can't map the requested space,
* then allocate more resources for it.
*/
if (map == kernel_map && uvm_maxkaddr < (start + size)) uvm_maxkaddr = pmap_growkernel(start + size);
#endif
UVMMAP_EVCNT_INCR(map_call);
/*
* if uobj is null, then uoffset is either a VAC hint for PMAP_PREFER
* [typically from uvm_map_reserve] or it is UVM_UNKNOWN_OFFSET. in
* either case we want to zero it before storing it in the map entry
* (because it looks strange and confusing when debugging...)
*
* if uobj is not null
* if uoffset is not UVM_UNKNOWN_OFFSET then we have a normal mapping
* and we do not need to change uoffset.
* if uoffset is UVM_UNKNOWN_OFFSET then we need to find the offset
* now (based on the starting address of the map). this case is
* for kernel object mappings where we don't know the offset until
* the virtual address is found (with uvm_map_findspace). the
* offset is the distance we are from the start of the map.
*/
if (uobj == NULL) {
uoffset = 0;
} else {
if (uoffset == UVM_UNKNOWN_OFFSET) { KASSERT(UVM_OBJ_IS_KERN_OBJECT(uobj));
uoffset = start - vm_map_min(kernel_map);
}
}
args->uma_flags = flags;
args->uma_prev = prev_entry;
args->uma_start = start;
args->uma_size = size;
args->uma_uobj = uobj;
args->uma_uoffset = uoffset;
UVMHIST_LOG(maphist, "<- done!", 0,0,0,0);
return 0;
}
/*
* uvm_map_enter:
*
* called with map locked.
* unlock the map before returning.
*/
int
uvm_map_enter(struct vm_map *map, const struct uvm_map_args *args,
struct vm_map_entry *new_entry)
{
struct vm_map_entry *prev_entry = args->uma_prev;
struct vm_map_entry *dead = NULL, *dead_entries = NULL;
const uvm_flag_t flags = args->uma_flags;
const vm_prot_t prot = UVM_PROTECTION(flags);
const vm_prot_t maxprot = UVM_MAXPROTECTION(flags);
const vm_inherit_t inherit = UVM_INHERIT(flags);
const int amapwaitflag = (flags & UVM_FLAG_NOWAIT) ?
AMAP_EXTEND_NOWAIT : 0;
const int advice = UVM_ADVICE(flags);
vaddr_t start = args->uma_start;
vsize_t size = args->uma_size;
struct uvm_object *uobj = args->uma_uobj;
voff_t uoffset = args->uma_uoffset;
const int kmap = (vm_map_pmap(map) == pmap_kernel());
int merged = 0;
int error;
int newetype;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, start=%#jx, size=%ju, flags=%#jx)",
(uintptr_t)map, start, size, flags);
UVMHIST_LOG(maphist, " uobj/offset %#jx/%jd", (uintptr_t)uobj,
uoffset,0,0);
KASSERT(map->hint == prev_entry); /* bimerge case assumes this */ KASSERT(vm_map_locked_p(map)); KASSERT((flags & (UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP)) !=
(UVM_FLAG_NOWAIT | UVM_FLAG_UNMAP));
if (uobj)
newetype = UVM_ET_OBJ;
else
newetype = 0;
if (flags & UVM_FLAG_COPYONW) {
newetype |= UVM_ET_COPYONWRITE;
if ((flags & UVM_FLAG_OVERLAY) == 0)
newetype |= UVM_ET_NEEDSCOPY;
}
/*
* For mappings with unmap, remove any old entries now. Adding the new
* entry cannot fail because that can only happen if UVM_FLAG_NOWAIT
* is set, and we do not support nowait and unmap together.
*/
if (flags & UVM_FLAG_UNMAP) { KASSERT(flags & UVM_FLAG_FIXED);
uvm_unmap_remove(map, start, start + size, &dead_entries, 0);
#ifdef DEBUG
struct vm_map_entry *tmp_entry __diagused;
bool rv __diagused;
rv = uvm_map_lookup_entry(map, start, &tmp_entry);
KASSERT(!rv); KASSERTMSG(prev_entry == tmp_entry,
"args %p prev_entry %p tmp_entry %p",
args, prev_entry, tmp_entry);
#endif
SAVE_HINT(map, map->hint, prev_entry);
}
/*
* try and insert in map by extending previous entry, if possible.
* XXX: we don't try and pull back the next entry. might be useful
* for a stack, but we are currently allocating our stack in advance.
*/
if (flags & UVM_FLAG_NOMERGE)
goto nomerge;
if (prev_entry->end == start &&
prev_entry != &map->header &&
UVM_ET_ISCOMPATIBLE(prev_entry, newetype, uobj, 0,
prot, maxprot, inherit, advice, 0)) {
if (uobj && prev_entry->offset +
(prev_entry->end - prev_entry->start) != uoffset)
goto forwardmerge;
/*
* can't extend a shared amap. note: no need to lock amap to
* look at refs since we don't care about its exact value.
* if it is one (i.e. we have only reference) it will stay there
*/
if (prev_entry->aref.ar_amap &&
amap_refs(prev_entry->aref.ar_amap) != 1) {
goto forwardmerge;
}
if (prev_entry->aref.ar_amap) {
error = amap_extend(prev_entry, size,
amapwaitflag | AMAP_EXTEND_FORWARDS);
if (error)
goto nomerge;
}
if (kmap) {
UVMMAP_EVCNT_INCR(kbackmerge);
} else {
UVMMAP_EVCNT_INCR(ubackmerge);
}
UVMHIST_LOG(maphist," starting back merge", 0, 0, 0, 0);
/*
* drop our reference to uobj since we are extending a reference
* that we already have (the ref count can not drop to zero).
*/
if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj);
/*
* Now that we've merged the entries, note that we've grown
* and our gap has shrunk. Then fix the tree.
*/
prev_entry->end += size;
prev_entry->gap -= size;
uvm_rb_fixup(map, prev_entry);
uvm_map_check(map, "map backmerged");
UVMHIST_LOG(maphist,"<- done (via backmerge)!", 0, 0, 0, 0);
merged++;
}
forwardmerge:
if (prev_entry->next->start == (start + size) &&
prev_entry->next != &map->header &&
UVM_ET_ISCOMPATIBLE(prev_entry->next, newetype, uobj, 0,
prot, maxprot, inherit, advice, 0)) {
if (uobj && prev_entry->next->offset != uoffset + size)
goto nomerge;
/*
* can't extend a shared amap. note: no need to lock amap to
* look at refs since we don't care about its exact value.
* if it is one (i.e. we have only reference) it will stay there.
*
* note that we also can't merge two amaps, so if we
* merged with the previous entry which has an amap,
* and the next entry also has an amap, we give up.
*
* Interesting cases:
* amap, new, amap -> give up second merge (single fwd extend)
* amap, new, none -> double forward extend (extend again here)
* none, new, amap -> double backward extend (done here)
* uobj, new, amap -> single backward extend (done here)
*
* XXX should we attempt to deal with someone refilling
* the deallocated region between two entries that are
* backed by the same amap (ie, arefs is 2, "prev" and
* "next" refer to it, and adding this allocation will
* close the hole, thus restoring arefs to 1 and
* deallocating the "next" vm_map_entry)? -- @@@
*/
if (prev_entry->next->aref.ar_amap && (amap_refs(prev_entry->next->aref.ar_amap) != 1 ||
(merged && prev_entry->aref.ar_amap))) {
goto nomerge;
}
if (merged) {
/*
* Try to extend the amap of the previous entry to
* cover the next entry as well. If it doesn't work
* just skip on, don't actually give up, since we've
* already completed the back merge.
*/
if (prev_entry->aref.ar_amap) { if (amap_extend(prev_entry,
prev_entry->next->end -
prev_entry->next->start,
amapwaitflag | AMAP_EXTEND_FORWARDS))
goto nomerge;
}
/*
* Try to extend the amap of the *next* entry
* back to cover the new allocation *and* the
* previous entry as well (the previous merge
* didn't have an amap already otherwise we
* wouldn't be checking here for an amap). If
* it doesn't work just skip on, again, don't
* actually give up, since we've already
* completed the back merge.
*/
else if (prev_entry->next->aref.ar_amap) {
if (amap_extend(prev_entry->next,
prev_entry->end -
prev_entry->start,
amapwaitflag | AMAP_EXTEND_BACKWARDS))
goto nomerge;
}
} else {
/*
* Pull the next entry's amap backwards to cover this
* new allocation.
*/
if (prev_entry->next->aref.ar_amap) {
error = amap_extend(prev_entry->next, size,
amapwaitflag | AMAP_EXTEND_BACKWARDS);
if (error)
goto nomerge;
}
}
if (merged) {
if (kmap) {
UVMMAP_EVCNT_DECR(kbackmerge);
UVMMAP_EVCNT_INCR(kbimerge);
} else {
UVMMAP_EVCNT_DECR(ubackmerge);
UVMMAP_EVCNT_INCR(ubimerge);
}
} else {
if (kmap) {
UVMMAP_EVCNT_INCR(kforwmerge);
} else {
UVMMAP_EVCNT_INCR(uforwmerge);
}
}
UVMHIST_LOG(maphist," starting forward merge", 0, 0, 0, 0);
/*
* drop our reference to uobj since we are extending a reference
* that we already have (the ref count can not drop to zero).
*/
if (uobj && uobj->pgops->pgo_detach) uobj->pgops->pgo_detach(uobj);
if (merged) {
dead = prev_entry->next;
prev_entry->end = dead->end;
uvm_map_entry_unlink(map, dead); if (dead->aref.ar_amap != NULL) {
prev_entry->aref = dead->aref;
dead->aref.ar_amap = NULL;
}
} else {
prev_entry->next->start -= size;
if (prev_entry != &map->header) {
prev_entry->gap -= size;
KASSERT(prev_entry->gap == uvm_rb_gap(prev_entry));
uvm_rb_fixup(map, prev_entry);
}
if (uobj) prev_entry->next->offset = uoffset;
}
uvm_map_check(map, "map forwardmerged");
UVMHIST_LOG(maphist,"<- done forwardmerge", 0, 0, 0, 0);
merged++;
}
nomerge:
if (!merged) {
UVMHIST_LOG(maphist," allocating new map entry", 0, 0, 0, 0);
if (kmap) {
UVMMAP_EVCNT_INCR(knomerge);
} else {
UVMMAP_EVCNT_INCR(unomerge);
}
/*
* allocate new entry and link it in.
*/
if (new_entry == NULL) { new_entry = uvm_mapent_alloc(map,
(flags & UVM_FLAG_NOWAIT));
if (__predict_false(new_entry == NULL)) {
error = ENOMEM;
goto done;
}
}
new_entry->start = start;
new_entry->end = new_entry->start + size;
new_entry->object.uvm_obj = uobj;
new_entry->offset = uoffset;
new_entry->etype = newetype;
if (flags & UVM_FLAG_NOMERGE) { new_entry->flags |= UVM_MAP_NOMERGE;
}
new_entry->protection = prot;
new_entry->max_protection = maxprot;
new_entry->inheritance = inherit;
new_entry->wired_count = 0;
new_entry->advice = advice;
if (flags & UVM_FLAG_OVERLAY) {
/*
* to_add: for BSS we overallocate a little since we
* are likely to extend
*/
vaddr_t to_add = (flags & UVM_FLAG_AMAPPAD) ?
UVM_AMAP_CHUNK << PAGE_SHIFT : 0;
struct vm_amap *amap = amap_alloc(size, to_add,
(flags & UVM_FLAG_NOWAIT));
if (__predict_false(amap == NULL)) {
error = ENOMEM;
goto done;
}
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = amap;
} else {
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = NULL;
}
uvm_map_entry_link(map, prev_entry, new_entry);
/*
* Update the free space hint
*/
if ((map->first_free == prev_entry) &&
(prev_entry->end >= new_entry->start))
map->first_free = new_entry;
new_entry = NULL;
}
map->size += size;
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
error = 0;
done:
vm_map_unlock(map); if (new_entry) { uvm_mapent_free(new_entry);
}
if (dead) { KDASSERT(merged);
uvm_mapent_free(dead);
}
if (dead_entries) uvm_unmap_detach(dead_entries, 0);
return error;
}
/*
* uvm_map_lookup_entry_bytree: lookup an entry in tree
*/
static inline bool
uvm_map_lookup_entry_bytree(struct vm_map *map, vaddr_t address,
struct vm_map_entry **entry /* OUT */)
{
struct vm_map_entry *prev = &map->header;
struct vm_map_entry *cur = ROOT_ENTRY(map);
while (cur) {
UVMMAP_EVCNT_INCR(mlk_treeloop);
if (address >= cur->start) {
if (address < cur->end) {
*entry = cur;
return true;
}
prev = cur;
cur = RIGHT_ENTRY(cur);
} else
cur = LEFT_ENTRY(cur);
}
*entry = prev;
return false;
}
/*
* uvm_map_lookup_entry: find map entry at or before an address
*
* => map must at least be read-locked by caller
* => entry is returned in "entry"
* => return value is true if address is in the returned entry
*/
bool
uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
struct vm_map_entry **entry /* OUT */)
{
struct vm_map_entry *cur;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,addr=%#jx,ent=%#jx)",
(uintptr_t)map, address, (uintptr_t)entry, 0);
/*
* make a quick check to see if we are already looking at
* the entry we want (which is usually the case). note also
* that we don't need to save the hint here... it is the
* same hint (unless we are at the header, in which case the
* hint didn't buy us anything anyway).
*/
cur = map->hint;
UVMMAP_EVCNT_INCR(mlk_call);
if (cur != &map->header && address >= cur->start && cur->end > address) {
UVMMAP_EVCNT_INCR(mlk_hint);
*entry = cur;
UVMHIST_LOG(maphist,"<- got it via hint (%#jx)",
(uintptr_t)cur, 0, 0, 0);
uvm_mapent_check(*entry);
return (true);
}
uvm_map_check(map, __func__);
/*
* lookup in the tree.
*/
UVMMAP_EVCNT_INCR(mlk_tree);
if (__predict_true(uvm_map_lookup_entry_bytree(map, address, entry))) {
SAVE_HINT(map, map->hint, *entry);
UVMHIST_LOG(maphist,"<- search got it (%#jx)",
(uintptr_t)cur, 0, 0, 0);
KDASSERT((*entry)->start <= address); KDASSERT(address < (*entry)->end); uvm_mapent_check(*entry);
return (true);
}
SAVE_HINT(map, map->hint, *entry);
UVMHIST_LOG(maphist,"<- failed!",0,0,0,0);
KDASSERT((*entry) == &map->header || (*entry)->end <= address); KDASSERT((*entry)->next == &map->header ||
address < (*entry)->next->start);
return (false);
}
/*
* See if the range between start and start + length fits in the gap
* entry->next->start and entry->end. Returns 1 if fits, 0 if doesn't
* fit, and -1 address wraps around.
*/
static int
uvm_map_space_avail(vaddr_t *start, vsize_t length, voff_t uoffset,
vsize_t align, int flags, int topdown, struct vm_map_entry *entry)
{
vaddr_t end;
#ifdef PMAP_PREFER
/*
* push start address forward as needed to avoid VAC alias problems.
* we only do this if a valid offset is specified.
*/
if (uoffset != UVM_UNKNOWN_OFFSET)
PMAP_PREFER(uoffset, start, length, topdown);
#endif
if ((flags & UVM_FLAG_COLORMATCH) != 0) {
KASSERT(align < uvmexp.ncolors); if (uvmexp.ncolors > 1) {
const u_int colormask = uvmexp.colormask;
const u_int colorsize = colormask + 1;
vaddr_t hint = atop(*start);
const u_int color = hint & colormask;
if (color != align) {
hint -= color; /* adjust to color boundary */
KASSERT((hint & colormask) == 0);
if (topdown) {
if (align > color) hint -= colorsize;
} else {
if (align < color) hint += colorsize;
}
*start = ptoa(hint + align); /* adjust to color */
}
}
} else {
KASSERT(powerof2(align)); uvm_map_align_va(start, align, topdown);
/*
* XXX Should we PMAP_PREFER() here again?
* eh...i think we're okay
*/
}
/*
* Find the end of the proposed new region. Be sure we didn't
* wrap around the address; if so, we lose. Otherwise, if the
* proposed new region fits before the next entry, we win.
*/
end = *start + length;
if (end < *start)
return (-1);
if (entry->next->start >= end && *start >= entry->end)
return (1);
return (0);
}
static void
uvm_findspace_invariants(struct vm_map *map, vaddr_t orig_hint, vaddr_t length,
struct uvm_object *uobj, voff_t uoffset, vsize_t align, int flags,
vaddr_t hint, struct vm_map_entry *entry, int line)
{
const int topdown = map->flags & VM_MAP_TOPDOWN;
KASSERTMSG( topdown || hint >= orig_hint,
"map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
" length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
" flags=%#x entry=%p (uvm_map_findspace line %d)",
map, hint, orig_hint,
length, uobj, (unsigned long long)uoffset, align,
flags, entry, line);
#ifndef __sh3__ /* XXXRO: kern/51254 */
KASSERTMSG(!topdown || hint <= orig_hint,
#else
if (__predict_false(!(!topdown || hint <= orig_hint)))
printf(
#endif
"map=%p hint=%#"PRIxVADDR" orig_hint=%#"PRIxVADDR
" length=%#"PRIxVSIZE" uobj=%p uoffset=%#llx align=%"PRIxVSIZE
" flags=%#x entry=%p (uvm_map_findspace line %d)",
map, hint, orig_hint,
length, uobj, (unsigned long long)uoffset, align,
flags, entry, line);
}
/*
* uvm_map_findspace: find "length" sized space in "map".
*
* => "hint" is a hint about where we want it, unless UVM_FLAG_FIXED is
* set in "flags" (in which case we insist on using "hint").
* => "result" is VA returned
* => uobj/uoffset are to be used to handle VAC alignment, if required
* => if "align" is non-zero, we attempt to align to that value.
* => caller must at least have read-locked map
* => returns NULL on failure, or pointer to prev. map entry if success
* => note this is a cross between the old vm_map_findspace and vm_map_find
*/
struct vm_map_entry *
uvm_map_findspace(struct vm_map *map, vaddr_t hint, vsize_t length,
vaddr_t *result /* OUT */, struct uvm_object *uobj, voff_t uoffset,
vsize_t align, int flags)
{
#define INVARIANTS() \
uvm_findspace_invariants(map, orig_hint, length, uobj, uoffset, align,\
flags, hint, entry, __LINE__)
struct vm_map_entry *entry = NULL;
struct vm_map_entry *child, *prev, *tmp;
vaddr_t orig_hint __diagused;
const int topdown = map->flags & VM_MAP_TOPDOWN;
int avail;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, hint=%#jx, len=%ju, flags=%#jx...",
(uintptr_t)map, hint, length, flags);
UVMHIST_LOG(maphist, " uobj=%#jx, uoffset=%#jx, align=%#jx)",
(uintptr_t)uobj, uoffset, align, 0);
KASSERT((flags & UVM_FLAG_COLORMATCH) != 0 || powerof2(align)); KASSERT((flags & UVM_FLAG_COLORMATCH) == 0 || align < uvmexp.ncolors); KASSERT((flags & UVM_FLAG_FIXED) == 0 || align == 0); uvm_map_check(map, "map_findspace entry");
/*
* Clamp the hint to the VM map's min/max address, and remmeber
* the clamped original hint. Remember the original hint,
* clamped to the min/max address. If we are aligning, then we
* may have to try again with no alignment constraint if we
* fail the first time.
*
* We use the original hint to verify later that the search has
* been monotonic -- that is, nonincreasing or nondecreasing,
* according to topdown or !topdown respectively. But the
* clamping is not monotonic.
*/
if (hint < vm_map_min(map)) { /* check ranges ... */ if (flags & UVM_FLAG_FIXED) {
UVMHIST_LOG(maphist,"<- VA below map range",0,0,0,0);
return (NULL);
}
hint = vm_map_min(map);
}
if (hint > vm_map_max(map)) {
UVMHIST_LOG(maphist,"<- VA %#jx > range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
return (NULL);
}
orig_hint = hint;
INVARIANTS();
UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
/*
* hint may not be aligned properly; we need round up or down it
* before proceeding further.
*/
if ((flags & UVM_FLAG_COLORMATCH) == 0) { uvm_map_align_va(&hint, align, topdown); INVARIANTS();
}
UVMHIST_LOG(maphist,"<- VA %#jx vs range [%#jx->%#jx]",
hint, vm_map_min(map), vm_map_max(map), 0);
/*
* Look for the first possible address; if there's already
* something at this address, we have to start after it.
*/
/*
* @@@: there are four, no, eight cases to consider.
*
* 0: found, fixed, bottom up -> fail
* 1: found, fixed, top down -> fail
* 2: found, not fixed, bottom up -> start after entry->end,
* loop up
* 3: found, not fixed, top down -> start before entry->start,
* loop down
* 4: not found, fixed, bottom up -> check entry->next->start, fail
* 5: not found, fixed, top down -> check entry->next->start, fail
* 6: not found, not fixed, bottom up -> check entry->next->start,
* loop up
* 7: not found, not fixed, top down -> check entry->next->start,
* loop down
*
* as you can see, it reduces to roughly five cases, and that
* adding top down mapping only adds one unique case (without
* it, there would be four cases).
*/
if ((flags & UVM_FLAG_FIXED) == 0 &&
hint == (topdown ? vm_map_max(map) : vm_map_min(map))) {
/*
* The uvm_map_findspace algorithm is monotonic -- for
* topdown VM it starts with a high hint and returns a
* lower free address; for !topdown VM it starts with a
* low hint and returns a higher free address. As an
* optimization, start with the first (highest for
* topdown, lowest for !topdown) free address.
*
* XXX This `optimization' probably doesn't actually do
* much in practice unless userland explicitly passes
* the VM map's minimum or maximum address, which
* varies from machine to machine (VM_MAX/MIN_ADDRESS,
* e.g. 0x7fbfdfeff000 on amd64 but 0xfffffffff000 on
* aarch64) and may vary according to other factors
* like sysctl vm.user_va0_disable. In particular, if
* the user specifies 0 as a hint to mmap, then mmap
* will choose a default address which is usually _not_
* VM_MAX/MIN_ADDRESS but something else instead like
* VM_MAX_ADDRESS - stack size - guard page overhead,
* in which case this branch is never hit.
*
* In fact, this branch appears to have been broken for
* two decades between when topdown was introduced in
* ~2003 and when it was adapted to handle the topdown
* case without violating the monotonicity assertion in
* 2022. Maybe Someone^TM should either ditch the
* optimization or find a better way to do it.
*/
entry = map->first_free;
} else {
if (uvm_map_lookup_entry(map, hint, &entry)) {
/* "hint" address already in use ... */
if (flags & UVM_FLAG_FIXED) {
UVMHIST_LOG(maphist, "<- fixed & VA in use",
0, 0, 0, 0);
return (NULL);
}
if (topdown)
/* Start from lower gap. */
entry = entry->prev;
} else if (flags & UVM_FLAG_FIXED) {
if (entry->next->start >= hint + length &&
hint + length > hint)
goto found;
/* "hint" address is gap but too small */
UVMHIST_LOG(maphist, "<- fixed mapping failed",
0, 0, 0, 0);
return (NULL); /* only one shot at it ... */
} else {
/*
* See if given hint fits in this gap.
*/
avail = uvm_map_space_avail(&hint, length,
uoffset, align, flags, topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
if (topdown) {
/*
* Still there is a chance to fit
* if hint > entry->end.
*/
} else {
/* Start from higher gap. */
entry = entry->next;
if (entry == &map->header)
goto notfound;
goto nextgap;
}
}
}
/*
* Note that all UVM_FLAGS_FIXED case is already handled.
*/
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
/* Try to find the space in the red-black tree */
/* Check slot before any entry */
if (topdown) { KASSERTMSG(entry->next->start >= vm_map_min(map),
"map=%p entry=%p entry->next=%p"
" entry->next->start=0x%"PRIxVADDR" min=0x%"PRIxVADDR,
map, entry, entry->next,
entry->next->start, vm_map_min(map));
if (length > entry->next->start - vm_map_min(map))
hint = vm_map_min(map); /* XXX goto wraparound? */
else
hint = entry->next->start - length;
KASSERT(hint >= vm_map_min(map));
} else {
hint = entry->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align, flags,
topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
nextgap:
KDASSERT((flags & UVM_FLAG_FIXED) == 0);
/* If there is not enough space in the whole tree, we fail */
tmp = ROOT_ENTRY(map);
if (tmp == NULL || tmp->maxgap < length)
goto notfound;
prev = NULL; /* previous candidate */
/* Find an entry close to hint that has enough space */
for (; tmp;) {
KASSERT(tmp->next->start == tmp->end + tmp->gap);
if (topdown) {
if (tmp->next->start < hint + length && (prev == NULL || tmp->end > prev->end)) { if (tmp->gap >= length)
prev = tmp;
else if ((child = LEFT_ENTRY(tmp)) != NULL
&& child->maxgap >= length)
prev = tmp;
}
} else {
if (tmp->end >= hint && (prev == NULL || tmp->end < prev->end)) { if (tmp->gap >= length)
prev = tmp;
else if ((child = RIGHT_ENTRY(tmp)) != NULL
&& child->maxgap >= length)
prev = tmp;
}
}
if (tmp->next->start < hint + length)
child = RIGHT_ENTRY(tmp);
else if (tmp->end > hint)
child = LEFT_ENTRY(tmp);
else {
if (tmp->gap >= length)
break;
if (topdown)
child = LEFT_ENTRY(tmp);
else
child = RIGHT_ENTRY(tmp);
}
if (child == NULL || child->maxgap < length)
break;
tmp = child;
}
if (tmp != NULL && tmp->start < hint && hint < tmp->next->start) {
/*
* Check if the entry that we found satifies the
* space requirement
*/
if (topdown) {
if (hint > tmp->next->start - length)
hint = tmp->next->start - length;
} else {
if (hint < tmp->end)
hint = tmp->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, tmp);
INVARIANTS(); switch (avail) {
case 1:
entry = tmp;
goto found;
case -1:
goto wraparound;
}
if (tmp->gap >= length)
goto listsearch;
}
if (prev == NULL)
goto notfound;
if (topdown) {
KASSERT(orig_hint >= prev->next->start - length ||
prev->next->start - length > prev->next->start);
hint = prev->next->start - length;
} else {
KASSERT(orig_hint <= prev->end);
hint = prev->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, prev);
INVARIANTS(); switch (avail) {
case 1:
entry = prev;
goto found;
case -1:
goto wraparound;
}
if (prev->gap >= length)
goto listsearch;
if (topdown)
tmp = LEFT_ENTRY(prev);
else
tmp = RIGHT_ENTRY(prev);
for (;;) {
KASSERT(tmp); KASSERTMSG(tmp->maxgap >= length,
"tmp->maxgap=0x%"PRIxVSIZE" length=0x%"PRIxVSIZE,
tmp->maxgap, length);
if (topdown)
child = RIGHT_ENTRY(tmp);
else
child = LEFT_ENTRY(tmp);
if (child && child->maxgap >= length) {
tmp = child;
continue;
}
if (tmp->gap >= length)
break;
if (topdown)
tmp = LEFT_ENTRY(tmp);
else
tmp = RIGHT_ENTRY(tmp);
}
if (topdown) {
KASSERT(orig_hint >= tmp->next->start - length ||
tmp->next->start - length > tmp->next->start);
hint = tmp->next->start - length;
} else {
KASSERT(orig_hint <= tmp->end);
hint = tmp->end;
}
INVARIANTS();
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, tmp);
INVARIANTS(); switch (avail) {
case 1:
entry = tmp;
goto found;
case -1:
goto wraparound;
}
/*
* The tree fails to find an entry because of offset or alignment
* restrictions. Search the list instead.
*/
listsearch:
/*
* Look through the rest of the map, trying to fit a new region in
* the gap between existing regions, or after the very last region.
* note: entry->end = base VA of current gap,
* entry->next->start = VA of end of current gap
*/
INVARIANTS();
for (;;) {
/* Update hint for current gap. */
hint = topdown ? entry->next->start - length : entry->end; INVARIANTS();
/* See if it fits. */
avail = uvm_map_space_avail(&hint, length, uoffset, align,
flags, topdown, entry);
INVARIANTS(); switch (avail) {
case 1:
goto found;
case -1:
goto wraparound;
}
/* Advance to next/previous gap */
if (topdown) {
if (entry == &map->header) {
UVMHIST_LOG(maphist, "<- failed (off start)",
0,0,0,0);
goto notfound;
}
entry = entry->prev;
} else {
entry = entry->next;
if (entry == &map->header) {
UVMHIST_LOG(maphist, "<- failed (off end)",
0,0,0,0);
goto notfound;
}
}
}
found:
SAVE_HINT(map, map->hint, entry);
*result = hint;
UVMHIST_LOG(maphist,"<- got it! (result=%#jx)", hint, 0,0,0);
INVARIANTS(); KASSERT(entry->end <= hint); KASSERT(hint + length <= entry->next->start);
return (entry);
wraparound:
UVMHIST_LOG(maphist, "<- failed (wrap around)", 0,0,0,0);
return (NULL);
notfound:
UVMHIST_LOG(maphist, "<- failed (notfound)", 0,0,0,0);
return (NULL);
#undef INVARIANTS
}
/*
* U N M A P - m a i n h e l p e r f u n c t i o n s
*/
/*
* uvm_unmap_remove: remove mappings from a vm_map (from "start" up to "stop")
*
* => caller must check alignment and size
* => map must be locked by caller
* => we return a list of map entries that we've remove from the map
* in "entry_list"
*/
void
uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map_entry **entry_list /* OUT */, int flags)
{
struct vm_map_entry *entry, *first_entry, *next;
vaddr_t len;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx, start=%#jx, end=%#jx)",
(uintptr_t)map, start, end, 0);
VM_MAP_RANGE_CHECK(map, start, end);
uvm_map_check(map, "unmap_remove entry");
/*
* find first entry
*/
if (uvm_map_lookup_entry(map, start, &first_entry) == true) {
/* clip and go... */
entry = first_entry;
UVM_MAP_CLIP_START(map, entry, start);
/* critical! prevents stale hint */
SAVE_HINT(map, entry, entry->prev);
} else {
entry = first_entry->next;
}
/*
* save the free space hint
*/
if (map->first_free != &map->header && map->first_free->start >= start) map->first_free = entry->prev;
/*
* note: we now re-use first_entry for a different task. we remove
* a number of map entries from the map and save them in a linked
* list headed by "first_entry". once we remove them from the map
* the caller should unlock the map and drop the references to the
* backing objects [c.f. uvm_unmap_detach]. the object is to
* separate unmapping from reference dropping. why?
* [1] the map has to be locked for unmapping
* [2] the map need not be locked for reference dropping
* [3] dropping references may trigger pager I/O, and if we hit
* a pager that does synchronous I/O we may have to wait for it.
* [4] we would like all waiting for I/O to occur with maps unlocked
* so that we don't block other threads.
*/
first_entry = NULL;
*entry_list = NULL;
/*
* break up the area into map entry sized regions and unmap. note
* that all mappings have to be removed before we can even consider
* dropping references to amaps or VM objects (otherwise we could end
* up with a mapping to a page on the free list which would be very bad)
*/
while ((entry != &map->header) && (entry->start < end)) { KASSERT((entry->flags & UVM_MAP_STATIC) == 0); UVM_MAP_CLIP_END(map, entry, end);
next = entry->next;
len = entry->end - entry->start;
/*
* unwire before removing addresses from the pmap; otherwise
* unwiring will put the entries back into the pmap (XXX).
*/
if (VM_MAPENT_ISWIRED(entry)) { uvm_map_entry_unwire(map, entry);
}
if (flags & UVM_FLAG_VAONLY) {
/* nothing */
} else if ((map->flags & VM_MAP_PAGEABLE) == 0) {
/*
* if the map is non-pageable, any pages mapped there
* must be wired and entered with pmap_kenter_pa(),
* and we should free any such pages immediately.
* this is mostly used for kmem_map.
*/
KASSERT(vm_map_pmap(map) == pmap_kernel());
uvm_km_pgremove_intrsafe(map, entry->start, entry->end);
} else if (UVM_ET_ISOBJ(entry) &&
UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)) {
panic("%s: kernel object %p %p\n",
__func__, map, entry);
} else if (UVM_ET_ISOBJ(entry) || entry->aref.ar_amap) {
/*
* remove mappings the standard way. lock object
* and/or amap to ensure vm_page state does not
* change while in pmap_remove().
*/
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(entry, RW_WRITER);
#else
uvm_map_lock_entry(entry, RW_READER);
#endif
pmap_remove(map->pmap, entry->start, entry->end);
/*
* note: if map is dying, leave pmap_update() for
* later. if the map is to be reused (exec) then
* pmap_update() will be called. if the map is
* being disposed of (exit) then pmap_destroy()
* will be called.
*/
if ((map->flags & VM_MAP_DYING) == 0) {
pmap_update(vm_map_pmap(map));
} else {
KASSERT(vm_map_pmap(map) != pmap_kernel());
}
uvm_map_unlock_entry(entry);
}
#if defined(UVMDEBUG)
/*
* check if there's remaining mapping,
* which is a bug in caller.
*/
vaddr_t va;
for (va = entry->start; va < entry->end;
va += PAGE_SIZE) {
if (pmap_extract(vm_map_pmap(map), va, NULL)) {
panic("%s: %#"PRIxVADDR" has mapping",
__func__, va);
}
}
if (VM_MAP_IS_KERNEL(map) && (flags & UVM_FLAG_NOWAIT) == 0) {
uvm_km_check_empty(map, entry->start, entry->end);
}
#endif /* defined(UVMDEBUG) */
/*
* remove entry from map and put it on our list of entries
* that we've nuked. then go to next entry.
*/
UVMHIST_LOG(maphist, " removed map entry %#jx",
(uintptr_t)entry, 0, 0, 0);
/* critical! prevents stale hint */
SAVE_HINT(map, entry, entry->prev); uvm_map_entry_unlink(map, entry); KASSERT(map->size >= len);
map->size -= len;
entry->prev = NULL;
entry->next = first_entry;
first_entry = entry;
entry = next;
}
uvm_map_check(map, "unmap_remove leave");
/*
* now we've cleaned up the map and are ready for the caller to drop
* references to the mapped objects.
*/
*entry_list = first_entry;
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
if (map->flags & VM_MAP_WANTVA) { mutex_enter(&map->misc_lock);
map->flags &= ~VM_MAP_WANTVA;
cv_broadcast(&map->cv);
mutex_exit(&map->misc_lock);
}
}
/*
* uvm_unmap_detach: drop references in a chain of map entries
*
* => we will free the map entries as we traverse the list.
*/
void
uvm_unmap_detach(struct vm_map_entry *first_entry, int flags)
{
struct vm_map_entry *next_entry;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
while (first_entry) { KASSERT(!VM_MAPENT_ISWIRED(first_entry));
UVMHIST_LOG(maphist,
" detach %#jx: amap=%#jx, obj=%#jx, submap?=%jd",
(uintptr_t)first_entry,
(uintptr_t)first_entry->aref.ar_amap,
(uintptr_t)first_entry->object.uvm_obj,
UVM_ET_ISSUBMAP(first_entry));
/*
* drop reference to amap, if we've got one
*/
if (first_entry->aref.ar_amap) uvm_map_unreference_amap(first_entry, flags);
/*
* drop reference to our backing object, if we've got one
*/
KASSERT(!UVM_ET_ISSUBMAP(first_entry)); if (UVM_ET_ISOBJ(first_entry) &&
first_entry->object.uvm_obj->pgops->pgo_detach) {
(*first_entry->object.uvm_obj->pgops->pgo_detach)
(first_entry->object.uvm_obj);
}
next_entry = first_entry->next;
uvm_mapent_free(first_entry);
first_entry = next_entry;
}
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* E X T R A C T I O N F U N C T I O N S
*/
/*
* uvm_map_reserve: reserve space in a vm_map for future use.
*
* => we reserve space in a map by putting a dummy map entry in the
* map (dummy means obj=NULL, amap=NULL, prot=VM_PROT_NONE)
* => map should be unlocked (we will write lock it)
* => we return true if we were able to reserve space
* => XXXCDC: should be inline?
*/
int
uvm_map_reserve(struct vm_map *map, vsize_t size,
vaddr_t offset /* hint for pmap_prefer */,
vsize_t align /* alignment */,
vaddr_t *raddr /* IN:hint, OUT: reserved VA */,
uvm_flag_t flags /* UVM_FLAG_FIXED or UVM_FLAG_COLORMATCH or 0 */)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, size=%#jx, offset=%#jx, addr=%#jx)",
(uintptr_t)map, size, offset, (uintptr_t)raddr);
size = round_page(size);
/*
* reserve some virtual space.
*/
if (uvm_map(map, raddr, size, NULL, offset, align,
UVM_MAPFLAG(UVM_PROT_NONE, UVM_PROT_NONE, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_NOMERGE|flags)) != 0) {
UVMHIST_LOG(maphist, "<- done (no VM)", 0,0,0,0);
return (false);
}
UVMHIST_LOG(maphist, "<- done (*raddr=%#jx)", *raddr,0,0,0);
return (true);
}
/*
* uvm_map_replace: replace a reserved (blank) area of memory with
* real mappings.
*
* => caller must WRITE-LOCK the map
* => we return true if replacement was a success
* => we expect the newents chain to have nnewents entrys on it and
* we expect newents->prev to point to the last entry on the list
* => note newents is allowed to be NULL
*/
static int
uvm_map_replace(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map_entry *newents, int nnewents, vsize_t nsize,
struct vm_map_entry **oldentryp)
{
struct vm_map_entry *oldent, *last;
uvm_map_check(map, "map_replace entry");
/*
* first find the blank map entry at the specified address
*/
if (!uvm_map_lookup_entry(map, start, &oldent)) {
return (false);
}
/*
* check to make sure we have a proper blank entry
*/
if (end < oldent->end) { UVM_MAP_CLIP_END(map, oldent, end);
}
if (oldent->start != start || oldent->end != end || oldent->object.uvm_obj != NULL || oldent->aref.ar_amap != NULL) {
return (false);
}
#ifdef DIAGNOSTIC
/*
* sanity check the newents chain
*/
{
struct vm_map_entry *tmpent = newents;
int nent = 0;
vsize_t sz = 0;
vaddr_t cur = start;
while (tmpent) {
nent++;
sz += tmpent->end - tmpent->start;
if (tmpent->start < cur)
panic("uvm_map_replace1");
if (tmpent->start >= tmpent->end || tmpent->end > end) {
panic("uvm_map_replace2: "
"tmpent->start=%#"PRIxVADDR
", tmpent->end=%#"PRIxVADDR
", end=%#"PRIxVADDR,
tmpent->start, tmpent->end, end);
}
cur = tmpent->end;
if (tmpent->next) {
if (tmpent->next->prev != tmpent) panic("uvm_map_replace3");
} else {
if (newents->prev != tmpent) panic("uvm_map_replace4");
}
tmpent = tmpent->next;
}
if (nent != nnewents)
panic("uvm_map_replace5");
if (sz != nsize)
panic("uvm_map_replace6");
}
#endif
/*
* map entry is a valid blank! replace it. (this does all the
* work of map entry link/unlink...).
*/
if (newents) {
last = newents->prev;
/* critical: flush stale hints out of map */
SAVE_HINT(map, map->hint, newents);
if (map->first_free == oldent) map->first_free = last;
last->next = oldent->next;
last->next->prev = last;
/* Fix RB tree */
uvm_rb_remove(map, oldent);
newents->prev = oldent->prev;
newents->prev->next = newents;
map->nentries = map->nentries + (nnewents - 1);
/* Fixup the RB tree */
{
int i;
struct vm_map_entry *tmp;
tmp = newents;
for (i = 0; i < nnewents && tmp; i++) {
uvm_rb_insert(map, tmp);
tmp = tmp->next;
}
}
} else {
/* NULL list of new entries: just remove the old one */
clear_hints(map, oldent); uvm_map_entry_unlink(map, oldent);
}
map->size -= end - start - nsize;
uvm_map_check(map, "map_replace leave");
/*
* now we can free the old blank entry and return.
*/
*oldentryp = oldent;
return (true);
}
/*
* uvm_map_extract: extract a mapping from a map and put it somewhere
* (maybe removing the old mapping)
*
* => maps should be unlocked (we will write lock them)
* => returns 0 on success, error code otherwise
* => start must be page aligned
* => len must be page sized
* => flags:
* UVM_EXTRACT_REMOVE: remove mappings from srcmap
* UVM_EXTRACT_CONTIG: abort if unmapped area (advisory only)
* UVM_EXTRACT_QREF: for a temporary extraction do quick obj refs
* UVM_EXTRACT_FIXPROT: set prot to maxprot as we go
* UVM_EXTRACT_PROT_ALL: set prot to UVM_PROT_ALL as we go
* >>>NOTE: if you set REMOVE, you are not allowed to use CONTIG or QREF!<<<
* >>>NOTE: QREF's must be unmapped via the QREF path, thus should only
* be used from within the kernel in a kernel level map <<<
*/
int
uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
struct vm_map *dstmap, vaddr_t *dstaddrp, int flags)
{
vaddr_t dstaddr, end, newend, oldoffset, fudge, orig_fudge;
struct vm_map_entry *chain, *endchain, *entry, *orig_entry, *newentry,
*deadentry, *oldentry;
struct vm_map_entry *resentry = NULL; /* a dummy reservation entry */
vsize_t elen __unused;
int nchain, error, copy_ok;
vsize_t nsize;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(srcmap=%#jx,start=%#jx, len=%#jx",
(uintptr_t)srcmap, start, len, 0);
UVMHIST_LOG(maphist," ...,dstmap=%#jx, flags=%#jx)",
(uintptr_t)dstmap, flags, 0, 0);
/*
* step 0: sanity check: start must be on a page boundary, length
* must be page sized. can't ask for CONTIG/QREF if you asked for
* REMOVE.
*/
KASSERTMSG((start & PAGE_MASK) == 0, "start=0x%"PRIxVADDR, start); KASSERTMSG((len & PAGE_MASK) == 0, "len=0x%"PRIxVADDR, len); KASSERT((flags & UVM_EXTRACT_REMOVE) == 0 ||
(flags & (UVM_EXTRACT_CONTIG|UVM_EXTRACT_QREF)) == 0);
/*
* step 1: reserve space in the target map for the extracted area
*/
if ((flags & UVM_EXTRACT_RESERVED) == 0) {
dstaddr = vm_map_min(dstmap);
if (!uvm_map_reserve(dstmap, len, start,
atop(start) & uvmexp.colormask, &dstaddr,
UVM_FLAG_COLORMATCH))
return (ENOMEM);
KASSERT((atop(start ^ dstaddr) & uvmexp.colormask) == 0);
*dstaddrp = dstaddr; /* pass address back to caller */
UVMHIST_LOG(maphist, " dstaddr=%#jx", dstaddr,0,0,0);
} else {
dstaddr = *dstaddrp;
}
/*
* step 2: setup for the extraction process loop by init'ing the
* map entry chain, locking src map, and looking up the first useful
* entry in the map.
*/
end = start + len;
newend = dstaddr + len;
chain = endchain = NULL;
nchain = 0;
nsize = 0;
vm_map_lock(srcmap);
if (uvm_map_lookup_entry(srcmap, start, &entry)) {
/* "start" is within an entry */
if (flags & UVM_EXTRACT_QREF) {
/*
* for quick references we don't clip the entry, so
* the entry may map space "before" the starting
* virtual address... this is the "fudge" factor
* (which can be non-zero only the first time
* through the "while" loop in step 3).
*/
fudge = start - entry->start;
} else {
/*
* normal reference: we clip the map to fit (thus
* fudge is zero)
*/
UVM_MAP_CLIP_START(srcmap, entry, start);
SAVE_HINT(srcmap, srcmap->hint, entry->prev);
fudge = 0;
}
} else {
/* "start" is not within an entry ... skip to next entry */
if (flags & UVM_EXTRACT_CONTIG) {
error = EINVAL;
goto bad; /* definite hole here ... */
}
entry = entry->next;
fudge = 0;
}
/* save values from srcmap for step 6 */
orig_entry = entry;
orig_fudge = fudge;
/*
* step 3: now start looping through the map entries, extracting
* as we go.
*/
while (entry->start < end && entry != &srcmap->header) {
/* if we are not doing a quick reference, clip it */
if ((flags & UVM_EXTRACT_QREF) == 0) UVM_MAP_CLIP_END(srcmap, entry, end);
/* clear needs_copy (allow chunking) */
if (UVM_ET_ISNEEDSCOPY(entry)) {
amap_copy(srcmap, entry,
AMAP_COPY_NOWAIT|AMAP_COPY_NOMERGE, start, end);
if (UVM_ET_ISNEEDSCOPY(entry)) { /* failed? */
error = ENOMEM;
goto bad;
}
/* amap_copy could clip (during chunk)! update fudge */
if (fudge) { fudge = start - entry->start;
orig_fudge = fudge;
}
}
/* calculate the offset of this from "start" */
oldoffset = (entry->start + fudge) - start;
/* allocate a new map entry */
newentry = uvm_mapent_alloc(dstmap, 0);
if (newentry == NULL) {
error = ENOMEM;
goto bad;
}
/* set up new map entry */
newentry->next = NULL;
newentry->prev = endchain;
newentry->start = dstaddr + oldoffset;
newentry->end =
newentry->start + (entry->end - (entry->start + fudge));
if (newentry->end > newend || newentry->end < newentry->start)
newentry->end = newend;
newentry->object.uvm_obj = entry->object.uvm_obj;
if (newentry->object.uvm_obj) { if (newentry->object.uvm_obj->pgops->pgo_reference) newentry->object.uvm_obj->pgops->
pgo_reference(newentry->object.uvm_obj);
newentry->offset = entry->offset + fudge;
} else {
newentry->offset = 0;
}
newentry->etype = entry->etype;
if (flags & UVM_EXTRACT_PROT_ALL) {
newentry->protection = newentry->max_protection =
UVM_PROT_ALL;
} else {
newentry->protection = (flags & UVM_EXTRACT_FIXPROT) ?
entry->max_protection : entry->protection;
newentry->max_protection = entry->max_protection;
}
newentry->inheritance = entry->inheritance;
newentry->wired_count = 0;
newentry->aref.ar_amap = entry->aref.ar_amap;
if (newentry->aref.ar_amap) {
newentry->aref.ar_pageoff =
entry->aref.ar_pageoff + (fudge >> PAGE_SHIFT);
uvm_map_reference_amap(newentry, AMAP_SHARED |
((flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0));
} else {
newentry->aref.ar_pageoff = 0;
}
newentry->advice = entry->advice;
if ((flags & UVM_EXTRACT_QREF) != 0) { newentry->flags |= UVM_MAP_NOMERGE;
}
/* now link it on the chain */
nchain++;
nsize += newentry->end - newentry->start;
if (endchain == NULL) {
chain = endchain = newentry;
} else {
endchain->next = newentry;
endchain = newentry;
}
/* end of 'while' loop! */
if ((flags & UVM_EXTRACT_CONTIG) && entry->end < end && (entry->next == &srcmap->header ||
entry->next->start != entry->end)) {
error = EINVAL;
goto bad;
}
entry = entry->next;
fudge = 0;
}
/*
* step 4: close off chain (in format expected by uvm_map_replace)
*/
if (chain) chain->prev = endchain;
/*
* step 5: attempt to lock the dest map so we can pmap_copy.
* note usage of copy_ok:
* 1 => dstmap locked, pmap_copy ok, and we "replace" here (step 5)
* 0 => dstmap unlocked, NO pmap_copy, and we will "replace" in step 7
*/
if (srcmap == dstmap || vm_map_lock_try(dstmap) == true) {
copy_ok = 1;
if (!uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
nchain, nsize, &resentry)) {
if (srcmap != dstmap)
vm_map_unlock(dstmap);
error = EIO;
goto bad;
}
} else {
copy_ok = 0;
/* replace deferred until step 7 */
}
/*
* step 6: traverse the srcmap a second time to do the following:
* - if we got a lock on the dstmap do pmap_copy
* - if UVM_EXTRACT_REMOVE remove the entries
* we make use of orig_entry and orig_fudge (saved in step 2)
*/
if (copy_ok || (flags & UVM_EXTRACT_REMOVE)) {
/* purge possible stale hints from srcmap */
if (flags & UVM_EXTRACT_REMOVE) {
SAVE_HINT(srcmap, srcmap->hint, orig_entry->prev);
if (srcmap->first_free != &srcmap->header &&
srcmap->first_free->start >= start)
srcmap->first_free = orig_entry->prev;
}
entry = orig_entry;
fudge = orig_fudge;
deadentry = NULL; /* for UVM_EXTRACT_REMOVE */
while (entry->start < end && entry != &srcmap->header) {
if (copy_ok) {
oldoffset = (entry->start + fudge) - start;
elen = MIN(end, entry->end) -
(entry->start + fudge);
pmap_copy(dstmap->pmap, srcmap->pmap,
dstaddr + oldoffset, elen,
entry->start + fudge);
}
/* we advance "entry" in the following if statement */
if (flags & UVM_EXTRACT_REMOVE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(entry, RW_WRITER);
#else
uvm_map_lock_entry(entry, RW_READER);
#endif
pmap_remove(srcmap->pmap, entry->start,
entry->end);
uvm_map_unlock_entry(entry);
oldentry = entry; /* save entry */
entry = entry->next; /* advance */
uvm_map_entry_unlink(srcmap, oldentry);
/* add to dead list */
oldentry->next = deadentry;
deadentry = oldentry;
} else {
entry = entry->next; /* advance */
}
/* end of 'while' loop */
fudge = 0;
}
pmap_update(srcmap->pmap);
/*
* unlock dstmap. we will dispose of deadentry in
* step 7 if needed
*/
if (copy_ok && srcmap != dstmap) vm_map_unlock(dstmap);
} else {
deadentry = NULL;
}
/*
* step 7: we are done with the source map, unlock. if copy_ok
* is 0 then we have not replaced the dummy mapping in dstmap yet
* and we need to do so now.
*/
vm_map_unlock(srcmap); if ((flags & UVM_EXTRACT_REMOVE) && deadentry) uvm_unmap_detach(deadentry, 0); /* dispose of old entries */
/* now do the replacement if we didn't do it in step 5 */
if (copy_ok == 0) {
vm_map_lock(dstmap);
error = uvm_map_replace(dstmap, dstaddr, dstaddr+len, chain,
nchain, nsize, &resentry);
vm_map_unlock(dstmap); if (error == false) {
error = EIO;
goto bad2;
}
}
if (resentry != NULL) uvm_mapent_free(resentry);
return (0);
/*
* bad: failure recovery
*/
bad:
vm_map_unlock(srcmap);
bad2: /* src already unlocked */
if (chain)
uvm_unmap_detach(chain,
(flags & UVM_EXTRACT_QREF) ? AMAP_REFALL : 0); if (resentry != NULL) uvm_mapent_free(resentry); if ((flags & UVM_EXTRACT_RESERVED) == 0) {
uvm_unmap(dstmap, dstaddr, dstaddr+len); /* ??? */
}
return (error);
}
/* end of extraction functions */
/*
* uvm_map_submap: punch down part of a map into a submap
*
* => only the kernel_map is allowed to be submapped
* => the purpose of submapping is to break up the locking granularity
* of a larger map
* => the range specified must have been mapped previously with a uvm_map()
* call [with uobj==NULL] to create a blank map entry in the main map.
* [And it had better still be blank!]
* => maps which contain submaps should never be copied or forked.
* => to remove a submap, use uvm_unmap() on the main map
* and then uvm_map_deallocate() the submap.
* => main map must be unlocked.
* => submap must have been init'd and have a zero reference count.
* [need not be locked as we don't actually reference it]
*/
int
uvm_map_submap(struct vm_map *map, vaddr_t start, vaddr_t end,
struct vm_map *submap)
{
struct vm_map_entry *entry;
int error;
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &entry)) {
UVM_MAP_CLIP_START(map, entry, start);
UVM_MAP_CLIP_END(map, entry, end); /* to be safe */
} else {
entry = NULL;
}
if (entry != NULL &&
entry->start == start && entry->end == end &&
entry->object.uvm_obj == NULL && entry->aref.ar_amap == NULL &&
!UVM_ET_ISCOPYONWRITE(entry) && !UVM_ET_ISNEEDSCOPY(entry)) {
entry->etype |= UVM_ET_SUBMAP;
entry->object.sub_map = submap;
entry->offset = 0;
uvm_map_reference(submap);
error = 0;
} else {
error = EINVAL;
}
vm_map_unlock(map);
return error;
}
/*
* uvm_map_protect_user: change map protection on behalf of the user.
* Enforces PAX settings as necessary.
*/
int
uvm_map_protect_user(struct lwp *l, vaddr_t start, vaddr_t end,
vm_prot_t new_prot)
{
int error;
if ((error = PAX_MPROTECT_VALIDATE(l, new_prot)))
return error;
return uvm_map_protect(&l->l_proc->p_vmspace->vm_map, start, end,
new_prot, false);
}
/*
* uvm_map_protect: change map protection
*
* => set_max means set max_protection.
* => map must be unlocked.
*/
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
~VM_PROT_WRITE : VM_PROT_ALL)
int
uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t new_prot, bool set_max)
{
struct vm_map_entry *current, *entry;
int error = 0;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_prot=%#jx)",
(uintptr_t)map, start, end, new_prot);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &entry)) {
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = entry->next;
}
/*
* make a first pass to check for protection violations.
*/
current = entry;
while ((current != &map->header) && (current->start < end)) { if (UVM_ET_ISSUBMAP(current)) {
error = EINVAL;
goto out;
}
if ((new_prot & current->max_protection) != new_prot) {
error = EACCES;
goto out;
}
/*
* Don't allow VM_PROT_EXECUTE to be set on entries that
* point to vnodes that are associated with a NOEXEC file
* system.
*/
if (UVM_ET_ISOBJ(current) &&
UVM_OBJ_IS_VNODE(current->object.uvm_obj)) {
struct vnode *vp =
(struct vnode *) current->object.uvm_obj;
if ((new_prot & VM_PROT_EXECUTE) != 0 &&
(vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) {
error = EACCES;
goto out;
}
}
current = current->next;
}
/* go back and fix up protections (no need to clip this time). */
current = entry;
while ((current != &map->header) && (current->start < end)) {
vm_prot_t old_prot;
UVM_MAP_CLIP_END(map, current, end);
old_prot = current->protection;
if (set_max)
current->protection =
(current->max_protection = new_prot) & old_prot;
else
current->protection = new_prot;
/*
* update physical map if necessary. worry about copy-on-write
* here -- CHECK THIS XXX
*/
if (current->protection != old_prot) {
/* update pmap! */
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(current, RW_WRITER);
#else
uvm_map_lock_entry(current, RW_READER);
#endif
pmap_protect(map->pmap, current->start, current->end,
current->protection & MASK(current));
uvm_map_unlock_entry(current);
/*
* If this entry points at a vnode, and the
* protection includes VM_PROT_EXECUTE, mark
* the vnode as VEXECMAP.
*/
if (UVM_ET_ISOBJ(current)) {
struct uvm_object *uobj =
current->object.uvm_obj;
if (UVM_OBJ_IS_VNODE(uobj) &&
(current->protection & VM_PROT_EXECUTE)) {
vn_markexec((struct vnode *) uobj);
}
}
}
/*
* If the map is configured to lock any future mappings,
* wire this entry now if the old protection was VM_PROT_NONE
* and the new protection is not VM_PROT_NONE.
*/
if ((map->flags & VM_MAP_WIREFUTURE) != 0 && VM_MAPENT_ISWIRED(current) == 0 &&
old_prot == VM_PROT_NONE &&
new_prot != VM_PROT_NONE) {
/*
* We must call pmap_update() here because the
* pmap_protect() call above might have removed some
* pmap entries and uvm_map_pageable() might create
* some new pmap entries that rely on the prior
* removals being completely finished.
*/
pmap_update(map->pmap);
if (uvm_map_pageable(map, current->start,
current->end, false,
UVM_LK_ENTER|UVM_LK_EXIT) != 0) {
/*
* If locking the entry fails, remember the
* error if it's the first one. Note we
* still continue setting the protection in
* the map, but will return the error
* condition regardless.
*
* XXX Ignore what the actual error is,
* XXX just call it a resource shortage
* XXX so that it doesn't get confused
* XXX what uvm_map_protect() itself would
* XXX normally return.
*/
error = ENOMEM;
}
}
current = current->next;
}
pmap_update(map->pmap);
out:
vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done, error=%jd",error,0,0,0);
return error;
}
#undef MASK
/*
* uvm_map_inherit: set inheritance code for range of addrs in map.
*
* => map must be unlocked
* => note that the inherit code is used during a "fork". see fork
* code for details.
*/
int
uvm_map_inherit(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_inherit_t new_inheritance)
{
struct vm_map_entry *entry, *temp_entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_inh=%#jx)",
(uintptr_t)map, start, end, new_inheritance);
switch (new_inheritance) {
case MAP_INHERIT_NONE:
case MAP_INHERIT_COPY:
case MAP_INHERIT_SHARE:
case MAP_INHERIT_ZERO:
break;
default:
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
return EINVAL;
}
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
entry = temp_entry;
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = temp_entry->next;
}
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end);
entry->inheritance = new_inheritance;
entry = entry->next;
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_advice: set advice code for range of addrs in map.
*
* => map must be unlocked
*/
int
uvm_map_advice(struct vm_map *map, vaddr_t start, vaddr_t end, int new_advice)
{
struct vm_map_entry *entry, *temp_entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_adv=%#jx)",
(uintptr_t)map, start, end, new_advice);
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (uvm_map_lookup_entry(map, start, &temp_entry)) {
entry = temp_entry;
UVM_MAP_CLIP_START(map, entry, start);
} else {
entry = temp_entry->next;
}
/*
* XXXJRT: disallow holes?
*/
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end);
switch (new_advice) {
case MADV_NORMAL:
case MADV_RANDOM:
case MADV_SEQUENTIAL:
/* nothing special here */
break;
default:
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (INVALID ARG)",0,0,0,0);
return EINVAL;
}
entry->advice = new_advice;
entry = entry->next;
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_willneed: apply MADV_WILLNEED
*/
int
uvm_map_willneed(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct vm_map_entry *entry;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx)",
(uintptr_t)map, start, end, 0);
vm_map_lock_read(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!uvm_map_lookup_entry(map, start, &entry)) {
entry = entry->next;
}
while (entry->start < end) {
struct vm_amap * const amap = entry->aref.ar_amap;
struct uvm_object * const uobj = entry->object.uvm_obj;
KASSERT(entry != &map->header); KASSERT(start < entry->end);
/*
* For now, we handle only the easy but commonly-requested case.
* ie. start prefetching of backing uobj pages.
*
* XXX It might be useful to pmap_enter() the already-in-core
* pages by inventing a "weak" mode for uvm_fault() which would
* only do the PGO_LOCKED pgo_get().
*/
if (UVM_ET_ISOBJ(entry) && amap == NULL && uobj != NULL) {
off_t offset;
off_t size;
offset = entry->offset;
if (start < entry->start) {
offset += entry->start - start;
}
size = entry->offset + (entry->end - entry->start);
if (entry->end < end) {
size -= end - entry->end;
}
uvm_readahead(uobj, offset, size);
}
entry = entry->next;
}
vm_map_unlock_read(map);
UVMHIST_LOG(maphist,"<- done (OK)",0,0,0,0);
return 0;
}
/*
* uvm_map_pageable: sets the pageability of a range in a map.
*
* => wires map entries. should not be used for transient page locking.
* for that, use uvm_fault_wire()/uvm_fault_unwire() (see uvm_vslock()).
* => regions specified as not pageable require lock-down (wired) memory
* and page tables.
* => map must never be read-locked
* => if islocked is true, map is already write-locked
* => we always unlock the map, since we must downgrade to a read-lock
* to call uvm_fault_wire()
* => XXXCDC: check this and try and clean it up.
*/
int
uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
bool new_pageable, int lockflags)
{
struct vm_map_entry *entry, *start_entry, *failed_entry;
int rv;
#ifdef DIAGNOSTIC
u_int timestamp_save;
#endif
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,new_pageable=%ju)",
(uintptr_t)map, start, end, new_pageable);
KASSERT(map->flags & VM_MAP_PAGEABLE); if ((lockflags & UVM_LK_ENTER) == 0) vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
/*
* only one pageability change may take place at one time, since
* uvm_fault_wire assumes it will be called only once for each
* wiring/unwiring. therefore, we have to make sure we're actually
* changing the pageability for the entire region. we do so before
* making any changes.
*/
if (uvm_map_lookup_entry(map, start, &start_entry) == false) {
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (fault)",0,0,0,0);
return EFAULT;
}
entry = start_entry;
if (start == end) { /* nothing required */
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (nothing)",0,0,0,0);
return 0;
}
/*
* handle wiring and unwiring separately.
*/
if (new_pageable) { /* unwire */
UVM_MAP_CLIP_START(map, entry, start);
/*
* unwiring. first ensure that the range to be unwired is
* really wired down and that there are no holes.
*/
while ((entry != &map->header) && (entry->start < end)) { if (entry->wired_count == 0 || (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done (INVAL)",0,0,0,0);
return EINVAL;
}
entry = entry->next;
}
/*
* POSIX 1003.1b - a single munlock call unlocks a region,
* regardless of the number of mlock calls made on that
* region.
*/
entry = start_entry;
while ((entry != &map->header) && (entry->start < end)) { UVM_MAP_CLIP_END(map, entry, end); if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry);
entry = entry->next;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
return 0;
}
/*
* wire case: in two passes [XXXCDC: ugly block of code here]
*
* 1: holding the write lock, we create any anonymous maps that need
* to be created. then we clip each map entry to the region to
* be wired and increment its wiring count.
*
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the pages for any newly wired area (wired_count == 1).
*
* downgrading to a read lock for uvm_fault_wire avoids a possible
* deadlock with another thread that may have faulted on one of
* the pages to be wired (it would mark the page busy, blocking
* us, then in turn block on the map lock that we hold). because
* of problems in the recursive lock package, we cannot upgrade
* to a write lock in vm_map_lookup. thus, any actions that
* require the write lock must be done beforehand. because we
* keep the read lock on the map, the copy-on-write status of the
* entries we modify here cannot change.
*/
while ((entry != &map->header) && (entry->start < end)) { if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
/*
* perform actions of vm_map_lookup that need the
* write lock on the map: create an anonymous map
* for a copy-on-write region, or an anonymous map
* for a zero-fill region. (XXXCDC: submap case
* ok?)
*/
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ if (UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL))) {
amap_copy(map, entry, 0, start, end);
/* XXXCDC: wait OK? */
}
}
}
UVM_MAP_CLIP_START(map, entry, start); UVM_MAP_CLIP_END(map, entry, end);
entry->wired_count++;
/*
* Check for holes
*/
if (entry->protection == VM_PROT_NONE || (entry->end < end && (entry->next == &map->header ||
entry->next->start > entry->end))) {
/*
* found one. amap creation actions do not need to
* be undone, but the wired counts need to be restored.
*/
while (entry != &map->header && entry->end > start) {
entry->wired_count--;
entry = entry->prev;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (INVALID WIRE)",0,0,0,0);
return EINVAL;
}
entry = entry->next;
}
/*
* Pass 2.
*/
#ifdef DIAGNOSTIC
timestamp_save = map->timestamp;
#endif
vm_map_busy(map); vm_map_unlock(map);
rv = 0;
entry = start_entry;
while (entry != &map->header && entry->start < end) { if (entry->wired_count == 1) {
rv = uvm_fault_wire(map, entry->start, entry->end,
entry->max_protection, 1);
if (rv) {
/*
* wiring failed. break out of the loop.
* we'll clean up the map below, once we
* have a write lock again.
*/
break;
}
}
entry = entry->next;
}
if (rv) { /* failed? */
/*
* Get back to an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save + 1 != map->timestamp)
panic("uvm_map_pageable: stale map");
#endif
/*
* first drop the wiring count on all the entries
* which haven't actually been wired yet.
*/
failed_entry = entry;
while (entry != &map->header && entry->start < end) {
entry->wired_count--;
entry = entry->next;
}
/*
* now, unwire all the entries that were successfully
* wired above.
*/
entry = start_entry;
while (entry != failed_entry) {
entry->wired_count--;
if (VM_MAPENT_ISWIRED(entry) == 0) uvm_map_entry_unwire(map, entry);
entry = entry->next;
}
if ((lockflags & UVM_LK_EXIT) == 0) vm_map_unlock(map);
UVMHIST_LOG(maphist, "<- done (RV=%jd)", rv,0,0,0);
return (rv);
}
if ((lockflags & UVM_LK_EXIT) == 0) {
vm_map_unbusy(map);
} else {
/*
* Get back to an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
}
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
return 0;
}
/*
* uvm_map_pageable_all: special case of uvm_map_pageable - affects
* all mapped regions.
*
* => map must not be locked.
* => if no flags are specified, all regions are unwired.
* => XXXJRT: has some of the same problems as uvm_map_pageable() above.
*/
int
uvm_map_pageable_all(struct vm_map *map, int flags, vsize_t limit)
{
struct vm_map_entry *entry, *failed_entry;
vsize_t size;
int rv;
#ifdef DIAGNOSTIC
u_int timestamp_save;
#endif
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,flags=%#jx)", (uintptr_t)map, flags,
0, 0);
KASSERT(map->flags & VM_MAP_PAGEABLE);
vm_map_lock(map);
/*
* handle wiring and unwiring separately.
*/
if (flags == 0) { /* unwire */
/*
* POSIX 1003.1b -- munlockall unlocks all regions,
* regardless of how many times mlockall has been called.
*/
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry);
}
map->flags &= ~VM_MAP_WIREFUTURE;
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (OK UNWIRE)",0,0,0,0);
return 0;
}
if (flags & MCL_FUTURE) {
/*
* must wire all future mappings; remember this.
*/
map->flags |= VM_MAP_WIREFUTURE;
}
if ((flags & MCL_CURRENT) == 0) {
/*
* no more work to do!
*/
UVMHIST_LOG(maphist,"<- done (OK no wire)",0,0,0,0);
vm_map_unlock(map);
return 0;
}
/*
* wire case: in three passes [XXXCDC: ugly block of code here]
*
* 1: holding the write lock, count all pages mapped by non-wired
* entries. if this would cause us to go over our limit, we fail.
*
* 2: still holding the write lock, we create any anonymous maps that
* need to be created. then we increment its wiring count.
*
* 3: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the pages for any newly wired area (wired_count == 1).
*
* downgrading to a read lock for uvm_fault_wire avoids a possible
* deadlock with another thread that may have faulted on one of
* the pages to be wired (it would mark the page busy, blocking
* us, then in turn block on the map lock that we hold). because
* of problems in the recursive lock package, we cannot upgrade
* to a write lock in vm_map_lookup. thus, any actions that
* require the write lock must be done beforehand. because we
* keep the read lock on the map, the copy-on-write status of the
* entries we modify here cannot change.
*/
for (size = 0, entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->protection != VM_PROT_NONE &&
VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
size += entry->end - entry->start;
}
}
if (atop(size) + uvmexp.wired > uvmexp.wiredmax) {
vm_map_unlock(map);
return ENOMEM;
}
if (limit != 0 &&
(size + ptoa(pmap_wired_count(vm_map_pmap(map))) > limit)) {
vm_map_unlock(map);
return ENOMEM;
}
/*
* Pass 2.
*/
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
if (VM_MAPENT_ISWIRED(entry) == 0) { /* not already wired? */
/*
* perform actions of vm_map_lookup that need the
* write lock on the map: create an anonymous map
* for a copy-on-write region, or an anonymous map
* for a zero-fill region. (XXXCDC: submap case
* ok?)
*/
if (!UVM_ET_ISSUBMAP(entry)) { /* not submap */ if (UVM_ET_ISNEEDSCOPY(entry) && ((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL))) {
amap_copy(map, entry, 0, entry->start,
entry->end);
/* XXXCDC: wait OK? */
}
}
}
entry->wired_count++;
}
/*
* Pass 3.
*/
#ifdef DIAGNOSTIC
timestamp_save = map->timestamp;
#endif
vm_map_busy(map); vm_map_unlock(map);
rv = 0;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (entry->wired_count == 1) {
rv = uvm_fault_wire(map, entry->start, entry->end,
entry->max_protection, 1);
if (rv) {
/*
* wiring failed. break out of the loop.
* we'll clean up the map below, once we
* have a write lock again.
*/
break;
}
}
}
if (rv) {
/*
* Get back an exclusive (write) lock.
*/
vm_map_lock(map);
vm_map_unbusy(map);
#ifdef DIAGNOSTIC
if (timestamp_save + 1 != map->timestamp)
panic("uvm_map_pageable_all: stale map");
#endif
/*
* first drop the wiring count on all the entries
* which haven't actually been wired yet.
*
* Skip VM_PROT_NONE entries like we did above.
*/
failed_entry = entry;
for (/* nothing */; entry != &map->header;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
entry->wired_count--;
}
/*
* now, unwire all the entries that were successfully
* wired above.
*
* Skip VM_PROT_NONE entries like we did above.
*/
for (entry = map->header.next; entry != failed_entry;
entry = entry->next) {
if (entry->protection == VM_PROT_NONE)
continue;
entry->wired_count--;
if (VM_MAPENT_ISWIRED(entry)) uvm_map_entry_unwire(map, entry);
}
vm_map_unlock(map);
UVMHIST_LOG(maphist,"<- done (RV=%jd)", rv,0,0,0);
return (rv);
}
vm_map_unbusy(map);
UVMHIST_LOG(maphist,"<- done (OK WIRE)",0,0,0,0);
return 0;
}
/*
* uvm_map_clean: clean out a map range
*
* => valid flags:
* if (flags & PGO_CLEANIT): dirty pages are cleaned first
* if (flags & PGO_SYNCIO): dirty pages are written synchronously
* if (flags & PGO_DEACTIVATE): any cached pages are deactivated after clean
* if (flags & PGO_FREE): any cached pages are freed after clean
* => returns an error if any part of the specified range isn't mapped
* => never a need to flush amap layer since the anonymous memory has
* no permanent home, but may deactivate pages there
* => called from sys_msync() and sys_madvise()
* => caller must not have map locked
*/
int
uvm_map_clean(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
struct vm_map_entry *current, *entry;
struct uvm_object *uobj;
struct vm_amap *amap;
struct vm_anon *anon;
struct vm_page *pg;
vaddr_t offset;
vsize_t size;
voff_t uoff;
int error, refs;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(map=%#jx,start=%#jx,end=%#jx,flags=%#jx)",
(uintptr_t)map, start, end, flags);
KASSERT((flags & (PGO_FREE|PGO_DEACTIVATE)) !=
(PGO_FREE|PGO_DEACTIVATE));
vm_map_lock(map);
VM_MAP_RANGE_CHECK(map, start, end);
if (!uvm_map_lookup_entry(map, start, &entry)) {
vm_map_unlock(map);
return EFAULT;
}
/*
* Make a first pass to check for holes and wiring problems.
*/
for (current = entry; current->start < end; current = current->next) {
if (UVM_ET_ISSUBMAP(current)) {
vm_map_unlock(map);
return EINVAL;
}
if ((flags & PGO_FREE) != 0 && VM_MAPENT_ISWIRED(entry)) { vm_map_unlock(map);
return EBUSY;
}
if (end <= current->end) {
break;
}
if (current->end != current->next->start) { vm_map_unlock(map);
return EFAULT;
}
}
vm_map_busy(map); vm_map_unlock(map);
error = 0;
for (current = entry; start < end; current = current->next) {
amap = current->aref.ar_amap; /* upper layer */
uobj = current->object.uvm_obj; /* lower layer */
KASSERT(start >= current->start);
/*
* No amap cleaning necessary if:
*
* (1) There's no amap.
*
* (2) We're not deactivating or freeing pages.
*/
if (amap == NULL || (flags & (PGO_DEACTIVATE|PGO_FREE)) == 0)
goto flush_object;
offset = start - current->start;
size = MIN(end, current->end) - start;
amap_lock(amap, RW_WRITER);
for ( ; size != 0; size -= PAGE_SIZE, offset += PAGE_SIZE) {
anon = amap_lookup(¤t->aref, offset);
if (anon == NULL)
continue;
KASSERT(anon->an_lock == amap->am_lock);
pg = anon->an_page;
if (pg == NULL) {
continue;
}
if (pg->flags & PG_BUSY) {
continue;
}
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* In these first 3 cases, we just deactivate the page.
*/
case PGO_CLEANIT|PGO_FREE:
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
/*
* skip the page if it's loaned or wired,
* since it shouldn't be on a paging queue
* at all in these cases.
*/
if (pg->loan_count != 0 ||
pg->wire_count != 0) {
continue;
}
KASSERT(pg->uanon == anon);
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
continue;
case PGO_FREE:
/*
* If there are multiple references to
* the amap, just deactivate the page.
*/
if (amap_refs(amap) > 1)
goto deactivate_it;
/* skip the page if it's wired */
if (pg->wire_count != 0) {
continue;
}
amap_unadd(¤t->aref, offset);
refs = --anon->an_ref;
if (refs == 0) { uvm_anfree(anon);
}
continue;
}
}
amap_unlock(amap);
flush_object:
/*
* flush pages if we've got a valid backing object.
* note that we must always clean object pages before
* freeing them since otherwise we could reveal stale
* data from files.
*/
uoff = current->offset + (start - current->start);
size = MIN(end, current->end) - start; if (uobj != NULL) {
rw_enter(uobj->vmobjlock, RW_WRITER);
if (uobj->pgops->pgo_put != NULL)
error = (uobj->pgops->pgo_put)(uobj, uoff,
uoff + size, flags | PGO_CLEANIT);
else
error = 0;
}
start += size;
}
vm_map_unbusy(map);
return error;
}
/*
* uvm_map_checkprot: check protection in map
*
* => must allow specified protection in a fully allocated region.
* => map must be read or write locked by caller.
*/
bool
uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t protection)
{
struct vm_map_entry *entry;
struct vm_map_entry *tmp_entry;
if (!uvm_map_lookup_entry(map, start, &tmp_entry)) {
return (false);
}
entry = tmp_entry;
while (start < end) {
if (entry == &map->header) {
return (false);
}
/*
* no holes allowed
*/
if (start < entry->start) {
return (false);
}
/*
* check protection associated with entry
*/
if ((entry->protection & protection) != protection) {
return (false);
}
start = entry->end;
entry = entry->next;
}
return (true);
}
/*
* uvmspace_alloc: allocate a vmspace structure.
*
* - structure includes vm_map and pmap
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
struct vmspace *
uvmspace_alloc(vaddr_t vmin, vaddr_t vmax, bool topdown)
{
struct vmspace *vm;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
vm = kmem_alloc(sizeof(*vm), KM_SLEEP);
uvmspace_init(vm, NULL, vmin, vmax, topdown);
UVMHIST_LOG(maphist,"<- done (vm=%#jx)", (uintptr_t)vm, 0, 0, 0);
return (vm);
}
/*
* uvmspace_init: initialize a vmspace structure.
*
* - XXX: no locking on this structure
* - refcnt set to 1, rest must be init'd by caller
*/
void
uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin,
vaddr_t vmax, bool topdown)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(vm=%#jx, pmap=%#jx, vmin=%#jx, vmax=%#jx",
(uintptr_t)vm, (uintptr_t)pmap, vmin, vmax);
UVMHIST_LOG(maphist, " topdown=%ju)", topdown, 0, 0, 0);
memset(vm, 0, sizeof(*vm));
uvm_map_setup(&vm->vm_map, vmin, vmax, VM_MAP_PAGEABLE
| (topdown ? VM_MAP_TOPDOWN : 0)
);
if (pmap)
pmap_reference(pmap);
else
pmap = pmap_create();
vm->vm_map.pmap = pmap;
vm->vm_refcnt = 1;
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
}
/*
* uvmspace_share: share a vmspace between two processes
*
* - used for vfork, threads(?)
*/
void
uvmspace_share(struct proc *p1, struct proc *p2)
{ uvmspace_addref(p1->p_vmspace);
p2->p_vmspace = p1->p_vmspace;
}
#if 0
/*
* uvmspace_unshare: ensure that process "p" has its own, unshared, vmspace
*
* - XXX: no locking on vmspace
*/
void
uvmspace_unshare(struct lwp *l)
{
struct proc *p = l->l_proc;
struct vmspace *nvm, *ovm = p->p_vmspace;
if (ovm->vm_refcnt == 1)
/* nothing to do: vmspace isn't shared in the first place */
return;
/* make a new vmspace, still holding old one */
nvm = uvmspace_fork(ovm);
kpreempt_disable();
pmap_deactivate(l); /* unbind old vmspace */
p->p_vmspace = nvm;
pmap_activate(l); /* switch to new vmspace */
kpreempt_enable();
uvmspace_free(ovm); /* drop reference to old vmspace */
}
#endif
/*
* uvmspace_spawn: a new process has been spawned and needs a vmspace
*/
void
uvmspace_spawn(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
struct proc *p = l->l_proc;
struct vmspace *nvm;
#ifdef __HAVE_CPU_VMSPACE_EXEC
cpu_vmspace_exec(l, start, end);
#endif
nvm = uvmspace_alloc(start, end, topdown);
kpreempt_disable();
p->p_vmspace = nvm;
pmap_activate(l);
kpreempt_enable();
}
/*
* uvmspace_exec: the process wants to exec a new program
*/
void
uvmspace_exec(struct lwp *l, vaddr_t start, vaddr_t end, bool topdown)
{
struct proc *p = l->l_proc;
struct vmspace *nvm, *ovm = p->p_vmspace;
struct vm_map *map;
int flags;
KASSERT(ovm != NULL);
#ifdef __HAVE_CPU_VMSPACE_EXEC
cpu_vmspace_exec(l, start, end);
#endif
map = &ovm->vm_map;
/*
* see if more than one process is using this vmspace...
*/
if (ovm->vm_refcnt == 1
&& topdown == ((ovm->vm_map.flags & VM_MAP_TOPDOWN) != 0)) {
/*
* if p is the only process using its vmspace then we can safely
* recycle that vmspace for the program that is being exec'd.
* But only if TOPDOWN matches the requested value for the new
* vm space!
*/
/*
* SYSV SHM semantics require us to kill all segments on an exec
*/
if (uvm_shmexit && ovm->vm_shm)
(*uvm_shmexit)(ovm);
/*
* POSIX 1003.1b -- "lock future mappings" is revoked
* when a process execs another program image.
*/
map->flags &= ~VM_MAP_WIREFUTURE;
/*
* now unmap the old program.
*
* XXX set VM_MAP_DYING for the duration, so pmap_update()
* is not called until the pmap has been totally cleared out
* after pmap_remove_all(), or it can confuse some pmap
* implementations. it would be nice to handle this by
* deferring the pmap_update() while it is known the address
* space is not visible to any user LWP other than curlwp,
* but there isn't an elegant way of inferring that right
* now.
*/
flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
map->flags |= VM_MAP_DYING;
uvm_unmap1(map, vm_map_min(map), vm_map_max(map), flags);
map->flags &= ~VM_MAP_DYING;
pmap_update(map->pmap);
KASSERT(map->header.prev == &map->header);
KASSERT(map->nentries == 0);
/*
* resize the map
*/
vm_map_setmin(map, start);
vm_map_setmax(map, end);
} else {
/*
* p's vmspace is being shared, so we can't reuse it for p since
* it is still being used for others. allocate a new vmspace
* for p
*/
nvm = uvmspace_alloc(start, end, topdown);
/*
* install new vmspace and drop our ref to the old one.
*/
kpreempt_disable();
pmap_deactivate(l);
p->p_vmspace = nvm;
pmap_activate(l);
kpreempt_enable();
uvmspace_free(ovm);
}
}
/*
* uvmspace_addref: add a reference to a vmspace.
*/
void
uvmspace_addref(struct vmspace *vm)
{ KASSERT((vm->vm_map.flags & VM_MAP_DYING) == 0); KASSERT(vm->vm_refcnt > 0);
atomic_inc_uint(&vm->vm_refcnt);
}
/*
* uvmspace_free: free a vmspace data structure
*/
void
uvmspace_free(struct vmspace *vm)
{
struct vm_map_entry *dead_entries;
struct vm_map *map = &vm->vm_map;
int flags;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(vm=%#jx) ref=%jd", (uintptr_t)vm,
vm->vm_refcnt, 0, 0);
membar_release();
if (atomic_dec_uint_nv(&vm->vm_refcnt) > 0)
return;
membar_acquire();
/*
* at this point, there should be no other references to the map.
* delete all of the mappings, then destroy the pmap.
*/
map->flags |= VM_MAP_DYING;
flags = pmap_remove_all(map->pmap) ? UVM_FLAG_VAONLY : 0;
/* Get rid of any SYSV shared memory segments. */
if (uvm_shmexit && vm->vm_shm != NULL) (*uvm_shmexit)(vm); if (map->nentries) {
uvm_unmap_remove(map, vm_map_min(map), vm_map_max(map),
&dead_entries, flags);
if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0);
}
KASSERT(map->nentries == 0); KASSERT(map->size == 0);
mutex_destroy(&map->misc_lock);
rw_destroy(&map->lock);
cv_destroy(&map->cv);
pmap_destroy(map->pmap);
kmem_free(vm, sizeof(*vm));
}
static struct vm_map_entry *
uvm_mapent_clone(struct vm_map *new_map, struct vm_map_entry *old_entry,
int flags)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_alloc(new_map, 0);
/* old_entry -> new_entry */
uvm_mapent_copy(old_entry, new_entry);
/* new pmap has nothing wired in it */
new_entry->wired_count = 0;
/*
* gain reference to object backing the map (can't
* be a submap, already checked this case).
*/
if (new_entry->aref.ar_amap) uvm_map_reference_amap(new_entry, flags); if (new_entry->object.uvm_obj &&
new_entry->object.uvm_obj->pgops->pgo_reference)
new_entry->object.uvm_obj->pgops->pgo_reference(
new_entry->object.uvm_obj);
/* insert entry at end of new_map's entry list */
uvm_map_entry_link(new_map, new_map->header.prev,
new_entry);
return new_entry;
}
/*
* share the mapping: this means we want the old and
* new entries to share amaps and backing objects.
*/
static void
uvm_mapent_forkshared(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
/*
* if the old_entry needs a new amap (due to prev fork)
* then we need to allocate it now so that we have
* something we own to share with the new_entry. [in
* other words, we need to clear needs_copy]
*/
if (UVM_ET_ISNEEDSCOPY(old_entry)) {
/* get our own amap, clears needs_copy */
amap_copy(old_map, old_entry, AMAP_COPY_NOCHUNK,
0, 0);
/* XXXCDC: WAITOK??? */
}
uvm_mapent_clone(new_map, old_entry, AMAP_SHARED);
}
static void
uvm_mapent_forkcopy(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
struct vm_map_entry *new_entry;
/*
* copy-on-write the mapping (using mmap's
* MAP_PRIVATE semantics)
*
* allocate new_entry, adjust reference counts.
* (note that new references are read-only).
*/
new_entry = uvm_mapent_clone(new_map, old_entry, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
/*
* the new entry will need an amap. it will either
* need to be copied from the old entry or created
* from scratch (if the old entry does not have an
* amap). can we defer this process until later
* (by setting "needs_copy") or do we need to copy
* the amap now?
*
* we must copy the amap now if any of the following
* conditions hold:
* 1. the old entry has an amap and that amap is
* being shared. this means that the old (parent)
* process is sharing the amap with another
* process. if we do not clear needs_copy here
* we will end up in a situation where both the
* parent and child process are referring to the
* same amap with "needs_copy" set. if the
* parent write-faults, the fault routine will
* clear "needs_copy" in the parent by allocating
* a new amap. this is wrong because the
* parent is supposed to be sharing the old amap
* and the new amap will break that.
*
* 2. if the old entry has an amap and a non-zero
* wire count then we are going to have to call
* amap_cow_now to avoid page faults in the
* parent process. since amap_cow_now requires
* "needs_copy" to be clear we might as well
* clear it here as well.
*
*/
if (old_entry->aref.ar_amap != NULL) { if ((amap_flags(old_entry->aref.ar_amap) & AMAP_SHARED) != 0 ||
VM_MAPENT_ISWIRED(old_entry)) {
amap_copy(new_map, new_entry,
AMAP_COPY_NOCHUNK, 0, 0);
/* XXXCDC: M_WAITOK ... ok? */
}
}
/*
* if the parent's entry is wired down, then the
* parent process does not want page faults on
* access to that memory. this means that we
* cannot do copy-on-write because we can't write
* protect the old entry. in this case we
* resolve all copy-on-write faults now, using
* amap_cow_now. note that we have already
* allocated any needed amap (above).
*/
if (VM_MAPENT_ISWIRED(old_entry)) {
/*
* resolve all copy-on-write faults now
* (note that there is nothing to do if
* the old mapping does not have an amap).
*/
if (old_entry->aref.ar_amap) amap_cow_now(new_map, new_entry);
} else {
/*
* setup mappings to trigger copy-on-write faults
* we must write-protect the parent if it has
* an amap and it is not already "needs_copy"...
* if it is already "needs_copy" then the parent
* has already been write-protected by a previous
* fork operation.
*/
if (old_entry->aref.ar_amap &&
!UVM_ET_ISNEEDSCOPY(old_entry)) {
if (old_entry->max_protection & VM_PROT_WRITE) {
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
uvm_map_lock_entry(old_entry, RW_WRITER);
#else
uvm_map_lock_entry(old_entry, RW_READER);
#endif
pmap_protect(old_map->pmap,
old_entry->start, old_entry->end,
old_entry->protection & ~VM_PROT_WRITE); uvm_map_unlock_entry(old_entry);
}
old_entry->etype |= UVM_ET_NEEDSCOPY;
}
}
}
/*
* zero the mapping: the new entry will be zero initialized
*/
static void
uvm_mapent_forkzero(struct vm_map *new_map, struct vm_map *old_map,
struct vm_map_entry *old_entry)
{
struct vm_map_entry *new_entry;
new_entry = uvm_mapent_clone(new_map, old_entry, 0);
new_entry->etype |=
(UVM_ET_COPYONWRITE|UVM_ET_NEEDSCOPY);
if (new_entry->aref.ar_amap) { uvm_map_unreference_amap(new_entry, 0);
new_entry->aref.ar_pageoff = 0;
new_entry->aref.ar_amap = NULL;
}
if (UVM_ET_ISOBJ(new_entry)) { if (new_entry->object.uvm_obj->pgops->pgo_detach) new_entry->object.uvm_obj->pgops->pgo_detach(
new_entry->object.uvm_obj);
new_entry->object.uvm_obj = NULL;
new_entry->offset = 0;
new_entry->etype &= ~UVM_ET_OBJ;
}
}
/*
* F O R K - m a i n e n t r y p o i n t
*/
/*
* uvmspace_fork: fork a process' main map
*
* => create a new vmspace for child process from parent.
* => parent's map must not be locked.
*/
struct vmspace *
uvmspace_fork(struct vmspace *vm1)
{
struct vmspace *vm2;
struct vm_map *old_map = &vm1->vm_map;
struct vm_map *new_map;
struct vm_map_entry *old_entry;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
vm_map_lock(old_map);
vm2 = uvmspace_alloc(vm_map_min(old_map), vm_map_max(old_map),
vm1->vm_map.flags & VM_MAP_TOPDOWN);
memcpy(&vm2->vm_startcopy, &vm1->vm_startcopy,
(char *) (vm1 + 1) - (char *) &vm1->vm_startcopy);
new_map = &vm2->vm_map; /* XXX */
old_entry = old_map->header.next;
new_map->size = old_map->size;
/*
* go entry-by-entry
*/
while (old_entry != &old_map->header) {
/*
* first, some sanity checks on the old entry
*/
KASSERT(!UVM_ET_ISSUBMAP(old_entry)); KASSERT(UVM_ET_ISCOPYONWRITE(old_entry) ||
!UVM_ET_ISNEEDSCOPY(old_entry));
switch (old_entry->inheritance) {
case MAP_INHERIT_NONE:
/*
* drop the mapping, modify size
*/
new_map->size -= old_entry->end - old_entry->start;
break;
case MAP_INHERIT_SHARE:
uvm_mapent_forkshared(new_map, old_map, old_entry);
break;
case MAP_INHERIT_COPY:
uvm_mapent_forkcopy(new_map, old_map, old_entry);
break;
case MAP_INHERIT_ZERO:
uvm_mapent_forkzero(new_map, old_map, old_entry);
break;
default:
KASSERT(0);
break;
}
old_entry = old_entry->next;
}
pmap_update(old_map->pmap);
vm_map_unlock(old_map); if (uvm_shmfork && vm1->vm_shm) (*uvm_shmfork)(vm1, vm2);
#ifdef PMAP_FORK
pmap_fork(vm1->vm_map.pmap, vm2->vm_map.pmap);
#endif
UVMHIST_LOG(maphist,"<- done",0,0,0,0);
return (vm2);
}
/*
* uvm_mapent_trymerge: try to merge an entry with its neighbors.
*
* => called with map locked.
* => return non zero if successfully merged.
*/
int
uvm_mapent_trymerge(struct vm_map *map, struct vm_map_entry *entry, int flags)
{
struct uvm_object *uobj;
struct vm_map_entry *next;
struct vm_map_entry *prev;
vsize_t size;
int merged = 0;
bool copying;
int newetype;
if (entry->aref.ar_amap != NULL) {
return 0;
}
if ((entry->flags & UVM_MAP_NOMERGE) != 0) {
return 0;
}
uobj = entry->object.uvm_obj;
size = entry->end - entry->start;
copying = (flags & UVM_MERGE_COPYING) != 0;
newetype = copying ? (entry->etype & ~UVM_ET_NEEDSCOPY) : entry->etype;
next = entry->next;
if (next != &map->header && next->start == entry->end && ((copying && next->aref.ar_amap != NULL && amap_refs(next->aref.ar_amap) == 1) || (!copying && next->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(next, newetype,
uobj, entry->flags, entry->protection,
entry->max_protection, entry->inheritance, entry->advice,
entry->wired_count) && (uobj == NULL || entry->offset + size == next->offset)) {
int error;
if (copying) {
error = amap_extend(next, size,
AMAP_EXTEND_NOWAIT|AMAP_EXTEND_BACKWARDS);
} else {
error = 0;
}
if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj);
}
}
entry->end = next->end;
clear_hints(map, next); uvm_map_entry_unlink(map, next); if (copying) { entry->aref = next->aref;
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
uvm_map_check(map, "trymerge forwardmerge");
uvm_mapent_free(next);
merged++;
}
}
prev = entry->prev;
if (prev != &map->header && prev->end == entry->start && ((copying && !merged && prev->aref.ar_amap != NULL && amap_refs(prev->aref.ar_amap) == 1) || (!copying && prev->aref.ar_amap == NULL)) && UVM_ET_ISCOMPATIBLE(prev, newetype,
uobj, entry->flags, entry->protection,
entry->max_protection, entry->inheritance, entry->advice,
entry->wired_count) && (uobj == NULL ||
prev->offset + prev->end - prev->start == entry->offset)) {
int error;
if (copying) {
error = amap_extend(prev, size,
AMAP_EXTEND_NOWAIT|AMAP_EXTEND_FORWARDS);
} else {
error = 0;
}
if (error == 0) { if (uobj) { if (uobj->pgops->pgo_detach) { uobj->pgops->pgo_detach(uobj);
}
entry->offset = prev->offset;
}
entry->start = prev->start;
clear_hints(map, prev); uvm_map_entry_unlink(map, prev); if (copying) { entry->aref = prev->aref;
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
uvm_map_check(map, "trymerge backmerge");
uvm_mapent_free(prev);
merged++;
}
}
return merged;
}
/*
* uvm_map_setup: init map
*
* => map must not be in service yet.
*/
void
uvm_map_setup(struct vm_map *map, vaddr_t vmin, vaddr_t vmax, int flags)
{
rb_tree_init(&map->rb_tree, &uvm_map_tree_ops);
map->header.next = map->header.prev = &map->header;
map->nentries = 0;
map->size = 0;
map->ref_count = 1;
vm_map_setmin(map, vmin);
vm_map_setmax(map, vmax);
map->flags = flags;
map->first_free = &map->header;
map->hint = &map->header;
map->timestamp = 0;
map->busy = NULL;
rw_init(&map->lock);
cv_init(&map->cv, "vm_map");
mutex_init(&map->misc_lock, MUTEX_DRIVER, IPL_NONE);
}
/*
* U N M A P - m a i n e n t r y p o i n t
*/
/*
* uvm_unmap1: remove mappings from a vm_map (from "start" up to "stop")
*
* => caller must check alignment and size
* => map must be unlocked (we will lock it)
* => flags is UVM_FLAG_QUANTUM or 0.
*/
void
uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
{
struct vm_map_entry *dead_entries;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (map=%#jx, start=%#jx, end=%#jx)",
(uintptr_t)map, start, end, 0);
KASSERTMSG(start < end,
"%s: map %p: start %#jx < end %#jx", __func__, map,
(uintmax_t)start, (uintmax_t)end);
if (map == kernel_map) { LOCKDEBUG_MEM_CHECK((void *)start, end - start);
}
/*
* work now done by helper functions. wipe the pmap's and then
* detach from the dead entries...
*/
vm_map_lock(map);
uvm_unmap_remove(map, start, end, &dead_entries, flags);
vm_map_unlock(map); if (dead_entries != NULL) uvm_unmap_detach(dead_entries, 0);
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* uvm_map_reference: add reference to a map
*
* => map need not be locked
*/
void
uvm_map_reference(struct vm_map *map)
{
atomic_inc_uint(&map->ref_count);
}
void
uvm_map_lock_entry(struct vm_map_entry *entry, krw_t op)
{ if (entry->aref.ar_amap != NULL) { amap_lock(entry->aref.ar_amap, op);
}
if (UVM_ET_ISOBJ(entry)) { rw_enter(entry->object.uvm_obj->vmobjlock, op);
}
}
void
uvm_map_unlock_entry(struct vm_map_entry *entry)
{ if (UVM_ET_ISOBJ(entry)) { rw_exit(entry->object.uvm_obj->vmobjlock);
}
if (entry->aref.ar_amap != NULL) { amap_unlock(entry->aref.ar_amap);
}
}
#define UVM_VOADDR_TYPE_MASK 0x3UL
#define UVM_VOADDR_TYPE_UOBJ 0x1UL
#define UVM_VOADDR_TYPE_ANON 0x2UL
#define UVM_VOADDR_OBJECT_MASK ~UVM_VOADDR_TYPE_MASK
#define UVM_VOADDR_GET_TYPE(voa) \
((voa)->object & UVM_VOADDR_TYPE_MASK)
#define UVM_VOADDR_GET_OBJECT(voa) \
((voa)->object & UVM_VOADDR_OBJECT_MASK)
#define UVM_VOADDR_SET_OBJECT(voa, obj, type) \
do { \
KASSERT(((uintptr_t)(obj) & UVM_VOADDR_TYPE_MASK) == 0); \
(voa)->object = ((uintptr_t)(obj)) | (type); \
} while (/*CONSTCOND*/0)
#define UVM_VOADDR_GET_UOBJ(voa) \
((struct uvm_object *)UVM_VOADDR_GET_OBJECT(voa))
#define UVM_VOADDR_SET_UOBJ(voa, uobj) \
UVM_VOADDR_SET_OBJECT(voa, uobj, UVM_VOADDR_TYPE_UOBJ)
#define UVM_VOADDR_GET_ANON(voa) \
((struct vm_anon *)UVM_VOADDR_GET_OBJECT(voa))
#define UVM_VOADDR_SET_ANON(voa, anon) \
UVM_VOADDR_SET_OBJECT(voa, anon, UVM_VOADDR_TYPE_ANON)
/*
* uvm_voaddr_acquire: returns the virtual object address corresponding
* to the specified virtual address.
*
* => resolves COW so the true page identity is tracked.
*
* => acquires a reference on the page's owner (uvm_object or vm_anon)
*/
bool
uvm_voaddr_acquire(struct vm_map * const map, vaddr_t const va,
struct uvm_voaddr * const voaddr)
{
struct vm_map_entry *entry;
struct vm_anon *anon = NULL;
bool result = false;
bool exclusive = false;
void (*unlock_fn)(struct vm_map *);
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist,"(map=%#jx,va=%#jx)", (uintptr_t)map, va, 0, 0);
const vaddr_t start = trunc_page(va);
const vaddr_t end = round_page(va+1);
lookup_again:
if (__predict_false(exclusive)) {
vm_map_lock(map);
unlock_fn = vm_map_unlock;
} else {
vm_map_lock_read(map);
unlock_fn = vm_map_unlock_read;
}
if (__predict_false(!uvm_map_lookup_entry(map, start, &entry))) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (no entry)",0,0,0,0);
return false;
}
if (__predict_false(entry->protection == VM_PROT_NONE)) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (PROT_NONE)",0,0,0,0);
return false;
}
/*
* We have a fast path for the common case of "no COW resolution
* needed" whereby we have taken a read lock on the map and if
* we don't encounter any need to create a vm_anon then great!
* But if we do, we loop around again, instead taking an exclusive
* lock so that we can perform the fault.
*
* In the event that we have to resolve the fault, we do nearly the
* same work as uvm_map_pageable() does:
*
* 1: holding the write lock, we create any anonymous maps that need
* to be created. however, we do NOT need to clip the map entries
* in this case.
*
* 2: we downgrade to a read lock, and call uvm_fault_wire to fault
* in the page (assuming the entry is not already wired). this
* is done because we need the vm_anon to be present.
*/
if (__predict_true(!VM_MAPENT_ISWIRED(entry))) {
bool need_fault = false;
/*
* perform the action of vm_map_lookup that need the
* write lock on the map: create an anonymous map for
* a copy-on-write region, or an anonymous map for
* a zero-fill region.
*/
if (__predict_false(UVM_ET_ISSUBMAP(entry))) {
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (submap)",0,0,0,0);
return false;
}
if (__predict_false(UVM_ET_ISNEEDSCOPY(entry) &&
((entry->max_protection & VM_PROT_WRITE) ||
(entry->object.uvm_obj == NULL)))) {
if (!exclusive) {
/* need to take the slow path */
KASSERT(unlock_fn == vm_map_unlock_read);
vm_map_unlock_read(map);
exclusive = true;
goto lookup_again;
}
need_fault = true;
amap_copy(map, entry, 0, start, end);
/* XXXCDC: wait OK? */
}
/*
* do a quick check to see if the fault has already
* been resolved to the upper layer.
*/
if (__predict_true(entry->aref.ar_amap != NULL &&
need_fault == false)) {
amap_lock(entry->aref.ar_amap, RW_WRITER);
anon = amap_lookup(&entry->aref, start - entry->start);
if (__predict_true(anon != NULL)) {
/* amap unlocked below */
goto found_anon;
}
amap_unlock(entry->aref.ar_amap);
need_fault = true;
}
/*
* we predict this test as false because if we reach
* this point, then we are likely dealing with a
* shared memory region backed by a uvm_object, in
* which case a fault to create the vm_anon is not
* necessary.
*/
if (__predict_false(need_fault)) {
if (exclusive) {
vm_map_busy(map);
vm_map_unlock(map);
unlock_fn = vm_map_unbusy;
}
if (uvm_fault_wire(map, start, end,
entry->max_protection, 1)) {
/* wiring failed */
unlock_fn(map);
UVMHIST_LOG(maphist,"<- done (wire failed)",
0,0,0,0);
return false;
}
/*
* now that we have resolved the fault, we can unwire
* the page.
*/
if (exclusive) {
vm_map_lock(map);
vm_map_unbusy(map);
unlock_fn = vm_map_unlock;
}
uvm_fault_unwire_locked(map, start, end);
}
}
/* check the upper layer */
if (entry->aref.ar_amap) {
amap_lock(entry->aref.ar_amap, RW_WRITER);
anon = amap_lookup(&entry->aref, start - entry->start);
if (anon) {
found_anon: KASSERT(anon->an_lock == entry->aref.ar_amap->am_lock);
anon->an_ref++;
rw_obj_hold(anon->an_lock);
KASSERT(anon->an_ref != 0);
UVM_VOADDR_SET_ANON(voaddr, anon);
voaddr->offset = va & PAGE_MASK;
result = true;
}
amap_unlock(entry->aref.ar_amap);
}
/* check the lower layer */
if (!result && UVM_ET_ISOBJ(entry)) {
struct uvm_object *uobj = entry->object.uvm_obj;
KASSERT(uobj != NULL);
(*uobj->pgops->pgo_reference)(uobj);
UVM_VOADDR_SET_UOBJ(voaddr, uobj);
voaddr->offset = entry->offset + (va - entry->start);
result = true;
}
unlock_fn(map);
if (result) {
UVMHIST_LOG(maphist,
"<- done OK (type=%jd,owner=%#jx,offset=%#jx)",
UVM_VOADDR_GET_TYPE(voaddr),
UVM_VOADDR_GET_OBJECT(voaddr),
voaddr->offset, 0);
} else {
UVMHIST_LOG(maphist,"<- done (failed)",0,0,0,0);
}
return result;
}
/*
* uvm_voaddr_release: release the references held by the
* vitual object address.
*/
void
uvm_voaddr_release(struct uvm_voaddr * const voaddr)
{
switch (UVM_VOADDR_GET_TYPE(voaddr)) {
case UVM_VOADDR_TYPE_UOBJ: {
struct uvm_object * const uobj = UVM_VOADDR_GET_UOBJ(voaddr);
KASSERT(uobj != NULL);
KASSERT(uobj->pgops->pgo_detach != NULL);
(*uobj->pgops->pgo_detach)(uobj);
break;
}
case UVM_VOADDR_TYPE_ANON: {
struct vm_anon * const anon = UVM_VOADDR_GET_ANON(voaddr);
krwlock_t *lock;
KASSERT(anon != NULL);
rw_enter((lock = anon->an_lock), RW_WRITER);
KASSERT(anon->an_ref > 0);
if (--anon->an_ref == 0) {
uvm_anfree(anon);
}
rw_exit(lock);
rw_obj_free(lock);
break;
}
default:
panic("uvm_voaddr_release: bad type");
}
memset(voaddr, 0, sizeof(*voaddr));
}
/*
* uvm_voaddr_compare: compare two uvm_voaddr objects.
*
* => memcmp() semantics
*/
int
uvm_voaddr_compare(const struct uvm_voaddr * const voaddr1,
const struct uvm_voaddr * const voaddr2)
{
const uintptr_t type1 = UVM_VOADDR_GET_TYPE(voaddr1);
const uintptr_t type2 = UVM_VOADDR_GET_TYPE(voaddr2);
KASSERT(type1 == UVM_VOADDR_TYPE_UOBJ ||
type1 == UVM_VOADDR_TYPE_ANON);
KASSERT(type2 == UVM_VOADDR_TYPE_UOBJ ||
type2 == UVM_VOADDR_TYPE_ANON);
if (type1 < type2)
return -1;
if (type1 > type2)
return 1;
const uintptr_t addr1 = UVM_VOADDR_GET_OBJECT(voaddr1);
const uintptr_t addr2 = UVM_VOADDR_GET_OBJECT(voaddr2);
if (addr1 < addr2)
return -1;
if (addr1 > addr2)
return 1;
if (voaddr1->offset < voaddr2->offset)
return -1;
if (voaddr1->offset > voaddr2->offset)
return 1;
return 0;
}
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_map_printit: actually prints the map
*/
void
uvm_map_printit(struct vm_map *map, bool full,
void (*pr)(const char *, ...))
{
struct vm_map_entry *entry;
(*pr)("MAP %p: [%#lx->%#lx]\n", map, vm_map_min(map),
vm_map_max(map));
(*pr)("\t#ent=%d, sz=%d, ref=%d, version=%d, flags=%#x\n",
map->nentries, map->size, map->ref_count, map->timestamp,
map->flags);
(*pr)("\tpmap=%p(resident=%ld, wired=%ld)\n", map->pmap,
pmap_resident_count(map->pmap), pmap_wired_count(map->pmap));
if (!full)
return;
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
(*pr)(" - %p: %#lx->%#lx: obj=%p/%#llx, amap=%p/%d\n",
entry, entry->start, entry->end, entry->object.uvm_obj,
(long long)entry->offset, entry->aref.ar_amap,
entry->aref.ar_pageoff);
(*pr)(
"\tsubmap=%c, cow=%c, nc=%c, prot(max)=%d/%d, inh=%d, "
"wc=%d, adv=%d%s\n",
(entry->etype & UVM_ET_SUBMAP) ? 'T' : 'F',
(entry->etype & UVM_ET_COPYONWRITE) ? 'T' : 'F',
(entry->etype & UVM_ET_NEEDSCOPY) ? 'T' : 'F',
entry->protection, entry->max_protection,
entry->inheritance, entry->wired_count, entry->advice,
entry == map->first_free ? " (first_free)" : "");
}
}
void
uvm_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
struct vm_map *map;
for (map = kernel_map;;) {
struct vm_map_entry *entry;
if (!uvm_map_lookup_entry_bytree(map, (vaddr_t)addr, &entry)) {
break;
}
(*pr)("%p is %p+%zu from VMMAP %p\n",
(void *)addr, (void *)entry->start,
(size_t)(addr - (uintptr_t)entry->start), map);
if (!UVM_ET_ISSUBMAP(entry)) {
break;
}
map = entry->object.sub_map;
}
}
#endif /* DDB || DEBUGPRINT */
#ifndef __USER_VA0_IS_SAFE
static int
sysctl_user_va0_disable(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int t, error;
node = *rnode;
node.sysctl_data = &t;
t = user_va0_disable;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return (error);
if (!t && user_va0_disable &&
kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MAP_VA_ZERO, 0,
NULL, NULL, NULL))
return EPERM;
user_va0_disable = !!t;
return 0;
}
#endif
static int
fill_vmentry(struct lwp *l, struct proc *p, struct kinfo_vmentry *kve,
struct vm_map *m, struct vm_map_entry *e)
{
#ifndef _RUMPKERNEL
int error;
memset(kve, 0, sizeof(*kve));
KASSERT(e != NULL);
if (UVM_ET_ISOBJ(e)) {
struct uvm_object *uobj = e->object.uvm_obj;
KASSERT(uobj != NULL);
kve->kve_ref_count = uobj->uo_refs;
kve->kve_count = uobj->uo_npages;
if (UVM_OBJ_IS_VNODE(uobj)) {
struct vattr va;
struct vnode *vp = (struct vnode *)uobj;
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, l->l_cred);
VOP_UNLOCK(vp);
kve->kve_type = KVME_TYPE_VNODE;
if (error == 0) {
kve->kve_vn_size = vp->v_size;
kve->kve_vn_type = (int)vp->v_type;
kve->kve_vn_mode = va.va_mode;
kve->kve_vn_rdev = va.va_rdev;
kve->kve_vn_fileid = va.va_fileid;
kve->kve_vn_fsid = va.va_fsid;
error = vnode_to_path(kve->kve_path,
sizeof(kve->kve_path) / 2, vp, l, p);
}
} else if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
kve->kve_type = KVME_TYPE_KERN;
} else if (UVM_OBJ_IS_DEVICE(uobj)) {
kve->kve_type = KVME_TYPE_DEVICE;
} else if (UVM_OBJ_IS_AOBJ(uobj)) {
kve->kve_type = KVME_TYPE_ANON;
} else {
kve->kve_type = KVME_TYPE_OBJECT;
}
} else if (UVM_ET_ISSUBMAP(e)) {
struct vm_map *map = e->object.sub_map;
KASSERT(map != NULL);
kve->kve_ref_count = map->ref_count;
kve->kve_count = map->nentries;
kve->kve_type = KVME_TYPE_SUBMAP;
} else
kve->kve_type = KVME_TYPE_UNKNOWN;
kve->kve_start = e->start;
kve->kve_end = e->end;
kve->kve_offset = e->offset;
kve->kve_wired_count = e->wired_count;
kve->kve_inheritance = e->inheritance;
kve->kve_attributes = 0; /* unused */
kve->kve_advice = e->advice;
#define PROT(p) (((p) & VM_PROT_READ) ? KVME_PROT_READ : 0) | \
(((p) & VM_PROT_WRITE) ? KVME_PROT_WRITE : 0) | \
(((p) & VM_PROT_EXECUTE) ? KVME_PROT_EXEC : 0)
kve->kve_protection = PROT(e->protection);
kve->kve_max_protection = PROT(e->max_protection);
kve->kve_flags |= (e->etype & UVM_ET_COPYONWRITE)
? KVME_FLAG_COW : 0;
kve->kve_flags |= (e->etype & UVM_ET_NEEDSCOPY)
? KVME_FLAG_NEEDS_COPY : 0;
kve->kve_flags |= (m->flags & VM_MAP_TOPDOWN)
? KVME_FLAG_GROWS_DOWN : KVME_FLAG_GROWS_UP;
kve->kve_flags |= (m->flags & VM_MAP_PAGEABLE)
? KVME_FLAG_PAGEABLE : 0;
#endif
return 0;
}
static int
fill_vmentries(struct lwp *l, pid_t pid, u_int elem_size, void *oldp,
size_t *oldlenp)
{
int error;
struct proc *p;
struct kinfo_vmentry *vme;
struct vmspace *vm;
struct vm_map *map;
struct vm_map_entry *entry;
char *dp;
size_t count, vmesize;
if (elem_size == 0 || elem_size > 2 * sizeof(*vme))
return EINVAL;
if (oldp) {
if (*oldlenp > 10UL * 1024UL * 1024UL)
return E2BIG;
count = *oldlenp / elem_size;
if (count == 0)
return ENOMEM;
vmesize = count * sizeof(*vme);
} else
vmesize = 0;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
vme = NULL;
count = 0;
if ((error = proc_vmspace_getref(p, &vm)) != 0)
goto out;
map = &vm->vm_map;
vm_map_lock_read(map);
dp = oldp;
if (oldp)
vme = kmem_alloc(vmesize, KM_SLEEP);
for (entry = map->header.next; entry != &map->header;
entry = entry->next) {
if (oldp && (dp - (char *)oldp) < vmesize) {
error = fill_vmentry(l, p, &vme[count], map, entry);
if (error)
goto out;
dp += elem_size;
}
count++;
}
vm_map_unlock_read(map);
uvmspace_free(vm);
out:
if (pid != -1)
mutex_exit(p->p_lock);
if (error == 0) {
const u_int esize = uimin(sizeof(*vme), elem_size);
dp = oldp;
for (size_t i = 0; i < count; i++) {
if (oldp && (dp - (char *)oldp) < vmesize) {
error = sysctl_copyout(l, &vme[i], dp, esize);
if (error)
break;
dp += elem_size;
} else
break;
}
count *= elem_size;
if (oldp != NULL && *oldlenp < count)
error = ENOSPC;
*oldlenp = count;
}
if (vme)
kmem_free(vme, vmesize);
return error;
}
static int
sysctl_vmproc(SYSCTLFN_ARGS)
{
int error;
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (namelen == 0)
return EINVAL;
switch (name[0]) {
case VM_PROC_MAP:
if (namelen != 3)
return EINVAL;
sysctl_unlock();
error = fill_vmentries(l, name[1], name[2], oldp, oldlenp);
sysctl_relock();
return error;
default:
return EINVAL;
}
}
SYSCTL_SETUP(sysctl_uvmmap_setup, "sysctl uvmmap setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "proc",
SYSCTL_DESCR("Process vm information"),
sysctl_vmproc, 0, NULL, 0,
CTL_VM, VM_PROC, CTL_EOL);
#ifndef __USER_VA0_IS_SAFE
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "user_va0_disable",
SYSCTL_DESCR("Disable VA 0"),
sysctl_user_va0_disable, 0, &user_va0_disable, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
#endif
}
/* $NetBSD: pmap_pvt.c,v 1.15 2022/05/08 22:03:02 rin Exp $ */
/*-
* Copyright (c) 2014, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__RCSID("$NetBSD: pmap_pvt.c,v 1.15 2022/05/08 22:03:02 rin Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/pserialize.h>
#include <uvm/uvm.h>
#include <uvm/pmap/pmap_pvt.h>
#if !defined(PMAP_PV_TRACK_ONLY_STUBS)
/*
* unmanaged pv-tracked ranges
*
* This is a linear list for now because the only user are the DRM
* graphics drivers, with a single tracked range per device, for the
* graphics aperture, so there are expected to be few of them.
*
* This is used only after the VM system is initialized well enough
* that we can use kmem_alloc.
*/
struct pv_track {
paddr_t pvt_start;
psize_t pvt_size;
struct pv_track *pvt_next;
struct pmap_page pvt_pages[];
};
static struct {
kmutex_t lock;
pserialize_t psz;
struct pv_track *list;
} pv_unmanaged __cacheline_aligned;
void
pmap_pv_init(void)
{
mutex_init(&pv_unmanaged.lock, MUTEX_DEFAULT, IPL_NONE);
pv_unmanaged.psz = pserialize_create();
pv_unmanaged.list = NULL;
}
void
pmap_pv_track(paddr_t start, psize_t size)
{
struct pv_track *pvt;
size_t npages;
KASSERT(start == trunc_page(start));
KASSERT(size == trunc_page(size));
/* We may sleep for allocation. */
ASSERT_SLEEPABLE();
npages = size >> PAGE_SHIFT;
pvt = kmem_zalloc(offsetof(struct pv_track, pvt_pages[npages]),
KM_SLEEP);
pvt->pvt_start = start;
pvt->pvt_size = size;
#ifdef PMAP_PAGE_INIT
for (size_t i = 0; i < npages; i++)
PMAP_PAGE_INIT(&pvt->pvt_pages[i]);
#endif
mutex_enter(&pv_unmanaged.lock);
pvt->pvt_next = pv_unmanaged.list;
atomic_store_release(&pv_unmanaged.list, pvt);
mutex_exit(&pv_unmanaged.lock);
}
void
pmap_pv_untrack(paddr_t start, psize_t size)
{
struct pv_track **pvtp, *pvt;
size_t npages;
KASSERT(start == trunc_page(start));
KASSERT(size == trunc_page(size));
/* We may sleep for pserialize_perform. */
ASSERT_SLEEPABLE();
mutex_enter(&pv_unmanaged.lock);
for (pvtp = &pv_unmanaged.list;
(pvt = *pvtp) != NULL;
pvtp = &pvt->pvt_next) {
if (pvt->pvt_start != start)
continue;
if (pvt->pvt_size != size)
panic("pmap_pv_untrack: pv-tracking at 0x%"PRIxPADDR
": 0x%"PRIxPSIZE" bytes, not 0x%"PRIxPSIZE" bytes",
pvt->pvt_start, pvt->pvt_size, size);
/*
* Remove from list. Readers can safely see the old
* and new states of the list.
*/
atomic_store_relaxed(pvtp, pvt->pvt_next);
/* Wait for readers who can see the old state to finish. */
pserialize_perform(pv_unmanaged.psz);
/*
* We now have exclusive access to pvt and can destroy
* it. Poison it to catch bugs.
*/
explicit_memset(&pvt->pvt_next, 0x1a, sizeof pvt->pvt_next);
goto out;
}
panic("pmap_pv_untrack: pages not pv-tracked at 0x%"PRIxPADDR
" (0x%"PRIxPSIZE" bytes)",
start, size);
out: mutex_exit(&pv_unmanaged.lock);
npages = size >> PAGE_SHIFT;
kmem_free(pvt, offsetof(struct pv_track, pvt_pages[npages]));
}
struct pmap_page *
pmap_pv_tracked(paddr_t pa)
{
struct pv_track *pvt;
size_t pgno;
int s;
KASSERT(pa == trunc_page(pa));
s = pserialize_read_enter();
for (pvt = atomic_load_consume(&pv_unmanaged.list);
pvt != NULL;
pvt = pvt->pvt_next) {
if ((pvt->pvt_start <= pa) &&
((pa - pvt->pvt_start) < pvt->pvt_size))
break;
}
pserialize_read_exit(s);
if (pvt == NULL)
return NULL;
KASSERT(pvt->pvt_start <= pa); KASSERT((pa - pvt->pvt_start) < pvt->pvt_size);
pgno = (pa - pvt->pvt_start) >> PAGE_SHIFT;
return &pvt->pvt_pages[pgno];
}
#else /* PMAP_PV_TRACK_ONLY_STUBS */
/*
* Provide empty stubs just for MODULAR kernels.
*/
void
pmap_pv_init(void)
{
}
struct pmap_page *
pmap_pv_tracked(paddr_t pa)
{
return NULL;
}
#if notdef
/*
* pmap_pv_{,un}track() are intentionally commented out. If modules
* call these functions, the result should be an inconsistent state.
*
* Such modules require real PV-tracking support. Let us make the
* two symbols undefined, and prevent these modules from loaded.
*/
void
pmap_pv_track(paddr_t start, psize_t size)
{
panic("PV-tracking not supported");
}
void
pmap_pv_untrack(paddr_t start, psize_t size)
{
panic("PV-tracking not supported");
}
#endif /* notdef */
#endif /* PMAP_PV_TRACK_ONLY_STUBS */
/* $NetBSD: kern_time_60.c,v 1.3 2020/01/29 15:47:51 ad Exp $ */
/*-
* Copyright (c) 2013, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_time_60.c,v 1.3 2020/01/29 15:47:51 ad Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/lwp.h>
#include <sys/time.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
static const struct syscall_package compat_60_syscalls[] = {
{ SYS_compat_60__lwp_park, 0, (sy_call_t *)compat_60_sys__lwp_park },
{ 0, 0, NULL }
};
int
compat_60_sys__lwp_park(struct lwp *l,
const struct compat_60_sys__lwp_park_args *uap, register_t *retval)
{
/* {
syscallarg(const struct timespec *) ts;
syscallarg(lwpid_t) unpark;
syscallarg(const void *) hint;
syscallarg(const void *) unparkhint;
} */
int error;
struct timespec ts, *tsp;
if (SCARG(uap, ts) == NULL)
tsp = NULL;
else {
error = copyin(SCARG(uap, ts), &ts, sizeof(ts));
if (error != 0)
return error;
tsp = &ts;
}
if (SCARG(uap, unpark) != 0) {
error = lwp_unpark(&SCARG(uap, unpark), 1);
if (error != 0)
return error;
}
return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp);
}
int
kern_time_60_init(void)
{
return syscall_establish(NULL, compat_60_syscalls);
}
int
kern_time_60_fini(void)
{
return syscall_disestablish(NULL, compat_60_syscalls);
}
/* $NetBSD: vm_43.c,v 1.21 2019/01/27 02:08:39 pgoyette Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
*
* @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94
*/
/*
* Mapped file (mmap) interface to VM
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vm_43.c,v 1.21 2019/01/27 02:08:39 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/resourcevar.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <miscfs/specfs/specdev.h>
#include <compat/common/compat_mod.h>
static struct syscall_package vm_43_syscalls[] = {
{ SYS_compat_43_ogetpagesize, 0,
(sy_call_t *)compat_43_sys_getpagesize },
{ SYS_compat_43_ommap, 0, (sy_call_t *)compat_43_sys_mmap },
{ 0, 0, NULL }
};
/* ARGSUSED */
int
compat_43_sys_getpagesize(struct lwp *l, const void *v, register_t *retval)
{
*retval = PAGE_SIZE;
return (0);
}
int
compat_43_sys_mmap(struct lwp *l, const struct compat_43_sys_mmap_args *uap, register_t *retval)
{
/* {
syscallarg(void *) addr;
syscallarg(size_t) len;
syscallarg(int) prot;
syscallarg(int) flags;
syscallarg(int) fd;
syscallarg(long) pos;
} */
struct sys_mmap_args /* {
syscallarg(void *) addr;
syscallarg(size_t) len;
syscallarg(int) prot;
syscallarg(int) flags;
syscallarg(int) fd;
syscallarg(long) pad;
syscallarg(off_t) pos;
} */ nargs;
static const char cvtbsdprot[8] = {
0,
PROT_EXEC,
PROT_WRITE,
PROT_EXEC|PROT_WRITE,
PROT_READ,
PROT_EXEC|PROT_READ,
PROT_WRITE|PROT_READ,
PROT_EXEC|PROT_WRITE|PROT_READ,
};
#define OMAP_ANON 0x0002
#define OMAP_COPY 0x0020
#define OMAP_SHARED 0x0010
#define OMAP_FIXED 0x0100
#define OMAP_INHERIT 0x0800
SCARG(&nargs, addr) = SCARG(uap, addr);
SCARG(&nargs, len) = SCARG(uap, len);
/* Note: index using prot is sign-safe due to mask */
SCARG(&nargs, prot) = cvtbsdprot[SCARG(uap, prot)&0x7];
SCARG(&nargs, flags) = 0;
if (SCARG(uap, flags) & OMAP_ANON)
SCARG(&nargs, flags) |= MAP_ANON;
if (SCARG(uap, flags) & OMAP_SHARED)
SCARG(&nargs, flags) |= MAP_SHARED;
else
SCARG(&nargs, flags) |= MAP_PRIVATE;
if (SCARG(uap, flags) & OMAP_COPY) {
SCARG(&nargs, flags) |= MAP_PRIVATE;
#if defined(COMPAT_10) && defined(__i386__)
/*
* Ancient kernel on x86 did not obey PROT_EXEC on i386 at least
* and ld.so did not turn it on. We take care of this on amd64
* in compat32.
*/
SCARG(&nargs, prot) |= PROT_EXEC;
#endif
}
if (SCARG(uap, flags) & OMAP_FIXED)
SCARG(&nargs, flags) |= MAP_FIXED;
if (SCARG(uap, flags) & OMAP_INHERIT)
SCARG(&nargs, flags) |= MAP_INHERIT;
SCARG(&nargs, fd) = SCARG(uap, fd);
SCARG(&nargs, pos) = SCARG(uap, pos);
return (sys_mmap(l, &nargs, retval));
}
int
vm_43_init(void)
{
return syscall_establish(NULL, vm_43_syscalls);
}
int
vm_43_fini(void)
{
return syscall_disestablish(NULL, vm_43_syscalls);
}
/* $NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_glue.c 8.6 (Berkeley) 1/5/94
* from: Id: uvm_glue.c,v 1.1.2.8 1998/02/07 01:16:54 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_glue.c,v 1.182 2023/10/04 20:34:19 ad Exp $");
#include "opt_kgdb.h"
#include "opt_kstack.h"
#include "opt_uvmhist.h"
/*
* uvm_glue.c: glue functions
*/
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/syncobj.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/asan.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>
/*
* uvm_kernacc: test if kernel can access a memory region.
*
* => Currently used only by /dev/kmem driver (dev/mm.c).
*/
bool
uvm_kernacc(void *addr, size_t len, vm_prot_t prot)
{
vaddr_t saddr = trunc_page((vaddr_t)addr);
vaddr_t eaddr = round_page(saddr + len);
bool rv;
vm_map_lock_read(kernel_map);
rv = uvm_map_checkprot(kernel_map, saddr, eaddr, prot);
vm_map_unlock_read(kernel_map);
return rv;
}
#ifdef KGDB
/*
* Change protections on kernel pages from addr to addr+len
* (presumably so debugger can plant a breakpoint).
*
* We force the protection change at the pmap level. If we were
* to use vm_map_protect a change to allow writing would be lazily-
* applied meaning we would still take a protection fault, something
* we really don't want to do. It would also fragment the kernel
* map unnecessarily. We cannot use pmap_protect since it also won't
* enforce a write-enable request. Using pmap_enter is the only way
* we can ensure the change takes place properly.
*/
void
uvm_chgkprot(void *addr, size_t len, int rw)
{
vm_prot_t prot;
paddr_t pa;
vaddr_t sva, eva;
prot = rw == B_READ ? VM_PROT_READ : VM_PROT_READ|VM_PROT_WRITE;
eva = round_page((vaddr_t)addr + len);
for (sva = trunc_page((vaddr_t)addr); sva < eva; sva += PAGE_SIZE) {
/*
* Extract physical address for the page.
*/
if (pmap_extract(pmap_kernel(), sva, &pa) == false)
panic("%s: invalid page", __func__);
pmap_enter(pmap_kernel(), sva, pa, prot, PMAP_WIRED);
}
pmap_update(pmap_kernel());
}
#endif
/*
* uvm_vslock: wire user memory for I/O
*
* - called from physio and sys___sysctl
* - XXXCDC: consider nuking this (or making it a macro?)
*/
int
uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access_type)
{
struct vm_map *map;
vaddr_t start, end;
int error;
map = &vs->vm_map;
start = trunc_page((vaddr_t)addr);
end = round_page((vaddr_t)addr + len);
error = uvm_fault_wire(map, start, end, access_type, 0);
return error;
}
/*
* uvm_vsunlock: unwire user memory wired by uvm_vslock()
*
* - called from physio and sys___sysctl
* - XXXCDC: consider nuking this (or making it a macro?)
*/
void
uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
{
uvm_fault_unwire(&vs->vm_map, trunc_page((vaddr_t)addr),
round_page((vaddr_t)addr + len));
}
/*
* uvm_proc_fork: fork a virtual address space
*
* - the address space is copied as per parent map's inherit values
*/
void
uvm_proc_fork(struct proc *p1, struct proc *p2, bool shared)
{
if (shared == true) {
p2->p_vmspace = NULL;
uvmspace_share(p1, p2);
} else {
p2->p_vmspace = uvmspace_fork(p1->p_vmspace);
}
cpu_proc_fork(p1, p2);
}
/*
* uvm_lwp_fork: fork a thread
*
* - a new PCB structure is allocated for the child process,
* and filled in by MD layer
* - if specified, the child gets a new user stack described by
* stack and stacksize
* - NOTE: the kernel stack may be at a different location in the child
* process, and thus addresses of automatic variables may be invalid
* after cpu_lwp_fork returns in the child process. We do nothing here
* after cpu_lwp_fork returns.
*/
void
uvm_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
void (*func)(void *), void *arg)
{
/* Fill stack with magic number. */
kstack_setup_magic(l2);
/*
* cpu_lwp_fork() copy and update the pcb, and make the child ready
* to run. If this is a normal user fork, the child will exit
* directly to user mode via child_return() on its first time
* slice and will not return here. If this is a kernel thread,
* the specified entry point will be executed.
*/
cpu_lwp_fork(l1, l2, stack, stacksize, func, arg);
}
#ifndef USPACE_ALIGN
#define USPACE_ALIGN 0
#endif
static pool_cache_t uvm_uarea_cache;
#if defined(__HAVE_CPU_UAREA_ROUTINES)
static pool_cache_t uvm_uarea_system_cache;
#else
#define uvm_uarea_system_cache uvm_uarea_cache
#endif
static void *
uarea_poolpage_alloc(struct pool *pp, int flags)
{ KASSERT((flags & PR_WAITOK) != 0);
#if defined(PMAP_MAP_POOLPAGE)
while (USPACE == PAGE_SIZE &&
(USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
struct vm_page *pg;
vaddr_t va;
#if defined(PMAP_ALLOC_POOLPAGE)
pg = PMAP_ALLOC_POOLPAGE(0);
#else
pg = uvm_pagealloc(NULL, 0, NULL, 0);
#endif
if (pg == NULL) {
uvm_wait("uarea");
continue;
}
va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
KASSERT(va != 0);
return (void *)va;
}
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
void *va = cpu_uarea_alloc(false);
if (va)
return (void *)va;
#endif
return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
USPACE_ALIGN, UVM_KMF_WIRED | UVM_KMF_WAITVA);
}
static void
uarea_poolpage_free(struct pool *pp, void *addr)
{
#if defined(PMAP_MAP_POOLPAGE)
if (USPACE == PAGE_SIZE &&
(USPACE_ALIGN == 0 || USPACE_ALIGN == PAGE_SIZE)) {
paddr_t pa;
pa = PMAP_UNMAP_POOLPAGE((vaddr_t) addr);
KASSERT(pa != 0);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
return;
}
#endif
#if defined(__HAVE_CPU_UAREA_ROUTINES)
if (cpu_uarea_free(addr))
return;
#endif
uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
UVM_KMF_WIRED);
}
static struct pool_allocator uvm_uarea_allocator = {
.pa_alloc = uarea_poolpage_alloc,
.pa_free = uarea_poolpage_free,
.pa_pagesz = USPACE,
};
#if defined(__HAVE_CPU_UAREA_ROUTINES)
static void *
uarea_system_poolpage_alloc(struct pool *pp, int flags)
{
void * const va = cpu_uarea_alloc(true);
if (va != NULL)
return va;
return (void *)uvm_km_alloc(kernel_map, pp->pr_alloc->pa_pagesz,
USPACE_ALIGN, UVM_KMF_WIRED |
((flags & PR_WAITOK) ? UVM_KMF_WAITVA :
(UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)));
}
static void
uarea_system_poolpage_free(struct pool *pp, void *addr)
{
if (cpu_uarea_free(addr))
return;
uvm_km_free(kernel_map, (vaddr_t)addr, pp->pr_alloc->pa_pagesz,
UVM_KMF_WIRED);
}
static struct pool_allocator uvm_uarea_system_allocator = {
.pa_alloc = uarea_system_poolpage_alloc,
.pa_free = uarea_system_poolpage_free,
.pa_pagesz = USPACE,
};
#endif /* __HAVE_CPU_UAREA_ROUTINES */
void
uvm_uarea_init(void)
{
int flags = PR_NOTOUCH;
/*
* specify PR_NOALIGN unless the alignment provided by
* the backend (USPACE_ALIGN) is sufficient to provide
* pool page size (UPSACE) alignment.
*/
if ((USPACE_ALIGN == 0 && USPACE != PAGE_SIZE) ||
(USPACE_ALIGN % USPACE) != 0) {
flags |= PR_NOALIGN;
}
uvm_uarea_cache = pool_cache_init(USPACE, USPACE_ALIGN, 0, flags,
"uarea", &uvm_uarea_allocator, IPL_NONE, NULL, NULL, NULL);
#if defined(__HAVE_CPU_UAREA_ROUTINES)
uvm_uarea_system_cache = pool_cache_init(USPACE, USPACE_ALIGN,
0, flags, "uareasys", &uvm_uarea_system_allocator,
IPL_NONE, NULL, NULL, NULL);
#endif
}
/*
* uvm_uarea_alloc: allocate a u-area
*/
vaddr_t
uvm_uarea_alloc(void)
{
return (vaddr_t)pool_cache_get(uvm_uarea_cache, PR_WAITOK);
}
vaddr_t
uvm_uarea_system_alloc(struct cpu_info *ci)
{
#ifdef __HAVE_CPU_UAREA_ALLOC_IDLELWP
if (__predict_false(ci != NULL))
return cpu_uarea_alloc_idlelwp(ci);
#endif
return (vaddr_t)pool_cache_get(uvm_uarea_system_cache, PR_WAITOK);
}
/*
* uvm_uarea_free: free a u-area
*/
void
uvm_uarea_free(vaddr_t uaddr)
{
kasan_mark((void *)uaddr, USPACE, USPACE, 0);
pool_cache_put(uvm_uarea_cache, (void *)uaddr);
}
void
uvm_uarea_system_free(vaddr_t uaddr)
{
kasan_mark((void *)uaddr, USPACE, USPACE, 0);
pool_cache_put(uvm_uarea_system_cache, (void *)uaddr);
}
vaddr_t
uvm_lwp_getuarea(lwp_t *l)
{
return (vaddr_t)l->l_addr - UAREA_PCB_OFFSET;
}
void
uvm_lwp_setuarea(lwp_t *l, vaddr_t addr)
{
l->l_addr = (void *)(addr + UAREA_PCB_OFFSET);
}
/*
* uvm_proc_exit: exit a virtual address space
*
* - borrow proc0's address space because freeing the vmspace
* of the dead process may block.
*/
void
uvm_proc_exit(struct proc *p)
{
struct lwp *l = curlwp; /* XXX */
struct vmspace *ovm;
KASSERT(p == l->l_proc);
ovm = p->p_vmspace;
KASSERT(ovm != NULL);
if (__predict_false(ovm == proc0.p_vmspace))
return;
/*
* borrow proc0's address space.
*/
kpreempt_disable();
pmap_deactivate(l);
p->p_vmspace = proc0.p_vmspace;
pmap_activate(l);
kpreempt_enable();
uvmspace_free(ovm);
}
void
uvm_lwp_exit(struct lwp *l)
{
vaddr_t va = uvm_lwp_getuarea(l);
bool system = (l->l_flag & LW_SYSTEM) != 0;
if (system)
uvm_uarea_system_free(va);
else
uvm_uarea_free(va);
#ifdef DIAGNOSTIC
uvm_lwp_setuarea(l, (vaddr_t)NULL);
#endif
}
/*
* uvm_init_limit: init per-process VM limits
*
* - called for process 0 and then inherited by all others.
*/
void
uvm_init_limits(struct proc *p)
{
/*
* Set up the initial limits on process VM. Set the maximum
* resident set size to be all of (reasonably) available memory.
* This causes any single, large process to start random page
* replacement once it fills memory.
*/
p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
p->p_rlimit[RLIMIT_STACK].rlim_max = maxsmap;
p->p_rlimit[RLIMIT_DATA].rlim_cur = DFLDSIZ;
p->p_rlimit[RLIMIT_DATA].rlim_max = maxdmap;
p->p_rlimit[RLIMIT_AS].rlim_cur = RLIM_INFINITY;
p->p_rlimit[RLIMIT_AS].rlim_max = RLIM_INFINITY;
p->p_rlimit[RLIMIT_RSS].rlim_cur = MIN(VM_MAXUSER_ADDRESS,
ctob((rlim_t)uvm_availmem(false)));
}
/*
* uvm_scheduler: process zero main loop.
*/
extern struct loadavg averunnable;
void
uvm_scheduler(void)
{
lwp_t *l = curlwp;
lwp_lock(l);
l->l_class = SCHED_FIFO;
lwp_changepri(l, PRI_VM);
lwp_unlock(l);
/* Start the freelist cache. */
uvm_pgflcache_start();
for (;;) {
/* Update legacy stats for post-mortem debugging. */
uvm_update_uvmexp();
/* See if the pagedaemon needs to generate some free pages. */
uvm_kick_pdaemon();
/* Calculate process statistics. */
sched_pstats();
(void)kpause("uvm", false, hz, NULL);
}
}
/*
* uvm_idle: called from the idle loop.
*/
void
uvm_idle(void)
{
struct cpu_info *ci = curcpu();
struct uvm_cpu *ucpu = ci->ci_data.cpu_uvm;
KASSERT(kpreempt_disabled());
uvmpdpol_idle(ucpu);
}
/* $NetBSD: wsevent.c,v 1.47 2021/09/26 01:16:10 thorpej Exp $ */
/*-
* Copyright (c) 2006, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996, 1997 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)event.c 8.1 (Berkeley) 6/11/93
*/
/*
* Internal "wscons_event" queue interface for the keyboard and mouse drivers.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: wsevent.c,v 1.47 2021/09/26 01:16:10 thorpej Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/compat_stub.h>
#include <sys/sysctl.h>
#include <dev/wscons/wsconsio.h>
#include <dev/wscons/wseventvar.h>
/*
* Size of a wsevent queue (measured in number of events).
* Should be a power of two so that `%' is fast.
* At the moment, the value below makes the queues use 2 Kbytes each; this
* value may need tuning.
*/
#define WSEVENT_QSIZE 256
#define EVSIZE(ver) ((ver) == WSEVENT_VERSION ? \
sizeof(struct wscons_event) : \
sizeof(struct owscons_event))
#define EVARRAY(ev, idx) (&(ev)->q[(idx)])
static int wsevent_default_version = WSEVENT_VERSION;
/*
* Priority of code managing wsevent queues. PWSEVENT is set just above
* PSOCK, which is just above TTIPRI, on the theory that mouse and keyboard
* `user' input should be quick.
*/
#define PWSEVENT 23
#define splwsevent() spltty()
static void wsevent_intr(void *);
/*
* Initialize a wscons_event queue.
*/
void
wsevent_init(struct wseventvar *ev, struct proc *p)
{
if (ev->q != NULL) {
#ifdef DIAGNOSTIC
printf("wsevent_init: already init\n");
#endif
return;
}
/*
* For binary compat set default version and either build with
* COMPAT_50 or load COMPAT_50 module to include the compatibility
* code.
*/
if (wsevent_default_version >= 0 &&
wsevent_default_version < WSEVENT_VERSION)
ev->version = wsevent_default_version;
else
ev->version = WSEVENT_VERSION;
ev->get = ev->put = 0;
ev->q = kmem_alloc(WSEVENT_QSIZE * sizeof(*ev->q), KM_SLEEP);
selinit(&ev->sel);
ev->io = p;
ev->sih = softint_establish(SOFTINT_MPSAFE | SOFTINT_CLOCK,
wsevent_intr, ev);
}
/*
* Tear down a wscons_event queue.
*/
void
wsevent_fini(struct wseventvar *ev)
{
if (ev->q == NULL) {
#ifdef DIAGNOSTIC
printf("wsevent_fini: already fini\n");
#endif
return;
}
seldestroy(&ev->sel);
kmem_free(ev->q, WSEVENT_QSIZE * sizeof(*ev->q));
ev->q = NULL;
softint_disestablish(ev->sih);
}
static int
wsevent_copyout_events(const struct wscons_event *events, int cnt,
struct uio *uio, int ver)
{
int error;
switch (ver) {
case 0:
MODULE_HOOK_CALL(wscons_copyout_events_50_hook,
(events, cnt, uio), enosys(), error);
if (error == ENOSYS)
error = EINVAL;
return error;
case WSEVENT_VERSION:
return uiomove(__UNCONST(events), cnt * sizeof(*events), uio);
default:
panic("%s: unknown version %d", __func__, ver);
}
}
/*
* User-level interface: read, poll.
* (User cannot write an event queue.)
*/
int
wsevent_read(struct wseventvar *ev, struct uio *uio, int flags)
{
int s, n, cnt, error;
const int ver = ev->version;
const size_t evsize = EVSIZE(ver);
/*
* Make sure we can return at least 1.
*/
if (uio->uio_resid < evsize)
return (EMSGSIZE); /* ??? */
s = splwsevent();
while (ev->get == ev->put) {
if (flags & IO_NDELAY) {
splx(s);
return (EWOULDBLOCK);
}
ev->wanted = 1;
error = tsleep(ev, PWSEVENT | PCATCH, "wsevent_read", 0);
if (error) { splx(s);
return (error);
}
}
/*
* Move wscons_event from tail end of queue (there is at least one
* there).
*/
if (ev->put < ev->get)
cnt = WSEVENT_QSIZE - ev->get; /* events in [get..QSIZE) */
else
cnt = ev->put - ev->get; /* events in [get..put) */
splx(s);
n = howmany(uio->uio_resid, evsize);
if (cnt > n)
cnt = n;
error = wsevent_copyout_events(EVARRAY(ev, ev->get), cnt, uio, ver);
n -= cnt;
/*
* If we do not wrap to 0, used up all our space, or had an error,
* stop. Otherwise move from front of queue to put index, if there
* is anything there to move.
*/
if ((ev->get = (ev->get + cnt) % WSEVENT_QSIZE) != 0 ||
n == 0 || error || (cnt = ev->put) == 0)
return (error);
if (cnt > n)
cnt = n;
error = wsevent_copyout_events(EVARRAY(ev, 0), cnt, uio, ver);
ev->get = cnt;
return (error);
}
int
wsevent_poll(struct wseventvar *ev, int events, struct lwp *l)
{
int revents = 0;
int s = splwsevent();
if (events & (POLLIN | POLLRDNORM)) { if (ev->get != ev->put)
revents |= events & (POLLIN | POLLRDNORM);
else
selrecord(l, &ev->sel);
}
splx(s);
return (revents);
}
static void
filt_wseventrdetach(struct knote *kn)
{
struct wseventvar *ev = kn->kn_hook;
int s;
s = splwsevent();
selremove_knote(&ev->sel, kn);
splx(s);
}
static int
filt_wseventread(struct knote *kn, long hint)
{
struct wseventvar *ev = kn->kn_hook;
if (ev->get == ev->put)
return (0);
if (ev->get < ev->put)
kn->kn_data = ev->put - ev->get;
else
kn->kn_data = (WSEVENT_QSIZE - ev->get) + ev->put;
kn->kn_data *= EVSIZE(ev->version);
return (1);
}
static const struct filterops wsevent_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = NULL,
.f_detach = filt_wseventrdetach,
.f_event = filt_wseventread,
};
int
wsevent_kqfilter(struct wseventvar *ev, struct knote *kn)
{
int s;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &wsevent_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = ev;
s = splwsevent();
selrecord_knote(&ev->sel, kn);
splx(s);
return (0);
}
/*
* Wakes up all listener of the 'ev' queue.
*/
void
wsevent_wakeup(struct wseventvar *ev)
{
selnotify(&ev->sel, 0, 0);
if (ev->wanted) { ev->wanted = 0;
wakeup(ev);
}
if (ev->async) { softint_schedule(ev->sih);
}
}
/*
* Soft interrupt handler: sends signal to async proc.
*/
static void
wsevent_intr(void *cookie)
{
struct wseventvar *ev;
ev = cookie;
if (ev->async) {
mutex_enter(&proc_lock);
psignal(ev->io, SIGIO);
mutex_exit(&proc_lock);
}
}
/*
* Injects the set of events given in 'events', whose size is 'nevents',
* into the 'ev' queue. If there is not enough free space to inject them
* all, returns ENOSPC and the queue is left intact; otherwise returns 0
* and wakes up all listeners.
*/
int
wsevent_inject(struct wseventvar *ev, struct wscons_event *events,
size_t nevents)
{
size_t avail, i;
struct timespec t;
/* Calculate number of free slots in the queue. */
if (ev->put < ev->get)
avail = ev->get - ev->put;
else
avail = WSEVENT_QSIZE - (ev->put - ev->get);
KASSERT(avail <= WSEVENT_QSIZE);
/* Fail if there is all events will not fit in the queue. */
if (avail < nevents)
return ENOSPC;
/* Use the current time for all events. */
getnanotime(&t);
/* Inject the events. */
for (i = 0; i < nevents; i++) {
struct wscons_event *we;
we = EVARRAY(ev, ev->put);
we->type = events[i].type;
we->value = events[i].value;
we->time = t;
ev->put = (ev->put + 1) % WSEVENT_QSIZE;
}
wsevent_wakeup(ev);
return 0;
}
int
wsevent_setversion(struct wseventvar *ev, int vers)
{ if (ev == NULL)
return EINVAL;
switch (vers) {
case 0:
case WSEVENT_VERSION:
break;
default:
return EINVAL;
}
if (vers == ev->version)
return 0;
ev->get = ev->put = 0;
ev->version = vers;
return 0;
}
SYSCTL_SETUP(sysctl_wsevent_setup, "sysctl hw.wsevent subtree setup")
{
const struct sysctlnode *node = NULL;
if (sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "wsevent", NULL,
NULL, 0, NULL, 0,
CTL_HW, CTL_CREATE, CTL_EOL) != 0)
return;
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_READWRITE,
CTLTYPE_INT, "default_version",
SYSCTL_DESCR("Set default event version for compatibility"),
NULL, 0, &wsevent_default_version, 0,
CTL_CREATE, CTL_EOL);
}
/* $NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $ */
/*-
* Copyright (c) 2006, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 2006 YAMAMOTO Takashi.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_specificdata.c,v 1.14 2017/06/01 02:45:13 chs Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/specificdata.h>
#include <sys/queue.h>
#include <sys/mutex.h>
/*
* Locking notes:
*
* The specdataref_container pointer in the specificdata_reference
* is volatile. To read it, you must hold EITHER the domain lock
* or the ref lock. To write it, you must hold BOTH the domain lock
* and the ref lock. The locks must be acquired in the following
* order:
* domain -> ref
*/
typedef struct {
specificdata_dtor_t ski_dtor;
} specificdata_key_impl;
struct specificdata_container {
size_t sc_nkey;
LIST_ENTRY(specificdata_container) sc_list;
void * sc_data[]; /* variable length */
};
#define SPECIFICDATA_CONTAINER_BYTESIZE(n) \
(sizeof(struct specificdata_container) + ((n) * sizeof(void *)))
struct specificdata_domain {
kmutex_t sd_lock;
unsigned int sd_nkey;
LIST_HEAD(, specificdata_container) sd_list;
specificdata_key_impl *sd_keys;
};
static void
specificdata_container_link(specificdata_domain_t sd,
specificdata_container_t sc)
{
LIST_INSERT_HEAD(&sd->sd_list, sc, sc_list);
}
static void
specificdata_container_unlink(specificdata_domain_t sd,
specificdata_container_t sc)
{
LIST_REMOVE(sc, sc_list);
}
static void
specificdata_destroy_datum(specificdata_domain_t sd,
specificdata_container_t sc, specificdata_key_t key)
{
specificdata_dtor_t dtor;
void *data;
if (key >= sc->sc_nkey)
return;
KASSERT(key < sd->sd_nkey);
data = sc->sc_data[key];
dtor = sd->sd_keys[key].ski_dtor;
if (dtor != NULL) {
if (data != NULL) { sc->sc_data[key] = NULL;
(*dtor)(data);
}
} else {
KASSERT(data == NULL);
}
}
static void
specificdata_noop_dtor(void *data)
{
/* nothing */
}
/*
* specificdata_domain_create --
* Create a specificdata domain.
*/
specificdata_domain_t
specificdata_domain_create(void)
{
specificdata_domain_t sd;
sd = kmem_zalloc(sizeof(*sd), KM_SLEEP);
mutex_init(&sd->sd_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&sd->sd_list);
return (sd);
}
/*
* specificdata_domain_delete --
* Destroy a specificdata domain.
*/
void
specificdata_domain_delete(specificdata_domain_t sd)
{
panic("specificdata_domain_delete: not implemented");
}
/*
* specificdata_key_create --
* Create a specificdata key for a domain.
*
* Note: This is a rare operation.
*/
int
specificdata_key_create(specificdata_domain_t sd, specificdata_key_t *keyp,
specificdata_dtor_t dtor)
{
specificdata_key_impl *newkeys;
specificdata_key_t key = 0;
size_t nsz;
ASSERT_SLEEPABLE();
if (dtor == NULL)
dtor = specificdata_noop_dtor;
mutex_enter(&sd->sd_lock);
if (sd->sd_keys == NULL)
goto needalloc;
for (; key < sd->sd_nkey; key++) {
if (sd->sd_keys[key].ski_dtor == NULL)
goto gotit;
}
needalloc:
nsz = (sd->sd_nkey + 1) * sizeof(*newkeys);
/* XXXSMP allocating memory while holding a lock. */
newkeys = kmem_zalloc(nsz, KM_SLEEP);
if (sd->sd_keys != NULL) {
size_t osz = sd->sd_nkey * sizeof(*newkeys);
memcpy(newkeys, sd->sd_keys, osz);
kmem_free(sd->sd_keys, osz);
}
sd->sd_keys = newkeys;
sd->sd_nkey++;
gotit:
sd->sd_keys[key].ski_dtor = dtor;
mutex_exit(&sd->sd_lock);
*keyp = key;
return (0);
}
/*
* specificdata_key_delete --
* Destroy a specificdata key for a domain.
*
* Note: This is a rare operation.
*/
void
specificdata_key_delete(specificdata_domain_t sd, specificdata_key_t key)
{
specificdata_container_t sc;
mutex_enter(&sd->sd_lock);
if (key >= sd->sd_nkey)
goto out;
/*
* Traverse all of the specificdata containers in the domain
* and the destroy the datum for the dying key.
*/
LIST_FOREACH(sc, &sd->sd_list, sc_list) {
specificdata_destroy_datum(sd, sc, key);
}
sd->sd_keys[key].ski_dtor = NULL;
out:
mutex_exit(&sd->sd_lock);
}
/*
* specificdata_init --
* Initialize a specificdata container for operation in the
* specified domain.
*/
int
specificdata_init(specificdata_domain_t sd, specificdata_reference *ref)
{
/*
* Just NULL-out the container pointer; we'll allocate the
* container the first time specificdata is put into it.
*/
ref->specdataref_container = NULL;
mutex_init(&ref->specdataref_lock, MUTEX_DEFAULT, IPL_NONE);
return (0);
}
/*
* specificdata_fini --
* Destroy a specificdata container. We destroy all of the datums
* stuffed into the container just as if the key were destroyed.
*/
void
specificdata_fini(specificdata_domain_t sd, specificdata_reference *ref)
{
specificdata_container_t sc;
specificdata_key_t key;
ASSERT_SLEEPABLE();
mutex_destroy(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc == NULL)
return;
ref->specdataref_container = NULL;
mutex_enter(&sd->sd_lock);
specificdata_container_unlink(sd, sc); for (key = 0; key < sc->sc_nkey; key++) { specificdata_destroy_datum(sd, sc, key);
}
mutex_exit(&sd->sd_lock);
kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}
/*
* specificdata_getspecific --
* Get a datum from a container.
*/
void *
specificdata_getspecific(specificdata_domain_t sd, specificdata_reference *ref,
specificdata_key_t key)
{
specificdata_container_t sc;
void *data = NULL;
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc != NULL && key < sc->sc_nkey) data = sc->sc_data[key];
mutex_exit(&ref->specdataref_lock);
return (data);
}
/*
* specificdata_getspecific_unlocked --
* Get a datum from a container in a lockless fashion.
*
* Note: When using this routine, care must be taken to ensure
* that no other thread could cause the specificdata_reference
* to become invalid (i.e. point at the wrong container) by
* issuing a setspecific call or destroying the container.
*/
void *
specificdata_getspecific_unlocked(specificdata_domain_t sd,
specificdata_reference *ref,
specificdata_key_t key)
{
specificdata_container_t sc;
sc = ref->specdataref_container;
if (sc != NULL && key < sc->sc_nkey) return (sc->sc_data[key]);
return (NULL);
}
/*
* specificdata_setspecific --
* Put a datum into a container.
*/
void
specificdata_setspecific(specificdata_domain_t sd,
specificdata_reference *ref,
specificdata_key_t key, void *data)
{
specificdata_container_t sc, newsc;
size_t newnkey, sz;
ASSERT_SLEEPABLE();
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (__predict_true(sc != NULL && key < sc->sc_nkey)) {
sc->sc_data[key] = data;
mutex_exit(&ref->specdataref_lock);
return;
}
mutex_exit(&ref->specdataref_lock);
/*
* Slow path: need to resize.
*/
mutex_enter(&sd->sd_lock);
newnkey = sd->sd_nkey;
if (key >= newnkey) {
mutex_exit(&sd->sd_lock);
panic("specificdata_setspecific");
}
sz = SPECIFICDATA_CONTAINER_BYTESIZE(newnkey);
newsc = kmem_zalloc(sz, KM_SLEEP);
newsc->sc_nkey = newnkey;
mutex_enter(&ref->specdataref_lock);
sc = ref->specdataref_container;
if (sc != NULL) {
if (key < sc->sc_nkey) {
/*
* Someone beat us to the punch. Unwind and put
* the object into the now large enough container.
*/
sc->sc_data[key] = data;
mutex_exit(&ref->specdataref_lock);
mutex_exit(&sd->sd_lock);
kmem_free(newsc, sz);
return;
}
specificdata_container_unlink(sd, sc);
memcpy(newsc->sc_data, sc->sc_data,
sc->sc_nkey * sizeof(void *));
}
newsc->sc_data[key] = data;
specificdata_container_link(sd, newsc);
ref->specdataref_container = newsc;
mutex_exit(&ref->specdataref_lock);
mutex_exit(&sd->sd_lock);
if (sc != NULL)
kmem_free(sc, SPECIFICDATA_CONTAINER_BYTESIZE(sc->sc_nkey));
}
/* $NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $ */
/*-
* Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_cksum.c,v 1.28 2011/04/25 22:05:05 yamt Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
/*
* Checksum of the IPv6 pseudo header.
*
* off is supposed to be the skipped IPv6 header, len is the payload size.
*/
int
in6_cksum(struct mbuf *m, u_int8_t nxt, uint32_t off, uint32_t len)
{
union {
uint16_t words[16];
struct {
struct in6_addr ip6_src;
struct in6_addr ip6_dst;
} addrs;
} u;
const struct in6_addr *in6_src;
const struct in6_addr *in6_dst;
const struct ip6_hdr *ip6;
uint32_t sum;
const uint16_t *w;
const char *cp;
if (nxt == 0)
return cpu_in_cksum(m, len, off, 0);
if (__predict_false(off < sizeof(struct ip6_hdr)))
panic("in6_cksum: offset too short for IPv6 header");
if (__predict_false(m->m_len < sizeof(struct ip6_hdr)))
panic("in6_cksum: mbuf too short for IPv6 header");
/*
* Compute the equivalent of:
* struct ip6_hdr_pseudo ip6;
*
* bzero(sizeof(*ip6));
* ip6.ip6ph_nxt = nxt;
* ip6.ip6ph_len = htonl(len);
* ipv6.ip6ph_src = mtod(m, struct ip6_hdr *)->ip6_src;
* in6_clearscope(&ip6->ip6ph_src);
* ipv6.ip6ph_dst = mtod(m, struct ip6_hdr *)->ip6_dst;
* in6_clearscope(&ip6->ip6ph_dst);
* sum = one_add(&ip6);
*/
#if BYTE_ORDER == LITTLE_ENDIAN
sum = ((len & 0xffff) + ((len >> 16) & 0xffff) + nxt) << 8;
#else
sum = (len & 0xffff) + ((len >> 16) & 0xffff) + nxt;
#endif
cp = mtod(m, const char *);
w = (const uint16_t *)(cp + offsetof(struct ip6_hdr, ip6_src));
ip6 = (const void *)cp;
if (__predict_true((uintptr_t)w % 2 == 0)) {
in6_src = &ip6->ip6_src;
in6_dst = &ip6->ip6_dst;
} else {
memcpy(&u, &ip6->ip6_src, 32);
w = u.words;
in6_src = &u.addrs.ip6_src;
in6_dst = &u.addrs.ip6_dst;
}
sum += w[0];
if (!IN6_IS_SCOPE_EMBEDDABLE(in6_src))
sum += w[1];
sum += w[2];
sum += w[3];
sum += w[4];
sum += w[5];
sum += w[6];
sum += w[7];
w += 8;
sum += w[0];
if (!IN6_IS_SCOPE_EMBEDDABLE(in6_dst))
sum += w[1];
sum += w[2];
sum += w[3];
sum += w[4];
sum += w[5];
sum += w[6];
sum += w[7];
return cpu_in_cksum(m, len, off, sum);
}
/* $NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $ */
/*
* Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_subr.c,v 1.88 2023/11/21 14:35:36 riastradh Exp $");
#include "opt_pax.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/mman.h>
#include <sys/resourcevar.h>
#include <sys/device.h>
#include <sys/pax.h>
#include <uvm/uvm_extern.h>
#define VMCMD_EVCNT_DECL(name) \
static struct evcnt vmcmd_ev_##name = \
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "vmcmd", #name); \
EVCNT_ATTACH_STATIC(vmcmd_ev_##name)
#define VMCMD_EVCNT_INCR(name) \
vmcmd_ev_##name.ev_count++
VMCMD_EVCNT_DECL(calls);
VMCMD_EVCNT_DECL(extends);
VMCMD_EVCNT_DECL(kills);
#ifdef DEBUG_STACK
#define DPRINTF(a) uprintf a
#else
#define DPRINTF(a)
#endif
unsigned int user_stack_guard_size = 1024 * 1024;
unsigned int user_thread_stack_guard_size = 64 * 1024;
/*
* new_vmcmd():
* create a new vmcmd structure and fill in its fields based
* on function call arguments. make sure objects ref'd by
* the vmcmd are 'held'.
*/
void
new_vmcmd(struct exec_vmcmd_set *evsp,
int (*proc)(struct lwp * l, struct exec_vmcmd *),
vsize_t len, vaddr_t addr, struct vnode *vp, u_long offset,
u_int prot, int flags)
{
struct exec_vmcmd *vcp;
VMCMD_EVCNT_INCR(calls);
KASSERT(proc != vmcmd_map_pagedvn || (vp->v_iflag & VI_TEXT));
KASSERT(vp == NULL || vrefcnt(vp) > 0);
if (evsp->evs_used >= evsp->evs_cnt)
vmcmdset_extend(evsp);
vcp = &evsp->evs_cmds[evsp->evs_used++];
vcp->ev_proc = proc;
vcp->ev_len = len;
vcp->ev_addr = addr;
if ((vcp->ev_vp = vp) != NULL)
vref(vp);
vcp->ev_offset = offset;
vcp->ev_prot = prot;
vcp->ev_flags = flags;
}
void
vmcmdset_extend(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *nvcp;
u_int ocnt;
#ifdef DIAGNOSTIC
if (evsp->evs_used < evsp->evs_cnt)
panic("vmcmdset_extend: not necessary");
#endif
/* figure out number of entries in new set */
if ((ocnt = evsp->evs_cnt) != 0) {
evsp->evs_cnt += ocnt;
VMCMD_EVCNT_INCR(extends);
} else
evsp->evs_cnt = EXEC_DEFAULT_VMCMD_SETSIZE;
/* allocate it */
nvcp = kmem_alloc(evsp->evs_cnt * sizeof(struct exec_vmcmd), KM_SLEEP);
/* free the old struct, if there was one, and record the new one */
if (ocnt) {
memcpy(nvcp, evsp->evs_cmds,
(ocnt * sizeof(struct exec_vmcmd)));
kmem_free(evsp->evs_cmds, ocnt * sizeof(struct exec_vmcmd));
}
evsp->evs_cmds = nvcp;
}
void
kill_vmcmds(struct exec_vmcmd_set *evsp)
{
struct exec_vmcmd *vcp;
u_int i;
VMCMD_EVCNT_INCR(kills);
if (evsp->evs_cnt == 0)
return;
for (i = 0; i < evsp->evs_used; i++) {
vcp = &evsp->evs_cmds[i];
if (vcp->ev_vp != NULL) vrele(vcp->ev_vp);
}
kmem_free(evsp->evs_cmds, evsp->evs_cnt * sizeof(struct exec_vmcmd));
evsp->evs_used = evsp->evs_cnt = 0;
}
/*
* vmcmd_map_pagedvn():
* handle vmcmd which specifies that a vnode should be mmap'd.
* appropriate for handling demand-paged text and data segments.
*/
static int
vmcmd_get_prot(struct lwp *l, const struct exec_vmcmd *cmd, vm_prot_t *prot,
vm_prot_t *maxprot)
{
vm_prot_t extraprot = PROT_MPROTECT_EXTRACT(cmd->ev_prot);
*prot = cmd->ev_prot & UVM_PROT_ALL;
*maxprot = PAX_MPROTECT_MAXPROTECT(l, *prot, extraprot, UVM_PROT_ALL);
if ((*prot & *maxprot) != *prot)
return EACCES;
return PAX_MPROTECT_VALIDATE(l, *prot);
}
int
vmcmd_map_pagedvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct uvm_object *uobj;
struct vnode *vp = cmd->ev_vp;
struct proc *p = l->l_proc;
int error;
vm_prot_t prot, maxprot;
KASSERT(vp->v_iflag & VI_TEXT);
/*
* map the vnode in using uvm_map.
*/
if (cmd->ev_len == 0)
return 0;
if (cmd->ev_offset & PAGE_MASK)
return EINVAL;
if (cmd->ev_addr & PAGE_MASK)
return EINVAL;
if (cmd->ev_len & PAGE_MASK)
return EINVAL;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
/*
* check the file system's opinion about mmapping the file
*/
error = VOP_MMAP(vp, prot, l->l_cred);
if (error)
return error;
if ((vp->v_vflag & VV_MAPPED) == 0) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_MAPPED;
VOP_UNLOCK(vp);
}
/*
* do the map, reference the object for this map entry
*/
uobj = &vp->v_uobj;
vref(vp);
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr, cmd->ev_len,
uobj, cmd->ev_offset, 0,
UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
UVM_ADV_NORMAL, UVM_FLAG_COPYONW|UVM_FLAG_FIXED));
if (error) {
uobj->pgops->pgo_detach(uobj);
}
return error;
}
/*
* vmcmd_map_readvn():
* handle vmcmd which specifies that a vnode should be read from.
* appropriate for non-demand-paged text/data segments, i.e. impure
* objects (a la OMAGIC and NMAGIC).
*/
int
vmcmd_map_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
long diff;
if (cmd->ev_len == 0)
return 0;
diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
cmd->ev_addr -= diff; /* required by uvm_map */
cmd->ev_offset -= diff;
cmd->ev_len += diff;
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_COPY,
UVM_ADV_NORMAL,
UVM_FLAG_FIXED|UVM_FLAG_OVERLAY|UVM_FLAG_COPYONW));
if (error)
return error;
return vmcmd_readvn(l, cmd);
}
int
vmcmd_readvn(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
vm_prot_t prot, maxprot;
error = vn_rdwr(UIO_READ, cmd->ev_vp, (void *)cmd->ev_addr,
cmd->ev_len, cmd->ev_offset, UIO_USERSPACE, IO_UNIT,
l->l_cred, NULL, l);
if (error)
return error;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
#ifdef PMAP_NEED_PROCWR
/*
* we had to write the process, make sure the pages are synched
* with the instruction cache.
*/
if (prot & VM_PROT_EXECUTE)
pmap_procwr(p, cmd->ev_addr, cmd->ev_len);
#endif
/*
* we had to map in the area at PROT_ALL so that vn_rdwr()
* could write to it. however, the caller seems to want
* it mapped read-only, so now we are going to have to call
* uvm_map_protect() to fix up the protection. ICK.
*/
if (maxprot != VM_PROT_ALL) {
error = uvm_map_protect(&p->p_vmspace->vm_map,
trunc_page(cmd->ev_addr),
round_page(cmd->ev_addr + cmd->ev_len),
maxprot, true);
if (error)
return error;
}
if (prot != maxprot) {
error = uvm_map_protect(&p->p_vmspace->vm_map,
trunc_page(cmd->ev_addr),
round_page(cmd->ev_addr + cmd->ev_len),
prot, false);
if (error)
return error;
}
return 0;
}
/*
* vmcmd_map_zero():
* handle vmcmd which specifies a zero-filled address space region. The
* address range must be first allocated, then protected appropriately.
*/
int
vmcmd_map_zero(struct lwp *l, struct exec_vmcmd *cmd)
{
struct proc *p = l->l_proc;
int error;
long diff;
vm_prot_t prot, maxprot;
diff = cmd->ev_addr - trunc_page(cmd->ev_addr);
cmd->ev_addr -= diff; /* required by uvm_map */
cmd->ev_len += diff;
if ((error = vmcmd_get_prot(l, cmd, &prot, &maxprot)) != 0)
return error;
error = uvm_map(&p->p_vmspace->vm_map, &cmd->ev_addr,
round_page(cmd->ev_len), NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(prot, maxprot, UVM_INH_COPY,
UVM_ADV_NORMAL,
UVM_FLAG_FIXED|UVM_FLAG_COPYONW));
if (cmd->ev_flags & VMCMD_STACK)
curproc->p_vmspace->vm_issize += atop(round_page(cmd->ev_len));
return error;
}
/*
* exec_read():
*
* Read from vnode into buffer at offset.
*/
int
exec_read(struct lwp *l, struct vnode *vp, u_long off, void *bf, size_t size,
int ioflg)
{
int error;
size_t resid;
KASSERT((ioflg & IO_NODELOCKED) == 0 || VOP_ISLOCKED(vp) != LK_NONE);
if ((error = vn_rdwr(UIO_READ, vp, bf, size, off, UIO_SYSSPACE,
ioflg, l->l_cred, &resid, NULL)) != 0)
return error;
/*
* See if we got all of it
*/
if (resid != 0)
return ENOEXEC;
return 0;
}
/*
* exec_setup_stack(): Set up the stack segment for an elf
* executable.
*
* Note that the ep_ssize parameter must be set to be the current stack
* limit; this is adjusted in the body of execve() to yield the
* appropriate stack segment usage once the argument length is
* calculated.
*
* This function returns an int for uniformity with other (future) formats'
* stack setup functions. They might have errors to return.
*/
int
exec_setup_stack(struct lwp *l, struct exec_package *epp)
{
vsize_t max_stack_size;
vaddr_t access_linear_min;
vsize_t access_size;
vaddr_t noaccess_linear_min;
vsize_t noaccess_size;
#ifndef USRSTACK32
#define USRSTACK32 (0x00000000ffffffffL&~PGOFSET)
#endif
#ifndef MAXSSIZ32
#define MAXSSIZ32 (MAXSSIZ >> 2)
#endif
if (epp->ep_flags & EXEC_32) {
epp->ep_minsaddr = USRSTACK32;
max_stack_size = MAXSSIZ32;
} else {
epp->ep_minsaddr = USRSTACK;
max_stack_size = MAXSSIZ;
}
DPRINTF(("ep_minsaddr=%#jx max_stack_size=%#jx\n",
(uintmax_t)epp->ep_minsaddr, (uintmax_t)max_stack_size));
pax_aslr_stack(epp, &max_stack_size);
DPRINTF(("[RLIMIT_STACK].lim_cur=%#jx max_stack_size=%#jx\n",
(uintmax_t)l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
(uintmax_t)max_stack_size));
epp->ep_ssize = MIN(l->l_proc->p_rlimit[RLIMIT_STACK].rlim_cur,
max_stack_size);
l->l_proc->p_stackbase = epp->ep_minsaddr;
epp->ep_maxsaddr = (vaddr_t)STACK_GROW(epp->ep_minsaddr,
max_stack_size);
DPRINTF(("ep_ssize=%#jx ep_minsaddr=%#jx ep_maxsaddr=%#jx\n",
(uintmax_t)epp->ep_ssize, (uintmax_t)epp->ep_minsaddr,
(uintmax_t)epp->ep_maxsaddr));
/*
* set up commands for stack. note that this takes *two*, one to
* map the part of the stack which we can access, and one to map
* the part which we can't.
*
* arguably, it could be made into one, but that would require the
* addition of another mapping proc, which is unnecessary
*/
access_size = epp->ep_ssize;
access_linear_min = (vaddr_t)STACK_ALLOC(epp->ep_minsaddr, access_size);
noaccess_size = max_stack_size - access_size;
noaccess_linear_min = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
access_size), noaccess_size);
DPRINTF(("access_size=%#jx, access_linear_min=%#jx, "
"noaccess_size=%#jx, noaccess_linear_min=%#jx\n",
(uintmax_t)access_size, (uintmax_t)access_linear_min,
(uintmax_t)noaccess_size, (uintmax_t)noaccess_linear_min));
if (user_stack_guard_size > 0) {
#ifdef __MACHINE_STACK_GROWS_UP
vsize_t guard_size = MIN(VM_MAXUSER_ADDRESS - epp->ep_maxsaddr, user_stack_guard_size);
if (guard_size > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, guard_size,
epp->ep_maxsaddr, NULL, 0, VM_PROT_NONE);
#else
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, user_stack_guard_size,
epp->ep_maxsaddr - user_stack_guard_size, NULL, 0, VM_PROT_NONE);
#endif
}
if (noaccess_size > 0 && noaccess_size <= MAXSSIZ) {
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, noaccess_size,
noaccess_linear_min, NULL, 0,
VM_PROT_NONE | PROT_MPROTECT(VM_PROT_READ | VM_PROT_WRITE),
VMCMD_STACK);
}
KASSERT(access_size > 0);
KASSERT(access_size <= MAXSSIZ);
NEW_VMCMD2(&epp->ep_vmcmds, vmcmd_map_zero, access_size,
access_linear_min, NULL, 0, VM_PROT_READ | VM_PROT_WRITE,
VMCMD_STACK);
return 0;
}
/* $NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vfsops.c,v 1.11 2022/07/08 07:42:06 hannken Exp $");
#include <sys/types.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/statvfs.h>
#include <sys/vnode.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
int
genfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
sbp->f_bsize = DEV_BSIZE;
sbp->f_frsize = DEV_BSIZE;
sbp->f_iosize = DEV_BSIZE;
sbp->f_blocks = 2; /* 1k to keep df happy */
sbp->f_bfree = 0;
sbp->f_bavail = 0;
sbp->f_bresvd = 0;
sbp->f_files = 0;
sbp->f_ffree = 0;
sbp->f_favail = 0;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, mp);
return 0;
}
int
genfs_renamelock_enter(struct mount *mp)
{
mutex_enter(mp->mnt_renamelock);
/* Preserve possible error return in case we become interruptible. */
return 0;
}
void
genfs_renamelock_exit(struct mount *mp)
{
mutex_exit(mp->mnt_renamelock);
}
int
genfs_suspendctl(struct mount *mp, int cmd)
{
int error;
switch (cmd) {
case SUSPEND_SUSPEND:
error = fstrans_setstate(mp, FSTRANS_SUSPENDING);
if (error)
return error;
error = fstrans_setstate(mp, FSTRANS_SUSPENDED);
return error;
case SUSPEND_RESUME:
error = fstrans_setstate(mp, FSTRANS_NORMAL);
KASSERT(error == 0);
return 0;
default:
panic("%s: bogus command %d", __func__, cmd);
}
}
/* $NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
*/
/*
* System calls relating to files.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_generic.c,v 1.134 2022/07/10 23:12:12 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/stat.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/ktrace.h>
#include <sys/atomic.h>
#include <sys/disklabel.h>
/*
* Read system call.
*/
/* ARGSUSED */
int
sys_read(struct lwp *l, const struct sys_read_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(void *) buf;
syscallarg(size_t) nbyte;
} */
file_t *fp;
int fd;
fd = SCARG(uap, fd);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
if ((fp->f_flag & FREAD) == 0) {
fd_putfile(fd);
return (EBADF);
}
/* dofileread() will unuse the descriptor for us */
return (dofileread(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
&fp->f_offset, FOF_UPDATE_OFFSET, retval));
}
int
dofileread(int fd, struct file *fp, void *buf, size_t nbyte,
off_t *offset, int flags, register_t *retval)
{
struct iovec aiov;
struct uio auio;
size_t cnt;
int error;
lwp_t *l;
l = curlwp;
aiov.iov_base = (void *)buf;
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = nbyte;
auio.uio_rw = UIO_READ;
auio.uio_vmspace = l->l_proc->p_vmspace;
/*
* Reads return ssize_t because -1 is returned on error. Therefore
* we must restrict the length to SSIZE_MAX to avoid garbage return
* values.
*/
if (auio.uio_resid > SSIZE_MAX) {
error = EINVAL;
goto out;
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
if (error) if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
cnt -= auio.uio_resid;
ktrgenio(fd, UIO_READ, buf, cnt, error);
*retval = cnt;
out:
fd_putfile(fd);
return (error);
}
/*
* Scatter read system call.
*/
int
sys_readv(struct lwp *l, const struct sys_readv_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct iovec *) iovp;
syscallarg(int) iovcnt;
} */
return do_filereadv(SCARG(uap, fd), SCARG(uap, iovp),
SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}
int
do_filereadv(int fd, const struct iovec *iovp, int iovcnt,
off_t *offset, int flags, register_t *retval)
{
struct uio auio;
struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
int i, error;
size_t cnt;
u_int iovlen;
struct file *fp;
struct iovec *ktriov = NULL;
if (iovcnt == 0)
return EINVAL;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((fp->f_flag & FREAD) == 0) {
fd_putfile(fd);
return EBADF;
}
if (offset == NULL)
offset = &fp->f_offset;
else {
/*
* Caller must not specify &fp->f_offset -- we can't
* safely dereference it for the call to fo_seek
* without holding some underlying object lock.
*/
KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) {
error = ESPIPE;
goto out;
}
error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
0);
if (error != 0)
goto out;
}
iovlen = iovcnt * sizeof(struct iovec);
if (flags & FOF_IOV_SYSSPACE)
iov = __UNCONST(iovp);
else {
iov = aiov;
if ((u_int)iovcnt > UIO_SMALLIOV) {
if ((u_int)iovcnt > IOV_MAX) {
error = EINVAL;
goto out;
}
iov = kmem_alloc(iovlen, KM_SLEEP);
needfree = iov;
}
error = copyin(iovp, iov, iovlen);
if (error)
goto done;
}
auio.uio_iov = iov;
auio.uio_iovcnt = iovcnt;
auio.uio_rw = UIO_READ;
auio.uio_vmspace = curproc->p_vmspace;
auio.uio_resid = 0;
for (i = 0; i < iovcnt; i++, iov++) {
auio.uio_resid += iov->iov_len;
/*
* Reads return ssize_t because -1 is returned on error.
* Therefore we must restrict the length to SSIZE_MAX to
* avoid garbage return values.
*/
if (iov->iov_len > SSIZE_MAX ||
auio.uio_resid > SSIZE_MAX - iov->iov_len) {
error = EINVAL;
goto done;
}
}
/*
* if tracing, save a copy of iovec
*/
if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP);
memcpy(ktriov, auio.uio_iov, iovlen);
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_read)(fp, offset, &auio, fp->f_cred, flags);
if (error) if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
cnt -= auio.uio_resid;
*retval = cnt;
if (ktriov != NULL) { ktrgeniov(fd, UIO_READ, ktriov, cnt, error);
kmem_free(ktriov, iovlen);
}
done:
if (needfree) kmem_free(needfree, iovlen);
out:
fd_putfile(fd);
return (error);
}
/*
* Write system call
*/
int
sys_write(struct lwp *l, const struct sys_write_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const void *) buf;
syscallarg(size_t) nbyte;
} */
file_t *fp;
int fd;
fd = SCARG(uap, fd);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
if ((fp->f_flag & FWRITE) == 0) {
fd_putfile(fd);
return (EBADF);
}
/* dofilewrite() will unuse the descriptor for us */
return (dofilewrite(fd, fp, SCARG(uap, buf), SCARG(uap, nbyte),
&fp->f_offset, FOF_UPDATE_OFFSET, retval));
}
int
dofilewrite(int fd, struct file *fp, const void *buf,
size_t nbyte, off_t *offset, int flags, register_t *retval)
{
struct iovec aiov;
struct uio auio;
size_t cnt;
int error;
aiov.iov_base = __UNCONST(buf); /* XXXUNCONST kills const */
aiov.iov_len = nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = nbyte;
auio.uio_rw = UIO_WRITE;
auio.uio_vmspace = curproc->p_vmspace;
/*
* Writes return ssize_t because -1 is returned on error. Therefore
* we must restrict the length to SSIZE_MAX to avoid garbage return
* values.
*/
if (auio.uio_resid > SSIZE_MAX) {
error = EINVAL;
goto out;
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
if (error) { if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) { mutex_enter(&proc_lock);
psignal(curproc, SIGPIPE);
mutex_exit(&proc_lock);
}
}
cnt -= auio.uio_resid;
ktrgenio(fd, UIO_WRITE, buf, cnt, error);
*retval = cnt;
out:
fd_putfile(fd);
return (error);
}
/*
* Gather write system call
*/
int
sys_writev(struct lwp *l, const struct sys_writev_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct iovec *) iovp;
syscallarg(int) iovcnt;
} */
return do_filewritev(SCARG(uap, fd), SCARG(uap, iovp),
SCARG(uap, iovcnt), NULL, FOF_UPDATE_OFFSET, retval);
}
int
do_filewritev(int fd, const struct iovec *iovp, int iovcnt,
off_t *offset, int flags, register_t *retval)
{
struct uio auio;
struct iovec *iov, *needfree = NULL, aiov[UIO_SMALLIOV];
int i, error;
size_t cnt;
u_int iovlen;
struct file *fp;
struct iovec *ktriov = NULL;
if (iovcnt == 0)
return EINVAL;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((fp->f_flag & FWRITE) == 0) {
fd_putfile(fd);
return EBADF;
}
if (offset == NULL)
offset = &fp->f_offset;
else {
/*
* Caller must not specify &fp->f_offset -- we can't
* safely dereference it for the call to fo_seek
* without holding some underlying object lock.
*/
KASSERT(offset != &fp->f_offset); if (fp->f_ops->fo_seek == NULL) {
error = ESPIPE;
goto out;
}
error = (*fp->f_ops->fo_seek)(fp, *offset, SEEK_SET, NULL,
0);
if (error != 0)
goto out;
}
iovlen = iovcnt * sizeof(struct iovec);
if (flags & FOF_IOV_SYSSPACE)
iov = __UNCONST(iovp);
else {
iov = aiov;
if ((u_int)iovcnt > UIO_SMALLIOV) {
if ((u_int)iovcnt > IOV_MAX) {
error = EINVAL;
goto out;
}
iov = kmem_alloc(iovlen, KM_SLEEP);
needfree = iov;
}
error = copyin(iovp, iov, iovlen);
if (error)
goto done;
}
auio.uio_iov = iov;
auio.uio_iovcnt = iovcnt;
auio.uio_rw = UIO_WRITE;
auio.uio_vmspace = curproc->p_vmspace;
auio.uio_resid = 0;
for (i = 0; i < iovcnt; i++, iov++) {
auio.uio_resid += iov->iov_len;
/*
* Writes return ssize_t because -1 is returned on error.
* Therefore we must restrict the length to SSIZE_MAX to
* avoid garbage return values.
*/
if (iov->iov_len > SSIZE_MAX ||
auio.uio_resid > SSIZE_MAX - iov->iov_len) {
error = EINVAL;
goto done;
}
}
/*
* if tracing, save a copy of iovec
*/
if (ktrpoint(KTR_GENIO)) { ktriov = kmem_alloc(iovlen, KM_SLEEP);
memcpy(ktriov, auio.uio_iov, iovlen);
}
cnt = auio.uio_resid;
error = (*fp->f_ops->fo_write)(fp, offset, &auio, fp->f_cred, flags);
if (error) { if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
if (error == EPIPE && !(fp->f_flag & FNOSIGPIPE)) { mutex_enter(&proc_lock);
psignal(curproc, SIGPIPE);
mutex_exit(&proc_lock);
}
}
cnt -= auio.uio_resid;
*retval = cnt;
if (ktriov != NULL) { ktrgeniov(fd, UIO_WRITE, ktriov, cnt, error);
kmem_free(ktriov, iovlen);
}
done:
if (needfree) kmem_free(needfree, iovlen);
out:
fd_putfile(fd);
return (error);
}
/*
* Ioctl system call
*/
/* ARGSUSED */
int
sys_ioctl(struct lwp *l, const struct sys_ioctl_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(u_long) com;
syscallarg(void *) data;
} */
struct file *fp;
proc_t *p;
u_long com;
int error;
size_t size, alloc_size;
void *data, *memp;
#define STK_PARAMS 128
u_long stkbuf[STK_PARAMS/sizeof(u_long)];
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
size_t zero_last = 0;
#define zero_size(SZ) ((SZ)+zero_last)
#else
#define zero_size(SZ) (SZ)
#endif
memp = NULL;
alloc_size = 0;
error = 0;
p = l->l_proc;
if ((fp = fd_getfile(SCARG(uap, fd))) == NULL)
return (EBADF);
if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
error = EBADF;
com = 0;
goto out;
}
switch (com = SCARG(uap, com)) {
case FIONCLEX:
case FIOCLEX:
fd_set_exclose(l, SCARG(uap, fd), com == FIOCLEX);
goto out;
}
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = IOCPARM_LEN(com);
alloc_size = size;
/*
* The disklabel is now padded to a multiple of 8 bytes however the old
* disklabel on 32bit platforms wasn't. This leaves a difference in
* size of 4 bytes between the two but are otherwise identical.
* To deal with this, we allocate enough space for the new disklabel
* but only copyin/out the smaller amount.
*/
if (IOCGROUP(com) == 'd') {
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
u_long ocom = com;
#endif
u_long ncom = com ^ (DIOCGDINFO ^ DIOCGDINFO32);
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
/*
* Userland might use struct disklabel that is bigger than the
* the kernel version (historic accident) - alloc userland
* size and zero unused part on copyout.
*/
#define DISKLABELLENDIFF (sizeof(struct partition) \
*(__TMPBIGMAXPARTITIONS-MAXPARTITIONS))
#define IOCFIXUP(NIOC) ((NIOC&~(IOCPARM_MASK<<IOCPARM_SHIFT)) | \
(IOCPARM_LEN(NIOC)-DISKLABELLENDIFF)<<IOCPARM_SHIFT)
switch (IOCFIXUP(ocom)) {
case DIOCGDINFO:
case DIOCWDINFO:
case DIOCSDINFO:
case DIOCGDEFLABEL:
com = ncom = IOCFIXUP(ocom);
zero_last = DISKLABELLENDIFF;
size -= DISKLABELLENDIFF;
goto done;
}
#endif
switch (ncom) {
case DIOCGDINFO:
case DIOCWDINFO:
case DIOCSDINFO:
case DIOCGDEFLABEL:
com = ncom;
if (IOCPARM_LEN(DIOCGDINFO32) < IOCPARM_LEN(DIOCGDINFO))
alloc_size = IOCPARM_LEN(DIOCGDINFO);
break;
}
#if __TMPBIGMAXPARTITIONS > MAXPARTITIONS
done: ;
#endif
}
if (size > IOCPARM_MAX) {
error = ENOTTY;
goto out;
}
memp = NULL;
if ((com >> IOCPARM_SHIFT) == 0) {
/* UNIX-style ioctl. */
data = SCARG(uap, data);
} else {
if (alloc_size > sizeof(stkbuf)) { memp = kmem_alloc(alloc_size, KM_SLEEP);
data = memp;
} else {
data = (void *)stkbuf;
}
if (com&IOC_IN) {
if (size) {
error = copyin(SCARG(uap, data), data, size);
if (error) {
goto out;
}
/*
* The data between size and alloc_size has
* not been overwritten. It shouldn't matter
* but let's clear that anyway.
*/
if (__predict_false(size < alloc_size)) { memset((char *)data+size, 0,
alloc_size - size);
}
ktrgenio(SCARG(uap, fd), UIO_WRITE, SCARG(uap, data), size, 0);
} else {
*(void **)data = SCARG(uap, data);
}
} else if ((com&IOC_OUT) && size) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
memset(data, 0, zero_size(size)); } else if (com&IOC_VOID) { *(void **)data = SCARG(uap, data);
}
}
switch (com) {
case FIONBIO:
/* XXX Code block is not atomic */
if (*(int *)data != 0)
atomic_or_uint(&fp->f_flag, FNONBLOCK);
else
atomic_and_uint(&fp->f_flag, ~FNONBLOCK);
error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, data);
break;
case FIOASYNC:
/* XXX Code block is not atomic */
if (*(int *)data != 0)
atomic_or_uint(&fp->f_flag, FASYNC);
else
atomic_and_uint(&fp->f_flag, ~FASYNC);
error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, data);
break;
default:
error = (*fp->f_ops->fo_ioctl)(fp, com, data);
/*
* Copy any data to user, size was
* already set and checked above.
*/
if (error == 0 && (com&IOC_OUT) && size) {
error = copyout(data, SCARG(uap, data),
zero_size(size));
ktrgenio(SCARG(uap, fd), UIO_READ, SCARG(uap, data),
size, error);
}
break;
}
out:
if (memp) kmem_free(memp, alloc_size);
fd_putfile(SCARG(uap, fd));
switch (error) {
case -1:
printf("sys_ioctl: _IO%s%s('%c', %lu, %lu) returned -1: "
"pid=%d comm=%s\n",
(com & IOC_IN) ? "W" : "", (com & IOC_OUT) ? "R" : "",
(char)IOCGROUP(com), (com & 0xff), IOCPARM_LEN(com),
p->p_pid, p->p_comm);
/* FALLTHROUGH */
case EPASSTHROUGH:
error = ENOTTY;
/* FALLTHROUGH */
default:
return (error);
}
}
/* $NetBSD: vnode.h,v 1.304 2022/10/26 23:40:30 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vnode.h 8.17 (Berkeley) 5/20/95
*/
#ifndef _SYS_VNODE_H_
#define _SYS_VNODE_H_
#include <sys/event.h>
#include <sys/queue.h>
#include <sys/condvar.h>
#include <sys/rwlock.h>
#include <sys/mutex.h>
#include <sys/time.h>
#include <sys/acl.h>
/* XXX: clean up includes later */
#include <uvm/uvm_param.h> /* XXX */
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <uvm/uvm_pglist.h> /* XXX */
#include <uvm/uvm_object.h> /* XXX */
#include <uvm/uvm_extern.h> /* XXX */
struct uvm_ractx;
#endif
/*
* The vnode is the focus of all file activity in UNIX. There is a
* unique vnode allocated for each active file, each current directory,
* each mounted-on file, text file, and the root.
*/
/*
* Vnode types. VNON means no type.
*/
enum vtype { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VSOCK, VFIFO, VBAD };
#define VNODE_TYPES \
"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"
/*
* Vnode tag types.
* These are for the benefit of external programs only (e.g., pstat)
* and should NEVER be inspected by the kernel.
*/
enum vtagtype {
VT_NON, VT_UFS, VT_NFS, VT_MFS, VT_MSDOSFS, VT_LFS, VT_LOFS,
VT_FDESC, VT_PORTAL, VT_NULL, VT_UMAP, VT_KERNFS, VT_PROCFS,
VT_AFS, VT_ISOFS, VT_UNION, VT_ADOSFS, VT_EXT2FS, VT_CODA,
VT_FILECORE, VT_NTFS, VT_VFS, VT_OVERLAY, VT_SMBFS, VT_PTYFS,
VT_TMPFS, VT_UDF, VT_SYSVBFS, VT_PUFFS, VT_HFS, VT_EFS, VT_ZFS,
VT_RUMP, VT_NILFS, VT_V7FS, VT_CHFS, VT_AUTOFS
};
#define VNODE_TAGS \
"VT_NON", "VT_UFS", "VT_NFS", "VT_MFS", "VT_MSDOSFS", "VT_LFS", "VT_LOFS", \
"VT_FDESC", "VT_PORTAL", "VT_NULL", "VT_UMAP", "VT_KERNFS", "VT_PROCFS", \
"VT_AFS", "VT_ISOFS", "VT_UNION", "VT_ADOSFS", "VT_EXT2FS", "VT_CODA", \
"VT_FILECORE", "VT_NTFS", "VT_VFS", "VT_OVERLAY", "VT_SMBFS", "VT_PTYFS", \
"VT_TMPFS", "VT_UDF", "VT_SYSVBFS", "VT_PUFFS", "VT_HFS", "VT_EFS", \
"VT_ZFS", "VT_RUMP", "VT_NILFS", "VT_V7FS", "VT_CHFS", "VT_AUTOFS"
#if defined(_KERNEL) || defined(_KMEMUSER)
struct vnode;
struct buf;
LIST_HEAD(buflists, buf);
/*
* Reading or writing any of these items requires holding the appropriate
* lock. Field markings and the corresponding locks:
*
* - stable, reference to the vnode is required
* b bufcache_lock
* e exec_lock
* f vnode_free_list_lock, or vrele_lock for vrele_list
* i v_interlock
* i+b v_interlock + bufcache_lock to modify, either to inspect
* i+u v_interlock + v_uobj.vmobjlock to modify, either to inspect
* k locked by underlying filesystem (maybe kernel_lock)
* u v_uobj.vmobjlock
* v vnode lock
*
* Each underlying filesystem allocates its own private area and hangs
* it from v_data.
*/
struct vnode {
/*
* VM system related items.
*/
struct uvm_object v_uobj; /* u the VM object */
voff_t v_size; /* i+u size of file */
voff_t v_writesize; /* i+u new size after write */
/*
* Unstable items get their own cache line.
* On _LP64 this fills the space nicely.
*/
kcondvar_t v_cv /* i synchronization */
__aligned(COHERENCY_UNIT);
int v_iflag; /* i+u VI_* flags */
int v_uflag; /* k VU_* flags */
int v_usecount; /* i reference count */
int v_numoutput; /* i # of pending writes */
int v_writecount; /* i ref count of writers */
int v_holdcnt; /* i page & buffer refs */
struct buflists v_cleanblkhd; /* i+b clean blocklist head */
struct buflists v_dirtyblkhd; /* i+b dirty blocklist head */
/*
* The remaining items are largely stable.
*/
int v_vflag /* v VV_* flags */
__aligned(COHERENCY_UNIT);
kmutex_t *v_interlock; /* - vnode interlock */
struct mount *v_mount; /* v ptr to vfs we are in */
int (**v_op)(void *); /* : vnode operations vector */
union {
struct mount *vu_mountedhere;/* v ptr to vfs (VDIR) */
struct socket *vu_socket; /* v unix ipc (VSOCK) */
struct specnode *vu_specnode; /* v device (VCHR, VBLK) */
struct fifoinfo *vu_fifoinfo; /* v fifo (VFIFO) */
struct uvm_ractx *vu_ractx; /* u read-ahead ctx (VREG) */
} v_un;
enum vtype v_type; /* - vnode type */
enum vtagtype v_tag; /* - type of underlying data */
void *v_data; /* - private data for fs */
struct vnode_klist *v_klist; /* i kevent / knote info */
void *v_segvguard; /* e for PAX_SEGVGUARD */
};
#define v_mountedhere v_un.vu_mountedhere
#define v_socket v_un.vu_socket
#define v_specnode v_un.vu_specnode
#define v_fifoinfo v_un.vu_fifoinfo
#define v_ractx v_un.vu_ractx
typedef struct vnode vnode_t;
/*
* Structure that encompasses the kevent state for a vnode. This is
* carved out as a separate structure because some vnodes may share
* this state with one another.
*
* N.B. if two vnodes share a vnode_klist, then they must also share
* v_interlock.
*/
struct vnode_klist {
struct klist vk_klist; /* i notes attached to vnode */
long vk_interest; /* i what the notes are interested in */
};
#endif
/*
* Vnode flags. The first set are locked by vnode lock or are stable.
* VSYSTEM is only used to skip vflush()ing quota files. VISTTY is used
* when reading dead vnodes.
*/
#define VV_ROOT 0x00000001 /* root of its file system */
#define VV_SYSTEM 0x00000002 /* vnode being used by kernel */
#define VV_ISTTY 0x00000004 /* vnode represents a tty */
#define VV_MAPPED 0x00000008 /* vnode might have user mappings */
#define VV_MPSAFE 0x00000010 /* file system code is MP safe */
/*
* The second set are locked by vp->v_interlock. VI_TEXT and VI_EXECMAP are
* typically updated with vp->v_uobj.vmobjlock also held as the VM system
* uses them for accounting purposes.
*/
#define VI_TEXT 0x00000100 /* vnode is a pure text prototype */
#define VI_EXECMAP 0x00000200 /* might have PROT_EXEC mappings */
#define VI_WRMAP 0x00000400 /* might have PROT_WRITE u. mappings */
#define VI_PAGES 0x00000800 /* UVM object has >0 pages */
#define VI_ONWORKLST 0x00004000 /* On syncer work-list */
#define VI_DEADCHECK 0x00008000 /* UVM: need to call vdead_check() */
/*
* The third set are locked by the underlying file system.
*/
#define VU_DIROP 0x01000000 /* LFS: involved in a directory op */
#define VNODE_FLAGBITS \
"\20\1ROOT\2SYSTEM\3ISTTY\4MAPPED\5MPSAFE\11TEXT\12EXECMAP" \
"\13WRMAP\14PAGES\17ONWORKLST\20DEADCHECK\31DIROP"
#define VSIZENOTSET ((voff_t)-1)
/*
* vnode lock flags
*/
#define LK_NONE 0x00000000 /* no lock - for VOP_ISLOCKED() */
#define LK_SHARED 0x00000001 /* shared lock */
#define LK_EXCLUSIVE 0x00000002 /* exclusive lock */
#define LK_UPGRADE 0x00000010 /* upgrade shared -> exclusive */
#define LK_DOWNGRADE 0x00000020 /* downgrade exclusive -> shared */
#define LK_NOWAIT 0x00000100 /* do not sleep to await lock */
#define LK_RETRY 0x00000200 /* vn_lock: retry until locked */
/*
* Vnode attributes. A field value of VNOVAL represents a field whose value
* is unavailable (getattr) or which is not to be changed (setattr).
*/
struct vattr {
enum vtype va_type; /* vnode type (for create) */
mode_t va_mode; /* files access mode and type */
nlink_t va_nlink; /* number of references to file */
uid_t va_uid; /* owner user id */
gid_t va_gid; /* owner group id */
dev_t va_fsid; /* file system id (dev for now) */
ino_t va_fileid; /* file id */
u_quad_t va_size; /* file size in bytes */
long va_blocksize; /* blocksize preferred for i/o */
struct timespec va_atime; /* time of last access */
struct timespec va_mtime; /* time of last modification */
struct timespec va_ctime; /* time file changed */
struct timespec va_birthtime; /* time file created */
u_long va_gen; /* generation number of file */
u_long va_flags; /* flags defined for file */
dev_t va_rdev; /* device the special file represents */
u_quad_t va_bytes; /* bytes of disk space held by file */
u_quad_t va_filerev; /* file modification number */
unsigned int va_vaflags; /* operations flags, see below */
long va_spare; /* remain quad aligned */
};
/*
* Flags for va_vaflags.
*/
#define VA_UTIMES_NULL 0x01 /* utimes argument was NULL */
#define VA_EXCLUSIVE 0x02 /* exclusive create request */
#ifdef _KERNEL
/*
* Flags for ioflag.
*/
#define IO_UNIT 0x00010 /* do I/O as atomic unit */
#define IO_APPEND 0x00020 /* append write to end */
#define IO_SYNC (0x40|IO_DSYNC) /* sync I/O file integrity completion */
#define IO_NODELOCKED 0x00080 /* underlying node already locked */
#define IO_NDELAY 0x00100 /* FNDELAY flag set in file table */
#define IO_DSYNC 0x00200 /* sync I/O data integrity completion */
#define IO_ALTSEMANTICS 0x00400 /* use alternate i/o semantics */
#define IO_NORMAL 0x00800 /* operate on regular data */
#define IO_EXT 0x01000 /* operate on extended attributes */
#define IO_DIRECT 0x02000 /* direct I/O hint */
#define IO_JOURNALLOCKED 0x04000 /* journal is already locked */
#define IO_ADV_MASK 0x00003 /* access pattern hint */
#define IO_ADV_SHIFT 0
#define IO_ADV_ENCODE(adv) (((adv) << IO_ADV_SHIFT) & IO_ADV_MASK)
#define IO_ADV_DECODE(ioflag) (((ioflag) & IO_ADV_MASK) >> IO_ADV_SHIFT)
/*
* Flags for accmode_t.
*/
#define VEXEC 000000000100 /* execute/search permission */
#define VWRITE 000000000200 /* write permission */
#define VREAD 000000000400 /* read permission */
#define VADMIN 000000010000 /* being the file owner */
#define VAPPEND 000000040000 /* permission to write/append */
/*
* VEXPLICIT_DENY makes VOP_ACCESSX(9) return EPERM or EACCES only
* if permission was denied explicitly, by a "deny" rule in NFSv4 ACL,
* and 0 otherwise. This never happens with ordinary unix access rights
* or POSIX.1e ACLs. Obviously, VEXPLICIT_DENY must be OR-ed with
* some other V* constant.
*/
#define VEXPLICIT_DENY 000000100000
#define VREAD_NAMED_ATTRS 000000200000 /* not used */
#define VWRITE_NAMED_ATTRS 000000400000 /* not used */
#define VDELETE_CHILD 000001000000
#define VREAD_ATTRIBUTES 000002000000 /* permission to stat(2) */
#define VWRITE_ATTRIBUTES 000004000000 /* change {m,c,a}time */
#define VDELETE 000010000000
#define VREAD_ACL 000020000000 /* read ACL and file mode */
#define VWRITE_ACL 000040000000 /* change ACL and/or file mode */
#define VWRITE_OWNER 000100000000 /* change file owner */
#define VSYNCHRONIZE 000200000000 /* not used */
#define VCREAT 000400000000 /* creating new file */
#define VVERIFY 001000000000 /* verification required */
#define __VNODE_PERM_BITS \
"\10" \
"\07VEXEC" \
"\10VWRITE" \
"\11VREAD" \
"\15VADMIN" \
"\17VAPPEND" \
"\20VEXPLICIT_DENY" \
"\21VREAD_NAMED_ATTRS" \
"\22VWRITE_NAMED_ATTRS" \
"\23VDELETE_CHILD" \
"\24VREAD_ATTRIBUTES" \
"\25VWRITE_ATTRIBUTES" \
"\26VDELETE" \
"\27VREAD_ACL" \
"\30VWRITE_ACL" \
"\31VWRITE_OWNER" \
"\32VSYNCHRONIZE" \
"\33VCREAT" \
"\34VVERIFY"
/*
* Permissions that were traditionally granted only to the file owner.
*/
#define VADMIN_PERMS (VADMIN | VWRITE_ATTRIBUTES | VWRITE_ACL | \
VWRITE_OWNER)
/*
* Permissions that were traditionally granted to everyone.
*/
#define VSTAT_PERMS (VREAD_ATTRIBUTES | VREAD_ACL)
/*
* Permissions that allow to change the state of the file in any way.
*/
#define VMODIFY_PERMS (VWRITE | VAPPEND | VADMIN_PERMS | VDELETE_CHILD | \
VDELETE)
/*
* Token indicating no attribute value yet assigned.
*/
#define VNOVAL (-1)
#define VNOVALSIZE ((u_quad_t)-1)
#define VNOVALFLAGS ((u_long)-1)
/*
* Convert between vnode types and inode formats (since POSIX.1
* defines mode word of stat structure in terms of inode formats).
*/
extern const enum vtype iftovt_tab[];
extern const int vttoif_tab[];
#define IFTOVT(mode) (iftovt_tab[((mode) & S_IFMT) >> 12])
#define VTTOIF(indx) (vttoif_tab[(int)(indx)])
#define MAKEIMODE(indx, mode) (int)(VTTOIF(indx) | (mode))
/*
* Flags to various vnode functions.
*/
#define SKIPSYSTEM 0x0001 /* vflush: skip vnodes marked VSYSTEM */
#define FORCECLOSE 0x0002 /* vflush: force file closeure */
#define WRITECLOSE 0x0004 /* vflush: only close writable files */
#define V_SAVE 0x0001 /* vinvalbuf: sync file first */
/*
* Flags to various vnode operations.
*/
#define REVOKEALL 0x0001 /* revoke: revoke all aliases */
#define FSYNC_WAIT 0x0001 /* fsync: wait for completion */
#define FSYNC_DATAONLY 0x0002 /* fsync: hint: sync file data only */
#define FSYNC_RECLAIM 0x0004 /* fsync: hint: vnode is being reclaimed */
#define FSYNC_LAZY 0x0008 /* fsync: lazy sync (trickle) */
#define FSYNC_NOLOG 0x0010 /* fsync: do not flush the log */
#define FSYNC_CACHE 0x0100 /* fsync: flush disk caches too */
#define UPDATE_WAIT 0x0001 /* update: wait for completion */
#define UPDATE_DIROP 0x0002 /* update: hint to fs to wait or not */
#define UPDATE_CLOSE 0x0004 /* update: clean up on close */
#define VDEAD_NOWAIT 0x0001 /* vdead_check: do not sleep */
void holdrelel(struct vnode *);
void holdrele(struct vnode *);
void vholdl(struct vnode *);
void vhold(struct vnode *);
void vref(struct vnode *);
#define NULLVP ((struct vnode *)NULL)
/*
* Macro to determine kevent interest on a vnode.
*/
#define _VN_KEVENT_INTEREST(vp, n) \
(((vp)->v_klist->vk_interest & (n)) != 0)
static inline bool
VN_KEVENT_INTEREST(struct vnode *vp, long hint)
{
mutex_enter(vp->v_interlock);
bool rv = _VN_KEVENT_INTEREST(vp, hint);
mutex_exit(vp->v_interlock);
return rv;
}
static inline void
VN_KNOTE(struct vnode *vp, long hint)
{
mutex_enter(vp->v_interlock);
if (__predict_false(_VN_KEVENT_INTEREST(vp, hint))) { knote(&vp->v_klist->vk_klist, hint);
}
mutex_exit(vp->v_interlock);
}
void vn_knote_attach(struct vnode *, struct knote *);
void vn_knote_detach(struct vnode *, struct knote *);
/*
* Global vnode data.
*/
extern struct vnode *rootvnode; /* root (i.e. "/") vnode */
extern int desiredvnodes; /* number of vnodes desired */
extern unsigned int numvnodes; /* current number of vnodes */
#endif /* _KERNEL */
/*
* Mods for exensibility.
*/
/*
* Flags for vdesc_flags:
*/
#define VDESC_MAX_VPS 8
/* Low order 16 flag bits are reserved for willrele flags for vp arguments. */
#define VDESC_VP0_WILLRELE 0x00000001
#define VDESC_VP1_WILLRELE 0x00000002
#define VDESC_VP2_WILLRELE 0x00000004
#define VDESC_VP3_WILLRELE 0x00000008
#define VDESC_VP0_WILLPUT 0x00000101
#define VDESC_VP1_WILLPUT 0x00000202
#define VDESC_VP2_WILLPUT 0x00000404
#define VDESC_VP3_WILLPUT 0x00000808
/*
* VDESC_NO_OFFSET is used to identify the end of the offset list
* and in places where no such field exists.
*/
#define VDESC_NO_OFFSET -1
/*
* This structure describes the vnode operation taking place.
*/
struct vnodeop_desc {
int vdesc_offset; /* offset in vector--first for speed */
const char *vdesc_name; /* a readable name for debugging */
int vdesc_flags; /* VDESC_* flags */
/*
* These ops are used by bypass routines to map and locate arguments.
* Creds and procs are not needed in bypass routines, but sometimes
* they are useful to (for example) transport layers.
* Nameidata is useful because it has a cred in it.
*/
const int *vdesc_vp_offsets; /* list ended by VDESC_NO_OFFSET */
int vdesc_vpp_offset; /* return vpp location */
int vdesc_cred_offset; /* cred location, if any */
int vdesc_componentname_offset; /* if any */
};
#ifdef _KERNEL
extern const struct vnodeop_desc * const vfs_op_descs[];
/*
* Union filesystem hook for vn_readdir().
*/
extern int (*vn_union_readdir_hook) (struct vnode **, struct file *, struct lwp *);
/*
* Macros for offsets in the vdesc struct.
*/
#define VOPARG_OFFSETOF(type, member) offsetof(type, member)
#define VOPARG_OFFSETTO(type,offset,sp) ((type)(((char *)(sp)) + (offset)))
/*
* This structure is used to configure the new vnodeops vector.
*/
struct vnodeopv_entry_desc {
const struct vnodeop_desc *opve_op; /* which operation this is */
int (*opve_impl)(void *); /* code implementing this operation */
};
struct vnodeopv_desc {
/* ptr to the ptr to the vector where op should go */
int (***opv_desc_vector_p)(void *);
const struct vnodeopv_entry_desc *opv_desc_ops; /* null terminated list */
};
/*
* A default routine which just returns an error.
*/
int vn_default_error(void *);
/*
* A generic structure.
* This can be used by bypass routines to identify generic arguments.
*/
struct vop_generic_args {
struct vnodeop_desc *a_desc;
/* other random data follows, presumably */
};
/*
* VOCALL calls an op given an ops vector. We break it out because BSD's
* vclean changes the ops vector and then wants to call ops with the old
* vector.
*/
/*
* actually, vclean doesn't use it anymore, but nfs does,
* for device specials and fifos.
*/
#define VOCALL(OPSV,OFF,AP) (( *((OPSV)[(OFF)])) (AP))
/*
* This call works for vnodes in the kernel.
*/
#define VCALL(VP,OFF,AP) VOCALL((VP)->v_op,(OFF),(AP))
#define VDESC(OP) (& __CONCAT(OP,_desc))
#define VOFFSET(OP) (VDESC(OP)->vdesc_offset)
/* XXX This include should go away */
#include <sys/mount.h>
/*
* Finally, include the default set of vnode operations.
*/
#include <sys/vnode_if.h>
/*
* Public vnode manipulation functions.
*/
struct file;
struct filedesc;
struct nameidata;
struct pathbuf;
struct proc;
struct stat;
struct uio;
struct vattr;
struct vnode;
/* see vnode(9) */
void vfs_vnode_sysinit(void);
int bdevvp(dev_t, struct vnode **);
int cdevvp(dev_t, struct vnode **);
void vattr_null(struct vattr *);
void vdevgone(int, int, int, enum vtype);
int vfinddev(dev_t, enum vtype, struct vnode **);
int vflush(struct mount *, struct vnode *, int);
int vflushbuf(struct vnode *, int);
void vgone(struct vnode *);
int vinvalbuf(struct vnode *, int, kauth_cred_t, struct lwp *, bool, int);
void vprint(const char *, struct vnode *);
void vput(struct vnode *);
bool vrecycle(struct vnode *);
void vrele(struct vnode *);
void vrele_async(struct vnode *);
void vrele_flush(struct mount *);
int vtruncbuf(struct vnode *, daddr_t, bool, int);
void vwakeup(struct buf *);
int vdead_check(struct vnode *, int);
void vrevoke(struct vnode *);
void vremfree(struct vnode *);
void vshareilock(struct vnode *, struct vnode *);
void vshareklist(struct vnode *, struct vnode *);
int vrefcnt(struct vnode *);
int vcache_get(struct mount *, const void *, size_t, struct vnode **);
int vcache_new(struct mount *, struct vnode *,
struct vattr *, kauth_cred_t, void *, struct vnode **);
int vcache_rekey_enter(struct mount *, struct vnode *,
const void *, size_t, const void *, size_t);
void vcache_rekey_exit(struct mount *, struct vnode *,
const void *, size_t, const void *, size_t);
/* see vnsubr(9) */
int vn_bwrite(void *);
int vn_close(struct vnode *, int, kauth_cred_t);
int vn_isunder(struct vnode *, struct vnode *, struct lwp *);
int vn_lock(struct vnode *, int);
void vn_markexec(struct vnode *);
int vn_marktext(struct vnode *);
int vn_open(struct vnode *, struct pathbuf *, int, int, int,
struct vnode **, bool *, int *);
int vn_rdwr(enum uio_rw, struct vnode *, void *, int, off_t, enum uio_seg,
int, kauth_cred_t, size_t *, struct lwp *);
int vn_readdir(struct file *, char *, int, unsigned int, int *,
struct lwp *, off_t **, int *);
int vn_stat(struct vnode *, struct stat *);
int vn_kqfilter(struct file *, struct knote *);
int vn_writechk(struct vnode *);
int vn_openchk(struct vnode *, kauth_cred_t, int);
int vn_extattr_get(struct vnode *, int, int, const char *, size_t *,
void *, struct lwp *);
int vn_extattr_set(struct vnode *, int, int, const char *, size_t,
const void *, struct lwp *);
int vn_extattr_rm(struct vnode *, int, int, const char *, struct lwp *);
int vn_fifo_bypass(void *);
int vn_bdev_open(dev_t, struct vnode **, struct lwp *);
int vn_bdev_openpath(struct pathbuf *pb, struct vnode **, struct lwp *);
/* initialise global vnode management */
void vntblinit(void);
/* misc stuff */
void sched_sync(void *);
void vn_syncer_add_to_worklist(struct vnode *, int);
void vn_syncer_remove_from_worklist(struct vnode *);
int dorevoke(struct vnode *, kauth_cred_t);
int rawdev_mounted(struct vnode *, struct vnode **);
uint8_t vtype2dt(enum vtype);
/* see vfssubr(9) */
int vfs_unixify_accmode(accmode_t *);
void vfs_getnewfsid(struct mount *);
void vfs_timestamp(struct timespec *);
#if defined(DDB) || defined(DEBUGPRINT)
void vfs_vnode_print(struct vnode *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_vnode_lock_print(void *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_mount_print(struct mount *, int, void (*)(const char *, ...)
__printflike(1, 2));
void vfs_mount_print_all(int, void (*)(const char *, ...)
__printflike(1, 2));
#endif /* DDB */
#endif /* _KERNEL */
#endif /* !_SYS_VNODE_H_ */
/* $NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* CPU-local reference counts
*
* localcount(9) is a reference-counting scheme that involves no
* interprocessor synchronization most of the time, at the cost of
* eight bytes of memory per CPU per object and at the cost of
* expensive interprocessor synchronization to drain references.
*
* localcount(9) references may be held across sleeps, may be
* transferred from CPU to CPU or thread to thread: they behave
* semantically like typical reference counts, with different
* pragmatic performance characteristics.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_localcount.c,v 1.7 2017/11/17 09:26:36 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/localcount.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/errno.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/xcall.h>
#if defined(DEBUG) && defined(LOCKDEBUG)
#include <sys/atomic.h>
#endif
static void localcount_xc(void *, void *);
/*
* localcount_init(lc)
*
* Initialize a localcount object. Returns 0 on success, error
* code on failure. May fail to allocate memory for percpu(9).
*
* The caller must call localcount_drain and then localcount_fini
* when done with lc.
*/
void
localcount_init(struct localcount *lc)
{
lc->lc_totalp = NULL;
lc->lc_percpu = percpu_alloc(sizeof(int64_t));
}
/*
* localcount_drain(lc, cv, interlock)
*
* Wait for all acquired references to lc to drain. Caller must
* hold interlock; localcount_drain releases it during cross-calls
* and waits on cv. The cv and interlock passed here must be the
* same as are passed to localcount_release for this lc.
*
* Caller must guarantee that no new references can be acquired
* with localcount_acquire before calling localcount_drain. For
* example, any object that may be found in a list and acquired
* must be removed from the list before localcount_drain.
*
* The localcount object lc may be used only with localcount_fini
* after this, unless reinitialized after localcount_fini with
* localcount_init.
*/
void
localcount_drain(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{
int64_t total = 0;
KASSERT(mutex_owned(interlock)); KASSERT(lc->lc_totalp == NULL);
/* Mark it draining. */
lc->lc_totalp = &total;
/*
* Count up all references on all CPUs.
*
* This serves as a global memory barrier: after xc_wait, all
* CPUs will have witnessed the nonnull value of lc->lc_totalp,
* so that it is safe to wait on the cv for them.
*/
mutex_exit(interlock);
xc_wait(xc_broadcast(0, &localcount_xc, lc, interlock));
mutex_enter(interlock);
/* Wait for remaining references to drain. */
while (total != 0) {
/*
* At this point, now that we have added up all
* references on all CPUs, the total had better be
* nonnegative.
*/
KASSERTMSG((0 < total),
"negatively referenced localcount: %p, %"PRId64,
lc, total);
cv_wait(cv, interlock);
}
/* Paranoia: Cause any further use of lc->lc_totalp to crash. */
lc->lc_totalp = (void *)(uintptr_t)1;
}
/*
* localcount_fini(lc)
*
* Finalize a localcount object, releasing any memory allocated
* for it. The localcount object must already have been drained.
*/
void
localcount_fini(struct localcount *lc)
{ KASSERT(lc->lc_totalp == (void *)(uintptr_t)1);
percpu_free(lc->lc_percpu, sizeof(uint64_t));
}
/*
* localcount_xc(cookie0, cookie1)
*
* Accumulate and transfer the per-CPU reference counts to a
* global total, resetting the per-CPU counter to zero. Once
* localcount_drain() has started, we only maintain the total
* count in localcount_release().
*/
static void
localcount_xc(void *cookie0, void *cookie1)
{
struct localcount *lc = cookie0;
kmutex_t *interlock = cookie1;
int64_t *localp;
mutex_enter(interlock);
localp = percpu_getref(lc->lc_percpu);
*lc->lc_totalp += *localp;
*localp -= *localp; /* ie, *localp = 0; */
percpu_putref(lc->lc_percpu);
mutex_exit(interlock);
}
/*
* localcount_adjust(lc, delta)
*
* Add delta -- positive or negative -- to the local CPU's count
* for lc.
*/
static void
localcount_adjust(struct localcount *lc, int delta)
{
int64_t *localp;
localp = percpu_getref(lc->lc_percpu);
*localp += delta;
percpu_putref(lc->lc_percpu);
}
/*
* localcount_acquire(lc)
*
* Acquire a reference to lc.
*
* The reference may be held across sleeps and may be migrated
* from CPU to CPU, or even thread to thread -- it is only
* counted, not associated with a particular concrete owner.
*
* Involves no interprocessor synchronization. May be used in any
* context: while a lock is held, within a pserialize(9) read
* section, in hard interrupt context (provided other users block
* hard interrupts), in soft interrupt context, in thread context,
* &c.
*
* Caller must guarantee that there is no concurrent
* localcount_drain. For example, any object that may be found in
* a list and acquired must be removed from the list before
* localcount_drain.
*/
void
localcount_acquire(struct localcount *lc)
{ KASSERT(lc->lc_totalp == NULL);
localcount_adjust(lc, +1);
#if defined(DEBUG) && defined(LOCKDEBUG)
if (atomic_inc_32_nv(&lc->lc_refcnt) == 0)
panic("counter overflow");
#endif
}
/*
* localcount_release(lc, cv, interlock)
*
* Release a reference to lc. If there is a concurrent
* localcount_drain and this may be the last reference, notify
* localcount_drain by acquiring interlock, waking cv, and
* releasing interlock. The cv and interlock passed here must be
* the same as are passed to localcount_drain for this lc.
*
* Involves no interprocessor synchronization unless there is a
* concurrent localcount_drain in progress.
*/
void
localcount_release(struct localcount *lc, kcondvar_t *cv, kmutex_t *interlock)
{
/*
* Block xcall so that if someone begins draining after we see
* lc->lc_totalp as null, then they won't start cv_wait until
* after they have counted this CPU's contributions.
*
* Otherwise, localcount_drain may notice an extant reference
* from this CPU and cv_wait for it, but having seen
* lc->lc_totalp as null, this CPU will not wake
* localcount_drain.
*/
kpreempt_disable();
KDASSERT(mutex_ownable(interlock));
if (__predict_false(lc->lc_totalp != NULL)) {
/*
* Slow path -- wake localcount_drain in case this is
* the last reference.
*/
mutex_enter(interlock);
if (--*lc->lc_totalp == 0) cv_broadcast(cv);
mutex_exit(interlock);
goto out;
}
localcount_adjust(lc, -1);
#if defined(DEBUG) && defined(LOCKDEBUG)
if (atomic_dec_32_nv(&lc->lc_refcnt) == UINT_MAX) panic("counter underflow");
#endif
out: kpreempt_enable();
}
/*
* localcount_debug_refcnt(lc)
*
* Return a total reference count of lc. It returns a correct value
* only if DEBUG and LOCKDEBUG enabled. Otherwise always return 0.
*/
uint32_t
localcount_debug_refcnt(const struct localcount *lc)
{
#if defined(DEBUG) && defined(LOCKDEBUG)
return lc->lc_refcnt;
#else
return 0;
#endif
}
/* $NetBSD: uvm_io.c,v 1.30 2024/05/03 07:09:20 skrll Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_io.c,v 1.1.2.2 1997/12/30 12:02:00 mrg Exp
*/
/*
* uvm_io.c: uvm i/o ops
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_io.c,v 1.30 2024/05/03 07:09:20 skrll Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <uvm/uvm.h>
/*
* functions
*/
/*
* uvm_io: perform I/O on a map
*
* => caller must have a reference to "map" so that it doesn't go away
* while we are working.
*/
int
uvm_io(struct vm_map *map, struct uio *uio, int flags)
{
vaddr_t baseva, endva, pageoffset, kva;
vsize_t chunksz, togo, sz;
struct vm_map_entry *dead_entries;
int error;
/*
* step 0: sanity checks and set up for copy loop. start with a
* large chunk size. if we have trouble finding vm space we will
* reduce it.
*/
if (uio->uio_resid == 0)
return 0;
togo = uio->uio_resid;
baseva = (vaddr_t) uio->uio_offset;
endva = baseva + (togo - 1);
if (endva < baseva) /* wrap around? */
return EIO;
if (baseva >= VM_MAXUSER_ADDRESS)
return 0;
if (endva >= VM_MAXUSER_ADDRESS)
/* EOF truncate */
togo = togo - (endva - VM_MAXUSER_ADDRESS + 1);
pageoffset = baseva & PAGE_MASK;
baseva = trunc_page(baseva);
chunksz = MIN(round_page(togo + pageoffset), trunc_page(MAXPHYS));
error = 0;
flags |= UVM_EXTRACT_QREF | UVM_EXTRACT_CONTIG | UVM_EXTRACT_FIXPROT;
/* XXX cannot use QREF with without AMAP_REFALL, and REFALL is unsafe */
flags &= ~UVM_EXTRACT_QREF;
/*
* step 1: main loop... while we've got data to move
*/
for (/*null*/; togo > 0 ; pageoffset = 0) {
/*
* step 2: extract mappings from the map into kernel_map
*/
error = uvm_map_extract(map, baseva, chunksz, kernel_map, &kva,
flags);
if (error) {
/* retry with a smaller chunk... */
if (error == ENOMEM && chunksz > PAGE_SIZE) { chunksz = trunc_page(chunksz / 2);
if (chunksz < PAGE_SIZE)
chunksz = PAGE_SIZE;
continue;
}
break;
}
/*
* step 3: move a chunk of data
*/
sz = chunksz - pageoffset;
if (sz > togo)
sz = togo;
error = uiomove((void *) (kva + pageoffset), sz, uio);
togo -= sz;
baseva += chunksz;
/*
* step 4: unmap the area of kernel memory
*/
vm_map_lock(kernel_map);
uvm_unmap_remove(kernel_map, kva, kva + chunksz, &dead_entries,
0);
vm_map_unlock(kernel_map);
if (dead_entries != NULL) uvm_unmap_detach(dead_entries, AMAP_REFALL); if (error)
break;
}
return error;
}
/* $NetBSD: sysv_msg_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */
/*-
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_msg_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/proc.h>
#include <sys/msg.h>
#include <compat/sys/msg.h>
#ifndef SYSVMSG
#define SYSVMSG
#endif
#include <sys/syscallargs.h>
int
compat_50_sys___msgctl13(struct lwp *l, const struct compat_50_sys___msgctl13_args *uap, register_t *retval)
{
/* {
syscallarg(int) msqid;
syscallarg(int) cmd;
syscallarg(struct msqid_ds13 *) buf;
} */
struct msqid_ds msqbuf;
struct msqid_ds13 omsqbuf;
int cmd, error;
cmd = SCARG(uap, cmd);
if (cmd == IPC_SET) {
error = copyin(SCARG(uap, buf), &omsqbuf, sizeof(omsqbuf));
if (error)
return (error);
__msqid_ds13_to_native(&omsqbuf, &msqbuf);
}
error = msgctl1(l, SCARG(uap, msqid), cmd,
(cmd == IPC_SET || cmd == IPC_STAT) ? &msqbuf : NULL);
if (error == 0 && cmd == IPC_STAT) { __native_to_msqid_ds13(&msqbuf, &omsqbuf);
error = copyout(&omsqbuf, SCARG(uap, buf), sizeof(omsqbuf));
}
return (error);
}
/* $NetBSD: scsipi_base.h,v 1.24 2017/02/26 10:58:47 maya Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _DEV_SCSIPI_SCSIPI_BASE_H_
#define _DEV_SCSIPI_SCSIPI_BASE_H_
struct scsipi_xfer *scsipi_get_xs(struct scsipi_periph *, int);
void scsipi_put_xs(struct scsipi_xfer *);
static __inline struct scsipi_xfer *scsipi_make_xs_internal(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
static __inline struct scsipi_xfer *scsipi_make_xs_unlocked(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
static __inline struct scsipi_xfer *scsipi_make_xs_locked(struct scsipi_periph *,
struct scsipi_generic *, int cmdlen, u_char *data_addr,
int datalen, int retries, int timeout, struct buf *,
int flags) __unused;
/*
* Make a scsipi_xfer, and return a pointer to it.
*/
static __inline struct scsipi_xfer *
scsipi_make_xs_internal(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
struct scsipi_xfer *xs;
if ((xs = scsipi_get_xs(periph, flags)) == NULL)
return (NULL);
/*
* Fill out the scsipi_xfer structure. We don't know whose context
* the cmd is in, so copy it.
*/
memcpy(&xs->cmdstore, cmd, cmdlen);
xs->cmd = &xs->cmdstore;
xs->cmdlen = cmdlen;
xs->data = data_addr;
xs->datalen = datalen;
xs->xs_retries = retries;
xs->timeout = timeout;
xs->bp = bp;
return (xs);
}
static __inline struct scsipi_xfer *
scsipi_make_xs_unlocked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
datalen, retries, timeout, bp, flags & ~XS_CTL_NOSLEEP);
}
static __inline struct scsipi_xfer *
scsipi_make_xs_locked(struct scsipi_periph *periph, struct scsipi_generic *cmd,
int cmdlen, u_char *data_addr, int datalen, int retries, int timeout,
struct buf *bp, int flags)
{
KDASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
return scsipi_make_xs_internal(periph, cmd, cmdlen, data_addr,
datalen, retries, timeout, bp, flags | XS_CTL_NOSLEEP);
}
#endif /* _DEV_SCSIPI_SCSIPI_BASE_H_ */
/* $NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993
* The Regents of the University of California.
* Copyright (c) 1990 University of Utah.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vnode_pager.c 8.8 (Berkeley) 2/13/94
* from: Id: uvm_vnode.c,v 1.1.2.26 1998/02/02 20:38:07 chuck Exp
*/
/*
* uvm_vnode.c: the vnode pager.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_vnode.c,v 1.121 2024/04/05 13:05:41 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif
#include <sys/atomic.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/disklabel.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_page_array.h>
#ifdef UVMHIST
UVMHIST_DEFINE(ubchist);
#endif
/*
* functions
*/
static void uvn_alloc_ractx(struct uvm_object *);
static void uvn_detach(struct uvm_object *);
static int uvn_get(struct uvm_object *, voff_t, struct vm_page **, int *,
int, vm_prot_t, int, int);
static void uvn_markdirty(struct uvm_object *);
static int uvn_put(struct uvm_object *, voff_t, voff_t, int);
static void uvn_reference(struct uvm_object *);
static int uvn_findpage(struct uvm_object *, voff_t, struct vm_page **,
unsigned int, struct uvm_page_array *a,
unsigned int);
/*
* master pager structure
*/
const struct uvm_pagerops uvm_vnodeops = {
.pgo_reference = uvn_reference,
.pgo_detach = uvn_detach,
.pgo_get = uvn_get,
.pgo_put = uvn_put,
.pgo_markdirty = uvn_markdirty,
};
/*
* the ops!
*/
/*
* uvn_reference
*
* duplicate a reference to a VM object. Note that the reference
* count must already be at least one (the passed in reference) so
* there is no chance of the uvn being killed or locked out here.
*
* => caller must call with object unlocked.
* => caller must be using the same accessprot as was used at attach time
*/
static void
uvn_reference(struct uvm_object *uobj)
{
vref((struct vnode *)uobj);
}
/*
* uvn_detach
*
* remove a reference to a VM object.
*
* => caller must call with object unlocked and map locked.
*/
static void
uvn_detach(struct uvm_object *uobj)
{
vrele((struct vnode *)uobj);
}
/*
* uvn_put: flush page data to backing store.
*
* => object must be locked on entry! VOP_PUTPAGES must unlock it.
* => flags: PGO_SYNCIO -- use sync. I/O
*/
static int
uvn_put(struct uvm_object *uobj, voff_t offlo, voff_t offhi, int flags)
{
struct vnode *vp = (struct vnode *)uobj;
int error;
KASSERT(rw_write_held(uobj->vmobjlock));
error = VOP_PUTPAGES(vp, offlo, offhi, flags);
return error;
}
/*
* uvn_get: get pages (synchronously) from backing store
*
* => prefer map unlocked (not required)
* => object must be locked! we will _unlock_ it before starting any I/O.
* => flags: PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
static int
uvn_get(struct uvm_object *uobj, voff_t offset,
struct vm_page **pps /* IN/OUT */,
int *npagesp /* IN (OUT if PGO_LOCKED)*/,
int centeridx, vm_prot_t access_type, int advice, int flags)
{
struct vnode *vp = (struct vnode *)uobj;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)vp, offset,
0, 0);
if (vp->v_type == VREG && (access_type & VM_PROT_WRITE) == 0
&& (flags & PGO_LOCKED) == 0 && vp->v_tag != VT_TMPFS) {
uvn_alloc_ractx(uobj);
uvm_ra_request(vp->v_ractx, advice, uobj, offset,
*npagesp << PAGE_SHIFT);
}
error = VOP_GETPAGES(vp, offset, pps, npagesp, centeridx,
access_type, advice, flags);
if (flags & PGO_LOCKED) KASSERT(rw_lock_held(uobj->vmobjlock));
return error;
}
/*
* uvn_markdirty: called when the object gains first dirty page
*
* => uobj must be write locked.
*/
static void
uvn_markdirty(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
KASSERT(rw_write_held(uobj->vmobjlock));
mutex_enter(vp->v_interlock);
if ((vp->v_iflag & VI_ONWORKLST) == 0) { vn_syncer_add_to_worklist(vp, filedelay);
}
mutex_exit(vp->v_interlock);
}
/*
* uvn_findpages:
* return the page for the uobj and offset requested, allocating if needed.
* => uobj must be locked.
* => returned pages will be BUSY.
*/
int
uvn_findpages(struct uvm_object *uobj, voff_t offset, unsigned int *npagesp,
struct vm_page **pgs, struct uvm_page_array *a, unsigned int flags)
{
unsigned int count, found, npages;
int i, rv;
struct uvm_page_array a_store;
if (a == NULL) {
/*
* XXX fragile API
* note that the array can be the one supplied by the caller of
* uvn_findpages. in that case, fillflags used by the caller
* might not match strictly with ours.
* in particular, the caller might have filled the array
* without DENSE but passed us UFP_DIRTYONLY (thus DENSE).
*/
const unsigned int fillflags =
((flags & UFP_BACKWARD) ? UVM_PAGE_ARRAY_FILL_BACKWARD : 0) |
((flags & UFP_DIRTYONLY) ?
(UVM_PAGE_ARRAY_FILL_DIRTY|UVM_PAGE_ARRAY_FILL_DENSE) : 0);
a = &a_store;
uvm_page_array_init(a, uobj, fillflags);
}
count = found = 0;
npages = *npagesp;
if (flags & UFP_BACKWARD) {
for (i = npages - 1; i >= 0; i--, offset -= PAGE_SIZE) {
rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
i + 1);
if (rv == 0) {
if (flags & UFP_DIRTYONLY)
break;
} else
found++;
count++;
}
} else {
for (i = 0; i < npages; i++, offset += PAGE_SIZE) {
rv = uvn_findpage(uobj, offset, &pgs[i], flags, a,
npages - i);
if (rv == 0) {
if (flags & UFP_DIRTYONLY)
break;
} else
found++;
count++;
}
}
if (a == &a_store) { uvm_page_array_fini(a);
}
*npagesp = count;
return (found);
}
/*
* uvn_findpage: find a single page
*
* if a suitable page was found, put it in *pgp and return 1.
* otherwise return 0.
*/
static int
uvn_findpage(struct uvm_object *uobj, voff_t offset, struct vm_page **pgp,
unsigned int flags, struct uvm_page_array *a, unsigned int nleft)
{
struct vm_page *pg;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(ubchist, "vp %#jx off %#jx", (uintptr_t)uobj, offset,
0, 0);
/*
* NOBUSY must come with NOWAIT and NOALLOC. if NOBUSY is
* specified, this may be called with a reader lock.
*/
KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOWAIT) != 0); KASSERT((flags & UFP_NOBUSY) == 0 || (flags & UFP_NOALLOC) != 0); KASSERT((flags & UFP_NOBUSY) != 0 || rw_write_held(uobj->vmobjlock));
if (*pgp != NULL) {
UVMHIST_LOG(ubchist, "dontcare", 0,0,0,0);
goto skip_offset;
}
for (;;) {
/*
* look for an existing page.
*/
pg = uvm_page_array_fill_and_peek(a, offset, nleft);
if (pg != NULL && pg->offset != offset) {
struct vm_page __diagused *tpg;
KASSERT(
((a->ar_flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0)
== (pg->offset < offset));
KASSERT((tpg = uvm_pagelookup(uobj, offset)) == NULL ||
((a->ar_flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0 &&
!uvm_obj_page_dirty_p(tpg)));
pg = NULL;
if ((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0) {
UVMHIST_LOG(ubchist, "dense", 0,0,0,0);
return 0;
}
}
/* nope? allocate one now */
if (pg == NULL) {
if (flags & UFP_NOALLOC) {
UVMHIST_LOG(ubchist, "noalloc", 0,0,0,0);
return 0;
}
pg = uvm_pagealloc(uobj, offset, NULL,
UVM_FLAG_COLORMATCH);
if (pg == NULL) {
if (flags & UFP_NOWAIT) {
UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
return 0;
}
rw_exit(uobj->vmobjlock);
uvm_wait("uvnfp1");
uvm_page_array_clear(a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
UVMHIST_LOG(ubchist, "alloced %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
KASSERTMSG(uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
break;
} else if (flags & UFP_NOCACHE) {
UVMHIST_LOG(ubchist, "nocache",0,0,0,0);
goto skip;
}
/* page is there, see if we need to wait on it */
if ((pg->flags & PG_BUSY) != 0) {
if (flags & UFP_NOWAIT) {
UVMHIST_LOG(ubchist, "nowait",0,0,0,0);
goto skip;
}
UVMHIST_LOG(ubchist, "wait %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
uvm_pagewait(pg, uobj->vmobjlock, "uvnfp2");
uvm_page_array_clear(a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
/* skip PG_RDONLY pages if requested */
if ((flags & UFP_NORDONLY) && (pg->flags & PG_RDONLY)) {
UVMHIST_LOG(ubchist, "nordonly",0,0,0,0);
goto skip;
}
/* stop on clean pages if requested */
if (flags & UFP_DIRTYONLY) {
const bool dirty = uvm_pagecheckdirty(pg, false);
if (!dirty) {
UVMHIST_LOG(ubchist, "dirtonly", 0,0,0,0);
return 0;
}
}
/* mark the page BUSY and we're done. */
if ((flags & UFP_NOBUSY) == 0) { pg->flags |= PG_BUSY;
UVM_PAGE_OWN(pg, "uvn_findpage");
}
UVMHIST_LOG(ubchist, "found %#jx (color %ju)",
(uintptr_t)pg, VM_PGCOLOR(pg), 0, 0);
uvm_page_array_advance(a);
break;
}
*pgp = pg;
return 1;
skip_offset:
/*
* skip this offset
*/
pg = uvm_page_array_peek(a);
if (pg != NULL) {
if (pg->offset == offset) {
uvm_page_array_advance(a);
} else {
KASSERT((a->ar_flags & UVM_PAGE_ARRAY_FILL_DENSE) == 0);
}
}
return 0;
skip:
/*
* skip this page
*/
KASSERT(pg != NULL);
uvm_page_array_advance(a);
return 0;
}
/*
* uvm_vnp_setsize: grow or shrink a vnode uobj
*
* grow => just update size value
* shrink => toss un-needed pages
*
* => we assume that the caller has a reference of some sort to the
* vnode in question so that it will not be yanked out from under
* us.
*/
void
uvm_vnp_setsize(struct vnode *vp, voff_t newsize)
{
struct uvm_object *uobj = &vp->v_uobj;
voff_t pgend = round_page(newsize);
voff_t oldsize;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
rw_enter(uobj->vmobjlock, RW_WRITER);
UVMHIST_LOG(ubchist, "vp %#jx old %#jx new %#jx",
(uintptr_t)vp, vp->v_size, newsize, 0);
/*
* now check if the size has changed: if we shrink we had better
* toss some pages...
*/
KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize);
KASSERTMSG((vp->v_size == vp->v_writesize ||
newsize == vp->v_writesize || newsize <= vp->v_size),
"vp=%p v_size=0x%llx v_writesize=0x%llx newsize=0x%llx",
vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
oldsize = vp->v_writesize;
/*
* check whether size shrinks
* if old size hasn't been set, there are no pages to drop
* if there was an integer overflow in pgend, then this is no shrink
*/
if (oldsize > pgend && oldsize != VSIZENOTSET && pgend >= 0) { (void) uvn_put(uobj, pgend, 0, PGO_FREE | PGO_SYNCIO);
rw_enter(uobj->vmobjlock, RW_WRITER);
}
mutex_enter(vp->v_interlock);
vp->v_size = vp->v_writesize = newsize;
mutex_exit(vp->v_interlock);
rw_exit(uobj->vmobjlock);
}
void
uvm_vnp_setwritesize(struct vnode *vp, voff_t newsize)
{
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
KASSERT(newsize != VSIZENOTSET); KASSERT(newsize >= 0); KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
KASSERTMSG(vp->v_size <= newsize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx newsize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize,
(unsigned long long)newsize);
mutex_enter(vp->v_interlock);
vp->v_writesize = newsize;
mutex_exit(vp->v_interlock);
rw_exit(vp->v_uobj.vmobjlock);
}
bool
uvn_text_p(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
int iflag;
/*
* v_interlock is not held here, but VI_EXECMAP is only ever changed
* with the vmobjlock held too.
*/
iflag = atomic_load_relaxed(&vp->v_iflag);
return (iflag & VI_EXECMAP) != 0;
}
static void
uvn_alloc_ractx(struct uvm_object *uobj)
{
struct vnode *vp = (struct vnode *)uobj;
struct uvm_ractx *ra = NULL;
KASSERT(rw_write_held(uobj->vmobjlock)); if (vp->v_type != VREG) {
return;
}
if (vp->v_ractx != NULL) {
return;
}
if (vp->v_ractx == NULL) {
rw_exit(uobj->vmobjlock);
ra = uvm_ra_allocctx();
rw_enter(uobj->vmobjlock, RW_WRITER);
if (ra != NULL && vp->v_ractx == NULL) { vp->v_ractx = ra;
ra = NULL;
}
}
if (ra != NULL) {
uvm_ra_freectx(ra);
}
}
/* $NetBSD: pci_usrreq.c,v 1.31 2021/09/05 03:47:24 mrg Exp $ */
/*
* Copyright 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Jason R. Thorpe for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* User -> kernel interface for PCI bus access.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pci_usrreq.c,v 1.31 2021/09/05 03:47:24 mrg Exp $");
#ifdef _KERNEL_OPT
#include "opt_pci.h"
#endif
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/ioctl.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/kauth.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
#include <dev/pci/pciio.h>
static int
pciopen(dev_t dev, int flags, int mode, struct lwp *l)
{
device_t dv;
dv = device_lookup(&pci_cd, minor(dev));
if (dv == NULL)
return ENXIO;
return 0;
}
static int
pciioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
struct pci_softc *sc = device_lookup_private(&pci_cd, minor(dev));
struct pci_child *child;
struct pciio_bdf_cfgreg *bdfr;
struct pciio_businfo *binfo;
struct pciio_drvname *dname;
struct pciio_drvnameonbus *dnameonbus;
pcitag_t tag;
switch (cmd) {
case PCI_IOC_BDF_CFGREAD:
case PCI_IOC_BDF_CFGWRITE:
bdfr = data;
if (bdfr->bus > 255 || bdfr->device >= sc->sc_maxndevs || bdfr->function > 7 || ISSET(bdfr->cfgreg.reg, 3))
return EINVAL;
tag = pci_make_tag(sc->sc_pc, bdfr->bus, bdfr->device,
bdfr->function);
if (cmd == PCI_IOC_BDF_CFGREAD) {
bdfr->cfgreg.val = pci_conf_read(sc->sc_pc, tag,
bdfr->cfgreg.reg);
} else {
if ((flag & FWRITE) == 0)
return EBADF;
pci_conf_write(sc->sc_pc, tag, bdfr->cfgreg.reg,
bdfr->cfgreg.val);
}
return 0;
case PCI_IOC_BUSINFO:
binfo = data;
binfo->busno = sc->sc_bus;
binfo->maxdevs = sc->sc_maxndevs;
return 0;
case PCI_IOC_DRVNAME:
dname = data;
if (dname->device >= sc->sc_maxndevs || dname->function > 7)
return EINVAL;
child = &sc->PCI_SC_DEVICESC(dname->device, dname->function);
if (!child->c_dev)
return ENXIO;
strlcpy(dname->name, device_xname(child->c_dev),
sizeof dname->name);
return 0;
case PCI_IOC_DRVNAMEONBUS:
dnameonbus = data;
int i;
for (i = 0; i < pci_cd.cd_ndevs; i++) {
sc = device_lookup_private(&pci_cd, i);
if (sc == NULL)
continue;
if (sc->sc_bus == dnameonbus->bus)
break; /* found the right bus */
}
if (i == pci_cd.cd_ndevs || sc == NULL)
return ENXIO;
if (dnameonbus->device >= sc->sc_maxndevs ||
dnameonbus->function > 7)
return EINVAL;
child = &sc->PCI_SC_DEVICESC(dnameonbus->device,
dnameonbus->function);
if (!child->c_dev)
return ENXIO;
strlcpy(dnameonbus->name, device_xname(child->c_dev),
sizeof dnameonbus->name);
return 0;
default:
return ENOTTY;
}
}
static paddr_t
pcimmap(dev_t dev, off_t offset, int prot)
{
struct pci_softc *sc = device_lookup_private(&pci_cd, minor(dev));
struct pci_child *c;
struct pci_range *r;
int flags = 0;
int device, range;
if (kauth_authorize_machdep(kauth_cred_get(), KAUTH_MACHDEP_UNMANAGEDMEM,
NULL, NULL, NULL, NULL) != 0) {
return -1;
}
/*
* Since we allow mapping of the entire bus, we
* take the offset to be the address on the bus,
* and pass 0 as the offset into that range.
*
* XXX Need a way to deal with linear/etc.
*
* XXX we rely on MD mmap() methods to enforce limits since these
* are hidden in *_tag_t structs if they exist at all
*/
#ifdef PCI_MAGIC_IO_RANGE
/*
* first, check if someone's trying to map the IO range
* XXX this assumes 64kB IO space even though some machines can have
* significantly more than that - macppc's bandit host bridge allows
* 8MB IO space and sparc64 may have the entire 4GB available. The
* firmware on both tries to use the lower 64kB first though and
* exausting it is pretty difficult so we should be safe
*/
if ((offset >= PCI_MAGIC_IO_RANGE) &&
(offset < (PCI_MAGIC_IO_RANGE + 0x10000))) {
return bus_space_mmap(sc->sc_iot, offset - PCI_MAGIC_IO_RANGE,
0, prot, 0);
}
#endif /* PCI_MAGIC_IO_RANGE */
for (device = 0; device < __arraycount(sc->sc_devices); device++) {
c = &sc->sc_devices[device];
if (c->c_dev == NULL)
continue;
for (range = 0; range < __arraycount(c->c_range); range++) {
r = &c->c_range[range];
if (r->r_size == 0)
break;
if (offset >= r->r_offset &&
offset < r->r_offset + r->r_size) {
flags = r->r_flags;
break;
}
}
}
return bus_space_mmap(sc->sc_memt, offset, 0, prot, flags);
}
const struct cdevsw pci_cdevsw = {
.d_open = pciopen,
.d_close = nullclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = pciioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = pcimmap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
/*
* pci_devioctl:
*
* PCI ioctls that can be performed on devices directly.
*/
int
pci_devioctl(pci_chipset_tag_t pc, pcitag_t tag, u_long cmd, void *data,
int flag, struct lwp *l)
{
struct pciio_cfgreg *r = (void *) data;
switch (cmd) {
case PCI_IOC_CFGREAD:
r->val = pci_conf_read(pc, tag, r->reg);
break;
case PCI_IOC_CFGWRITE:
if ((flag & FWRITE) == 0)
return EBADF;
pci_conf_write(pc, tag, r->reg, r->val);
break;
default:
return EPASSTHROUGH;
}
return 0;
}
/* $NetBSD: dead_vfsops.c,v 1.13 2022/10/26 23:39:43 riastradh Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Juergen Hannken-Illjes.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: dead_vfsops.c,v 1.13 2022/10/26 23:39:43 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/specfs/specdev.h>
VFS_PROTOS(dead);
static void dead_panic(void);
static const struct vnodeopv_desc * const dead_vnodeopv_descs[] = {
&dead_vnodeop_opv_desc,
NULL
};
struct mount *dead_rootmount;
struct vfsops dead_vfsops = {
.vfs_name = "dead",
.vfs_min_mount_data = 0,
.vfs_mount = (void *)dead_panic,
.vfs_start = (void *)dead_panic,
.vfs_unmount = (void *)dead_panic,
.vfs_root = (void *)dead_panic,
.vfs_quotactl = (void *)dead_panic,
.vfs_statvfs = (void *)eopnotsupp,
.vfs_sync = (void *)dead_panic,
.vfs_vget = (void *)dead_panic,
.vfs_loadvnode = (void *)dead_panic,
.vfs_newvnode = dead_newvnode,
.vfs_fhtovp = (void *)dead_panic,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = (void *)dead_panic,
.vfs_reinit = (void *)dead_panic,
.vfs_done = (void *)dead_panic,
.vfs_mountroot = (void *)dead_panic,
.vfs_snapshot = (void *)dead_panic,
.vfs_extattrctl = (void *)dead_panic,
.vfs_suspendctl = (void *)dead_panic,
.vfs_renamelock_enter = (void *)dead_panic,
.vfs_renamelock_exit = (void *)dead_panic,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = dead_vnodeopv_descs
};
static void
dead_panic(void)
{
panic("dead fs operation used");
}
/*
* Create a new anonymous device vnode.
*/
int
dead_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
struct vattr *vap, kauth_cred_t cred, void *extra,
size_t *key_len, const void **new_key)
{ KASSERT(mp == dead_rootmount); KASSERT(dvp == NULL); KASSERT(vap->va_type == VCHR || vap->va_type == VBLK); KASSERT(vap->va_rdev != VNOVAL);
vp->v_tag = VT_NON;
vp->v_type = vap->va_type;
vp->v_op = spec_vnodeop_p;
vp->v_vflag |= VV_MPSAFE;
uvm_vnp_setsize(vp, 0);
spec_node_init(vp, vap->va_rdev);
*key_len = 0;
*new_key = NULL;
return 0;
}
/* $NetBSD: uvm_physseg.c,v 1.20 2024/01/13 09:44:42 tnn Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.h 7.3 (Berkeley) 4/21/91
* from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* Consolidated API from uvm_page.c and others.
* Consolidated and designed by Cherry G. Mathew <cherry@zyx.in>
* rbtree(3) backing implementation by:
* Santhosh N. Raju <santhosh.raju@gmail.com>
*/
#ifdef _KERNEL_OPT
#include "opt_uvm.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/extent.h>
#include <sys/kmem.h>
#include <uvm/uvm.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_param.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_physseg.h>
/*
* uvm_physseg: describes one segment of physical memory
*/
struct uvm_physseg {
/* used during RB tree lookup for PHYS_TO_VM_PAGE(). */
#if defined(UVM_HOTPLUG)
struct rb_node rb_node; /* tree information */
#endif
paddr_t start; /* PF# of first page in segment */
paddr_t end; /* (PF# of last page in segment) + 1 */
struct vm_page *pgs; /* vm_page structures (from start) */
/* less performance sensitive fields. */
paddr_t avail_start; /* PF# of first free page in segment */
paddr_t avail_end; /* (PF# of last free page in segment) +1 */
struct extent *ext; /* extent(9) structure to manage pgs[] */
int free_list; /* which free list they belong on */
u_long start_hint; /* start looking for free pages here */
#ifdef __HAVE_PMAP_PHYSSEG
struct pmap_physseg pmseg; /* pmap specific (MD) data */
#endif
};
/*
* These functions are reserved for uvm(9) internal use and are not
* exported in the header file uvm_physseg.h
*
* Thus they are redefined here.
*/
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
#if defined(UVM_HOTPLUG) /* rbtree impementation */
#define HANDLE_TO_PHYSSEG_NODE(h) ((struct uvm_physseg *)(h))
#define PHYSSEG_NODE_TO_HANDLE(u) ((uvm_physseg_t)(u))
struct uvm_physseg_graph {
struct rb_tree rb_tree; /* Tree for entries */
int nentries; /* Number of entries */
} __aligned(COHERENCY_UNIT);
static struct uvm_physseg_graph uvm_physseg_graph __read_mostly;
/*
* Note on kmem(9) allocator usage:
* We take the conservative approach that plug/unplug are allowed to
* fail in high memory stress situations.
*
* We want to avoid re-entrant situations in which one plug/unplug
* operation is waiting on a previous one to complete, since this
* makes the design more complicated than necessary.
*
* We may review this and change its behaviour, once the use cases
* become more obvious.
*/
/*
* Special alloc()/free() functions for boot time support:
* We assume that alloc() at boot time is only for new 'vm_physseg's
* This allows us to use a static array for memory allocation at boot
* time. Thus we avoid using kmem(9) which is not ready at this point
* in boot.
*
* After kmem(9) is ready, we use it. We currently discard any free()s
* to this static array, since the size is small enough to be a
* trivial waste on all architectures we run on.
*/
static size_t nseg = 0;
static struct uvm_physseg uvm_physseg[VM_PHYSSEG_MAX];
static void *
uvm_physseg_alloc(size_t sz)
{
/*
* During boot time, we only support allocating vm_physseg
* entries from the static array.
* We need to assert for this.
*/
if (__predict_false(uvm.page_init_done == false)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to alloc size other than multiple"
" of struct uvm_physseg at boot\n", __func__);
size_t n = sz / sizeof(struct uvm_physseg);
nseg += n;
KASSERT(nseg > 0);
KASSERT(nseg <= VM_PHYSSEG_MAX);
return &uvm_physseg[nseg - n];
}
return kmem_zalloc(sz, KM_NOSLEEP);
}
static void
uvm_physseg_free(void *p, size_t sz)
{
/*
* This is a bit tricky. We do allow simulation of free()
* during boot (for eg: when MD code is "steal"ing memory,
* and the segment has been exhausted (and thus needs to be
* free() - ed.
* free() also complicates things because we leak the
* free(). Therefore calling code can't assume that free()-ed
* memory is available for alloc() again, at boot time.
*
* Thus we can't explicitly disallow free()s during
* boot time. However, the same restriction for alloc()
* applies to free(). We only allow uvm_physseg related free()s
* via this function during boot time.
*/
if (__predict_false(uvm.page_init_done == false)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to free size other than struct uvm_physseg"
" at boot\n", __func__);
}
/*
* Could have been in a single if(){} block - split for
* clarity
*/
if ((struct uvm_physseg *)p >= uvm_physseg &&
(struct uvm_physseg *)p < (uvm_physseg + VM_PHYSSEG_MAX)) {
if (sz % sizeof(struct uvm_physseg))
panic("%s: tried to free() other than struct uvm_physseg"
" from static array\n", __func__);
if ((sz / sizeof(struct uvm_physseg)) >= VM_PHYSSEG_MAX)
panic("%s: tried to free() the entire static array!", __func__);
return; /* Nothing to free */
}
kmem_free(p, sz);
}
/* XXX: Multi page size */
bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
int preload;
size_t slabpages;
struct uvm_physseg *ps, *current_ps = NULL;
struct vm_page *slab = NULL, *pgs = NULL;
#ifdef DEBUG
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pfn, &off);
ps = HANDLE_TO_PHYSSEG_NODE(upm);
if (ps != NULL) /* XXX; do we allow "update" plugs ? */
return false;
#endif
/*
* do we have room?
*/
ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
if (ps == NULL) {
printf("uvm_page_physload: unable to load physical memory "
"segment\n");
printf("\t%d segments allocated, ignoring 0x%"PRIxPADDR" -> 0x%"PRIxPADDR"\n",
VM_PHYSSEG_MAX, pfn, pfn + pages + 1);
printf("\tincrease VM_PHYSSEG_MAX\n");
return false;
}
/* span init */
ps->start = pfn;
ps->end = pfn + pages;
/*
* XXX: Ugly hack because uvmexp.npages accounts for only
* those pages in the segment included below as well - this
* should be legacy and removed.
*/
ps->avail_start = ps->start;
ps->avail_end = ps->end;
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
preload = 1; /* We are going to assume it is a preload */
RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
/* If there are non NULL pages then we are not in a preload */
if (current_ps->pgs != NULL) {
preload = 0;
/* Try to scavenge from earlier unplug()s. */
pgs = uvm_physseg_seg_alloc_from_slab(current_ps, pages);
if (pgs != NULL) {
break;
}
}
}
/*
* if VM is already running, attempt to kmem_alloc vm_page structures
*/
if (!preload) {
if (pgs == NULL) { /* Brand new */
/* Iteratively try alloc down from uvmexp.npages */
for (slabpages = (size_t) uvmexp.npages; slabpages >= pages; slabpages--) {
slab = kmem_zalloc(sizeof *pgs * (long unsigned int)slabpages, KM_NOSLEEP);
if (slab != NULL)
break;
}
if (slab == NULL) {
uvm_physseg_free(ps, sizeof(struct uvm_physseg));
return false;
}
uvm_physseg_seg_chomp_slab(ps, slab, (size_t) slabpages);
/* We allocate enough for this plug */
pgs = uvm_physseg_seg_alloc_from_slab(ps, pages);
if (pgs == NULL) {
printf("unable to uvm_physseg_seg_alloc_from_slab() from backend\n");
return false;
}
} else {
/* Reuse scavenged extent */
ps->ext = current_ps->ext;
}
physmem += pages;
uvmpdpol_reinit();
} else { /* Boot time - see uvm_page.c:uvm_page_init() */
pgs = NULL;
ps->pgs = pgs;
}
/*
* now insert us in the proper place in uvm_physseg_graph.rb_tree
*/
current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
if (current_ps != ps) {
panic("uvm_page_physload: Duplicate address range detected!");
}
uvm_physseg_graph.nentries++;
/*
* uvm_pagefree() requires the PHYS_TO_VM_PAGE(pgs[i]) on the
* newly allocated pgs[] to return the correct value. This is
* a bit of a chicken and egg problem, since it needs
* uvm_physseg_find() to succeed. For this, the node needs to
* be inserted *before* uvm_physseg_init_seg() happens.
*
* During boot, this happens anyway, since
* uvm_physseg_init_seg() is called later on and separately
* from uvm_page.c:uvm_page_init().
* In the case of hotplug we need to ensure this.
*/
if (__predict_true(!preload))
uvm_physseg_init_seg(ps, pgs);
if (psp != NULL)
*psp = ps;
return true;
}
static int
uvm_physseg_compare_nodes(void *ctx, const void *nnode1, const void *nnode2)
{
const struct uvm_physseg *enode1 = nnode1;
const struct uvm_physseg *enode2 = nnode2;
KASSERT(enode1->start < enode2->start || enode1->start >= enode2->end);
KASSERT(enode2->start < enode1->start || enode2->start >= enode1->end);
if (enode1->start < enode2->start)
return -1;
if (enode1->start >= enode2->end)
return 1;
return 0;
}
static int
uvm_physseg_compare_key(void *ctx, const void *nnode, const void *pkey)
{
const struct uvm_physseg *enode = nnode;
const paddr_t pa = *(const paddr_t *) pkey;
if(enode->start <= pa && pa < enode->end)
return 0;
if (enode->start < pa)
return -1;
if (enode->end > pa)
return 1;
return 0;
}
static const rb_tree_ops_t uvm_physseg_tree_ops = {
.rbto_compare_nodes = uvm_physseg_compare_nodes,
.rbto_compare_key = uvm_physseg_compare_key,
.rbto_node_offset = offsetof(struct uvm_physseg, rb_node),
.rbto_context = NULL
};
/*
* uvm_physseg_init: init the physmem
*
* => physmem unit should not be in use at this point
*/
void
uvm_physseg_init(void)
{
rb_tree_init(&(uvm_physseg_graph.rb_tree), &uvm_physseg_tree_ops);
uvm_physseg_graph.nentries = 0;
}
uvm_physseg_t
uvm_physseg_get_next(uvm_physseg_t upm)
{
/* next of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(upm) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
RB_DIR_RIGHT);
}
uvm_physseg_t
uvm_physseg_get_prev(uvm_physseg_t upm)
{
/* prev of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(upm) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (uvm_physseg_t) rb_tree_iterate(&(uvm_physseg_graph.rb_tree), upm,
RB_DIR_LEFT);
}
uvm_physseg_t
uvm_physseg_get_last(void)
{
return (uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));
}
uvm_physseg_t
uvm_physseg_get_first(void)
{
return (uvm_physseg_t) RB_TREE_MIN(&(uvm_physseg_graph.rb_tree));
}
paddr_t
uvm_physseg_get_highest_frame(void)
{
struct uvm_physseg *ps =
(uvm_physseg_t) RB_TREE_MAX(&(uvm_physseg_graph.rb_tree));
return ps->end - 1;
}
/*
* uvm_page_physunload: unload physical memory and return it to
* caller.
*/
bool
uvm_page_physunload(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
struct uvm_physseg *seg;
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (seg->free_list != freelist) {
return false;
}
/*
* During cold boot, what we're about to unplug hasn't been
* put on the uvm freelist, nor has uvmexp.npages been
* updated. (This happens in uvm_page.c:uvm_page_init())
*
* For hotplug, we assume here that the pages being unloaded
* here are completely out of sight of uvm (ie; not on any uvm
* lists), and that uvmexp.npages has been suitably
* decremented before we're called.
*
* XXX: will avail_end == start if avail_start < avail_end?
*/
/* try from front */
if (seg->avail_start == seg->start &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_start);
return uvm_physseg_unplug(seg->avail_start, 1);
}
/* try from rear */
if (seg->avail_end == seg->end &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_end - 1);
return uvm_physseg_unplug(seg->avail_end - 1, 1);
}
return false;
}
bool
uvm_page_physunload_force(uvm_physseg_t upm, int freelist, paddr_t *paddrp)
{
struct uvm_physseg *seg;
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
/* any room in this bank? */
if (seg->avail_start >= seg->avail_end) {
return false; /* nope */
}
*paddrp = ctob(seg->avail_start);
/* Always unplug from front */
return uvm_physseg_unplug(seg->avail_start, 1);
}
/*
* vm_physseg_find: find vm_physseg structure that belongs to a PA
*/
uvm_physseg_t
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{
struct uvm_physseg * ps = NULL;
ps = rb_tree_find_node(&(uvm_physseg_graph.rb_tree), &pframe);
if(ps != NULL && offp != NULL)
*offp = pframe - ps->start;
return ps;
}
#else /* UVM_HOTPLUG */
/*
* physical memory config is stored in vm_physmem.
*/
#define VM_PHYSMEM_PTR(i) (&vm_physmem[i])
#if VM_PHYSSEG_MAX == 1
#define VM_PHYSMEM_PTR_SWAP(i, j) /* impossible */
#else
#define VM_PHYSMEM_PTR_SWAP(i, j) \
do { vm_physmem[(i)] = vm_physmem[(j)]; } while (0)
#endif
#define HANDLE_TO_PHYSSEG_NODE(h) (VM_PHYSMEM_PTR((int)h))
#define PHYSSEG_NODE_TO_HANDLE(u) ((int)((vsize_t) (u - vm_physmem) / sizeof(struct uvm_physseg)))
/* XXXCDC: uvm.physmem */
static struct uvm_physseg vm_physmem[VM_PHYSSEG_MAX] __read_mostly;
/* XXXCDC: uvm.nphysseg */
static int vm_nphysseg __read_mostly = 0;
#define vm_nphysmem vm_nphysseg
void
uvm_physseg_init(void)
{
/* XXX: Provisioning for rb_tree related init(s) */
return;
}
int
uvm_physseg_get_next(uvm_physseg_t lcv)
{
/* next of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(lcv) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (lcv + 1);
}
int
uvm_physseg_get_prev(uvm_physseg_t lcv)
{
/* prev of invalid is invalid, not fatal */
if (uvm_physseg_valid_p(lcv) == false)
return UVM_PHYSSEG_TYPE_INVALID;
return (lcv - 1);
}
int
uvm_physseg_get_last(void)
{
return (vm_nphysseg - 1);
}
int
uvm_physseg_get_first(void)
{
return 0;
}
paddr_t
uvm_physseg_get_highest_frame(void)
{
int lcv;
paddr_t last = 0;
struct uvm_physseg *ps;
for (lcv = 0; lcv < vm_nphysseg; lcv++) {
ps = VM_PHYSMEM_PTR(lcv);
if (last < ps->end)
last = ps->end;
}
return last;
}
static struct vm_page *
uvm_post_preload_check(void)
{
int preload, lcv;
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++) {
if (VM_PHYSMEM_PTR(lcv)->pgs)
break;
}
preload = (lcv == vm_nphysmem);
/*
* if VM is already running, attempt to kmem_alloc vm_page structures
*/
if (!preload) {
panic("Tried to add RAM after uvm_page_init");
}
return NULL;
}
/*
* uvm_page_physunload: unload physical memory and return it to
* caller.
*/
bool
uvm_page_physunload(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
int x;
struct uvm_physseg *seg;
uvm_post_preload_check();
seg = VM_PHYSMEM_PTR(psi);
if (seg->free_list != freelist) {
return false;
}
/* try from front */
if (seg->avail_start == seg->start &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_start);
seg->avail_start++;
seg->start++;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
/* try from rear */
if (seg->avail_end == seg->end &&
seg->avail_start < seg->avail_end) {
*paddrp = ctob(seg->avail_end - 1);
seg->avail_end--;
seg->end--;
/* nothing left? nuke it */
if (seg->avail_end == seg->start) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
return false;
}
bool
uvm_page_physunload_force(uvm_physseg_t psi, int freelist, paddr_t *paddrp)
{
int x;
struct uvm_physseg *seg;
uvm_post_preload_check();
seg = VM_PHYSMEM_PTR(psi);
/* any room in this bank? */
if (seg->avail_start >= seg->avail_end) {
return false; /* nope */
}
*paddrp = ctob(seg->avail_start);
seg->avail_start++;
/* truncate! */
seg->start = seg->avail_start;
/* nothing left? nuke it */
if (seg->avail_start == seg->end) {
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = psi ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
}
return (true);
}
bool
uvm_physseg_plug(paddr_t pfn, size_t pages, uvm_physseg_t *psp)
{
int lcv;
struct vm_page *pgs;
struct uvm_physseg *ps;
#ifdef DEBUG
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pfn, &off);
if (uvm_physseg_valid_p(upm)) /* XXX; do we allow "update" plugs ? */
return false;
#endif
paddr_t start = pfn;
paddr_t end = pfn + pages;
paddr_t avail_start = start;
paddr_t avail_end = end;
if (uvmexp.pagesize == 0)
panic("uvm_page_physload: page size not set!");
/*
* do we have room?
*/
if (vm_nphysmem == VM_PHYSSEG_MAX) {
printf("uvm_page_physload: unable to load physical memory "
"segment\n");
printf("\t%d segments allocated, ignoring 0x%llx -> 0x%llx\n",
VM_PHYSSEG_MAX, (long long)start, (long long)end);
printf("\tincrease VM_PHYSSEG_MAX\n");
if (psp != NULL)
*psp = UVM_PHYSSEG_TYPE_INVALID_OVERFLOW;
return false;
}
/*
* check to see if this is a "preload" (i.e. uvm_page_init hasn't been
* called yet, so kmem is not available).
*/
pgs = uvm_post_preload_check();
/*
* now insert us in the proper place in vm_physmem[]
*/
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_RANDOM)
/* random: put it at the end (easy!) */
ps = VM_PHYSMEM_PTR(vm_nphysmem);
lcv = vm_nphysmem;
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
{
int x;
/* sort by address for binary search */
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
if (start < VM_PHYSMEM_PTR(lcv)->start)
break;
ps = VM_PHYSMEM_PTR(lcv);
/* move back other entries, if necessary ... */
for (x = vm_nphysmem ; x > lcv ; x--)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x - 1);
}
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
{
int x;
/* sort by largest segment first */
for (lcv = 0 ; lcv < vm_nphysmem ; lcv++)
if ((end - start) >
(VM_PHYSMEM_PTR(lcv)->end - VM_PHYSMEM_PTR(lcv)->start))
break;
ps = VM_PHYSMEM_PTR(lcv);
/* move back other entries, if necessary ... */
for (x = vm_nphysmem ; x > lcv ; x--)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x - 1);
}
#else
panic("uvm_page_physload: unknown physseg strategy selected!");
#endif
ps->start = start;
ps->end = end;
ps->avail_start = avail_start;
ps->avail_end = avail_end;
ps->pgs = pgs;
vm_nphysmem++;
if (psp != NULL)
*psp = lcv;
return true;
}
/*
* when VM_PHYSSEG_MAX is 1, we can simplify these functions
*/
#if VM_PHYSSEG_MAX == 1
static inline int vm_physseg_find_contig(struct uvm_physseg *, int, paddr_t, psize_t *);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
static inline int vm_physseg_find_bsearch(struct uvm_physseg *, int, paddr_t, psize_t *);
#else
static inline int vm_physseg_find_linear(struct uvm_physseg *, int, paddr_t, psize_t *);
#endif
/*
* vm_physseg_find: find vm_physseg structure that belongs to a PA
*/
inline int
uvm_physseg_find(paddr_t pframe, psize_t *offp)
{
#if VM_PHYSSEG_MAX == 1
return vm_physseg_find_contig(vm_physmem, vm_nphysseg, pframe, offp);
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
return vm_physseg_find_bsearch(vm_physmem, vm_nphysseg, pframe, offp);
#else
return vm_physseg_find_linear(vm_physmem, vm_nphysseg, pframe, offp);
#endif
}
#if VM_PHYSSEG_MAX == 1
static inline int
vm_physseg_find_contig(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* 'contig' case */
if (pframe >= segs[0].start && pframe < segs[0].end) {
if (offp)
*offp = pframe - segs[0].start;
return(0);
}
return(-1);
}
#elif (VM_PHYSSEG_STRAT == VM_PSTRAT_BSEARCH)
static inline int
vm_physseg_find_bsearch(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* binary search for it */
int start, len, guess;
/*
* if try is too large (thus target is less than try) we reduce
* the length to trunc(len/2) [i.e. everything smaller than "try"]
*
* if the try is too small (thus target is greater than try) then
* we set the new start to be (try + 1). this means we need to
* reduce the length to (round(len/2) - 1).
*
* note "adjust" below which takes advantage of the fact that
* (round(len/2) - 1) == trunc((len - 1) / 2)
* for any value of len we may have
*/
for (start = 0, len = nsegs ; len != 0 ; len = len / 2) {
guess = start + (len / 2); /* try in the middle */
/* start past our try? */
if (pframe >= segs[guess].start) {
/* was try correct? */
if (pframe < segs[guess].end) {
if (offp)
*offp = pframe - segs[guess].start;
return guess; /* got it */
}
start = guess + 1; /* next time, start here */
len--; /* "adjust" */
} else {
/*
* pframe before try, just reduce length of
* region, done in "for" loop
*/
}
}
return(-1);
}
#else
static inline int
vm_physseg_find_linear(struct uvm_physseg *segs, int nsegs, paddr_t pframe, psize_t *offp)
{
/* linear search for it */
int lcv;
for (lcv = 0; lcv < nsegs; lcv++) { if (pframe >= segs[lcv].start &&
pframe < segs[lcv].end) {
if (offp) *offp = pframe - segs[lcv].start;
return(lcv); /* got it */
}
}
return(-1);
}
#endif
#endif /* UVM_HOTPLUG */
/*
* PHYS_TO_VM_PAGE: find vm_page for a PA. used by MI code to get vm_pages
* back from an I/O mapping (ugh!). used in some MD code as well. it can
* be prominent in flamegraphs, so optimise it and try to make it easy for
* the compiler by including next to the inline lookup routines.
*/
struct vm_page *
uvm_phys_to_vm_page(paddr_t pa)
{
#if VM_PHYSSEG_STRAT != VM_PSTRAT_BSEARCH
/* 'contig' and linear cases */
KASSERT(vm_nphysseg > 0);
struct uvm_physseg *ps = &vm_physmem[0];
struct uvm_physseg *end = &vm_physmem[vm_nphysseg];
paddr_t pframe = atop(pa);
do {
if (pframe >= ps->start && pframe < ps->end) { return &ps->pgs[pframe - ps->start];
}
} while (VM_PHYSSEG_MAX > 1 && __predict_false(++ps < end));
return NULL;
#else
/* binary search for it */
paddr_t pf = atop(pa);
paddr_t off;
uvm_physseg_t upm;
upm = uvm_physseg_find(pf, &off);
if (upm != UVM_PHYSSEG_TYPE_INVALID)
return uvm_physseg_get_pg(upm, off);
return(NULL);
#endif
}
bool
uvm_physseg_valid_p(uvm_physseg_t upm)
{
struct uvm_physseg *ps;
if (upm == UVM_PHYSSEG_TYPE_INVALID ||
upm == UVM_PHYSSEG_TYPE_INVALID_EMPTY ||
upm == UVM_PHYSSEG_TYPE_INVALID_OVERFLOW)
return false;
/*
* This is the delicate init dance -
* needs to go with the dance.
*/
if (uvm.page_init_done != true)
return true;
ps = HANDLE_TO_PHYSSEG_NODE(upm);
/* Extra checks needed only post uvm_page_init() */
if (ps->pgs == NULL)
return false;
/* XXX: etc. */
return true;
}
/*
* Boot protocol dictates that these must be able to return partially
* initialised segments.
*/
paddr_t
uvm_physseg_get_start(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->start;
}
paddr_t
uvm_physseg_get_end(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->end;
}
paddr_t
uvm_physseg_get_avail_start(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->avail_start;
}
#if defined(UVM_PHYSSEG_LEGACY)
void
uvm_physseg_set_avail_start(uvm_physseg_t upm, paddr_t avail_start)
{
struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);
#if defined(DIAGNOSTIC)
paddr_t avail_end;
avail_end = uvm_physseg_get_avail_end(upm);
KASSERT(uvm_physseg_valid_p(upm));
KASSERT(avail_start < avail_end);
KASSERT(avail_start >= ps->start);
#endif
ps->avail_start = avail_start;
}
void
uvm_physseg_set_avail_end(uvm_physseg_t upm, paddr_t avail_end)
{
struct uvm_physseg *ps = HANDLE_TO_PHYSSEG_NODE(upm);
#if defined(DIAGNOSTIC)
paddr_t avail_start;
avail_start = uvm_physseg_get_avail_start(upm);
KASSERT(uvm_physseg_valid_p(upm));
KASSERT(avail_end > avail_start);
KASSERT(avail_end <= ps->end);
#endif
ps->avail_end = avail_end;
}
#endif /* UVM_PHYSSEG_LEGACY */
paddr_t
uvm_physseg_get_avail_end(uvm_physseg_t upm)
{
if (uvm_physseg_valid_p(upm) == false)
return (paddr_t) -1;
return HANDLE_TO_PHYSSEG_NODE(upm)->avail_end;
}
inline struct vm_page *
uvm_physseg_get_pg(uvm_physseg_t upm, paddr_t idx)
{
KASSERT(uvm_physseg_valid_p(upm));
return &HANDLE_TO_PHYSSEG_NODE(upm)->pgs[idx];
}
#ifdef __HAVE_PMAP_PHYSSEG
struct pmap_physseg *
uvm_physseg_get_pmseg(uvm_physseg_t upm)
{
KASSERT(uvm_physseg_valid_p(upm));
return &(HANDLE_TO_PHYSSEG_NODE(upm)->pmseg);
}
#endif
int
uvm_physseg_get_free_list(uvm_physseg_t upm)
{ KASSERT(uvm_physseg_valid_p(upm));
return HANDLE_TO_PHYSSEG_NODE(upm)->free_list;
}
u_long
uvm_physseg_get_start_hint(uvm_physseg_t upm)
{
KASSERT(uvm_physseg_valid_p(upm));
return HANDLE_TO_PHYSSEG_NODE(upm)->start_hint;
}
bool
uvm_physseg_set_start_hint(uvm_physseg_t upm, u_long start_hint)
{
if (uvm_physseg_valid_p(upm) == false)
return false;
HANDLE_TO_PHYSSEG_NODE(upm)->start_hint = start_hint;
return true;
}
void
uvm_physseg_init_seg(uvm_physseg_t upm, struct vm_page *pgs)
{
psize_t i;
psize_t n;
paddr_t paddr;
struct uvm_physseg *seg;
struct vm_page *pg;
KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
KASSERT(pgs != NULL);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
KASSERT(seg != NULL);
KASSERT(seg->pgs == NULL);
n = seg->end - seg->start;
seg->pgs = pgs;
/* init and free vm_pages (we've already zeroed them) */
paddr = ctob(seg->start);
for (i = 0 ; i < n ; i++, paddr += PAGE_SIZE) {
pg = &seg->pgs[i];
pg->phys_addr = paddr;
#ifdef __HAVE_VM_PAGE_MD
VM_MDPAGE_INIT(pg);
#endif
if (atop(paddr) >= seg->avail_start &&
atop(paddr) < seg->avail_end) {
uvmexp.npages++;
/* add page to free pool */
uvm_page_set_freelist(pg,
uvm_page_lookup_freelist(pg));
/* Disable LOCKDEBUG: too many and too early. */
mutex_init(&pg->interlock, MUTEX_NODEBUG, IPL_NONE);
uvm_pagefree(pg);
}
}
}
void
uvm_physseg_seg_chomp_slab(uvm_physseg_t upm, struct vm_page *pgs, size_t n)
{
struct uvm_physseg *seg = HANDLE_TO_PHYSSEG_NODE(upm);
/* max number of pre-boot unplug()s allowed */
#define UVM_PHYSSEG_BOOT_UNPLUG_MAX VM_PHYSSEG_MAX
static char btslab_ex_storage[EXTENT_FIXED_STORAGE_SIZE(UVM_PHYSSEG_BOOT_UNPLUG_MAX)];
if (__predict_false(uvm.page_init_done == false)) {
seg->ext = extent_create("Boot time slab", (u_long) pgs, (u_long) (pgs + n),
(void *)btslab_ex_storage, sizeof(btslab_ex_storage), 0);
} else {
seg->ext = extent_create("Hotplug slab", (u_long) pgs, (u_long) (pgs + n), NULL, 0, 0);
}
KASSERT(seg->ext != NULL);
}
struct vm_page *
uvm_physseg_seg_alloc_from_slab(uvm_physseg_t upm, size_t pages)
{
int err;
struct uvm_physseg *seg;
struct vm_page *pgs = NULL;
KASSERT(pages > 0);
seg = HANDLE_TO_PHYSSEG_NODE(upm);
if (__predict_false(seg->ext == NULL)) {
/*
* This is a situation unique to boot time.
* It shouldn't happen at any point other than from
* the first uvm_page.c:uvm_page_init() call
* Since we're in a loop, we can get away with the
* below.
*/
KASSERT(uvm.page_init_done != true);
uvm_physseg_t upmp = uvm_physseg_get_prev(upm);
KASSERT(upmp != UVM_PHYSSEG_TYPE_INVALID);
seg->ext = HANDLE_TO_PHYSSEG_NODE(upmp)->ext;
KASSERT(seg->ext != NULL);
}
/* We allocate enough for this segment */
err = extent_alloc(seg->ext, sizeof(*pgs) * pages, 1, 0, EX_BOUNDZERO, (u_long *)&pgs);
if (err != 0) {
#ifdef DEBUG
printf("%s: extent_alloc failed with error: %d \n",
__func__, err);
#endif
}
return pgs;
}
/*
* uvm_page_physload: load physical memory into VM system
*
* => all args are PFs
* => all pages in start/end get vm_page structures
* => areas marked by avail_start/avail_end get added to the free page pool
* => we are limited to VM_PHYSSEG_MAX physical memory segments
*/
uvm_physseg_t
uvm_page_physload(paddr_t start, paddr_t end, paddr_t avail_start,
paddr_t avail_end, int free_list)
{
struct uvm_physseg *ps;
uvm_physseg_t upm;
if (__predict_true(uvm.page_init_done == true))
panic("%s: unload attempted after uvm_page_init()\n", __func__);
if (uvmexp.pagesize == 0)
panic("uvm_page_physload: page size not set!");
if (free_list >= VM_NFREELIST || free_list < VM_FREELIST_DEFAULT)
panic("uvm_page_physload: bad free list %d", free_list);
if (start >= end)
panic("uvm_page_physload: start[%" PRIxPADDR "] >= end[%"
PRIxPADDR "]", start, end);
if (uvm_physseg_plug(start, end - start, &upm) == false) {
panic("uvm_physseg_plug() failed at boot.");
/* NOTREACHED */
return UVM_PHYSSEG_TYPE_INVALID; /* XXX: correct type */
}
ps = HANDLE_TO_PHYSSEG_NODE(upm);
/* Legacy */
ps->avail_start = avail_start;
ps->avail_end = avail_end;
ps->free_list = free_list; /* XXX: */
return upm;
}
bool
uvm_physseg_unplug(paddr_t pfn, size_t pages)
{
uvm_physseg_t upm;
paddr_t off = 0, start __diagused, end;
struct uvm_physseg *seg;
upm = uvm_physseg_find(pfn, &off);
if (!uvm_physseg_valid_p(upm)) {
printf("%s: Tried to unplug from unknown offset\n", __func__);
return false;
}
seg = HANDLE_TO_PHYSSEG_NODE(upm);
start = uvm_physseg_get_start(upm);
end = uvm_physseg_get_end(upm);
if (end < (pfn + pages)) {
printf("%s: Tried to unplug oversized span \n", __func__);
return false;
}
KASSERT(pfn == start + off); /* sanity */
if (__predict_true(uvm.page_init_done == true)) {
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
if (extent_free(seg->ext, (u_long)(seg->pgs + off), sizeof(struct vm_page) * pages, EX_MALLOCOK | EX_NOWAIT) != 0)
return false;
}
if (off == 0 && (pfn + pages) == end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
int segcount = 0;
struct uvm_physseg *current_ps;
/* Complete segment */
if (uvm_physseg_graph.nentries == 1)
panic("%s: out of memory!", __func__);
if (__predict_true(uvm.page_init_done == true)) {
RB_TREE_FOREACH(current_ps, &(uvm_physseg_graph.rb_tree)) {
if (seg->ext == current_ps->ext)
segcount++;
}
KASSERT(segcount > 0);
if (segcount == 1) {
extent_destroy(seg->ext);
}
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
rb_tree_remove_node(&(uvm_physseg_graph.rb_tree), upm);
memset(seg, 0, sizeof(struct uvm_physseg));
uvm_physseg_free(seg, sizeof(struct uvm_physseg));
uvm_physseg_graph.nentries--;
#else /* UVM_HOTPLUG */
int x;
if (vm_nphysmem == 1)
panic("uvm_page_physget: out of memory!");
vm_nphysmem--;
for (x = upm ; x < vm_nphysmem ; x++)
/* structure copy */
VM_PHYSMEM_PTR_SWAP(x, x + 1);
#endif /* UVM_HOTPLUG */
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
return true;
}
if (off > 0 &&
(pfn + pages) < end) {
#if defined(UVM_HOTPLUG) /* rbtree implementation */
/* middle chunk - need a new segment */
struct uvm_physseg *ps, *current_ps;
ps = uvm_physseg_alloc(sizeof (struct uvm_physseg));
if (ps == NULL) {
printf("%s: Unable to allocated new fragment vm_physseg \n",
__func__);
return false;
}
/* Remove middle chunk */
if (__predict_true(uvm.page_init_done == true)) {
KASSERT(seg->ext != NULL);
ps->ext = seg->ext;
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
ps->start = pfn + pages;
ps->avail_start = ps->start; /* XXX: Legacy */
ps->end = seg->end;
ps->avail_end = ps->end; /* XXX: Legacy */
seg->end = pfn;
seg->avail_end = seg->end; /* XXX: Legacy */
/*
* The new pgs array points to the beginning of the
* tail fragment.
*/
if (__predict_true(uvm.page_init_done == true))
ps->pgs = seg->pgs + off + pages;
current_ps = rb_tree_insert_node(&(uvm_physseg_graph.rb_tree), ps);
if (current_ps != ps) {
panic("uvm_page_physload: Duplicate address range detected!");
}
uvm_physseg_graph.nentries++;
#else /* UVM_HOTPLUG */
panic("%s: can't unplug() from the middle of a segment without"
" UVM_HOTPLUG\n", __func__);
/* NOTREACHED */
#endif /* UVM_HOTPLUG */
return true;
}
if (off == 0 && (pfn + pages) < end) {
/* Remove front chunk */
if (__predict_true(uvm.page_init_done == true)) {
/* XXX: KASSERT() that seg->pgs[] are not on any uvm lists */
/*
* We assume that the unplug will succeed from
* this point onwards
*/
uvmexp.npages -= (int) pages;
}
/* Truncate */
seg->start = pfn + pages;
seg->avail_start = seg->start; /* XXX: Legacy */
/*
* Move the pgs array start to the beginning of the
* tail end.
*/
if (__predict_true(uvm.page_init_done == true))
seg->pgs += pages;
return true;
}
if (off > 0 && (pfn + pages) == end) {
/* back chunk */
/* Truncate! */
seg->end = pfn;
seg->avail_end = seg->end; /* XXX: Legacy */
uvmexp.npages -= (int) pages;
return true;
}
printf("%s: Tried to unplug unknown range \n", __func__);
return false;
}
/* $NetBSD: uvm_page.h,v 1.109 2020/12/20 16:38:26 skrll Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.h 7.3 (Berkeley) 4/21/91
* from: Id: uvm_page.h,v 1.1.2.6 1998/02/04 02:31:42 chuck Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#ifndef _UVM_UVM_PAGE_H_
#define _UVM_UVM_PAGE_H_
#ifdef _KERNEL_OPT
#include "opt_uvm_page_trkown.h"
#endif
#include <sys/rwlock.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_pglist.h>
/*
* Management of resident (logical) pages.
*
* Each resident page has a vm_page structure, indexed by page number.
* There are several lists in the structure:
*
* - A red-black tree rooted with the containing object is used to
* quickly perform object+offset lookups.
* - A list of all pages for a given object, for a quick deactivation
* at a time of deallocation.
* - An ordered list of pages due for pageout.
*
* In addition, the structure contains the object and offset to which
* this page belongs (for pageout) and sundry status bits.
*
* Note that the page structure has no lock of its own. The page is
* generally protected by its owner's lock (UVM object or amap/anon).
* It should be noted that UVM has to serialize pmap(9) operations on
* the managed pages, e.g. for pmap_enter() calls. Hence, the lock
* order is as follows:
*
* [vmpage-owner-lock] ->
* any pmap locks (e.g. PV hash lock)
*
* Since the kernel is always self-consistent, no serialization is
* required for unmanaged mappings, e.g. for pmap_kenter_pa() calls.
*
* Field markings and the corresponding locks:
*
* f: free page queue lock, uvm_fpageqlock
* o: page owner (uvm_object::vmobjlock, vm_amap::am_lock, vm_anon::an_lock)
* i: vm_page::interlock
* => flags set and cleared only with o&i held can
* safely be tested for with only o held.
* o,i: o|i for read, o&i for write (depends on context - if could be loaned)
* => see uvm_loan.c
* w: wired page queue or uvm_pglistalloc:
* => wired page queue: o&i to change, stable from wire to unwire
* XXX What about concurrent or nested wire?
* => uvm_pglistalloc: owned by caller
* ?: locked by pmap or assumed page owner's lock
* p: locked by pagedaemon policy module (pdpolicy)
* c: cpu private
* s: stable, does not change
*
* UVM and pmap(9) may use uvm_page_owner_locked_p() to assert whether the
* page owner's lock is acquired.
*
* A page can have one of four identities:
*
* o free
* => pageq.list is entry on global free page queue
* => uanon is unused (or (void *)0xdeadbeef for DEBUG)
* => uobject is unused (or (void *)0xdeadbeef for DEBUG)
* => PG_FREE is set in flags
* o owned by a uvm_object
* => pageq.queue is entry on wired page queue, if any
* => uanon is NULL or the vm_anon to which it has been O->A loaned
* => uobject is owner
* o owned by a vm_anon
* => pageq is unused (XXX correct?)
* => uanon is owner
* => uobject is NULL
* => PG_ANON is set in flags
* o allocated by uvm_pglistalloc
* => pageq.queue is entry on resulting pglist, owned by caller
* => uanon is unused
* => uobject is unused
*
* The following transitions are allowed:
*
* - uvm_pagealloc: free -> owned by a uvm_object/vm_anon
* - uvm_pagefree: owned by a uvm_object/vm_anon -> free
* - uvm_pglistalloc: free -> allocated by uvm_pglistalloc
* - uvm_pglistfree: allocated by uvm_pglistalloc -> free
*
* On the ordering of fields:
*
* The fields most heavily used during fault processing are clustered
* together at the start of the structure to reduce cache misses.
* XXX This entire thing should be shrunk to fit in one cache line.
*/
struct vm_page {
/* _LP64: first cache line */
union {
TAILQ_ENTRY(vm_page) queue; /* w: wired page queue
* or uvm_pglistalloc output */
LIST_ENTRY(vm_page) list; /* f: global free page queue */
} pageq;
uint32_t pqflags; /* i: pagedaemon flags */
uint32_t flags; /* o: object flags */
paddr_t phys_addr; /* o: physical address of pg */
uint32_t loan_count; /* o,i: num. active loans */
uint32_t wire_count; /* o,i: wired down map refs */
struct vm_anon *uanon; /* o,i: anon */
struct uvm_object *uobject; /* o,i: object */
voff_t offset; /* o: offset into object */
/* _LP64: second cache line */
kmutex_t interlock; /* s: lock on identity */
TAILQ_ENTRY(vm_page) pdqueue; /* p: pagedaemon queue */
#ifdef __HAVE_VM_PAGE_MD
struct vm_page_md mdpage; /* ?: pmap-specific data */
#endif
#if defined(UVM_PAGE_TRKOWN)
/* debugging fields to track page ownership */
pid_t owner; /* proc that set PG_BUSY */
lwpid_t lowner; /* lwp that set PG_BUSY */
const char *owner_tag; /* why it was set busy */
#endif
};
/*
* Overview of UVM page flags, stored in pg->flags.
*
* Locking notes:
*
* PG_, struct vm_page::flags => locked by owner
* PG_AOBJ => additionally locked by vm_page::interlock
* PG_ANON => additionally locked by vm_page::interlock
* PG_FREE => additionally locked by uvm_fpageqlock
* for uvm_pglistalloc()
*
* Flag descriptions:
*
* PG_CLEAN:
* Page is known clean.
* The contents of the page is consistent with its backing store.
*
* PG_DIRTY:
* Page is known dirty.
* To avoid losing data, the contents of the page should be written
* back to the backing store before freeing the page.
*
* PG_BUSY:
* Page is long-term locked, usually because of I/O (transfer from the
* page memory to the backing store) is in progress. LWP attempting
* to access the page shall set PQ_WANTED and wait. PG_BUSY may only
* be set with a write lock held on the object.
*
* PG_PAGEOUT:
* Indicates that the page is being paged-out in preparation for
* being freed.
*
* PG_RELEASED:
* Indicates that the page, which is currently PG_BUSY, should be freed
* after the release of long-term lock. It is responsibility of the
* owning LWP (i.e. which set PG_BUSY) to do it.
*
* PG_FAKE:
* Page has been allocated, but not yet initialised. The flag is used
* to avoid overwriting of valid data, e.g. to prevent read from the
* backing store when in-core data is newer.
*
* PG_RDONLY:
* Indicates that the page must be mapped read-only.
*
* PG_MARKER:
* Dummy marker page, generally used for list traversal.
*/
/*
* if you want to renumber PG_CLEAN and PG_DIRTY, check __CTASSERTs in
* uvm_page_status.c first.
*/
#define PG_CLEAN 0x00000001 /* page is known clean */
#define PG_DIRTY 0x00000002 /* page is known dirty */
#define PG_BUSY 0x00000004 /* page is locked */
#define PG_PAGEOUT 0x00000010 /* page to be freed for pagedaemon */
#define PG_RELEASED 0x00000020 /* page to be freed when unbusied */
#define PG_FAKE 0x00000040 /* page is not yet initialized */
#define PG_RDONLY 0x00000080 /* page must be mapped read-only */
#define PG_TABLED 0x00000200 /* page is tabled in object */
#define PG_AOBJ 0x00000400 /* page is part of an anonymous
uvm_object */
#define PG_ANON 0x00000800 /* page is part of an anon, rather
than an uvm_object */
#define PG_FILE 0x00001000 /* file backed (non-anonymous) */
#define PG_READAHEAD 0x00002000 /* read-ahead but not "hit" yet */
#define PG_FREE 0x00004000 /* page is on free list */
#define PG_MARKER 0x00008000 /* dummy marker page */
#define PG_PAGER1 0x00010000 /* pager-specific flag */
#define PG_PGLCA 0x00020000 /* allocated by uvm_pglistalloc_contig */
#define PG_STAT (PG_ANON|PG_AOBJ|PG_FILE)
#define PG_SWAPBACKED (PG_ANON|PG_AOBJ)
#define UVM_PGFLAGBITS \
"\20\1CLEAN\2DIRTY\3BUSY" \
"\5PAGEOUT\6RELEASED\7FAKE\10RDONLY" \
"\11ZERO\12TABLED\13AOBJ\14ANON" \
"\15FILE\16READAHEAD\17FREE\20MARKER" \
"\21PAGER1\22PGLCA"
/*
* Flags stored in pg->pqflags, which is protected by pg->interlock.
*
* PQ_PRIVATE:
* ... is for uvmpdpol to do whatever it wants with.
*
* PQ_INTENT_SET:
* Indicates that the intent set on the page has not yet been realized.
*
* PQ_INTENT_QUEUED:
* Indicates that the page is, or will soon be, on a per-CPU queue for
* the intent to be realized.
*
* PQ_WANTED:
* Indicates that the page, which is currently PG_BUSY, is wanted by
* some other LWP. The page owner (i.e. LWP which set PG_BUSY) is
* responsible to clear both flags and wake up any waiters once it has
* released the long-term lock (PG_BUSY).
*/
#define PQ_INTENT_A 0x00000000 /* intend activation */
#define PQ_INTENT_I 0x00000001 /* intend deactivation */
#define PQ_INTENT_E 0x00000002 /* intend enqueue */
#define PQ_INTENT_D 0x00000003 /* intend dequeue */
#define PQ_INTENT_MASK 0x00000003 /* mask of intended state */
#define PQ_INTENT_SET 0x00000004 /* not realized yet */
#define PQ_INTENT_QUEUED 0x00000008 /* queued for processing */
#define PQ_PRIVATE 0x00000ff0 /* private for pdpolicy */
#define PQ_WANTED 0x00001000 /* someone is waiting for page */
#define UVM_PQFLAGBITS \
"\20\1INTENT_0\2INTENT_1\3INTENT_SET\4INTENT_QUEUED" \
"\5PRIVATE1\6PRIVATE2\7PRIVATE3\10PRIVATE4" \
"\11PRIVATE5\12PRIVATE6\13PRIVATE7\14PRIVATE8" \
"\15WANTED"
/*
* physical memory layout structure
*
* MD vmparam.h must #define:
* VM_PHYSEG_MAX = max number of physical memory segments we support
* (if this is "1" then we revert to a "contig" case)
* VM_PHYSSEG_STRAT: memory sort/search options (for VM_PHYSEG_MAX > 1)
* - VM_PSTRAT_RANDOM: linear search (random order)
* - VM_PSTRAT_BSEARCH: binary search (sorted by address)
* - VM_PSTRAT_BIGFIRST: linear search (sorted by largest segment first)
* - others?
* XXXCDC: eventually we should purge all left-over global variables...
*/
#define VM_PSTRAT_RANDOM 1
#define VM_PSTRAT_BSEARCH 2
#define VM_PSTRAT_BIGFIRST 3
#ifdef _KERNEL
/*
* prototypes: the following prototypes define the interface to pages
*/
void uvm_page_init(vaddr_t *, vaddr_t *);
void uvm_pglistalloc_init(void);
#if defined(UVM_PAGE_TRKOWN)
void uvm_page_own(struct vm_page *, const char *);
#endif
#if !defined(PMAP_STEAL_MEMORY)
bool uvm_page_physget(paddr_t *);
#endif
void uvm_page_recolor(int);
void uvm_page_rebucket(void);
void uvm_pageactivate(struct vm_page *);
vaddr_t uvm_pageboot_alloc(vsize_t);
void uvm_pagecopy(struct vm_page *, struct vm_page *);
void uvm_pagedeactivate(struct vm_page *);
void uvm_pagedequeue(struct vm_page *);
void uvm_pageenqueue(struct vm_page *);
void uvm_pagefree(struct vm_page *);
void uvm_pagelock(struct vm_page *);
void uvm_pagelock2(struct vm_page *, struct vm_page *);
void uvm_pageunlock(struct vm_page *);
void uvm_pageunlock2(struct vm_page *, struct vm_page *);
void uvm_page_unbusy(struct vm_page **, int);
struct vm_page *uvm_pagelookup(struct uvm_object *, voff_t);
void uvm_pageunwire(struct vm_page *);
void uvm_pagewire(struct vm_page *);
void uvm_pagezero(struct vm_page *);
bool uvm_pageismanaged(paddr_t);
bool uvm_page_owner_locked_p(struct vm_page *, bool);
void uvm_pgfl_lock(void);
void uvm_pgfl_unlock(void);
unsigned int uvm_pagegetdirty(struct vm_page *);
void uvm_pagemarkdirty(struct vm_page *, unsigned int);
bool uvm_pagecheckdirty(struct vm_page *, bool);
bool uvm_pagereadonly_p(struct vm_page *);
bool uvm_page_locked_p(struct vm_page *);
void uvm_pagewakeup(struct vm_page *);
bool uvm_pagewanted_p(struct vm_page *);
void uvm_pagewait(struct vm_page *, krwlock_t *, const char *);
int uvm_page_lookup_freelist(struct vm_page *);
struct vm_page *uvm_phys_to_vm_page(paddr_t);
paddr_t uvm_vm_page_to_phys(const struct vm_page *);
#if defined(PMAP_DIRECT)
extern bool ubc_direct;
int uvm_direct_process(struct vm_page **, u_int, voff_t, vsize_t,
int (*)(void *, size_t, void *), void *);
#endif
/*
* page dirtiness status for uvm_pagegetdirty and uvm_pagemarkdirty
*
* UNKNOWN means that we need to consult pmap to know if the page is
* dirty or not.
* basically, UVM_PAGE_STATUS_CLEAN implies that the page has no writable
* mapping.
*
* if you want to renumber these, check __CTASSERTs in
* uvm_page_status.c first.
*/
#define UVM_PAGE_STATUS_UNKNOWN 0
#define UVM_PAGE_STATUS_CLEAN 1
#define UVM_PAGE_STATUS_DIRTY 2
#define UVM_PAGE_NUM_STATUS 3
/*
* macros
*/
#define VM_PAGE_TO_PHYS(entry) uvm_vm_page_to_phys(entry)
#ifdef __HAVE_VM_PAGE_MD
#define VM_PAGE_TO_MD(pg) (&(pg)->mdpage)
#define VM_MD_TO_PAGE(md) (container_of((md), struct vm_page, mdpage))
#endif
/*
* Compute the page color for a given page.
*/
#define VM_PGCOLOR(pg) \
(atop(VM_PAGE_TO_PHYS((pg))) & uvmexp.colormask)
#define PHYS_TO_VM_PAGE(pa) uvm_phys_to_vm_page(pa)
/*
* VM_PAGE_IS_FREE() can't tell if the page is on global free list, or a
* per-CPU cache. If you need to be certain, pause caching.
*/
#define VM_PAGE_IS_FREE(entry) ((entry)->flags & PG_FREE)
/*
* Use the lower 10 bits of pg->phys_addr to cache some some locators for
* the page. This implies that the smallest possible page size is 1kB, and
* that nobody should use pg->phys_addr directly (use VM_PAGE_TO_PHYS()).
*
* - 5 bits for the freelist index, because uvm_page_lookup_freelist()
* traverses an rbtree and therefore features prominently in traces
* captured during performance test. It would probably be more useful to
* cache physseg index here because freelist can be inferred from physseg,
* but it requires changes to allocation for UVM_HOTPLUG, so for now we'll
* go with freelist.
*
* - 5 bits for "bucket", a way for us to categorise pages further as
* needed (e.g. NUMA node).
*
* None of this is set in stone; it can be adjusted as needed.
*/
#define UVM_PHYSADDR_FREELIST __BITS(0,4)
#define UVM_PHYSADDR_BUCKET __BITS(5,9)
static inline unsigned
uvm_page_get_freelist(struct vm_page *pg)
{
unsigned fl = __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_FREELIST); KASSERT(fl == (unsigned)uvm_page_lookup_freelist(pg));
return fl;
}
static inline unsigned
uvm_page_get_bucket(struct vm_page *pg)
{
return __SHIFTOUT(pg->phys_addr, UVM_PHYSADDR_BUCKET);
}
static inline void
uvm_page_set_freelist(struct vm_page *pg, unsigned fl)
{
KASSERT(fl < 32);
pg->phys_addr &= ~UVM_PHYSADDR_FREELIST;
pg->phys_addr |= __SHIFTIN(fl, UVM_PHYSADDR_FREELIST);
}
static inline void
uvm_page_set_bucket(struct vm_page *pg, unsigned b)
{
KASSERT(b < 32);
pg->phys_addr &= ~UVM_PHYSADDR_BUCKET;
pg->phys_addr |= __SHIFTIN(b, UVM_PHYSADDR_BUCKET);
}
#endif /* _KERNEL */
#endif /* _UVM_UVM_PAGE_H_ */
/* $NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $ */
/*-
* Copyright (c) 1998, 2008, 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Kevin M. Lahey of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1980, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)route.c 8.3 (Berkeley) 1/9/95
*/
#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_route.h"
#endif /* _KERNEL && _KERNEL_OPT */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtbl.c,v 1.7 2017/06/01 02:45:14 chs Exp $");
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/proc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/pool.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/raw_cb.h>
static rtbl_t *rt_tables[AF_MAX+1];
int
rt_inithead(rtbl_t **tp, int off)
{
rtbl_t *t;
if (*tp != NULL)
return 1;
t = kmem_alloc(sizeof(*t), KM_SLEEP);
*tp = t;
return rn_inithead0(&t->t_rnh, off);
}
struct rtentry *
rt_matchaddr(rtbl_t *t, const struct sockaddr *dst)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_matchaddr(dst, rnh);
if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
return NULL;
return (struct rtentry *)rn;
}
int
rt_addaddr(rtbl_t *t, struct rtentry *rt, const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_addaddr(rt_getkey(rt), netmask, rnh, rt->rt_nodes);
return (rn == NULL) ? EEXIST : 0;
}
struct rtentry *
rt_lookup(rtbl_t *t, const struct sockaddr *dst, const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
rn = rnh->rnh_lookup(dst, netmask, rnh);
if (rn == NULL || (rn->rn_flags & RNF_ROOT) != 0)
return NULL;
return (struct rtentry *)rn;
}
struct rtentry *
rt_deladdr(rtbl_t *t, const struct sockaddr *dst,
const struct sockaddr *netmask)
{
struct radix_node_head *rnh = &t->t_rnh;
struct radix_node *rn;
if ((rn = rnh->rnh_deladdr(dst, netmask, rnh)) == NULL)
return NULL;
if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic("%s", __func__); return (struct rtentry *)rn;
}
static int
rt_walktree_visitor(struct radix_node *rn, void *v)
{
struct rtwalk *rw = (struct rtwalk *)v;
return (*rw->rw_f)((struct rtentry *)rn, rw->rw_v);
}
int
rtbl_walktree(sa_family_t family, int (*f)(struct rtentry *, void *), void *v)
{
rtbl_t *t = rt_tables[family];
struct rtwalk rw;
if (t == NULL)
return 0;
rw.rw_f = f;
rw.rw_v = v;
return rn_walktree(&t->t_rnh, rt_walktree_visitor, &rw);
}
struct rtentry *
rtbl_search_matched_entry(sa_family_t family,
int (*f)(struct rtentry *, void *), void *v)
{
rtbl_t *t = rt_tables[family];
struct rtwalk rw;
if (t == NULL)
return 0;
rw.rw_f = f;
rw.rw_v = v;
return (struct rtentry *)
rn_search_matched(&t->t_rnh, rt_walktree_visitor, &rw);
}
rtbl_t *
rt_gettable(sa_family_t af)
{ if (af >= __arraycount(rt_tables))
return NULL;
return rt_tables[af];
}
void
rtbl_init(void)
{
struct domain *dom;
DOMAIN_FOREACH(dom)
if (dom->dom_rtattach)
dom->dom_rtattach(&rt_tables[dom->dom_family],
dom->dom_rtoffset);
}
void
rt_assert_inactive(const struct rtentry *rt)
{
if (rt->rt_nodes->rn_flags & (RNF_ACTIVE | RNF_ROOT))
panic ("rtfree 2");
}
int
rt_refines(const struct sockaddr *m_sa, const struct sockaddr *n_sa)
{
return rn_refines(m_sa, n_sa);
}
/* $NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
*/
/*
* Signal subsystem.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sig.c,v 1.409 2024/02/10 09:24:18 andvar Exp $");
#include "opt_execfmt.h"
#include "opt_ptrace.h"
#include "opt_dtrace.h"
#include "opt_compat_sunos.h"
#include "opt_compat_netbsd.h"
#include "opt_compat_netbsd32.h"
#include "opt_pax.h"
#define SIGPROP /* include signal properties table */
#include <sys/param.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/systm.h>
#include <sys/wait.h>
#include <sys/ktrace.h>
#include <sys/syslog.h>
#include <sys/filedesc.h>
#include <sys/file.h>
#include <sys/pool.h>
#include <sys/ucontext.h>
#include <sys/exec.h>
#include <sys/kauth.h>
#include <sys/acct.h>
#include <sys/callout.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/exec_elf.h>
#include <sys/compat_stub.h>
#ifdef PAX_SEGVGUARD
#include <sys/pax.h>
#endif /* PAX_SEGVGUARD */
#include <uvm/uvm_extern.h>
/* Many hard-coded assumptions that there are <= 4 x 32bit signal mask bits */
__CTASSERT(NSIG <= 128);
#define SIGQUEUE_MAX 32
static pool_cache_t sigacts_cache __read_mostly;
static pool_cache_t ksiginfo_cache __read_mostly;
static callout_t proc_stop_ch __cacheline_aligned;
sigset_t contsigmask __cacheline_aligned;
sigset_t stopsigmask __cacheline_aligned;
static sigset_t vforksigmask __cacheline_aligned;
sigset_t sigcantmask __cacheline_aligned;
static void ksiginfo_exechook(struct proc *, void *);
static void proc_stop(struct proc *, int);
static void proc_stop_done(struct proc *, int);
static void proc_stop_callout(void *);
static int sigchecktrace(void);
static int sigpost(struct lwp *, sig_t, int, int);
static int sigput(sigpend_t *, struct proc *, ksiginfo_t *);
static int sigunwait(struct proc *, const ksiginfo_t *);
static void sigswitch(int, int, bool);
static void sigswitch_unlock_and_switch_away(struct lwp *);
static void sigacts_poolpage_free(struct pool *, void *);
static void *sigacts_poolpage_alloc(struct pool *, int);
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , signal__send,
"struct lwp *", /* target thread */
"struct proc *", /* target process */
"int"); /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__discard,
"struct lwp *", /* target thread */
"struct proc *", /* target process */
"int"); /* signal */
SDT_PROBE_DEFINE3(proc, kernel, , signal__handle,
"int", /* signal */
"ksiginfo_t *", /* signal info */
"void (*)(void)"); /* handler address */
static struct pool_allocator sigactspool_allocator = {
.pa_alloc = sigacts_poolpage_alloc,
.pa_free = sigacts_poolpage_free
};
#ifdef DEBUG
int kern_logsigexit = 1;
#else
int kern_logsigexit = 0;
#endif
static const char logcoredump[] =
"pid %d (%s), uid %d: exited on signal %d (core dumped)\n";
static const char lognocoredump[] =
"pid %d (%s), uid %d: exited on signal %d (core not dumped, err = %d)\n";
static kauth_listener_t signal_listener;
static int
signal_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result, signum;
result = KAUTH_RESULT_DEFER;
p = arg0;
signum = (int)(unsigned long)arg1;
if (action != KAUTH_PROCESS_SIGNAL)
return result;
if (kauth_cred_uidmatch(cred, p->p_cred) || (signum == SIGCONT && (curproc->p_session == p->p_session)))
result = KAUTH_RESULT_ALLOW;
return result;
}
static int
sigacts_ctor(void *arg __unused, void *obj, int flags __unused)
{
memset(obj, 0, sizeof(struct sigacts));
return 0;
}
/*
* signal_init:
*
* Initialize global signal-related data structures.
*/
void
signal_init(void)
{
sigactspool_allocator.pa_pagesz = (PAGE_SIZE)*2;
sigacts_cache = pool_cache_init(sizeof(struct sigacts), 0, 0, 0,
"sigacts", sizeof(struct sigacts) > PAGE_SIZE ?
&sigactspool_allocator : NULL, IPL_NONE, sigacts_ctor, NULL, NULL);
ksiginfo_cache = pool_cache_init(sizeof(ksiginfo_t), 0, 0, 0,
"ksiginfo", NULL, IPL_VM, NULL, NULL, NULL);
exechook_establish(ksiginfo_exechook, NULL);
callout_init(&proc_stop_ch, CALLOUT_MPSAFE);
callout_setfunc(&proc_stop_ch, proc_stop_callout, NULL);
signal_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
signal_listener_cb, NULL);
}
/*
* sigacts_poolpage_alloc:
*
* Allocate a page for the sigacts memory pool.
*/
static void *
sigacts_poolpage_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(kernel_map,
PAGE_SIZE * 2, PAGE_SIZE * 2,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK)
| UVM_KMF_WIRED);
}
/*
* sigacts_poolpage_free:
*
* Free a page on behalf of the sigacts memory pool.
*/
static void
sigacts_poolpage_free(struct pool *pp, void *v)
{
uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * 2, UVM_KMF_WIRED);
}
/*
* sigactsinit:
*
* Create an initial sigacts structure, using the same signal state
* as of specified process. If 'share' is set, share the sigacts by
* holding a reference, otherwise just copy it from parent.
*/
struct sigacts *
sigactsinit(struct proc *pp, int share)
{
struct sigacts *ps = pp->p_sigacts, *ps2;
if (__predict_false(share)) {
atomic_inc_uint(&ps->sa_refcnt);
return ps;
}
ps2 = pool_cache_get(sigacts_cache, PR_WAITOK);
mutex_init(&ps2->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
ps2->sa_refcnt = 1;
mutex_enter(&ps->sa_mutex);
memcpy(ps2->sa_sigdesc, ps->sa_sigdesc, sizeof(ps2->sa_sigdesc));
mutex_exit(&ps->sa_mutex);
return ps2;
}
/*
* sigactsunshare:
*
* Make this process not share its sigacts, maintaining all signal state.
*/
void
sigactsunshare(struct proc *p)
{
struct sigacts *ps, *oldps = p->p_sigacts;
if (__predict_true(oldps->sa_refcnt == 1))
return;
ps = pool_cache_get(sigacts_cache, PR_WAITOK);
mutex_init(&ps->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
memcpy(ps->sa_sigdesc, oldps->sa_sigdesc, sizeof(ps->sa_sigdesc));
ps->sa_refcnt = 1;
p->p_sigacts = ps;
sigactsfree(oldps);
}
/*
* sigactsfree;
*
* Release a sigacts structure.
*/
void
sigactsfree(struct sigacts *ps)
{
membar_release();
if (atomic_dec_uint_nv(&ps->sa_refcnt) == 0) {
membar_acquire();
mutex_destroy(&ps->sa_mutex);
pool_cache_put(sigacts_cache, ps);
}
}
/*
* siginit:
*
* Initialize signal state for process 0; set to ignore signals that
* are ignored by default and disable the signal stack. Locking not
* required as the system is still cold.
*/
void
siginit(struct proc *p)
{
struct lwp *l;
struct sigacts *ps;
int signo, prop;
ps = p->p_sigacts;
sigemptyset(&contsigmask);
sigemptyset(&stopsigmask);
sigemptyset(&vforksigmask);
sigemptyset(&sigcantmask);
for (signo = 1; signo < NSIG; signo++) {
prop = sigprop[signo];
if (prop & SA_CONT)
sigaddset(&contsigmask, signo);
if (prop & SA_STOP)
sigaddset(&stopsigmask, signo);
if (prop & SA_STOP && signo != SIGSTOP)
sigaddset(&vforksigmask, signo);
if (prop & SA_CANTMASK)
sigaddset(&sigcantmask, signo);
if (prop & SA_IGNORE && signo != SIGCONT)
sigaddset(&p->p_sigctx.ps_sigignore, signo);
sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
}
sigemptyset(&p->p_sigctx.ps_sigcatch);
p->p_sflag &= ~PS_NOCLDSTOP;
ksiginfo_queue_init(&p->p_sigpend.sp_info);
sigemptyset(&p->p_sigpend.sp_set);
/*
* Reset per LWP state.
*/
l = LIST_FIRST(&p->p_lwps);
l->l_sigwaited = NULL;
l->l_sigstk = SS_INIT;
ksiginfo_queue_init(&l->l_sigpend.sp_info);
sigemptyset(&l->l_sigpend.sp_set);
/* One reference. */
ps->sa_refcnt = 1;
}
/*
* execsigs:
*
* Reset signals for an exec of the specified process.
*/
void
execsigs(struct proc *p)
{
struct sigacts *ps;
struct lwp *l;
int signo, prop;
sigset_t tset;
ksiginfoq_t kq;
KASSERT(p->p_nlwps == 1);
sigactsunshare(p);
ps = p->p_sigacts;
/*
* Reset caught signals. Held signals remain held through
* l->l_sigmask (unless they were caught, and are now ignored
* by default).
*
* No need to lock yet, the process has only one LWP and
* at this point the sigacts are private to the process.
*/
sigemptyset(&tset);
for (signo = 1; signo < NSIG; signo++) {
if (sigismember(&p->p_sigctx.ps_sigcatch, signo)) {
prop = sigprop[signo];
if (prop & SA_IGNORE) {
if ((prop & SA_CONT) == 0)
sigaddset(&p->p_sigctx.ps_sigignore,
signo);
sigaddset(&tset, signo);
}
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
}
sigemptyset(&SIGACTION_PS(ps, signo).sa_mask);
SIGACTION_PS(ps, signo).sa_flags = SA_RESTART;
}
ksiginfo_queue_init(&kq);
mutex_enter(p->p_lock);
sigclearall(p, &tset, &kq);
sigemptyset(&p->p_sigctx.ps_sigcatch);
/*
* Reset no zombies if child dies flag as Solaris does.
*/
p->p_flag &= ~(PK_NOCLDWAIT | PK_CLDSIGIGN);
if (SIGACTION_PS(ps, SIGCHLD).sa_handler == SIG_IGN)
SIGACTION_PS(ps, SIGCHLD).sa_handler = SIG_DFL;
/*
* Reset per-LWP state.
*/
l = LIST_FIRST(&p->p_lwps);
l->l_sigwaited = NULL;
l->l_sigstk = SS_INIT;
ksiginfo_queue_init(&l->l_sigpend.sp_info);
sigemptyset(&l->l_sigpend.sp_set);
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
}
/*
* ksiginfo_exechook:
*
* Free all pending ksiginfo entries from a process on exec.
* Additionally, drain any unused ksiginfo structures in the
* system back to the pool.
*
* XXX This should not be a hook, every process has signals.
*/
static void
ksiginfo_exechook(struct proc *p, void *v)
{
ksiginfoq_t kq;
ksiginfo_queue_init(&kq);
mutex_enter(p->p_lock);
sigclearall(p, NULL, &kq);
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
}
/*
* ksiginfo_alloc:
*
* Allocate a new ksiginfo structure from the pool, and optionally copy
* an existing one. If the existing ksiginfo_t is from the pool, and
* has not been queued somewhere, then just return it. Additionally,
* if the existing ksiginfo_t does not contain any information beyond
* the signal number, then just return it.
*/
ksiginfo_t *
ksiginfo_alloc(struct proc *p, ksiginfo_t *ok, int flags)
{
ksiginfo_t *kp;
if (ok != NULL) {
if ((ok->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) ==
KSI_FROMPOOL)
return ok;
if (KSI_EMPTY_P(ok))
return ok;
}
kp = pool_cache_get(ksiginfo_cache, flags);
if (kp == NULL) {
#ifdef DIAGNOSTIC
printf("Out of memory allocating ksiginfo for pid %d\n",
p->p_pid);
#endif
return NULL;
}
if (ok != NULL) {
memcpy(kp, ok, sizeof(*kp));
kp->ksi_flags &= ~KSI_QUEUED;
} else
KSI_INIT_EMPTY(kp);
kp->ksi_flags |= KSI_FROMPOOL;
return kp;
}
/*
* ksiginfo_free:
*
* If the given ksiginfo_t is from the pool and has not been queued,
* then free it.
*/
void
ksiginfo_free(ksiginfo_t *kp)
{
if ((kp->ksi_flags & (KSI_QUEUED | KSI_FROMPOOL)) != KSI_FROMPOOL)
return;
pool_cache_put(ksiginfo_cache, kp);
}
/*
* ksiginfo_queue_drain:
*
* Drain a non-empty ksiginfo_t queue.
*/
void
ksiginfo_queue_drain0(ksiginfoq_t *kq)
{
ksiginfo_t *ksi;
KASSERT(!TAILQ_EMPTY(kq)); while (!TAILQ_EMPTY(kq)) {
ksi = TAILQ_FIRST(kq);
TAILQ_REMOVE(kq, ksi, ksi_list);
pool_cache_put(ksiginfo_cache, ksi);
}
}
static int
siggetinfo(sigpend_t *sp, ksiginfo_t *out, int signo)
{
ksiginfo_t *ksi, *nksi;
if (sp == NULL)
goto out;
/* Find siginfo and copy it out. */
int count = 0;
TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, nksi) {
if (ksi->ksi_signo != signo)
continue;
if (count++ > 0) /* Only remove the first, count all of them */
continue;
TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list);
KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0);
ksi->ksi_flags &= ~KSI_QUEUED;
if (out != NULL) {
memcpy(out, ksi, sizeof(*out));
out->ksi_flags &= ~(KSI_FROMPOOL | KSI_QUEUED);
}
ksiginfo_free(ksi);
}
if (count)
return count;
out:
/* If there is no siginfo, then manufacture it. */
if (out != NULL) {
KSI_INIT(out);
out->ksi_info._signo = signo;
out->ksi_info._code = SI_NOINFO;
}
return 0;
}
/*
* sigget:
*
* Fetch the first pending signal from a set. Optionally, also fetch
* or manufacture a ksiginfo element. Returns the number of the first
* pending signal, or zero.
*/
int
sigget(sigpend_t *sp, ksiginfo_t *out, int signo, const sigset_t *mask)
{
sigset_t tset;
int count;
/* If there's no pending set, the signal is from the debugger. */
if (sp == NULL)
goto out;
/* Construct mask from signo, and 'mask'. */
if (signo == 0) {
if (mask != NULL) {
tset = *mask;
__sigandset(&sp->sp_set, &tset);
} else
tset = sp->sp_set;
/* If there are no signals pending - return. */
if ((signo = firstsig(&tset)) == 0)
goto out;
} else {
KASSERT(sigismember(&sp->sp_set, signo));
}
sigdelset(&sp->sp_set, signo);
out:
count = siggetinfo(sp, out, signo);
if (count > 1)
sigaddset(&sp->sp_set, signo);
return signo;
}
/*
* sigput:
*
* Append a new ksiginfo element to the list of pending ksiginfo's.
*/
static int
sigput(sigpend_t *sp, struct proc *p, ksiginfo_t *ksi)
{
ksiginfo_t *kp;
KASSERT(mutex_owned(p->p_lock)); KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0);
sigaddset(&sp->sp_set, ksi->ksi_signo);
/*
* If there is no siginfo, we are done.
*/
if (KSI_EMPTY_P(ksi))
return 0;
KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0);
size_t count = 0;
TAILQ_FOREACH(kp, &sp->sp_info, ksi_list) {
count++;
if (ksi->ksi_signo >= SIGRTMIN && ksi->ksi_signo <= SIGRTMAX)
continue;
if (kp->ksi_signo == ksi->ksi_signo) { KSI_COPY(ksi, kp);
kp->ksi_flags |= KSI_QUEUED;
return 0;
}
}
if (count >= SIGQUEUE_MAX) {
#ifdef DIAGNOSTIC
printf("%s(%d): Signal queue is full signal=%d\n",
p->p_comm, p->p_pid, ksi->ksi_signo);
#endif
return EAGAIN;
}
ksi->ksi_flags |= KSI_QUEUED;
TAILQ_INSERT_TAIL(&sp->sp_info, ksi, ksi_list);
return 0;
}
/*
* sigclear:
*
* Clear all pending signals in the specified set.
*/
void
sigclear(sigpend_t *sp, const sigset_t *mask, ksiginfoq_t *kq)
{
ksiginfo_t *ksi, *next;
if (mask == NULL)
sigemptyset(&sp->sp_set);
else
sigminusset(mask, &sp->sp_set); TAILQ_FOREACH_SAFE(ksi, &sp->sp_info, ksi_list, next) { if (mask == NULL || sigismember(mask, ksi->ksi_signo)) { TAILQ_REMOVE(&sp->sp_info, ksi, ksi_list); KASSERT((ksi->ksi_flags & KSI_FROMPOOL) != 0); KASSERT((ksi->ksi_flags & KSI_QUEUED) != 0); TAILQ_INSERT_TAIL(kq, ksi, ksi_list);
}
}
}
/*
* sigclearall:
*
* Clear all pending signals in the specified set from a process and
* its LWPs.
*/
void
sigclearall(struct proc *p, const sigset_t *mask, ksiginfoq_t *kq)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
sigclear(&p->p_sigpend, mask, kq);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
sigclear(&l->l_sigpend, mask, kq);
}
}
/*
* sigispending:
*
* Return the first signal number if there are pending signals for the
* current LWP. May be called unlocked provided that LW_PENDSIG is set,
* and that the signal has been posted to the appopriate queue before
* LW_PENDSIG is set.
*
* This should only ever be called with (l == curlwp), unless the
* result does not matter (procfs, sysctl).
*/
int
sigispending(struct lwp *l, int signo)
{
struct proc *p = l->l_proc;
sigset_t tset;
membar_consumer();
tset = l->l_sigpend.sp_set;
sigplusset(&p->p_sigpend.sp_set, &tset);
sigminusset(&p->p_sigctx.ps_sigignore, &tset);
sigminusset(&l->l_sigmask, &tset);
if (signo == 0) {
return firstsig(&tset);
}
return sigismember(&tset, signo) ? signo : 0;
}
void
getucontext(struct lwp *l, ucontext_t *ucp)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
ucp->uc_flags = 0;
ucp->uc_link = l->l_ctxlink;
ucp->uc_sigmask = l->l_sigmask;
ucp->uc_flags |= _UC_SIGMASK;
/*
* The (unsupplied) definition of the `current execution stack'
* in the System V Interface Definition appears to allow returning
* the main context stack.
*/
if ((l->l_sigstk.ss_flags & SS_ONSTACK) == 0) {
ucp->uc_stack.ss_sp = (void *)l->l_proc->p_stackbase;
ucp->uc_stack.ss_size = ctob(l->l_proc->p_vmspace->vm_ssize);
ucp->uc_stack.ss_flags = 0; /* XXX, def. is Very Fishy */
} else {
/* Simply copy alternate signal execution stack. */
ucp->uc_stack = l->l_sigstk;
}
ucp->uc_flags |= _UC_STACK;
mutex_exit(p->p_lock);
cpu_getmcontext(l, &ucp->uc_mcontext, &ucp->uc_flags);
mutex_enter(p->p_lock);
}
int
setucontext(struct lwp *l, const ucontext_t *ucp)
{
struct proc *p = l->l_proc;
int error;
KASSERT(mutex_owned(p->p_lock));
if ((ucp->uc_flags & _UC_SIGMASK) != 0) {
error = sigprocmask1(l, SIG_SETMASK, &ucp->uc_sigmask, NULL);
if (error != 0)
return error;
}
mutex_exit(p->p_lock);
error = cpu_setmcontext(l, &ucp->uc_mcontext, ucp->uc_flags);
mutex_enter(p->p_lock);
if (error != 0)
return (error);
l->l_ctxlink = ucp->uc_link;
/*
* If there was stack information, update whether or not we are
* still running on an alternate signal stack.
*/
if ((ucp->uc_flags & _UC_STACK) != 0) {
if (ucp->uc_stack.ss_flags & SS_ONSTACK)
l->l_sigstk.ss_flags |= SS_ONSTACK;
else
l->l_sigstk.ss_flags &= ~SS_ONSTACK;
}
return 0;
}
/*
* killpg1: common code for kill process group/broadcast kill.
*/
int
killpg1(struct lwp *l, ksiginfo_t *ksi, int pgid, int all)
{
struct proc *p, *cp;
kauth_cred_t pc;
struct pgrp *pgrp;
int nfound;
int signo = ksi->ksi_signo;
cp = l->l_proc;
pc = l->l_cred;
nfound = 0;
mutex_enter(&proc_lock);
if (all) {
/*
* Broadcast.
*/
PROCLIST_FOREACH(p, &allproc) {
if (p->p_pid <= 1 || p == cp ||
(p->p_flag & PK_SYSTEM) != 0)
continue;
mutex_enter(p->p_lock);
if (kauth_authorize_process(pc,
KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(signo), NULL,
NULL) == 0) {
nfound++;
if (signo)
kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
}
} else {
if (pgid == 0)
/* Zero pgid means send to my process group. */
pgrp = cp->p_pgrp;
else {
pgrp = pgrp_find(pgid);
if (pgrp == NULL)
goto out;
}
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
if (p->p_pid <= 1 || p->p_flag & PK_SYSTEM)
continue;
mutex_enter(p->p_lock);
if (kauth_authorize_process(pc, KAUTH_PROCESS_SIGNAL,
p, KAUTH_ARG(signo), NULL, NULL) == 0) {
nfound++;
if (signo && P_ZOMBIE(p) == 0)
kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
}
}
out:
mutex_exit(&proc_lock);
return nfound ? 0 : ESRCH;
}
/*
* Send a signal to a process group. If checktty is set, limit to members
* which have a controlling terminal.
*/
void
pgsignal(struct pgrp *pgrp, int sig, int checkctty)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KSI_INIT_EMPTY(&ksi);
ksi.ksi_signo = sig;
kpgsignal(pgrp, &ksi, NULL, checkctty);
}
void
kpgsignal(struct pgrp *pgrp, ksiginfo_t *ksi, void *data, int checkctty)
{
struct proc *p;
KASSERT(!cpu_intr_p());
KASSERT(mutex_owned(&proc_lock));
KASSERT(pgrp != NULL);
LIST_FOREACH(p, &pgrp->pg_members, p_pglist)
if (checkctty == 0 || p->p_lflag & PL_CONTROLT)
kpsignal(p, ksi, data);
}
/*
* Send a signal caused by a trap to the current LWP. If it will be caught
* immediately, deliver it with correct code. Otherwise, post it normally.
*/
void
trapsignal(struct lwp *l, ksiginfo_t *ksi)
{
struct proc *p;
struct sigacts *ps;
int signo = ksi->ksi_signo;
sigset_t *mask;
sig_t action;
KASSERT(KSI_TRAP_P(ksi));
ksi->ksi_lid = l->l_lid;
p = l->l_proc;
KASSERT(!cpu_intr_p());
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
lwp_exit(l);
panic("trapsignal");
/* NOTREACHED */
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
mutex_exit(&proc_lock);
sigswitch_unlock_and_switch_away(l);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
goto repeat;
}
mask = &l->l_sigmask;
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
if (ISSET(p->p_slflag, PSL_TRACED) && !(p->p_pptr == p->p_opptr && ISSET(p->p_lflag, PL_PPWAIT)) && p->p_xsig != SIGKILL &&
!sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
p->p_xsig = signo;
p->p_sigctx.ps_faked = true;
p->p_sigctx.ps_lwp = ksi->ksi_lid;
p->p_sigctx.ps_info = ksi->ksi_info;
sigswitch(0, signo, true);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, ksi);
else
ktrpsig(signo, action, mask, ksi);
}
return;
}
const bool caught = sigismember(&p->p_sigctx.ps_sigcatch, signo);
const bool masked = sigismember(mask, signo);
if (caught && !masked) {
mutex_exit(&proc_lock);
l->l_ru.ru_nsignals++;
kpsendsig(l, ksi, mask);
mutex_exit(p->p_lock);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, ksi);
else
ktrpsig(signo, action, mask, ksi);
}
return;
}
/*
* If the signal is masked or ignored, then unmask it and
* reset it to the default action so that the process or
* its tracer will be notified.
*/
const bool ignored = action == SIG_IGN;
if (masked || ignored) { mutex_enter(&ps->sa_mutex);
sigdelset(mask, signo);
sigdelset(&p->p_sigctx.ps_sigcatch, signo);
sigdelset(&p->p_sigctx.ps_sigignore, signo);
sigdelset(&SIGACTION_PS(ps, signo).sa_mask, signo);
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
mutex_exit(&ps->sa_mutex);
}
kpsignal2(p, ksi);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
}
/*
* Fill in signal information and signal the parent for a child status change.
*/
void
child_psignal(struct proc *p, int mask)
{
ksiginfo_t ksi;
struct proc *q;
int xsig;
KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock));
xsig = p->p_xsig;
KSI_INIT(&ksi);
ksi.ksi_signo = SIGCHLD;
ksi.ksi_code = (xsig == SIGCONT ? CLD_CONTINUED : CLD_STOPPED);
ksi.ksi_pid = p->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(p->p_cred);
ksi.ksi_status = xsig;
ksi.ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
ksi.ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
q = p->p_pptr;
mutex_exit(p->p_lock);
mutex_enter(q->p_lock);
if ((q->p_sflag & mask) == 0) kpsignal2(q, &ksi);
mutex_exit(q->p_lock);
mutex_enter(p->p_lock);
}
void
psignal(struct proc *p, int signo)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock));
KSI_INIT_EMPTY(&ksi);
ksi.ksi_signo = signo;
mutex_enter(p->p_lock);
kpsignal2(p, &ksi);
mutex_exit(p->p_lock);
}
void
kpsignal(struct proc *p, ksiginfo_t *ksi, void *data)
{
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); if ((p->p_sflag & PS_WEXIT) == 0 && data) {
size_t fd;
filedesc_t *fdp = p->p_fd;
/* XXXSMP locking */
ksi->ksi_fd = -1;
dt = atomic_load_consume(&fdp->fd_dt); for (fd = 0; fd < dt->dt_nfiles; fd++) { if ((ff = dt->dt_ff[fd]) == NULL)
continue;
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL)
continue;
if (fp->f_data == data) { ksi->ksi_fd = fd;
break;
}
}
}
mutex_enter(p->p_lock);
kpsignal2(p, ksi);
mutex_exit(p->p_lock);
}
/*
* sigismasked:
*
* Returns true if signal is ignored or masked for the specified LWP.
*/
int
sigismasked(struct lwp *l, int sig)
{
struct proc *p = l->l_proc;
return sigismember(&p->p_sigctx.ps_sigignore, sig) ||
sigismember(&l->l_sigmask, sig);
}
/*
* sigpost:
*
* Post a pending signal to an LWP. Returns non-zero if the LWP may
* be able to take the signal.
*/
static int
sigpost(struct lwp *l, sig_t action, int prop, int sig)
{
int rv, masked;
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
/*
* If the LWP is on the way out, sigclear() will be busy draining all
* pending signals. Don't give it more.
*/
if (l->l_stat == LSZOMB)
return 0;
SDT_PROBE(proc, kernel, , signal__send, l, p, sig, 0, 0);
lwp_lock(l);
if (__predict_false((l->l_flag & LW_DBGSUSPEND) != 0)) {
if ((prop & SA_KILL) != 0)
l->l_flag &= ~LW_DBGSUSPEND;
else {
lwp_unlock(l);
return 0;
}
}
/*
* Have the LWP check for signals. This ensures that even if no LWP
* is found to take the signal immediately, it should be taken soon.
*/
signotify(l);
/*
* SIGCONT can be masked, but if LWP is stopped, it needs restart.
* Note: SIGKILL and SIGSTOP cannot be masked.
*/
masked = sigismember(&l->l_sigmask, sig);
if (masked && ((prop & SA_CONT) == 0 || l->l_stat != LSSTOP)) {
lwp_unlock(l);
return 0;
}
/*
* If killing the process, make it run fast.
*/
if (__predict_false((prop & SA_KILL) != 0) && action == SIG_DFL && l->l_priority < MAXPRI_USER) { KASSERT(l->l_class == SCHED_OTHER);
lwp_changepri(l, MAXPRI_USER);
}
/*
* If the LWP is running or on a run queue, then we win. If it's
* sleeping interruptably, wake it and make it take the signal. If
* the sleep isn't interruptable, then the chances are it will get
* to see the signal soon anyhow. If suspended, it can't take the
* signal right now. If it's LWP private or for all LWPs, save it
* for later; otherwise punt.
*/
rv = 0;
switch (l->l_stat) {
case LSRUN:
case LSONPROC:
rv = 1;
break;
case LSSLEEP:
if ((l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
}
break;
case LSSUSPENDED:
if ((prop & SA_KILL) != 0 && (l->l_flag & LW_WCORE) != 0) {
/* lwp_continue() will release the lock. */
lwp_continue(l);
return 1;
}
break;
case LSSTOP:
if ((prop & SA_STOP) != 0)
break;
/*
* If the LWP is stopped and we are sending a continue
* signal, then start it again.
*/
if ((prop & SA_CONT) != 0) {
if (l->l_wchan != NULL) {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
rv = 1;
break;
}
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
} else if (l->l_wchan == NULL || (l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
return 1;
}
break;
default:
break;
}
lwp_unlock(l);
return rv;
}
/*
* Notify an LWP that it has a pending signal.
*/
void
signotify(struct lwp *l)
{ KASSERT(lwp_locked(l, NULL));
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
}
/*
* Find an LWP within process p that is waiting on signal ksi, and hand
* it on.
*/
static int
sigunwait(struct proc *p, const ksiginfo_t *ksi)
{
struct lwp *l;
int signo;
KASSERT(mutex_owned(p->p_lock));
signo = ksi->ksi_signo;
if (ksi->ksi_lid != 0) {
/*
* Signal came via _lwp_kill(). Find the LWP and see if
* it's interested.
*/
if ((l = lwp_find(p, ksi->ksi_lid)) == NULL)
return 0;
if (l->l_sigwaited == NULL ||
!sigismember(&l->l_sigwaitset, signo))
return 0;
} else {
/*
* Look for any LWP that may be interested.
*/
LIST_FOREACH(l, &p->p_sigwaiters, l_sigwaiter) { KASSERT(l->l_sigwaited != NULL);
if (sigismember(&l->l_sigwaitset, signo))
break;
}
}
if (l != NULL) {
l->l_sigwaited->ksi_info = ksi->ksi_info;
l->l_sigwaited = NULL;
LIST_REMOVE(l, l_sigwaiter);
cv_signal(&l->l_sigcv);
return 1;
}
return 0;
}
/*
* Send the signal to the process. If the signal has an action, the action
* is usually performed by the target process rather than the caller; we add
* the signal to the set of pending signals for the process.
*
* Exceptions:
* o When a stop signal is sent to a sleeping process that takes the
* default action, the process is stopped without awakening it.
* o SIGCONT restarts stopped processes (or puts them back to sleep)
* regardless of the signal action (eg, blocked or ignored).
*
* Other ignored signals are discarded immediately.
*/
int
kpsignal2(struct proc *p, ksiginfo_t *ksi)
{
int prop, signo = ksi->ksi_signo;
struct lwp *l = NULL;
ksiginfo_t *kp;
lwpid_t lid;
sig_t action;
bool toall;
bool traced;
int error = 0;
KASSERT(!cpu_intr_p()); KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock)); KASSERT((ksi->ksi_flags & KSI_QUEUED) == 0); KASSERT(signo > 0); KASSERT(signo < NSIG);
/*
* If the process is being created by fork, is a zombie or is
* exiting, then just drop the signal here and bail out.
*/
if (p->p_stat != SACTIVE && p->p_stat != SSTOP)
return 0;
/*
* Notify any interested parties of the signal.
*/
KNOTE(&p->p_klist, NOTE_SIGNAL | signo);
/*
* Some signals including SIGKILL must act on the entire process.
*/
kp = NULL;
prop = sigprop[signo];
toall = ((prop & SA_TOALL) != 0);
lid = toall ? 0 : ksi->ksi_lid;
traced = ISSET(p->p_slflag, PSL_TRACED) &&
!sigismember(&p->p_sigctx.ps_sigpass, signo);
/*
* If proc is traced, always give parent a chance.
*/
if (traced) {
action = SIG_DFL;
if (lid == 0) {
/*
* If the process is being traced and the signal
* is being caught, make sure to save any ksiginfo.
*/
if ((kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
goto discard;
if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
goto out;
}
} else {
/*
* If the signal is being ignored, then drop it. Note: we
* don't set SIGCONT in ps_sigignore, and if it is set to
* SIG_IGN, action will be SIG_DFL here.
*/
if (sigismember(&p->p_sigctx.ps_sigignore, signo))
goto discard;
else if (sigismember(&p->p_sigctx.ps_sigcatch, signo))
action = SIG_CATCH;
else {
action = SIG_DFL;
/*
* If sending a tty stop signal to a member of an
* orphaned process group, discard the signal here if
* the action is default; don't stop the process below
* if sleeping, and don't clear any pending SIGCONT.
*/
if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0)
goto discard;
if (prop & SA_KILL && p->p_nice > NZERO) p->p_nice = NZERO;
}
}
/*
* If stopping or continuing a process, discard any pending
* signals that would do the inverse.
*/
if ((prop & (SA_CONT | SA_STOP)) != 0) {
ksiginfoq_t kq;
ksiginfo_queue_init(&kq);
if ((prop & SA_CONT) != 0) sigclear(&p->p_sigpend, &stopsigmask, &kq); if ((prop & SA_STOP) != 0) sigclear(&p->p_sigpend, &contsigmask, &kq); ksiginfo_queue_drain(&kq); /* XXXSMP */
}
/*
* If the signal doesn't have SA_CANTMASK (no override for SIGKILL,
* please!), check if any LWPs are waiting on it. If yes, pass on
* the signal info. The signal won't be processed further here.
*/
if ((prop & SA_CANTMASK) == 0 && !LIST_EMPTY(&p->p_sigwaiters) && p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0 && sigunwait(p, ksi))
goto discard;
/*
* XXXSMP Should be allocated by the caller, we're holding locks
* here.
*/
if (kp == NULL && (kp = ksiginfo_alloc(p, ksi, PR_NOWAIT)) == NULL)
goto discard;
/*
* LWP private signals are easy - just find the LWP and post
* the signal to it.
*/
if (lid != 0) {
l = lwp_find(p, lid);
if (l != NULL) { if ((error = sigput(&l->l_sigpend, p, kp)) != 0)
goto out;
membar_producer();
if (sigpost(l, action, prop, kp->ksi_signo) != 0)
signo = -1;
}
goto out;
}
/*
* Some signals go to all LWPs, even if posted with _lwp_kill()
* or for an SA process.
*/
if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) { if (traced)
goto deliver;
/*
* If SIGCONT is default (or ignored) and process is
* asleep, we are finished; the process should not
* be awakened.
*/
if ((prop & SA_CONT) != 0 && action == SIG_DFL)
goto out;
} else {
/*
* Process is stopped or stopping.
* - If traced, then no action is needed, unless killing.
* - Run the process only if sending SIGCONT or SIGKILL.
*/
if (traced && signo != SIGKILL) {
goto out;
}
if ((prop & SA_CONT) != 0 || signo == SIGKILL) {
/*
* Re-adjust p_nstopchild if the process was
* stopped but not yet collected by its parent.
*/
if (p->p_stat == SSTOP && !p->p_waited) p->p_pptr->p_nstopchild--;
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
if (traced) {
KASSERT(signo == SIGKILL);
goto deliver;
}
/*
* Do not make signal pending if SIGCONT is default.
*
* If the process catches SIGCONT, let it handle the
* signal itself (if waiting on event - process runs,
* otherwise continues sleeping).
*/
if ((prop & SA_CONT) != 0) {
p->p_xsig = SIGCONT;
p->p_sflag |= PS_CONTINUED;
child_psignal(p, 0);
if (action == SIG_DFL) {
KASSERT(signo != SIGKILL);
goto deliver;
}
}
} else if ((prop & SA_STOP) != 0) {
/*
* Already stopped, don't need to stop again.
* (If we did the shell could get confused.)
*/
goto out;
}
}
/*
* Make signal pending.
*/
KASSERT(!traced); if ((error = sigput(&p->p_sigpend, p, kp)) != 0)
goto out;
deliver:
/*
* Before we set LW_PENDSIG on any LWP, ensure that the signal is
* visible on the per process list (for sigispending()). This
* is unlikely to be needed in practice, but...
*/
membar_producer();
/*
* Try to find an LWP that can take the signal.
*/
LIST_FOREACH(l, &p->p_lwps, l_sibling) { if (sigpost(l, action, prop, kp->ksi_signo) && !toall)
break;
}
signo = -1;
out:
/*
* If the ksiginfo wasn't used, then bin it. XXXSMP freeing memory
* with locks held. The caller should take care of this.
*/
ksiginfo_free(kp); if (signo == -1)
return error;
discard:
SDT_PROBE(proc, kernel, , signal__discard, l, p, signo, 0, 0);
return error;
}
void
kpsendsig(struct lwp *l, const ksiginfo_t *ksi, const sigset_t *mask)
{
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
(*p->p_emul->e_sendsig)(ksi, mask);
}
/*
* Stop any LWPs sleeping interruptably.
*/
static void
proc_stop_lwps(struct proc *p)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
KASSERT((p->p_sflag & PS_STOPPING) != 0);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
if (l->l_stat == LSSLEEP && (l->l_flag & LW_SINTR) != 0) {
l->l_stat = LSSTOP;
p->p_nrlwps--;
}
lwp_unlock(l);
}
}
/*
* Finish stopping of a process. Mark it stopped and notify the parent.
*
* Drop p_lock briefly if ppsig is true.
*/
static void
proc_stop_done(struct proc *p, int ppmask)
{
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
KASSERT((p->p_sflag & PS_STOPPING) != 0);
KASSERT(p->p_nrlwps == 0 || p->p_nrlwps == 1);
KASSERT(p->p_nrlwps == 0 || p == curproc);
p->p_sflag &= ~PS_STOPPING;
p->p_stat = SSTOP;
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
/* child_psignal drops p_lock briefly. */
child_psignal(p, ppmask);
cv_broadcast(&p->p_pptr->p_waitcv);
}
/*
* Stop the current process and switch away to the debugger notifying
* an event specific to a traced process only.
*/
void
eventswitch(int code, int pe_report_event, int entity)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
struct sigacts *ps;
sigset_t *mask;
sig_t action;
ksiginfo_t ksi;
const int signo = SIGTRAP;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
KASSERT(p->p_pptr != initproc);
KASSERT(l->l_stat == LSONPROC);
KASSERT(ISSET(p->p_slflag, PSL_TRACED));
KASSERT(!ISSET(l->l_flag, LW_SYSTEM));
KASSERT(p->p_nrlwps > 0);
KASSERT((code == TRAP_CHLD) || (code == TRAP_LWP) ||
(code == TRAP_EXEC));
KASSERT((code != TRAP_CHLD) || (entity > 1)); /* prevent pid1 */
KASSERT((code != TRAP_LWP) || (entity > 0));
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
if (pe_report_event == PTRACE_LWP_EXIT) {
/* Avoid double lwp_exit() and panic. */
return;
}
lwp_exit(l);
panic("eventswitch");
/* NOTREACHED */
}
/*
* If we are no longer traced, abandon this event signal.
*
* This avoids killing a process after detaching the debugger.
*/
if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
/*
* If there's a pending SIGKILL process it immediately.
*/
if (p->p_xsig == SIGKILL ||
sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
mutex_exit(&proc_lock);
sigswitch_unlock_and_switch_away(l);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
goto repeat;
}
KSI_INIT_TRAP(&ksi);
ksi.ksi_lid = l->l_lid;
ksi.ksi_signo = signo;
ksi.ksi_code = code;
ksi.ksi_pe_report_event = pe_report_event;
CTASSERT(sizeof(ksi.ksi_pe_other_pid) == sizeof(ksi.ksi_pe_lwp));
ksi.ksi_pe_other_pid = entity;
/* Needed for ktrace */
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
mask = &l->l_sigmask;
p->p_xsig = signo;
p->p_sigctx.ps_faked = true;
p->p_sigctx.ps_lwp = ksi.ksi_lid;
p->p_sigctx.ps_info = ksi.ksi_info;
sigswitch(0, signo, true);
if (code == TRAP_CHLD) {
mutex_enter(&proc_lock);
while (l->l_vforkwaiting)
cv_wait(&l->l_waitcv, &proc_lock);
mutex_exit(&proc_lock);
}
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
else
ktrpsig(signo, action, mask, &ksi);
}
}
void
eventswitchchild(struct proc *p, int code, int pe_report_event)
{
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) !=
(PSL_TRACED|PSL_TRACEDCHILD)) {
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return;
}
eventswitch(code, pe_report_event, p->p_oppid);
}
/*
* Stop the current process and switch away when being stopped or traced.
*/
static void
sigswitch(int ppmask, int signo, bool proc_lock_held)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(l->l_stat == LSONPROC);
KASSERT(p->p_nrlwps > 0);
if (proc_lock_held) {
KASSERT(mutex_owned(&proc_lock));
} else {
KASSERT(!mutex_owned(&proc_lock));
}
/*
* On entry we know that the process needs to stop. If it's
* the result of a 'sideways' stop signal that has been sourced
* through issignal(), then stop other LWPs in the process too.
*/
if (p->p_stat == SACTIVE && (p->p_sflag & PS_STOPPING) == 0) {
KASSERT(signo != 0);
proc_stop(p, signo);
KASSERT(p->p_nrlwps > 0);
}
/*
* If we are the last live LWP, and the stop was a result of
* a new signal, then signal the parent.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
if (!proc_lock_held && !mutex_tryenter(&proc_lock)) {
mutex_exit(p->p_lock);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
}
if (p->p_nrlwps == 1 && (p->p_sflag & PS_STOPPING) != 0) {
/*
* Note that proc_stop_done() can drop
* p->p_lock briefly.
*/
proc_stop_done(p, ppmask);
}
mutex_exit(&proc_lock);
}
sigswitch_unlock_and_switch_away(l);
}
/*
* Unlock and switch away.
*/
static void
sigswitch_unlock_and_switch_away(struct lwp *l)
{
struct proc *p;
p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(!mutex_owned(&proc_lock));
KASSERT(l->l_stat == LSONPROC);
KASSERT(p->p_nrlwps > 0);
KASSERT(l->l_blcnt == 0);
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
p->p_nrlwps--;
lwp_lock(l);
KASSERT(l->l_stat == LSONPROC || l->l_stat == LSSLEEP);
l->l_stat = LSSTOP;
lwp_unlock(l);
}
mutex_exit(p->p_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
}
/*
* Check for a signal from the debugger.
*/
static int
sigchecktrace(void)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int signo;
KASSERT(mutex_owned(p->p_lock));
/* If there's a pending SIGKILL, process it immediately. */
if (sigismember(&p->p_sigpend.sp_set, SIGKILL))
return 0;
/*
* If we are no longer being traced, or the parent didn't
* give us a signal, or we're stopping, look for more signals.
*/
if ((p->p_slflag & PSL_TRACED) == 0 || p->p_xsig == 0 ||
(p->p_sflag & PS_STOPPING) != 0)
return 0;
/*
* If the new signal is being masked, look for other signals.
* `p->p_sigctx.ps_siglist |= mask' is done in setrunnable().
*/
signo = p->p_xsig;
p->p_xsig = 0;
if (sigismember(&l->l_sigmask, signo)) {
signo = 0;
}
return signo;
}
/*
* If the current process has received a signal (should be caught or cause
* termination, should interrupt current syscall), return the signal number.
*
* Stop signals with default action are processed immediately, then cleared;
* they aren't returned. This is checked after each entry to the system for
* a syscall or trap.
*
* We will also return -1 if the process is exiting and the current LWP must
* follow suit.
*/
int
issignal(struct lwp *l)
{
struct proc *p;
int siglwp, signo, prop;
sigpend_t *sp;
sigset_t ss;
bool traced;
p = l->l_proc;
sp = NULL;
signo = 0;
KASSERT(p == curproc);
KASSERT(mutex_owned(p->p_lock));
for (;;) {
/* Discard any signals that we have decided not to take. */
if (signo != 0) {
(void)sigget(sp, NULL, signo, NULL);
}
/*
* If the process is stopped/stopping, then stop ourselves
* now that we're on the kernel/userspace boundary. When
* we awaken, check for a signal from the debugger.
*/
if (p->p_stat == SSTOP || (p->p_sflag & PS_STOPPING) != 0) {
sigswitch_unlock_and_switch_away(l);
mutex_enter(p->p_lock);
continue;
} else if (p->p_stat == SACTIVE)
signo = sigchecktrace();
else
signo = 0;
/* Signals from the debugger are "out of band". */
sp = NULL;
/*
* If the debugger didn't provide a signal, find a pending
* signal from our set. Check per-LWP signals first, and
* then per-process.
*/
if (signo == 0) {
sp = &l->l_sigpend;
ss = sp->sp_set;
siglwp = l->l_lid;
if ((p->p_lflag & PL_PPWAIT) != 0)
sigminusset(&vforksigmask, &ss);
sigminusset(&l->l_sigmask, &ss);
if ((signo = firstsig(&ss)) == 0) {
sp = &p->p_sigpend;
ss = sp->sp_set;
siglwp = 0;
if ((p->p_lflag & PL_PPWAIT) != 0)
sigminusset(&vforksigmask, &ss);
sigminusset(&l->l_sigmask, &ss);
if ((signo = firstsig(&ss)) == 0) {
/*
* No signal pending - clear the
* indicator and bail out.
*/
lwp_lock(l);
l->l_flag &= ~LW_PENDSIG;
lwp_unlock(l);
sp = NULL;
break;
}
}
}
traced = ISSET(p->p_slflag, PSL_TRACED) &&
!sigismember(&p->p_sigctx.ps_sigpass, signo);
if (sp) {
/* Overwrite process' signal context to correspond
* to the currently reported LWP. This is necessary
* for PT_GET_SIGINFO to report the correct signal when
* multiple LWPs have pending signals. We do this only
* when the signal comes from the queue, for signals
* created by the debugger we assume it set correct
* siginfo.
*/
ksiginfo_t *ksi = TAILQ_FIRST(&sp->sp_info);
if (ksi) {
p->p_sigctx.ps_lwp = ksi->ksi_lid;
p->p_sigctx.ps_info = ksi->ksi_info;
} else {
p->p_sigctx.ps_lwp = siglwp;
memset(&p->p_sigctx.ps_info, 0,
sizeof(p->p_sigctx.ps_info));
p->p_sigctx.ps_info._signo = signo;
p->p_sigctx.ps_info._code = SI_NOINFO;
}
}
/*
* We should see pending but ignored signals only if
* we are being traced.
*/
if (sigismember(&p->p_sigctx.ps_sigignore, signo) &&
!traced) {
/* Discard the signal. */
continue;
}
/*
* If traced, always stop, and stay stopped until released
* by the debugger. If the our parent is our debugger waiting
* for us and we vforked, don't hang as we could deadlock.
*/
if (traced && signo != SIGKILL &&
!(ISSET(p->p_lflag, PL_PPWAIT) &&
(p->p_pptr == p->p_opptr))) {
/*
* Take the signal, but don't remove it from the
* siginfo queue, because the debugger can send
* it later.
*/
if (sp)
sigdelset(&sp->sp_set, signo);
p->p_xsig = signo;
/* Handling of signal trace */
sigswitch(0, signo, false);
mutex_enter(p->p_lock);
/* Check for a signal from the debugger. */
if ((signo = sigchecktrace()) == 0)
continue;
/* Signals from the debugger are "out of band". */
sp = NULL;
}
prop = sigprop[signo];
/*
* Decide whether the signal should be returned.
*/
switch ((long)SIGACTION(p, signo).sa_handler) {
case (long)SIG_DFL:
/*
* Don't take default actions on system processes.
*/
if (p->p_pid <= 1) {
#ifdef DIAGNOSTIC
/*
* Are you sure you want to ignore SIGSEGV
* in init? XXX
*/
printf_nolog("Process (pid %d) got sig %d\n",
p->p_pid, signo);
#endif
continue;
}
/*
* If there is a pending stop signal to process with
* default action, stop here, then clear the signal.
* However, if process is member of an orphaned
* process group, ignore tty stop signals.
*/
if (prop & SA_STOP) {
/*
* XXX Don't hold proc_lock for p_lflag,
* but it's not a big deal.
*/
if ((traced &&
!(ISSET(p->p_lflag, PL_PPWAIT) &&
(p->p_pptr == p->p_opptr))) ||
((p->p_lflag & PL_ORPHANPG) != 0 &&
prop & SA_TTYSTOP)) {
/* Ignore the signal. */
continue;
}
/* Take the signal. */
(void)sigget(sp, NULL, signo, NULL);
p->p_xsig = signo;
p->p_sflag &= ~PS_CONTINUED;
signo = 0;
sigswitch(PS_NOCLDSTOP, p->p_xsig, false);
mutex_enter(p->p_lock);
} else if (prop & SA_IGNORE) {
/*
* Except for SIGCONT, shouldn't get here.
* Default action is to ignore; drop it.
*/
continue;
}
break;
case (long)SIG_IGN:
#ifdef DEBUG_ISSIGNAL
/*
* Masking above should prevent us ever trying
* to take action on an ignored signal other
* than SIGCONT, unless process is traced.
*/
if ((prop & SA_CONT) == 0 && !traced)
printf_nolog("issignal\n");
#endif
continue;
default:
/*
* This signal has an action, let postsig() process
* it.
*/
break;
}
break;
}
l->l_sigpendset = sp;
return signo;
}
/*
* Take the action for the specified signal
* from the current set of pending signals.
*/
void
postsig(int signo)
{
struct lwp *l;
struct proc *p;
struct sigacts *ps;
sig_t action;
sigset_t *returnmask;
ksiginfo_t ksi;
l = curlwp;
p = l->l_proc;
ps = p->p_sigacts;
KASSERT(mutex_owned(p->p_lock));
KASSERT(signo > 0);
/*
* Set the new mask value and also defer further occurrences of this
* signal.
*
* Special case: user has done a sigsuspend. Here the current mask is
* not of interest, but rather the mask from before the sigsuspend is
* what we want restored after the signal processing is completed.
*/
if (l->l_sigrestore) {
returnmask = &l->l_sigoldmask;
l->l_sigrestore = 0;
} else
returnmask = &l->l_sigmask;
/*
* Commit to taking the signal before releasing the mutex.
*/
action = SIGACTION_PS(ps, signo).sa_handler;
l->l_ru.ru_nsignals++;
if (l->l_sigpendset == NULL) {
/* From the debugger */
if (p->p_sigctx.ps_faked &&
signo == p->p_sigctx.ps_info._signo) {
KSI_INIT(&ksi);
ksi.ksi_info = p->p_sigctx.ps_info;
ksi.ksi_lid = p->p_sigctx.ps_lwp;
p->p_sigctx.ps_faked = false;
} else {
if (!siggetinfo(&l->l_sigpend, &ksi, signo))
(void)siggetinfo(&p->p_sigpend, &ksi, signo);
}
} else
sigget(l->l_sigpendset, &ksi, signo, NULL);
if (ktrpoint(KTR_PSIG)) {
mutex_exit(p->p_lock);
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action,
returnmask, &ksi);
else
ktrpsig(signo, action, returnmask, &ksi);
mutex_enter(p->p_lock);
}
SDT_PROBE(proc, kernel, , signal__handle, signo, &ksi, action, 0, 0);
if (action == SIG_DFL) {
/*
* Default action, where the default is to kill
* the process. (Other cases were ignored above.)
*/
sigexit(l, signo);
return;
}
/*
* If we get here, the signal must be caught.
*/
#ifdef DIAGNOSTIC
if (action == SIG_IGN || sigismember(&l->l_sigmask, signo))
panic("postsig action");
#endif
kpsendsig(l, &ksi, returnmask);
}
/*
* sendsig:
*
* Default signal delivery method for NetBSD.
*/
void
sendsig(const struct ksiginfo *ksi, const sigset_t *mask)
{
struct sigacts *sa;
int sig;
sig = ksi->ksi_signo;
sa = curproc->p_sigacts;
switch (sa->sa_sigdesc[sig].sd_vers) {
case __SIGTRAMP_SIGCODE_VERSION:
#ifdef __HAVE_STRUCT_SIGCONTEXT
case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
__SIGTRAMP_SIGCONTEXT_VERSION_MAX:
/* Compat for 1.6 and earlier. */
MODULE_HOOK_CALL_VOID(sendsig_sigcontext_16_hook, (ksi, mask),
break);
return;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
case __SIGTRAMP_SIGINFO_VERSION_MIN ...
__SIGTRAMP_SIGINFO_VERSION_MAX:
sendsig_siginfo(ksi, mask);
return;
default:
break;
}
printf("sendsig: bad version %d\n", sa->sa_sigdesc[sig].sd_vers);
sigexit(curlwp, SIGILL);
}
/*
* sendsig_reset:
*
* Reset the signal action. Called from emulation specific sendsig()
* before unlocking to deliver the signal.
*/
void
sendsig_reset(struct lwp *l, int signo)
{
struct proc *p = l->l_proc;
struct sigacts *ps = p->p_sigacts;
KASSERT(mutex_owned(p->p_lock));
p->p_sigctx.ps_lwp = 0;
memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
mutex_enter(&ps->sa_mutex);
sigplusset(&SIGACTION_PS(ps, signo).sa_mask, &l->l_sigmask);
if (SIGACTION_PS(ps, signo).sa_flags & SA_RESETHAND) {
sigdelset(&p->p_sigctx.ps_sigcatch, signo);
if (signo != SIGCONT && sigprop[signo] & SA_IGNORE) sigaddset(&p->p_sigctx.ps_sigignore, signo);
SIGACTION_PS(ps, signo).sa_handler = SIG_DFL;
}
mutex_exit(&ps->sa_mutex);
}
/*
* Kill the current process for stated reason.
*/
void
killproc(struct proc *p, const char *why)
{
KASSERT(mutex_owned(&proc_lock));
log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
uprintf_locked("sorry, pid %d was killed: %s\n", p->p_pid, why);
psignal(p, SIGKILL);
}
/*
* Force the current process to exit with the specified signal, dumping core
* if appropriate. We bypass the normal tests for masked and caught
* signals, allowing unrecoverable failures to terminate the process without
* changing signal state. Mark the accounting record with the signal
* termination. If dumping core, save the signal number for the debugger.
* Calls exit and does not return.
*/
void
sigexit(struct lwp *l, int signo)
{
int exitsig, error, docore;
struct proc *p;
struct lwp *t;
p = l->l_proc;
KASSERT(mutex_owned(p->p_lock));
KASSERT(l->l_blcnt == 0);
/*
* Don't permit coredump() multiple times in the same process.
* Call back into sigexit, where we will be suspended until
* the deed is done. Note that this is a recursive call, but
* LW_WCORE will prevent us from coming back this way.
*/
if ((p->p_sflag & PS_WCORE) != 0) {
lwp_lock(l);
l->l_flag |= (LW_WCORE | LW_WEXIT | LW_WSUSPEND);
lwp_need_userret(l);
lwp_unlock(l);
mutex_exit(p->p_lock);
lwp_userret(l);
panic("sigexit 1");
/* NOTREACHED */
}
/* If process is already on the way out, then bail now. */
if ((p->p_sflag & PS_WEXIT) != 0) {
mutex_exit(p->p_lock);
lwp_exit(l);
panic("sigexit 2");
/* NOTREACHED */
}
/*
* Prepare all other LWPs for exit. If dumping core, suspend them
* so that their registers are available long enough to be dumped.
*/
if ((docore = (sigprop[signo] & SA_CORE)) != 0) {
p->p_sflag |= PS_WCORE;
for (;;) {
LIST_FOREACH(t, &p->p_lwps, l_sibling) {
lwp_lock(t);
if (t == l) {
t->l_flag &=
~(LW_WSUSPEND | LW_DBGSUSPEND);
lwp_unlock(t);
continue;
}
t->l_flag |= (LW_WCORE | LW_WEXIT);
lwp_need_userret(t);
lwp_suspend(l, t);
}
if (p->p_nrlwps == 1)
break;
/*
* Kick any LWPs sitting in lwp_wait1(), and wait
* for everyone else to stop before proceeding.
*/
p->p_nlwpwait++;
cv_broadcast(&p->p_lwpcv);
cv_wait(&p->p_lwpcv, p->p_lock);
p->p_nlwpwait--;
}
}
exitsig = signo;
p->p_acflag |= AXSIG;
memset(&p->p_sigctx.ps_info, 0, sizeof(p->p_sigctx.ps_info));
p->p_sigctx.ps_info._signo = signo;
p->p_sigctx.ps_info._code = SI_NOINFO;
if (docore) {
mutex_exit(p->p_lock);
MODULE_HOOK_CALL(coredump_hook, (l, NULL), enosys(), error);
if (kern_logsigexit) {
int uid = l->l_cred ?
(int)kauth_cred_geteuid(l->l_cred) : -1;
if (error)
log(LOG_INFO, lognocoredump, p->p_pid,
p->p_comm, uid, signo, error);
else
log(LOG_INFO, logcoredump, p->p_pid,
p->p_comm, uid, signo);
}
#ifdef PAX_SEGVGUARD
rw_enter(&exec_lock, RW_WRITER);
pax_segvguard(l, p->p_textvp, p->p_comm, true);
rw_exit(&exec_lock);
#endif /* PAX_SEGVGUARD */
/* Acquire the sched state mutex. exit1() will release it. */
mutex_enter(p->p_lock);
if (error == 0)
p->p_sflag |= PS_COREDUMP;
}
/* No longer dumping core. */
p->p_sflag &= ~PS_WCORE;
exit1(l, 0, exitsig);
/* NOTREACHED */
}
/*
* Since the "real" code may (or may not) be present in loadable module,
* we provide routines here which calls the module hooks.
*/
int
coredump_netbsd(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_netbsd_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_netbsd32(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_netbsd32_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_elf32(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_elf32_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
int
coredump_elf64(struct lwp *l, struct coredump_iostate *iocookie)
{
int retval;
MODULE_HOOK_CALL(coredump_elf64_hook, (l, iocookie), ENOSYS, retval);
return retval;
}
/*
* Put process 'p' into the stopped state and optionally, notify the parent.
*/
void
proc_stop(struct proc *p, int signo)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
/*
* First off, set the stopping indicator and bring all sleeping
* LWPs to a halt so they are included in p->p_nrlwps. We mustn't
* unlock between here and the p->p_nrlwps check below.
*/
p->p_sflag |= PS_STOPPING;
membar_producer();
proc_stop_lwps(p);
/*
* If there are no LWPs available to take the signal, then we
* signal the parent process immediately. Otherwise, the last
* LWP to stop will take care of it.
*/
if (p->p_nrlwps == 0) {
proc_stop_done(p, PS_NOCLDSTOP);
} else {
/*
* Have the remaining LWPs come to a halt, and trigger
* proc_stop_callout() to ensure that they do.
*/
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
sigpost(l, SIG_DFL, SA_STOP, signo);
}
callout_schedule(&proc_stop_ch, 1);
}
}
/*
* When stopping a process, we do not immediately set sleeping LWPs stopped,
* but wait for them to come to a halt at the kernel-user boundary. This is
* to allow LWPs to release any locks that they may hold before stopping.
*
* Non-interruptable sleeps can be long, and there is the potential for an
* LWP to begin sleeping interruptably soon after the process has been set
* stopping (PS_STOPPING). These LWPs will not notice that the process is
* stopping, and so complete halt of the process and the return of status
* information to the parent could be delayed indefinitely.
*
* To handle this race, proc_stop_callout() runs once per tick while there
* are stopping processes in the system. It sets LWPs that are sleeping
* interruptably into the LSSTOP state.
*
* Note that we are not concerned about keeping all LWPs stopped while the
* process is stopped: stopped LWPs can awaken briefly to handle signals.
* What we do need to ensure is that all LWPs in a stopping process have
* stopped at least once, so that notification can be sent to the parent
* process.
*/
static void
proc_stop_callout(void *cookie)
{
bool more, restart;
struct proc *p;
(void)cookie;
do {
restart = false;
more = false;
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
mutex_enter(p->p_lock);
if ((p->p_sflag & PS_STOPPING) == 0) {
mutex_exit(p->p_lock);
continue;
}
/* Stop any LWPs sleeping interruptably. */
proc_stop_lwps(p);
if (p->p_nrlwps == 0) {
/*
* We brought the process to a halt.
* Mark it as stopped and notify the
* parent.
*
* Note that proc_stop_done() will
* drop p->p_lock briefly.
* Arrange to restart and check
* all processes again.
*/
restart = true;
proc_stop_done(p, PS_NOCLDSTOP);
} else
more = true;
mutex_exit(p->p_lock);
if (restart)
break;
}
mutex_exit(&proc_lock);
} while (restart);
/*
* If we noted processes that are stopping but still have
* running LWPs, then arrange to check again in 1 tick.
*/
if (more)
callout_schedule(&proc_stop_ch, 1);
}
/*
* Given a process in state SSTOP, set the state back to SACTIVE and
* move LSSTOP'd LWPs to LSSLEEP or make them runnable.
*/
void
proc_unstop(struct proc *p)
{
struct lwp *l;
int sig;
KASSERT(mutex_owned(&proc_lock)); KASSERT(mutex_owned(p->p_lock));
p->p_stat = SACTIVE;
p->p_sflag &= ~PS_STOPPING;
sig = p->p_xsig;
if (!p->p_waited) p->p_pptr->p_nstopchild--; LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
if (l->l_stat != LSSTOP || (l->l_flag & LW_DBGSUSPEND) != 0) {
lwp_unlock(l);
continue;
}
if (l->l_wchan == NULL) {
setrunnable(l);
continue;
}
if (sig && (l->l_flag & LW_SINTR) != 0) {
setrunnable(l);
sig = 0;
} else {
l->l_stat = LSSLEEP;
p->p_nrlwps++;
lwp_unlock(l);
}
}
}
void
proc_stoptrace(int trapno, int sysnum, const register_t args[],
const register_t *ret, int error)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
struct sigacts *ps;
sigset_t *mask;
sig_t action;
ksiginfo_t ksi;
size_t i, sy_narg;
const int signo = SIGTRAP;
KASSERT((trapno == TRAP_SCE) || (trapno == TRAP_SCX));
KASSERT(p->p_pptr != initproc);
KASSERT(ISSET(p->p_slflag, PSL_TRACED));
KASSERT(ISSET(p->p_slflag, PSL_SYSCALL));
sy_narg = p->p_emul->e_sysent[sysnum].sy_narg;
KSI_INIT_TRAP(&ksi);
ksi.ksi_lid = l->l_lid;
ksi.ksi_signo = signo;
ksi.ksi_code = trapno;
ksi.ksi_sysnum = sysnum;
if (trapno == TRAP_SCE) {
ksi.ksi_retval[0] = 0;
ksi.ksi_retval[1] = 0;
ksi.ksi_error = 0;
} else {
ksi.ksi_retval[0] = ret[0];
ksi.ksi_retval[1] = ret[1];
ksi.ksi_error = error;
}
memset(ksi.ksi_args, 0, sizeof(ksi.ksi_args));
for (i = 0; i < sy_narg; i++)
ksi.ksi_args[i] = args[i];
mutex_enter(p->p_lock);
repeat:
/*
* If we are exiting, demise now.
*
* This avoids notifying tracer and deadlocking.
*/
if (__predict_false(ISSET(p->p_sflag, PS_WEXIT))) {
mutex_exit(p->p_lock);
lwp_exit(l);
panic("proc_stoptrace");
/* NOTREACHED */
}
/*
* If there's a pending SIGKILL process it immediately.
*/
if (p->p_xsig == SIGKILL ||
sigismember(&p->p_sigpend.sp_set, SIGKILL)) {
mutex_exit(p->p_lock);
return;
}
/*
* If we are no longer traced, abandon this event signal.
*
* This avoids killing a process after detaching the debugger.
*/
if (__predict_false(!ISSET(p->p_slflag, PSL_TRACED))) {
mutex_exit(p->p_lock);
return;
}
/*
* The process is already stopping.
*/
if ((p->p_sflag & PS_STOPPING) != 0) {
sigswitch_unlock_and_switch_away(l);
mutex_enter(p->p_lock);
goto repeat;
}
/* Needed for ktrace */
ps = p->p_sigacts;
action = SIGACTION_PS(ps, signo).sa_handler;
mask = &l->l_sigmask;
p->p_xsig = signo;
p->p_sigctx.ps_lwp = ksi.ksi_lid;
p->p_sigctx.ps_info = ksi.ksi_info;
sigswitch(0, signo, false);
if (ktrpoint(KTR_PSIG)) {
if (p->p_emul->e_ktrpsig)
p->p_emul->e_ktrpsig(signo, action, mask, &ksi);
else
ktrpsig(signo, action, mask, &ksi);
}
}
static int
filt_sigattach(struct knote *kn)
{
struct proc *p = curproc;
kn->kn_obj = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
mutex_enter(p->p_lock);
klist_insert(&p->p_klist, kn);
mutex_exit(p->p_lock);
return 0;
}
static void
filt_sigdetach(struct knote *kn)
{
struct proc *p = kn->kn_obj;
mutex_enter(p->p_lock);
klist_remove(&p->p_klist, kn);
mutex_exit(p->p_lock);
}
/*
* Signal knotes are shared with proc knotes, so we apply a mask to
* the hint in order to differentiate them from process hints. This
* could be avoided by using a signal-specific knote list, but probably
* isn't worth the trouble.
*/
static int
filt_signal(struct knote *kn, long hint)
{
if (hint & NOTE_SIGNAL) {
hint &= ~NOTE_SIGNAL;
if (kn->kn_id == hint)
kn->kn_data++;
}
return (kn->kn_data != 0);
}
const struct filterops sig_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_sigattach,
.f_detach = filt_sigdetach,
.f_event = filt_signal,
};
/* $NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $ */
/*-
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_prof.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $");
#ifdef _KERNEL_OPT
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/cpu.h>
#ifdef GPROF
#include <sys/malloc.h>
#include <sys/gmon.h>
#include <sys/xcall.h>
MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
static int sysctl_kern_profiling(SYSCTLFN_ARGS);
#ifdef MULTIPROCESSOR
void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
#endif
/*
* Froms is actually a bunch of unsigned shorts indexing tos
*/
struct gmonparam _gmonparam = { .state = GMON_PROF_OFF };
/* Actual start of the kernel text segment. */
extern char kernel_text[];
extern char etext[];
void
kmstartup(void)
{
char *cp;
struct gmonparam *p = &_gmonparam;
unsigned long size;
/*
* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
p->lowpc = rounddown(((u_long)kernel_text),
HISTFRACTION * sizeof(HISTCOUNTER));
p->highpc = roundup((u_long)etext,
HISTFRACTION * sizeof(HISTCOUNTER));
p->textsize = p->highpc - p->lowpc;
printf("Profiling kernel, textsize=%ld [%lx..%lx]\n",
p->textsize, p->lowpc, p->highpc);
p->kcountsize = p->textsize / HISTFRACTION;
p->hashfraction = HASHFRACTION;
p->fromssize = p->textsize / HASHFRACTION;
p->tolimit = p->textsize * ARCDENSITY / 100;
if (p->tolimit < MINARCS)
p->tolimit = MINARCS;
else if (p->tolimit > MAXARCS)
p->tolimit = MAXARCS;
p->tossize = p->tolimit * sizeof(struct tostruct);
size = p->kcountsize + p->fromssize + p->tossize;
#ifdef MULTIPROCESSOR
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
p = malloc(sizeof(struct gmonparam) + size, M_GPROF,
M_NOWAIT | M_ZERO);
if (p == NULL) {
printf("No memory for profiling on %s\n",
cpu_name(ci));
/* cannot profile on this cpu */
continue;
}
memcpy(p, &_gmonparam, sizeof(_gmonparam));
ci->ci_gmon = p;
/*
* To allow profiling to be controlled only by the global
* _gmonparam.state, set the default value for each CPU to
* GMON_PROF_ON. If _gmonparam.state is not ON, mcount will
* not be executed.
* This is For compatibility of the kgmon(8) kmem interface.
*/
p->state = GMON_PROF_ON;
cp = (char *)(p + 1);
p->tos = (struct tostruct *)cp;
p->kcount = (u_short *)(cp + p->tossize);
p->froms = (u_short *)(cp + p->tossize + p->kcountsize);
}
sysctl_createv(NULL, 0, NULL, NULL,
0, CTLTYPE_NODE, "percpu",
SYSCTL_DESCR("per cpu profiling information"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL);
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
sysctl_createv(NULL, 0, NULL, NULL,
0, CTLTYPE_NODE, cpu_name(ci),
NULL,
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_INT, "state",
SYSCTL_DESCR("Profiling state"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_STATE, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count",
SYSCTL_DESCR("Array of statistical program counters"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_COUNT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms",
SYSCTL_DESCR("Array indexed by program counter of "
"call-from points"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_FROMS, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos",
SYSCTL_DESCR("Array of structures describing "
"destination of calls and their counts"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_TOS, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam",
SYSCTL_DESCR("Structure giving the sizes of the above "
"arrays"),
sysctl_kern_profiling, 0, (void *)ci, 0,
CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
GPROF_GMONPARAM, CTL_EOL);
}
/*
* For minimal compatibility of the kgmon(8) kmem interface,
* the _gmonparam and cpu0:ci_gmon share buffers.
*/
p = curcpu()->ci_gmon;
if (p != NULL) {
_gmonparam.tos = p->tos;
_gmonparam.kcount = p->kcount;
_gmonparam.froms = p->froms;
}
#else /* MULTIPROCESSOR */
cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO);
if (cp == 0) {
printf("No memory for profiling.\n");
return;
}
p->tos = (struct tostruct *)cp;
cp += p->tossize;
p->kcount = (u_short *)cp;
cp += p->kcountsize;
p->froms = (u_short *)cp;
#endif /* MULTIPROCESSOR */
}
#ifdef MULTIPROCESSOR
static void
prof_set_state_xc(void *arg1, void *arg2 __unused)
{
int state = PTRTOUINT64(arg1);
struct gmonparam *gp = curcpu()->ci_gmon;
if (gp != NULL)
gp->state = state;
}
#endif /* MULTIPROCESSOR */
/*
* Return kernel profiling information.
*/
/*
* sysctl helper routine for kern.profiling subtree. enables/disables
* kernel profiling and gives out copies of the profiling data.
*/
static int
sysctl_kern_profiling(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
struct gmonparam *gp;
int error;
#ifdef MULTIPROCESSOR
CPU_INFO_ITERATOR cii;
struct cpu_info *ci, *target_ci;
uint64_t where;
int state;
bool prof_on, do_merge;
target_ci = (struct cpu_info *)rnode->sysctl_data;
do_merge = (oldp != NULL) && (target_ci == NULL) &&
((node.sysctl_num == GPROF_COUNT) ||
(node.sysctl_num == GPROF_FROMS) ||
(node.sysctl_num == GPROF_TOS));
if (do_merge) {
/* kern.profiling.{count,froms,tos} */
unsigned long size;
char *cp;
/* allocate temporary gmonparam, and merge results of all CPU */
size = _gmonparam.kcountsize + _gmonparam.fromssize +
_gmonparam.tossize;
gp = malloc(sizeof(struct gmonparam) + size, M_GPROF,
M_NOWAIT | M_ZERO);
if (gp == NULL)
return ENOMEM;
memcpy(gp, &_gmonparam, sizeof(_gmonparam));
cp = (char *)(gp + 1);
gp->tos = (struct tostruct *)cp;
gp->kcount = (u_short *)(cp + gp->tossize);
gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize);
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
_gmonparam_merge(gp, ci->ci_gmon);
}
} else if (target_ci != NULL) {
/* kern.profiling.percpu.* */
gp = target_ci->ci_gmon;
} else {
/* kern.profiling.{state,gmonparam} */
gp = &_gmonparam;
}
#else /* MULTIPROCESSOR */
gp = &_gmonparam;
#endif
switch (node.sysctl_num) {
case GPROF_STATE:
#ifdef MULTIPROCESSOR
/*
* if _gmonparam.state is OFF, the state of each CPU is
* considered to be OFF, even if it is actually ON.
*/
if (_gmonparam.state == GMON_PROF_OFF ||
gp->state == GMON_PROF_OFF)
state = GMON_PROF_OFF;
else
state = GMON_PROF_ON;
node.sysctl_data = &state;
#else
node.sysctl_data = &gp->state;
#endif
break;
case GPROF_COUNT:
node.sysctl_data = gp->kcount;
node.sysctl_size = gp->kcountsize;
break;
case GPROF_FROMS:
node.sysctl_data = gp->froms;
node.sysctl_size = gp->fromssize;
break;
case GPROF_TOS:
node.sysctl_data = gp->tos;
node.sysctl_size = gp->tossize;
break;
case GPROF_GMONPARAM:
node.sysctl_data = gp;
node.sysctl_size = sizeof(*gp);
break;
default:
return (EOPNOTSUPP);
}
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
goto done;
#ifdef MULTIPROCESSOR
switch (node.sysctl_num) {
case GPROF_STATE:
if (target_ci != NULL) {
where = xc_unicast(0, prof_set_state_xc,
UINT64TOPTR(state), NULL, target_ci);
xc_wait(where);
/* if even one CPU being profiled, enable perfclock. */
prof_on = false;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
if (ci->ci_gmon->state != GMON_PROF_OFF) {
prof_on = true;
break;
}
}
mutex_spin_enter(&proc0.p_stmutex);
if (prof_on)
startprofclock(&proc0);
else
stopprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
if (prof_on) {
_gmonparam.state = GMON_PROF_ON;
} else {
_gmonparam.state = GMON_PROF_OFF;
/*
* when _gmonparam.state and all CPU gmon state
* are OFF, all CPU states should be ON so that
* the entire CPUs profiling can be controlled
* by _gmonparam.state only.
*/
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
ci->ci_gmon->state = GMON_PROF_ON;
}
}
} else {
_gmonparam.state = state;
where = xc_broadcast(0, prof_set_state_xc,
UINT64TOPTR(state), NULL);
xc_wait(where);
mutex_spin_enter(&proc0.p_stmutex);
if (state == GMON_PROF_OFF)
stopprofclock(&proc0);
else
startprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
}
break;
case GPROF_COUNT:
/*
* if 'kern.profiling.{count,froms,tos}' is written, the same
* data will be written to 'kern.profiling.percpu.cpuN.xxx'
*/
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->kcount, gp->kcount,
newlen);
}
}
break;
case GPROF_FROMS:
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->froms, gp->froms, newlen);
}
}
break;
case GPROF_TOS:
if (target_ci == NULL) {
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_gmon == NULL)
continue;
memmove(ci->ci_gmon->tos, gp->tos, newlen);
}
}
break;
}
#else
if (node.sysctl_num == GPROF_STATE) {
mutex_spin_enter(&proc0.p_stmutex);
if (gp->state == GMON_PROF_OFF)
stopprofclock(&proc0);
else
startprofclock(&proc0);
mutex_spin_exit(&proc0.p_stmutex);
}
#endif
done:
#ifdef MULTIPROCESSOR
if (do_merge)
free(gp, M_GPROF);
#endif
return error;
}
SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "profiling",
SYSCTL_DESCR("Profiling information (available)"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PROF, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "state",
SYSCTL_DESCR("Profiling state"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_STATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "count",
SYSCTL_DESCR("Array of statistical program counters"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_COUNT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "froms",
SYSCTL_DESCR("Array indexed by program counter of "
"call-from points"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_FROMS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "tos",
SYSCTL_DESCR("Array of structures describing "
"destination of calls and their counts"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_TOS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "gmonparam",
SYSCTL_DESCR("Structure giving the sizes of the above "
"arrays"),
sysctl_kern_profiling, 0, NULL, 0,
CTL_KERN, KERN_PROF, GPROF_GMONPARAM, CTL_EOL);
}
#endif /* GPROF */
/*
* Profiling system call.
*
* The scale factor is a fixed point number with 16 bits of fraction, so that
* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
*/
/* ARGSUSED */
int
sys_profil(struct lwp *l, const struct sys_profil_args *uap, register_t *retval)
{
/* {
syscallarg(char *) samples;
syscallarg(size_t) size;
syscallarg(u_long) offset;
syscallarg(u_int) scale;
} */
struct proc *p = l->l_proc;
struct uprof *upp;
if (SCARG(uap, scale) > (1 << 16))
return (EINVAL);
if (SCARG(uap, scale) == 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
return (0);
}
upp = &p->p_stats->p_prof;
/* Block profile interrupts while changing state. */
mutex_spin_enter(&p->p_stmutex);
upp->pr_off = SCARG(uap, offset);
upp->pr_scale = SCARG(uap, scale);
upp->pr_base = SCARG(uap, samples);
upp->pr_size = SCARG(uap, size);
startprofclock(p);
mutex_spin_exit(&p->p_stmutex);
return (0);
}
/*
* Scale is a fixed-point number with the binary point 16 bits
* into the value, and is <= 1.0. pc is at most 32 bits, so the
* intermediate result is at most 48 bits.
*/
#define PC_TO_INDEX(pc, prof) \
((int)(((u_quad_t)((pc) - (prof)->pr_off) * \
(u_quad_t)((prof)->pr_scale)) >> 16) & ~1)
/*
* Collect user-level profiling statistics; called on a profiling tick,
* when a process is running in user-mode. This routine may be called
* from an interrupt context. We schedule an AST that will vector us
* to trap() with a context in which copyin and copyout will work.
* Trap will then call addupc_task().
*
* XXX We could use ufetch/ustore here if the profile buffers were
* wired.
*
* Note that we may (rarely) not get around to the AST soon enough, and
* lose profile ticks when the next tick overwrites this one, but in this
* case the system is overloaded and the profile is probably already
* inaccurate.
*/
void
addupc_intr(struct lwp *l, u_long pc)
{
struct uprof *prof;
struct proc *p;
u_int i;
p = l->l_proc;
KASSERT(mutex_owned(&p->p_stmutex));
prof = &p->p_stats->p_prof;
if (pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size)
return; /* out of range; ignore */
mutex_spin_exit(&p->p_stmutex);
/* XXXSMP */
prof->pr_addr = pc;
prof->pr_ticks++;
cpu_need_proftick(l);
mutex_spin_enter(&p->p_stmutex);
}
/*
* Much like before, but we can afford to take faults here. If the
* update fails, we simply turn off profiling.
*/
void
addupc_task(struct lwp *l, u_long pc, u_int ticks)
{
struct uprof *prof;
struct proc *p;
void *addr;
int error;
u_int i;
u_short v;
p = l->l_proc;
if (ticks == 0)
return;
mutex_spin_enter(&p->p_stmutex);
prof = &p->p_stats->p_prof;
/* Testing P_PROFIL may be unnecessary, but is certainly safe. */
if ((p->p_stflag & PST_PROFIL) == 0 || pc < prof->pr_off ||
(i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
mutex_spin_exit(&p->p_stmutex);
return;
}
addr = prof->pr_base + i;
mutex_spin_exit(&p->p_stmutex);
if ((error = copyin(addr, (void *)&v, sizeof(v))) == 0) {
v += ticks;
error = copyout((void *)&v, addr, sizeof(v));
}
if (error != 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
}
/* $NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_sig.c,v 1.57 2023/10/04 20:42:38 ad Exp $");
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/signalvar.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/wait.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/sdt.h>
#include <sys/compat_stub.h>
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE2(proc, kernel, , signal__clear,
"int", /* signal */
"ksiginfo_t *"); /* signal-info */
int
sys___sigaction_sigtramp(struct lwp *l,
const struct sys___sigaction_sigtramp_args *uap, register_t *retval)
{
/* {
syscallarg(int) signum;
syscallarg(const struct sigaction *) nsa;
syscallarg(struct sigaction *) osa;
syscallarg(void *) tramp;
syscallarg(int) vers;
} */
struct sigaction nsa, osa;
int error;
if (SCARG(uap, nsa)) {
error = copyin(SCARG(uap, nsa), &nsa, sizeof(nsa));
if (error)
return (error);
}
error = sigaction1(l, SCARG(uap, signum),
SCARG(uap, nsa) ? &nsa : 0, SCARG(uap, osa) ? &osa : 0,
SCARG(uap, tramp), SCARG(uap, vers));
if (error)
return (error);
if (SCARG(uap, osa)) {
error = copyout(&osa, SCARG(uap, osa), sizeof(osa));
if (error)
return (error);
}
return 0;
}
/*
* Manipulate signal mask. Note that we receive new mask, not pointer, and
* return old mask as return value; the library stub does the rest.
*/
int
sys___sigprocmask14(struct lwp *l, const struct sys___sigprocmask14_args *uap,
register_t *retval)
{
/* {
syscallarg(int) how;
syscallarg(const sigset_t *) set;
syscallarg(sigset_t *) oset;
} */
struct proc *p = l->l_proc;
sigset_t nss, oss;
int error;
if (SCARG(uap, set)) {
error = copyin(SCARG(uap, set), &nss, sizeof(nss));
if (error)
return error;
}
mutex_enter(p->p_lock);
error = sigprocmask1(l, SCARG(uap, how),
SCARG(uap, set) ? &nss : 0, SCARG(uap, oset) ? &oss : 0);
mutex_exit(p->p_lock);
if (error)
return error;
if (SCARG(uap, oset)) {
error = copyout(&oss, SCARG(uap, oset), sizeof(oss));
if (error)
return error;
}
return 0;
}
int
sys___sigpending14(struct lwp *l, const struct sys___sigpending14_args *uap,
register_t *retval)
{
/* {
syscallarg(sigset_t *) set;
} */
sigset_t ss;
sigpending1(l, &ss);
return copyout(&ss, SCARG(uap, set), sizeof(ss));
}
/*
* Suspend process until signal, providing mask to be set in the meantime.
* Note nonstandard calling convention: libc stub passes mask, not pointer,
* to save a copyin.
*/
int
sys___sigsuspend14(struct lwp *l, const struct sys___sigsuspend14_args *uap,
register_t *retval)
{
/* {
syscallarg(const sigset_t *) set;
} */
sigset_t ss;
int error;
if (SCARG(uap, set)) {
error = copyin(SCARG(uap, set), &ss, sizeof(ss));
if (error)
return error;
}
return sigsuspend1(l, SCARG(uap, set) ? &ss : 0);
}
int
sys___sigaltstack14(struct lwp *l, const struct sys___sigaltstack14_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct sigaltstack *) nss;
syscallarg(struct sigaltstack *) oss;
} */
stack_t nss, oss;
int error;
if (SCARG(uap, nss)) {
error = copyin(SCARG(uap, nss), &nss, sizeof(nss));
if (error)
return error;
}
error = sigaltstack1(l,
SCARG(uap, nss) ? &nss : 0, SCARG(uap, oss) ? &oss : 0);
if (error)
return error;
if (SCARG(uap, oss)) {
error = copyout(&oss, SCARG(uap, oss), sizeof(oss));
if (error)
return error;
}
return 0;
}
int
kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval)
{
int error;
struct proc *p;
if ((u_int)ksi->ksi_signo >= NSIG)
return EINVAL;
if (pid != l->l_proc->p_pid) {
if (ksi->ksi_pid != l->l_proc->p_pid)
return EPERM;
if (ksi->ksi_uid != kauth_cred_geteuid(l->l_cred))
return EPERM;
switch (ksi->ksi_code) {
case SI_USER:
case SI_QUEUE:
break;
default:
return EPERM;
}
}
if (pid > 0) {
/* kill single process */
mutex_enter(&proc_lock);
p = proc_find_raw(pid);
if (p == NULL || (p->p_stat != SACTIVE && p->p_stat != SSTOP)) {
mutex_exit(&proc_lock);
/* IEEE Std 1003.1-2001: return success for zombies */
return p ? 0 : ESRCH;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_SIGNAL, p, KAUTH_ARG(ksi->ksi_signo),
NULL, NULL);
if (!error && ksi->ksi_signo) {
error = kpsignal2(p, ksi);
}
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
return error;
}
switch (pid) {
case -1: /* broadcast signal */
return killpg1(l, ksi, 0, 1);
case 0: /* signal own process group */
return killpg1(l, ksi, 0, 0);
default: /* negative explicit process group */
return killpg1(l, ksi, -pid, 0);
}
/* NOTREACHED */
}
int
sys_sigqueueinfo(struct lwp *l, const struct sys_sigqueueinfo_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t int) pid;
syscallarg(const siginfo_t *) info;
} */
ksiginfo_t ksi;
int error;
KSI_INIT(&ksi);
if ((error = copyin(&SCARG(uap, info)->_info, &ksi.ksi_info,
sizeof(ksi.ksi_info))) != 0)
return error;
return kill1(l, SCARG(uap, pid), &ksi, retval);
}
int
sys_kill(struct lwp *l, const struct sys_kill_args *uap, register_t *retval)
{
/* {
syscallarg(pid_t) pid;
syscallarg(int) signum;
} */
ksiginfo_t ksi;
KSI_INIT(&ksi);
ksi.ksi_signo = SCARG(uap, signum);
ksi.ksi_code = SI_USER;
ksi.ksi_pid = l->l_proc->p_pid;
ksi.ksi_uid = kauth_cred_geteuid(l->l_cred);
return kill1(l, SCARG(uap, pid), &ksi, retval);
}
int
sys_getcontext(struct lwp *l, const struct sys_getcontext_args *uap,
register_t *retval)
{
/* {
syscallarg(struct __ucontext *) ucp;
} */
struct proc *p = l->l_proc;
ucontext_t uc;
memset(&uc, 0, sizeof(uc));
mutex_enter(p->p_lock);
getucontext(l, &uc);
mutex_exit(p->p_lock);
return copyout(&uc, SCARG(uap, ucp), sizeof (*SCARG(uap, ucp)));
}
int
sys_setcontext(struct lwp *l, const struct sys_setcontext_args *uap,
register_t *retval)
{
/* {
syscallarg(const ucontext_t *) ucp;
} */
struct proc *p = l->l_proc;
ucontext_t uc;
int error;
error = copyin(SCARG(uap, ucp), &uc, sizeof (uc));
if (error)
return error;
if ((uc.uc_flags & _UC_CPU) == 0)
return EINVAL;
mutex_enter(p->p_lock);
error = setucontext(l, &uc);
mutex_exit(p->p_lock);
if (error)
return error;
return EJUSTRETURN;
}
/*
* sigtimedwait(2) system call, used also for implementation
* of sigwaitinfo() and sigwait().
*
* This only handles single LWP in signal wait. libpthread provides
* its own sigtimedwait() wrapper to DTRT WRT individual threads.
*/
int
sys_____sigtimedwait50(struct lwp *l,
const struct sys_____sigtimedwait50_args *uap, register_t *retval)
{
return sigtimedwait1(l, uap, retval, copyin, copyout, copyin, copyout);
}
int
sigaction1(struct lwp *l, int signum, const struct sigaction *nsa,
struct sigaction *osa, const void *tramp, int vers)
{
struct proc *p;
struct sigacts *ps;
sigset_t tset;
int prop, error;
ksiginfoq_t kq;
static bool v0v1valid;
if (signum <= 0 || signum >= NSIG)
return EINVAL;
p = l->l_proc;
error = 0;
ksiginfo_queue_init(&kq);
/*
* Trampoline ABI version __SIGTRAMP_SIGCODE_VERSION (0) is reserved
* for the legacy kernel provided on-stack trampoline. Conversely,
* if we are using a non-0 ABI version, we must have a trampoline.
* Only validate the vers if a new sigaction was supplied and there
* was an actual handler specified (not SIG_IGN or SIG_DFL), which
* don't require a trampoline. Emulations use legacy kernel
* trampolines with version 0, alternatively check for that too.
*
* If version < __SIGTRAMP_SIGINFO_VERSION_MIN (usually 2), we try
* to autoload the compat module. Note that we interlock with the
* unload check in compat_modcmd() using kernconfig_lock. If the
* autoload fails, we don't try it again for this process.
*/
if (nsa != NULL && nsa->sa_handler != SIG_IGN
&& nsa->sa_handler != SIG_DFL) {
if (__predict_false(vers < __SIGTRAMP_SIGINFO_VERSION_MIN)) {
if (vers == __SIGTRAMP_SIGCODE_VERSION &&
p->p_sigctx.ps_sigcode != NULL) {
/*
* if sigcode is used for this emulation,
* version 0 is allowed.
*/
}
#ifdef __HAVE_STRUCT_SIGCONTEXT
else if (p->p_flag & PK_32) {
/*
* The 32-bit compat module will have
* pre-validated this for us.
*/
v0v1valid = true;
} else if ((p->p_lflag & PL_SIGCOMPAT) == 0) {
kernconfig_lock();
(void)module_autoload("compat_16",
MODULE_CLASS_ANY);
if (sendsig_sigcontext_16_hook.hooked) {
/*
* We need to remember if the
* sigcontext method may be useable,
* because libc may use it even
* if siginfo is available.
*/
v0v1valid = true;
}
mutex_enter(&proc_lock);
/*
* Prevent unload of compat module while
* this process remains.
*/
p->p_lflag |= PL_SIGCOMPAT;
mutex_exit(&proc_lock);
kernconfig_unlock();
}
#endif /* __HAVE_STRUCT_SIGCONTEXT */
}
switch (vers) {
case __SIGTRAMP_SIGCODE_VERSION:
/* kernel supplied trampoline. */
if (tramp != NULL ||
(p->p_sigctx.ps_sigcode == NULL && !v0v1valid)) {
return EINVAL;
}
break;
#ifdef __HAVE_STRUCT_SIGCONTEXT
case __SIGTRAMP_SIGCONTEXT_VERSION_MIN ...
__SIGTRAMP_SIGCONTEXT_VERSION_MAX:
/* sigcontext, user supplied trampoline. */
if (tramp == NULL || !v0v1valid) {
return EINVAL;
}
break;
#endif /* __HAVE_STRUCT_SIGCONTEXT */
case __SIGTRAMP_SIGINFO_VERSION_MIN ...
__SIGTRAMP_SIGINFO_VERSION_MAX:
/* siginfo, user supplied trampoline. */
if (tramp == NULL) {
return EINVAL;
}
break;
default:
/* Invalid trampoline version. */
return EINVAL;
}
}
mutex_enter(p->p_lock);
ps = p->p_sigacts;
if (osa)
sigaction_copy(osa, &SIGACTION_PS(ps, signum));
if (!nsa)
goto out;
prop = sigprop[signum];
if ((nsa->sa_flags & ~SA_ALLBITS) || (prop & SA_CANTMASK)) {
error = EINVAL;
goto out;
}
sigaction_copy(&SIGACTION_PS(ps, signum), nsa);
ps->sa_sigdesc[signum].sd_tramp = tramp;
ps->sa_sigdesc[signum].sd_vers = vers;
sigminusset(&sigcantmask, &SIGACTION_PS(ps, signum).sa_mask);
if ((prop & SA_NORESET) != 0)
SIGACTION_PS(ps, signum).sa_flags &= ~SA_RESETHAND;
if (signum == SIGCHLD) {
if (nsa->sa_flags & SA_NOCLDSTOP)
p->p_sflag |= PS_NOCLDSTOP;
else
p->p_sflag &= ~PS_NOCLDSTOP;
if (nsa->sa_flags & SA_NOCLDWAIT) {
/*
* Paranoia: since SA_NOCLDWAIT is implemented by
* reparenting the dying child to PID 1 (and trust
* it to reap the zombie), PID 1 itself is forbidden
* to set SA_NOCLDWAIT.
*/
if (p->p_pid == 1)
p->p_flag &= ~PK_NOCLDWAIT;
else
p->p_flag |= PK_NOCLDWAIT;
} else
p->p_flag &= ~PK_NOCLDWAIT;
if (nsa->sa_handler == SIG_IGN) {
/*
* Paranoia: same as above.
*/
if (p->p_pid == 1)
p->p_flag &= ~PK_CLDSIGIGN;
else
p->p_flag |= PK_CLDSIGIGN;
} else
p->p_flag &= ~PK_CLDSIGIGN;
}
if ((nsa->sa_flags & SA_NODEFER) == 0)
sigaddset(&SIGACTION_PS(ps, signum).sa_mask, signum);
else
sigdelset(&SIGACTION_PS(ps, signum).sa_mask, signum);
/*
* Set bit in p_sigctx.ps_sigignore for signals that are set to
* SIG_IGN, and for signals set to SIG_DFL where the default is to
* ignore. However, don't put SIGCONT in p_sigctx.ps_sigignore, as
* we have to restart the process.
*/
if (nsa->sa_handler == SIG_IGN ||
(nsa->sa_handler == SIG_DFL && (prop & SA_IGNORE) != 0)) {
/* Never to be seen again. */
sigemptyset(&tset);
sigaddset(&tset, signum);
sigclearall(p, &tset, &kq);
if (signum != SIGCONT) {
/* Easier in psignal */
sigaddset(&p->p_sigctx.ps_sigignore, signum);
}
sigdelset(&p->p_sigctx.ps_sigcatch, signum);
} else {
sigdelset(&p->p_sigctx.ps_sigignore, signum);
if (nsa->sa_handler == SIG_DFL)
sigdelset(&p->p_sigctx.ps_sigcatch, signum);
else
sigaddset(&p->p_sigctx.ps_sigcatch, signum);
}
/*
* Previously held signals may now have become visible. Ensure that
* we check for them before returning to userspace.
*/
if (sigispending(l, 0)) {
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
out:
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
return error;
}
int
sigprocmask1(struct lwp *l, int how, const sigset_t *nss, sigset_t *oss)
{
sigset_t *mask = &l->l_sigmask;
bool more;
KASSERT(mutex_owned(l->l_proc->p_lock));
if (oss) {
*oss = *mask;
}
if (nss == NULL) {
return 0;
}
switch (how) {
case SIG_BLOCK:
sigplusset(nss, mask);
more = false;
break;
case SIG_UNBLOCK:
sigminusset(nss, mask);
more = true;
break;
case SIG_SETMASK:
*mask = *nss;
more = true;
break;
default:
return EINVAL;
}
sigminusset(&sigcantmask, mask);
if (more && sigispending(l, 0)) {
/*
* Check for pending signals on return to user.
*/
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
return 0;
}
void
sigpending1(struct lwp *l, sigset_t *ss)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
*ss = l->l_sigpend.sp_set;
sigplusset(&p->p_sigpend.sp_set, ss);
mutex_exit(p->p_lock);
}
void
sigsuspendsetup(struct lwp *l, const sigset_t *ss)
{
struct proc *p = l->l_proc;
/*
* When returning from sigsuspend/pselect/pollts, we want
* the old mask to be restored after the
* signal handler has finished. Thus, we
* save it here and mark the sigctx structure
* to indicate this.
*/
mutex_enter(p->p_lock);
l->l_sigrestore = 1;
l->l_sigoldmask = l->l_sigmask;
l->l_sigmask = *ss;
sigminusset(&sigcantmask, &l->l_sigmask);
/* Check for pending signals when sleeping. */
if (sigispending(l, 0)) { lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
}
mutex_exit(p->p_lock);
}
void
sigsuspendteardown(struct lwp *l)
{
struct proc *p = l->l_proc;
mutex_enter(p->p_lock);
/* Check for pending signals when sleeping. */
if (l->l_sigrestore) {
if (sigispending(l, 0)) {
lwp_lock(l);
l->l_flag |= LW_PENDSIG;
lwp_need_userret(l);
lwp_unlock(l);
} else {
l->l_sigrestore = 0;
l->l_sigmask = l->l_sigoldmask;
}
}
mutex_exit(p->p_lock);
}
int
sigsuspend1(struct lwp *l, const sigset_t *ss)
{
if (ss)
sigsuspendsetup(l, ss);
while (kpause("pause", true, 0, NULL) == 0)
;
/* always return EINTR rather than ERESTART... */
return EINTR;
}
int
sigaltstack1(struct lwp *l, const stack_t *nss, stack_t *oss)
{
struct proc *p = l->l_proc;
int error = 0;
mutex_enter(p->p_lock);
if (oss)
*oss = l->l_sigstk;
if (nss) {
if (nss->ss_flags & ~SS_ALLBITS)
error = EINVAL;
else if (nss->ss_flags & SS_DISABLE) {
if (l->l_sigstk.ss_flags & SS_ONSTACK)
error = EINVAL;
} else if (nss->ss_size < MINSIGSTKSZ)
error = ENOMEM;
if (!error)
l->l_sigstk = *nss;
}
mutex_exit(p->p_lock);
return error;
}
int
sigtimedwait1(struct lwp *l, const struct sys_____sigtimedwait50_args *uap,
register_t *retval, copyin_t fetchss, copyout_t storeinf, copyin_t fetchts,
copyout_t storets)
{
/* {
syscallarg(const sigset_t *) set;
syscallarg(siginfo_t *) info;
syscallarg(struct timespec *) timeout;
} */
struct proc *p = l->l_proc;
int error, signum, timo;
struct timespec ts, tsstart, tsnow;
ksiginfo_t ksi;
/*
* Calculate timeout, if it was specified.
*
* NULL pointer means an infinite timeout.
* {.tv_sec = 0, .tv_nsec = 0} means do not block.
*/
if (SCARG(uap, timeout)) {
error = (*fetchts)(SCARG(uap, timeout), &ts, sizeof(ts));
if (error)
return error;
if ((error = itimespecfix(&ts)) != 0)
return error;
timo = tstohz(&ts);
if (timo == 0) {
if (ts.tv_sec == 0 && ts.tv_nsec == 0)
timo = -1; /* do not block */
else
timo = 1; /* the shortest possible timeout */
}
/*
* Remember current uptime, it would be used in
* ECANCELED/ERESTART case.
*/
getnanouptime(&tsstart);
} else {
memset(&tsstart, 0, sizeof(tsstart)); /* XXXgcc */
timo = 0; /* infinite timeout */
}
error = (*fetchss)(SCARG(uap, set), &l->l_sigwaitset,
sizeof(l->l_sigwaitset));
if (error)
return error;
/*
* Silently ignore SA_CANTMASK signals. psignal1() would ignore
* SA_CANTMASK signals in waitset, we do this only for the below
* siglist check.
*/
sigminusset(&sigcantmask, &l->l_sigwaitset);
memset(&ksi.ksi_info, 0, sizeof(ksi.ksi_info));
mutex_enter(p->p_lock);
/* Check for pending signals in the process, if no - then in LWP. */
if ((signum = sigget(&p->p_sigpend, &ksi, 0, &l->l_sigwaitset)) == 0)
signum = sigget(&l->l_sigpend, &ksi, 0, &l->l_sigwaitset);
if (signum != 0) {
/* If found a pending signal, just copy it out to the user. */
mutex_exit(p->p_lock);
goto out;
}
if (timo < 0) {
/* If not allowed to block, return an error */
mutex_exit(p->p_lock);
return EAGAIN;
}
/*
* Set up the sigwait list and wait for signal to arrive.
* We can either be woken up or time out.
*/
l->l_sigwaited = &ksi;
LIST_INSERT_HEAD(&p->p_sigwaiters, l, l_sigwaiter);
error = cv_timedwait_sig(&l->l_sigcv, p->p_lock, timo);
/*
* Need to find out if we woke as a result of _lwp_wakeup() or a
* signal outside our wait set.
*/
if (l->l_sigwaited != NULL) {
if (error == EINTR) {
/* Wakeup via _lwp_wakeup(). */
error = ECANCELED;
} else if (!error) {
/* Spurious wakeup - arrange for syscall restart. */
error = ERESTART;
}
l->l_sigwaited = NULL;
LIST_REMOVE(l, l_sigwaiter);
}
mutex_exit(p->p_lock);
/*
* If the sleep was interrupted (either by signal or wakeup), update
* the timeout and copyout new value back. It would be used when
* the syscall would be restarted or called again.
*/
if (timo && (error == ERESTART || error == ECANCELED)) {
getnanouptime(&tsnow);
/* Compute how much time has passed since start. */
timespecsub(&tsnow, &tsstart, &tsnow);
/* Subtract passed time from timeout. */
timespecsub(&ts, &tsnow, &ts);
if (ts.tv_sec < 0)
error = EAGAIN;
else {
/* Copy updated timeout to userland. */
error = (*storets)(&ts, SCARG(uap, timeout),
sizeof(ts));
}
}
out:
/*
* If a signal from the wait set arrived, copy it to userland.
* Copy only the used part of siginfo, the padding part is
* left unchanged (userland is not supposed to touch it anyway).
*/
if (error == 0 && SCARG(uap, info)) {
error = (*storeinf)(&ksi.ksi_info, SCARG(uap, info),
sizeof(ksi.ksi_info));
}
if (error == 0) {
*retval = ksi.ksi_info._signo;
SDT_PROBE(proc, kernel, , signal__clear, *retval,
&ksi, 0, 0, 0);
}
return error;
}
/* $NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (C) 1993, 1994, 1996 Christopher G. Demetriou
* Copyright (C) 1992 Wolfgang Solfrank.
* Copyright (C) 1992 TooLs GmbH.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by TooLs GmbH.
* 4. The name of TooLs GmbH may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY TOOLS GMBH ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL TOOLS GMBH BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exec.c,v 1.521 2023/10/08 12:38:58 ad Exp $");
#include "opt_exec.h"
#include "opt_execfmt.h"
#include "opt_ktrace.h"
#include "opt_modular.h"
#include "opt_syscall_debug.h"
#include "veriexec.h"
#include "opt_pax.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/mount.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/acct.h>
#include <sys/atomic.h>
#include <sys/exec.h>
#include <sys/futex.h>
#include <sys/ktrace.h>
#include <sys/uidinfo.h>
#include <sys/wait.h>
#include <sys/mman.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/lwpctl.h>
#include <sys/pax.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#if NVERIEXEC > 0
#include <sys/verified_exec.h>
#endif /* NVERIEXEC > 0 */
#include <sys/sdt.h>
#include <sys/spawn.h>
#include <sys/prot.h>
#include <sys/cprng.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
#include <compat/common/compat_util.h>
#ifndef MD_TOPDOWN_INIT
#ifdef __USE_TOPDOWN_VM
#define MD_TOPDOWN_INIT(epp) (epp)->ep_flags |= EXEC_TOPDOWN_VM
#else
#define MD_TOPDOWN_INIT(epp)
#endif
#endif
struct execve_data;
extern int user_va0_disable;
static size_t calcargs(struct execve_data * restrict, const size_t);
static size_t calcstack(struct execve_data * restrict, const size_t);
static int copyoutargs(struct execve_data * restrict, struct lwp *,
char * const);
static int copyoutpsstrs(struct execve_data * restrict, struct proc *);
static int copyinargs(struct execve_data * restrict, char * const *,
char * const *, execve_fetch_element_t, char **);
static int copyinargstrs(struct execve_data * restrict, char * const *,
execve_fetch_element_t, char **, size_t *, void (*)(const void *, size_t));
static int exec_sigcode_map(struct proc *, const struct emul *);
#if defined(DEBUG) && !defined(DEBUG_EXEC)
#define DEBUG_EXEC
#endif
#ifdef DEBUG_EXEC
#define DPRINTF(a) printf a
#define COPYPRINTF(s, a, b) printf("%s, %d: copyout%s @%p %zu\n", __func__, \
__LINE__, (s), (a), (b))
static void dump_vmcmds(const struct exec_package * const, size_t, int);
#define DUMPVMCMDS(p, x, e) do { dump_vmcmds((p), (x), (e)); } while (0)
#else
#define DPRINTF(a)
#define COPYPRINTF(s, a, b)
#define DUMPVMCMDS(p, x, e) do {} while (0)
#endif /* DEBUG_EXEC */
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exec, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__success, "char *");
SDT_PROBE_DEFINE1(proc, kernel, , exec__failure, "int");
/*
* Exec function switch:
*
* Note that each makecmds function is responsible for loading the
* exec package with the necessary functions for any exec-type-specific
* handling.
*
* Functions for specific exec types should be defined in their own
* header file.
*/
static const struct execsw **execsw = NULL;
static int nexecs;
u_int exec_maxhdrsz; /* must not be static - used by netbsd32 */
/* list of dynamically loaded execsw entries */
static LIST_HEAD(execlist_head, exec_entry) ex_head =
LIST_HEAD_INITIALIZER(ex_head);
struct exec_entry {
LIST_ENTRY(exec_entry) ex_list;
SLIST_ENTRY(exec_entry) ex_slist;
const struct execsw *ex_sw;
};
#ifndef __HAVE_SYSCALL_INTERN
void syscall(void);
#endif
/* NetBSD autoloadable syscalls */
#ifdef MODULAR
#include <kern/syscalls_autoload.c>
#endif
/* NetBSD emul struct */
struct emul emul_netbsd = {
.e_name = "netbsd",
#ifdef EMUL_NATIVEROOT
.e_path = EMUL_NATIVEROOT,
#else
.e_path = NULL,
#endif
#ifndef __HAVE_MINIMAL_EMUL
.e_flags = EMUL_HAS_SYS___syscall,
.e_errno = NULL,
.e_nosys = SYS_syscall,
.e_nsysent = SYS_NSYSENT,
#endif
#ifdef MODULAR
.e_sc_autoload = netbsd_syscalls_autoload,
#endif
.e_sysent = sysent,
.e_nomodbits = sysent_nomodbits,
#ifdef SYSCALL_DEBUG
.e_syscallnames = syscallnames,
#else
.e_syscallnames = NULL,
#endif
.e_sendsig = sendsig,
.e_trapsignal = trapsignal,
.e_sigcode = NULL,
.e_esigcode = NULL,
.e_sigobject = NULL,
.e_setregs = setregs,
.e_proc_exec = NULL,
.e_proc_fork = NULL,
.e_proc_exit = NULL,
.e_lwp_fork = NULL,
.e_lwp_exit = NULL,
#ifdef __HAVE_SYSCALL_INTERN
.e_syscall_intern = syscall_intern,
#else
.e_syscall = syscall,
#endif
.e_sysctlovly = NULL,
.e_vm_default_addr = uvm_default_mapaddr,
.e_usertrap = NULL,
.e_ucsize = sizeof(ucontext_t),
.e_startlwp = startlwp
};
/*
* Exec lock. Used to control access to execsw[] structures.
* This must not be static so that netbsd32 can access it, too.
*/
krwlock_t exec_lock __cacheline_aligned;
/*
* Data used between a loadvm and execve part of an "exec" operation
*/
struct execve_data {
struct exec_package ed_pack;
struct pathbuf *ed_pathbuf;
struct vattr ed_attr;
struct ps_strings ed_arginfo;
char *ed_argp;
const char *ed_pathstring;
char *ed_resolvedname;
size_t ed_ps_strings_sz;
int ed_szsigcode;
size_t ed_argslen;
long ed_argc;
long ed_envc;
};
/*
* data passed from parent lwp to child during a posix_spawn()
*/
struct spawn_exec_data {
struct execve_data sed_exec;
struct posix_spawn_file_actions
*sed_actions;
struct posix_spawnattr *sed_attrs;
struct proc *sed_parent;
kcondvar_t sed_cv_child_ready;
kmutex_t sed_mtx_child;
int sed_error;
volatile uint32_t sed_refcnt;
};
static struct vm_map *exec_map;
static struct pool exec_pool;
static void *
exec_pool_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(exec_map, NCARGS, 0,
UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
}
static void
exec_pool_free(struct pool *pp, void *addr)
{
uvm_km_free(exec_map, (vaddr_t)addr, NCARGS, UVM_KMF_PAGEABLE);
}
static struct pool_allocator exec_palloc = {
.pa_alloc = exec_pool_alloc,
.pa_free = exec_pool_free,
.pa_pagesz = NCARGS
};
static void
exec_path_free(struct execve_data *data)
{
pathbuf_stringcopy_put(data->ed_pathbuf, data->ed_pathstring);
pathbuf_destroy(data->ed_pathbuf);
if (data->ed_resolvedname) PNBUF_PUT(data->ed_resolvedname);
}
static int
exec_resolvename(struct lwp *l, struct exec_package *epp, struct vnode *vp,
char **rpath)
{
int error;
char *p;
KASSERT(rpath != NULL);
*rpath = PNBUF_GET();
error = vnode_to_path(*rpath, MAXPATHLEN, vp, l, l->l_proc);
if (error) {
DPRINTF(("%s: can't resolve name for %s, error %d\n",
__func__, epp->ep_kname, error));
PNBUF_PUT(*rpath);
*rpath = NULL;
return error;
}
epp->ep_resolvedname = *rpath;
if ((p = strrchr(*rpath, '/')) != NULL) epp->ep_kname = p + 1;
return 0;
}
/*
* check exec:
* given an "executable" described in the exec package's namei info,
* see what we can do with it.
*
* ON ENTRY:
* exec package with appropriate namei info
* lwp pointer of exec'ing lwp
* NO SELF-LOCKED VNODES
*
* ON EXIT:
* error: nothing held, etc. exec header still allocated.
* ok: filled exec package, executable's vnode (unlocked).
*
* EXEC SWITCH ENTRY:
* Locked vnode to check, exec package, proc.
*
* EXEC SWITCH EXIT:
* ok: return 0, filled exec package, executable's vnode (unlocked).
* error: destructive:
* everything deallocated execept exec header.
* non-destructive:
* error code, executable's vnode (unlocked),
* exec header unmodified.
*/
int
/*ARGSUSED*/
check_exec(struct lwp *l, struct exec_package *epp, struct pathbuf *pb,
char **rpath)
{
int error, i;
struct vnode *vp;
size_t resid;
if (epp->ep_resolvedname) {
struct nameidata nd;
// grab the absolute pathbuf here before namei() trashes it.
pathbuf_copystring(pb, epp->ep_resolvedname, PATH_MAX);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
/* first get the vnode */
if ((error = namei(&nd)) != 0)
return error;
epp->ep_vp = vp = nd.ni_vp;
#ifdef DIAGNOSTIC
/* paranoia (take this out once namei stuff stabilizes) */
memset(nd.ni_pnbuf, '~', PATH_MAX);
#endif
} else {
struct file *fp;
if ((error = fd_getvnode(epp->ep_xfd, &fp)) != 0)
return error;
epp->ep_vp = vp = fp->f_vnode;
vref(vp);
fd_putfile(epp->ep_xfd);
if ((error = exec_resolvename(l, epp, vp, rpath)) != 0)
return error;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
}
/* check access and type */
if (vp->v_type != VREG) {
error = EACCES;
goto bad1;
}
if ((error = VOP_ACCESS(vp, VEXEC, l->l_cred)) != 0)
goto bad1;
/* get attributes */
/* XXX VOP_GETATTR is the only thing that needs LK_EXCLUSIVE here */
if ((error = VOP_GETATTR(vp, epp->ep_vap, l->l_cred)) != 0)
goto bad1;
/* Check mount point */
if (vp->v_mount->mnt_flag & MNT_NOEXEC) {
error = EACCES;
goto bad1;
}
if (vp->v_mount->mnt_flag & MNT_NOSUID) epp->ep_vap->va_mode &= ~(S_ISUID | S_ISGID);
/* try to open it */
if ((error = VOP_OPEN(vp, FREAD, l->l_cred)) != 0)
goto bad1;
/* now we have the file, get the exec header */
error = vn_rdwr(UIO_READ, vp, epp->ep_hdr, epp->ep_hdrlen, 0,
UIO_SYSSPACE, IO_NODELOCKED, l->l_cred, &resid, NULL);
if (error)
goto bad1;
/* unlock vp, since we need it unlocked from here on out. */
VOP_UNLOCK(vp);
#if NVERIEXEC > 0
error = veriexec_verify(l, vp,
epp->ep_resolvedname ? epp->ep_resolvedname : epp->ep_kname,
epp->ep_flags & EXEC_INDIR ? VERIEXEC_INDIRECT : VERIEXEC_DIRECT,
NULL);
if (error)
goto bad2;
#endif /* NVERIEXEC > 0 */
#ifdef PAX_SEGVGUARD
error = pax_segvguard(l, vp, epp->ep_resolvedname, false);
if (error)
goto bad2;
#endif /* PAX_SEGVGUARD */
epp->ep_hdrvalid = epp->ep_hdrlen - resid;
/*
* Set up default address space limits. Can be overridden
* by individual exec packages.
*/
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS;
/*
* set up the vmcmds for creation of the process
* address space
*/
error = ENOEXEC;
for (i = 0; i < nexecs; i++) {
int newerror;
epp->ep_esch = execsw[i];
newerror = (*execsw[i]->es_makecmds)(l, epp);
if (!newerror) {
/* Seems ok: check that entry point is not too high */
if (epp->ep_entry >= epp->ep_vm_maxaddr) {
#ifdef DIAGNOSTIC
printf("%s: rejecting %p due to "
"too high entry address (>= %p)\n",
__func__, (void *)epp->ep_entry,
(void *)epp->ep_vm_maxaddr);
#endif
error = ENOEXEC;
break;
}
/* Seems ok: check that entry point is not too low */
if (epp->ep_entry < epp->ep_vm_minaddr) {
#ifdef DIAGNOSTIC
printf("%s: rejecting %p due to "
"too low entry address (< %p)\n",
__func__, (void *)epp->ep_entry,
(void *)epp->ep_vm_minaddr);
#endif
error = ENOEXEC;
break;
}
/* check limits */
#ifdef DIAGNOSTIC
#define LMSG "%s: rejecting due to %s limit (%ju > %ju)\n"
#endif
#ifdef MAXTSIZ
if (epp->ep_tsize > MAXTSIZ) {
#ifdef DIAGNOSTIC
printf(LMSG, __func__, "text",
(uintmax_t)epp->ep_tsize,
(uintmax_t)MAXTSIZ);
#endif
error = ENOMEM;
break;
}
#endif
vsize_t dlimit =
(vsize_t)l->l_proc->p_rlimit[RLIMIT_DATA].rlim_cur;
if (epp->ep_dsize > dlimit) {
#ifdef DIAGNOSTIC
printf(LMSG, __func__, "data",
(uintmax_t)epp->ep_dsize,
(uintmax_t)dlimit);
#endif
error = ENOMEM;
break;
}
return 0;
}
/*
* Reset all the fields that may have been modified by the
* loader.
*/
KASSERT(epp->ep_emul_arg == NULL); if (epp->ep_emul_root != NULL) { vrele(epp->ep_emul_root);
epp->ep_emul_root = NULL;
}
if (epp->ep_interp != NULL) { vrele(epp->ep_interp);
epp->ep_interp = NULL;
}
epp->ep_pax_flags = 0;
/* make sure the first "interesting" error code is saved. */
if (error == ENOEXEC)
error = newerror;
if (epp->ep_flags & EXEC_DESTR)
/* Error from "#!" code, tidied up by recursive call */
return error;
}
/* not found, error */
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
#if NVERIEXEC > 0 || defined(PAX_SEGVGUARD)
bad2:
#endif
/*
* close and release the vnode, restore the old one, free the
* pathname buf, and punt.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(vp, FREAD, l->l_cred);
vput(vp);
return error;
bad1:
/*
* free the namei pathname buffer, and put the vnode
* (which we don't yet have open).
*/
vput(vp); /* was still locked */
return error;
}
#ifdef __MACHINE_STACK_GROWS_UP
#define STACK_PTHREADSPACE NBPG
#else
#define STACK_PTHREADSPACE 0
#endif
static int
execve_fetch_element(char * const *array, size_t index, char **value)
{
return copyin(array + index, value, sizeof(*value));
}
/*
* exec system call
*/
int
sys_execve(struct lwp *l, const struct sys_execve_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(char * const *) argp;
syscallarg(char * const *) envp;
} */
return execve1(l, true, SCARG(uap, path), -1, SCARG(uap, argp),
SCARG(uap, envp), execve_fetch_element);
}
int
sys_fexecve(struct lwp *l, const struct sys_fexecve_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(char * const *) argp;
syscallarg(char * const *) envp;
} */
return execve1(l, false, NULL, SCARG(uap, fd), SCARG(uap, argp),
SCARG(uap, envp), execve_fetch_element);
}
/*
* Load modules to try and execute an image that we do not understand.
* If no execsw entries are present, we load those likely to be needed
* in order to run native images only. Otherwise, we autoload all
* possible modules that could let us run the binary. XXX lame
*/
static void
exec_autoload(void)
{
#ifdef MODULAR
static const char * const native[] = {
"exec_elf32",
"exec_elf64",
"exec_script",
NULL
};
static const char * const compat[] = {
"exec_elf32",
"exec_elf64",
"exec_script",
"exec_aout",
"exec_coff",
"exec_ecoff",
"compat_aoutm68k",
"compat_netbsd32",
#if 0
"compat_linux",
"compat_linux32",
#endif
"compat_sunos",
"compat_sunos32",
"compat_ultrix",
NULL
};
char const * const *list;
int i;
list = nexecs == 0 ? native : compat;
for (i = 0; list[i] != NULL; i++) {
if (module_autoload(list[i], MODULE_CLASS_EXEC) != 0) {
continue;
}
yield();
}
#endif
}
/*
* Copy the user or kernel supplied upath to the allocated pathbuffer pbp
* making it absolute in the process, by prepending the current working
* directory if it is not. If offs is supplied it will contain the offset
* where the original supplied copy of upath starts.
*/
int
exec_makepathbuf(struct lwp *l, const char *upath, enum uio_seg seg,
struct pathbuf **pbp, size_t *offs)
{
char *path, *bp;
size_t len, tlen;
int error;
struct cwdinfo *cwdi;
path = PNBUF_GET();
if (seg == UIO_SYSSPACE) {
error = copystr(upath, path, MAXPATHLEN, &len);
} else {
error = copyinstr(upath, path, MAXPATHLEN, &len);
}
if (error)
goto err;
if (path[0] == '/') {
if (offs)
*offs = 0;
goto out;
}
len++;
if (len + 1 >= MAXPATHLEN) {
error = ENAMETOOLONG;
goto err;
}
bp = path + MAXPATHLEN - len;
memmove(bp, path, len);
*(--bp) = '/';
cwdi = l->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path, MAXPATHLEN / 2,
GETCWD_CHECK_ACCESS, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto err;
tlen = path + MAXPATHLEN - bp;
memmove(path, bp, tlen);
path[tlen - 1] = '\0';
if (offs) *offs = tlen - len;
out:
*pbp = pathbuf_assimilate(path);
return 0;
err:
PNBUF_PUT(path);
return error;
}
vaddr_t
exec_vm_minaddr(vaddr_t va_min)
{
/*
* Increase va_min if we don't want NULL to be mappable by the
* process.
*/
#define VM_MIN_GUARD PAGE_SIZE
if (user_va0_disable && (va_min < VM_MIN_GUARD))
return VM_MIN_GUARD;
return va_min;
}
static int
execve_loadvm(struct lwp *l, bool has_path, const char *path, int fd,
char * const *args, char * const *envs,
execve_fetch_element_t fetch_element,
struct execve_data * restrict data)
{
struct exec_package * const epp = &data->ed_pack;
int error;
struct proc *p;
char *dp;
u_int modgen;
KASSERT(data != NULL);
p = l->l_proc;
modgen = 0;
SDT_PROBE(proc, kernel, , exec, path, 0, 0, 0, 0);
/*
* Check if we have exceeded our number of processes limit.
* This is so that we handle the case where a root daemon
* forked, ran setuid to become the desired user and is trying
* to exec. The obvious place to do the reference counting check
* is setuid(), but we don't do the reference counting check there
* like other OS's do because then all the programs that use setuid()
* must be modified to check the return code of setuid() and exit().
* It is dangerous to make setuid() fail, because it fails open and
* the program will continue to run as root. If we make it succeed
* and return an error code, again we are not enforcing the limit.
* The best place to enforce the limit is here, when the process tries
* to execute a new image, because eventually the process will need
* to call exec in order to do something useful.
*/
retry:
if (p->p_flag & PK_SUGID) { if (kauth_authorize_process(l->l_cred, KAUTH_PROCESS_RLIMIT,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p->p_rlimit[RLIMIT_NPROC],
KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
chgproccnt(kauth_cred_getuid(l->l_cred), 0) >
p->p_rlimit[RLIMIT_NPROC].rlim_cur)
return EAGAIN;
}
/*
* Drain existing references and forbid new ones. The process
* should be left alone until we're done here. This is necessary
* to avoid race conditions - e.g. in ptrace() - that might allow
* a local user to illicitly obtain elevated privileges.
*/
rw_enter(&p->p_reflock, RW_WRITER);
if (has_path) {
size_t offs;
/*
* Init the namei data to point the file user's program name.
* This is done here rather than in check_exec(), so that it's
* possible to override this settings if any of makecmd/probe
* functions call check_exec() recursively - for example,
* see exec_script_makecmds().
*/
if ((error = exec_makepathbuf(l, path, UIO_USERSPACE,
&data->ed_pathbuf, &offs)) != 0)
goto clrflg;
data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
epp->ep_kname = data->ed_pathstring + offs;
data->ed_resolvedname = PNBUF_GET();
epp->ep_resolvedname = data->ed_resolvedname;
epp->ep_xfd = -1;
} else { data->ed_pathbuf = pathbuf_assimilate(strcpy(PNBUF_GET(), "/"));
data->ed_pathstring = pathbuf_stringcopy_get(data->ed_pathbuf);
epp->ep_kname = "*fexecve*";
data->ed_resolvedname = NULL;
epp->ep_resolvedname = NULL;
epp->ep_xfd = fd;
}
/*
* initialize the fields of the exec package.
*/
epp->ep_hdr = kmem_alloc(exec_maxhdrsz, KM_SLEEP);
epp->ep_hdrlen = exec_maxhdrsz;
epp->ep_hdrvalid = 0;
epp->ep_emul_arg = NULL;
epp->ep_emul_arg_free = NULL;
memset(&epp->ep_vmcmds, 0, sizeof(epp->ep_vmcmds));
epp->ep_vap = &data->ed_attr;
epp->ep_flags = (p->p_flag & PK_32) ? EXEC_FROM32 : 0;
MD_TOPDOWN_INIT(epp);
epp->ep_emul_root = NULL;
epp->ep_interp = NULL;
epp->ep_esch = NULL;
epp->ep_pax_flags = 0;
memset(epp->ep_machine_arch, 0, sizeof(epp->ep_machine_arch));
rw_enter(&exec_lock, RW_READER);
/* see if we can run it. */
if ((error = check_exec(l, epp, data->ed_pathbuf,
&data->ed_resolvedname)) != 0) {
if (error != ENOENT && error != EACCES && error != ENOEXEC) {
DPRINTF(("%s: check exec failed for %s, error %d\n",
__func__, epp->ep_kname, error));
}
goto freehdr;
}
/* allocate an argument buffer */
data->ed_argp = pool_get(&exec_pool, PR_WAITOK);
KASSERT(data->ed_argp != NULL);
dp = data->ed_argp;
if ((error = copyinargs(data, args, envs, fetch_element, &dp)) != 0) {
goto bad;
}
/*
* Calculate the new stack size.
*/
#ifdef __MACHINE_STACK_GROWS_UP
/*
* copyargs() fills argc/argv/envp from the lower address even on
* __MACHINE_STACK_GROWS_UP machines. Reserve a few words just below the SP
* so that _rtld() use it.
*/
#define RTLD_GAP 32
#else
#define RTLD_GAP 0
#endif
const size_t argenvstrlen = (char *)ALIGN(dp) - data->ed_argp;
data->ed_argslen = calcargs(data, argenvstrlen);
const size_t len = calcstack(data, pax_aslr_stack_gap(epp) + RTLD_GAP);
if (len > epp->ep_ssize) {
/* in effect, compare to initial limit */
DPRINTF(("%s: stack limit exceeded %zu\n", __func__, len));
error = ENOMEM;
goto bad;
}
/* adjust "active stack depth" for process VSZ */
epp->ep_ssize = len;
return 0;
bad:
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
/* kill any opened file descriptor, if necessary */
if (epp->ep_flags & EXEC_HASFD) { epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
}
/* close and put the exec'd file */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
vput(epp->ep_vp);
pool_put(&exec_pool, data->ed_argp);
freehdr:
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL) vrele(epp->ep_emul_root); if (epp->ep_interp != NULL) vrele(epp->ep_interp);
rw_exit(&exec_lock);
exec_path_free(data);
clrflg:
rw_exit(&p->p_reflock);
if (modgen != module_gen && error == ENOEXEC) {
modgen = module_gen;
exec_autoload();
goto retry;
}
SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
return error;
}
static int
execve_dovmcmds(struct lwp *l, struct execve_data * restrict data)
{
struct exec_package * const epp = &data->ed_pack;
struct proc *p = l->l_proc;
struct exec_vmcmd *base_vcp;
int error = 0;
size_t i;
/* record proc's vnode, for use by procfs and others */
if (p->p_textvp)
vrele(p->p_textvp);
vref(epp->ep_vp);
p->p_textvp = epp->ep_vp;
/* create the new process's VM space by running the vmcmds */
KASSERTMSG(epp->ep_vmcmds.evs_used != 0, "%s: no vmcmds", __func__);
#ifdef TRACE_EXEC
DUMPVMCMDS(epp, 0, 0);
#endif
base_vcp = NULL;
for (i = 0; i < epp->ep_vmcmds.evs_used && !error; i++) {
struct exec_vmcmd *vcp;
vcp = &epp->ep_vmcmds.evs_cmds[i];
if (vcp->ev_flags & VMCMD_RELATIVE) {
KASSERTMSG(base_vcp != NULL,
"%s: relative vmcmd with no base", __func__);
KASSERTMSG((vcp->ev_flags & VMCMD_BASE) == 0,
"%s: illegal base & relative vmcmd", __func__);
vcp->ev_addr += base_vcp->ev_addr;
}
error = (*vcp->ev_proc)(l, vcp);
if (error)
DUMPVMCMDS(epp, i, error);
if (vcp->ev_flags & VMCMD_BASE)
base_vcp = vcp;
}
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, l->l_cred);
vput(epp->ep_vp);
/* if an error happened, deallocate and punt */
if (error != 0) {
DPRINTF(("%s: vmcmd %zu failed: %d\n", __func__, i - 1, error));
}
return error;
}
static void
execve_free_data(struct execve_data *data)
{
struct exec_package * const epp = &data->ed_pack;
/* free the vmspace-creation commands, and release their references */
kill_vmcmds(&epp->ep_vmcmds);
/* kill any opened file descriptor, if necessary */
if (epp->ep_flags & EXEC_HASFD) {
epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
}
/* close and put the exec'd file */
vn_lock(epp->ep_vp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(epp->ep_vp, FREAD, curlwp->l_cred);
vput(epp->ep_vp);
pool_put(&exec_pool, data->ed_argp);
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL)
vrele(epp->ep_emul_root);
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
exec_path_free(data);
}
static void
pathexec(struct proc *p, const char *resolvedname)
{
/* set command name & other accounting info */
const char *cmdname;
if (resolvedname == NULL) {
cmdname = "*fexecve*";
resolvedname = "/";
} else {
cmdname = strrchr(resolvedname, '/') + 1;
}
KASSERTMSG(resolvedname[0] == '/', "bad resolvedname `%s'",
resolvedname);
strlcpy(p->p_comm, cmdname, sizeof(p->p_comm));
kmem_strfree(p->p_path);
p->p_path = kmem_strdupsize(resolvedname, NULL, KM_SLEEP);
}
/* XXX elsewhere */
static int
credexec(struct lwp *l, struct execve_data *data)
{
struct proc *p = l->l_proc;
struct vattr *attr = &data->ed_attr;
int error;
/*
* Deal with set[ug]id. MNT_NOSUID has already been used to disable
* s[ug]id. It's OK to check for PSL_TRACED here as we have blocked
* out additional references on the process for the moment.
*/
if ((p->p_slflag & PSL_TRACED) == 0 &&
(((attr->va_mode & S_ISUID) != 0 &&
kauth_cred_geteuid(l->l_cred) != attr->va_uid) ||
((attr->va_mode & S_ISGID) != 0 &&
kauth_cred_getegid(l->l_cred) != attr->va_gid))) {
/*
* Mark the process as SUGID before we do
* anything that might block.
*/
proc_crmod_enter();
proc_crmod_leave(NULL, NULL, true);
if (data->ed_argc == 0) {
DPRINTF((
"%s: not executing set[ug]id binary with no args\n",
__func__));
return EINVAL;
}
/* Make sure file descriptors 0..2 are in use. */
if ((error = fd_checkstd()) != 0) {
DPRINTF(("%s: fdcheckstd failed %d\n",
__func__, error));
return error;
}
/*
* Copy the credential so other references don't see our
* changes.
*/
l->l_cred = kauth_cred_copy(l->l_cred);
#ifdef KTRACE
/*
* If the persistent trace flag isn't set, turn off.
*/
if (p->p_tracep) {
mutex_enter(&ktrace_lock);
if (!(p->p_traceflag & KTRFAC_PERSISTENT))
ktrderef(p);
mutex_exit(&ktrace_lock);
}
#endif
if (attr->va_mode & S_ISUID)
kauth_cred_seteuid(l->l_cred, attr->va_uid);
if (attr->va_mode & S_ISGID)
kauth_cred_setegid(l->l_cred, attr->va_gid);
} else {
if (kauth_cred_geteuid(l->l_cred) ==
kauth_cred_getuid(l->l_cred) &&
kauth_cred_getegid(l->l_cred) ==
kauth_cred_getgid(l->l_cred))
p->p_flag &= ~PK_SUGID;
}
/*
* Copy the credential so other references don't see our changes.
* Test to see if this is necessary first, since in the common case
* we won't need a private reference.
*/
if (kauth_cred_geteuid(l->l_cred) != kauth_cred_getsvuid(l->l_cred) ||
kauth_cred_getegid(l->l_cred) != kauth_cred_getsvgid(l->l_cred)) {
l->l_cred = kauth_cred_copy(l->l_cred);
kauth_cred_setsvuid(l->l_cred, kauth_cred_geteuid(l->l_cred));
kauth_cred_setsvgid(l->l_cred, kauth_cred_getegid(l->l_cred));
}
/* Update the master credentials. */
if (l->l_cred != p->p_cred) {
kauth_cred_t ocred;
mutex_enter(p->p_lock);
ocred = p->p_cred;
p->p_cred = kauth_cred_hold(l->l_cred);
mutex_exit(p->p_lock);
kauth_cred_free(ocred);
}
return 0;
}
static void
emulexec(struct lwp *l, struct exec_package *epp)
{
struct proc *p = l->l_proc;
/* The emulation root will usually have been found when we looked
* for the elf interpreter (or similar), if not look now. */
if (epp->ep_esch->es_emul->e_path != NULL &&
epp->ep_emul_root == NULL)
emul_find_root(l, epp);
/* Any old emulation root got removed by fdcloseexec */
rw_enter(&p->p_cwdi->cwdi_lock, RW_WRITER);
p->p_cwdi->cwdi_edir = epp->ep_emul_root;
rw_exit(&p->p_cwdi->cwdi_lock);
epp->ep_emul_root = NULL;
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
/*
* Call emulation specific exec hook. This can setup per-process
* p->p_emuldata or do any other per-process stuff an emulation needs.
*
* If we are executing process of different emulation than the
* original forked process, call e_proc_exit() of the old emulation
* first, then e_proc_exec() of new emulation. If the emulation is
* same, the exec hook code should deallocate any old emulation
* resources held previously by this process.
*/
if (p->p_emul && p->p_emul->e_proc_exit
&& p->p_emul != epp->ep_esch->es_emul)
(*p->p_emul->e_proc_exit)(p);
/*
* Call exec hook. Emulation code may NOT store reference to anything
* from &pack.
*/
if (epp->ep_esch->es_emul->e_proc_exec)
(*epp->ep_esch->es_emul->e_proc_exec)(p, epp);
/* update p_emul, the old value is no longer needed */
p->p_emul = epp->ep_esch->es_emul;
/* ...and the same for p_execsw */
p->p_execsw = epp->ep_esch;
#ifdef __HAVE_SYSCALL_INTERN
(*p->p_emul->e_syscall_intern)(p);
#endif
ktremul();
}
static int
execve_runproc(struct lwp *l, struct execve_data * restrict data,
bool no_local_exec_lock, bool is_spawn)
{
struct exec_package * const epp = &data->ed_pack;
int error = 0;
struct proc *p;
struct vmspace *vm;
/*
* In case of a posix_spawn operation, the child doing the exec
* might not hold the reader lock on exec_lock, but the parent
* will do this instead.
*/
KASSERT(no_local_exec_lock || rw_lock_held(&exec_lock));
KASSERT(!no_local_exec_lock || is_spawn);
KASSERT(data != NULL);
p = l->l_proc;
/* Get rid of other LWPs. */
if (p->p_nlwps > 1) {
mutex_enter(p->p_lock);
exit_lwps(l);
mutex_exit(p->p_lock);
}
KDASSERT(p->p_nlwps == 1);
/*
* All of the other LWPs got rid of their robust futexes
* when they exited above, but we might still have some
* to dispose of. Do that now.
*/
if (__predict_false(l->l_robust_head != 0)) {
futex_release_all_lwp(l);
/*
* Since this LWP will live on with a different
* program image, we need to clear the robust
* futex list pointer here.
*/
l->l_robust_head = 0;
}
/* Destroy any lwpctl info. */
if (p->p_lwpctl != NULL)
lwp_ctl_exit();
/* Remove POSIX timers */
ptimers_free(p, TIMERS_POSIX);
/* Set the PaX flags. */
pax_set_flags(epp, p);
/*
* Do whatever is necessary to prepare the address space
* for remapping. Note that this might replace the current
* vmspace with another!
*
* vfork(): do not touch any user space data in the new child
* until we have awoken the parent below, or it will defeat
* lazy pmap switching (on x86).
*/
if (is_spawn)
uvmspace_spawn(l, epp->ep_vm_minaddr,
epp->ep_vm_maxaddr,
epp->ep_flags & EXEC_TOPDOWN_VM);
else
uvmspace_exec(l, epp->ep_vm_minaddr,
epp->ep_vm_maxaddr,
epp->ep_flags & EXEC_TOPDOWN_VM);
vm = p->p_vmspace;
vm->vm_taddr = (void *)epp->ep_taddr;
vm->vm_tsize = btoc(epp->ep_tsize);
vm->vm_daddr = (void*)epp->ep_daddr;
vm->vm_dsize = btoc(epp->ep_dsize);
vm->vm_ssize = btoc(epp->ep_ssize);
vm->vm_issize = 0;
vm->vm_maxsaddr = (void *)epp->ep_maxsaddr;
vm->vm_minsaddr = (void *)epp->ep_minsaddr;
pax_aslr_init_vm(l, vm, epp);
cwdexec(p);
fd_closeexec(); /* handle close on exec */
if (__predict_false(ktrace_on))
fd_ktrexecfd();
execsigs(p); /* reset caught signals */
mutex_enter(p->p_lock);
l->l_ctxlink = NULL; /* reset ucontext link */
p->p_acflag &= ~AFORK;
p->p_flag |= PK_EXEC;
mutex_exit(p->p_lock);
error = credexec(l, data);
if (error)
goto exec_abort;
#if defined(__HAVE_RAS)
/*
* Remove all RASs from the address space.
*/
ras_purgeall();
#endif
/*
* Stop profiling.
*/
if ((p->p_stflag & PST_PROFIL) != 0) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
/*
* It's OK to test PL_PPWAIT unlocked here, as other LWPs have
* exited and exec()/exit() are the only places it will be cleared.
*
* Once the parent has been awoken, curlwp may teleport to a new CPU
* in sched_vforkexec(), and it's then OK to start messing with user
* data. See comment above.
*/
if ((p->p_lflag & PL_PPWAIT) != 0) {
bool samecpu;
lwp_t *lp;
mutex_enter(&proc_lock);
lp = p->p_vforklwp;
p->p_vforklwp = NULL;
l->l_lwpctl = NULL; /* was on loan from blocked parent */
/* Clear flags after cv_broadcast() (scheduler needs them). */
p->p_lflag &= ~PL_PPWAIT;
lp->l_vforkwaiting = false;
/* If parent is still on same CPU, teleport curlwp elsewhere. */
samecpu = (lp->l_cpu == curlwp->l_cpu);
cv_broadcast(&lp->l_waitcv);
mutex_exit(&proc_lock);
/* Give the parent its CPU back - find a new home. */
KASSERT(!is_spawn);
sched_vforkexec(l, samecpu);
}
/* Now map address space. */
error = execve_dovmcmds(l, data);
if (error != 0)
goto exec_abort;
pathexec(p, epp->ep_resolvedname);
char * const newstack = STACK_GROW(vm->vm_minsaddr, epp->ep_ssize);
error = copyoutargs(data, l, newstack);
if (error != 0)
goto exec_abort;
doexechooks(p);
/*
* Set initial SP at the top of the stack.
*
* Note that on machines where stack grows up (e.g. hppa), SP points to
* the end of arg/env strings. Userland guesses the address of argc
* via ps_strings::ps_argvstr.
*/
/* Setup new registers and do misc. setup. */
(*epp->ep_esch->es_emul->e_setregs)(l, epp, (vaddr_t)newstack);
if (epp->ep_esch->es_setregs)
(*epp->ep_esch->es_setregs)(l, epp, (vaddr_t)newstack);
/* Provide a consistent LWP private setting */
(void)lwp_setprivate(l, NULL);
/* Discard all PCU state; need to start fresh */
pcu_discard_all(l);
/* map the process's signal trampoline code */
if ((error = exec_sigcode_map(p, epp->ep_esch->es_emul)) != 0) {
DPRINTF(("%s: map sigcode failed %d\n", __func__, error));
goto exec_abort;
}
pool_put(&exec_pool, data->ed_argp);
/*
* Notify anyone who might care that we've exec'd.
*
* This is slightly racy; someone could sneak in and
* attach a knote after we've decided not to notify,
* or vice-versa, but that's not particularly bothersome.
* knote_proc_exec() will acquire p->p_lock as needed.
*/
if (!SLIST_EMPTY(&p->p_klist)) {
knote_proc_exec(p);
}
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
SDT_PROBE(proc, kernel, , exec__success, epp->ep_kname, 0, 0, 0, 0);
emulexec(l, epp);
/* Allow new references from the debugger/procfs. */
rw_exit(&p->p_reflock);
if (!no_local_exec_lock)
rw_exit(&exec_lock);
mutex_enter(&proc_lock);
/* posix_spawn(3) reports a single event with implied exec(3) */
if ((p->p_slflag & PSL_TRACED) && !is_spawn) {
mutex_enter(p->p_lock);
eventswitch(TRAP_EXEC, 0, 0);
mutex_enter(&proc_lock);
}
if (p->p_sflag & PS_STOPEXEC) {
ksiginfoq_t kq;
KASSERT(l->l_blcnt == 0);
p->p_pptr->p_nstopchild++;
p->p_waited = 0;
mutex_enter(p->p_lock);
ksiginfo_queue_init(&kq);
sigclearall(p, &contsigmask, &kq);
lwp_lock(l);
l->l_stat = LSSTOP;
p->p_stat = SSTOP;
p->p_nrlwps--;
lwp_unlock(l);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
ksiginfo_queue_drain(&kq);
} else {
mutex_exit(&proc_lock);
}
exec_path_free(data);
#ifdef TRACE_EXEC
DPRINTF(("%s finished\n", __func__));
#endif
return EJUSTRETURN;
exec_abort:
SDT_PROBE(proc, kernel, , exec__failure, error, 0, 0, 0, 0);
rw_exit(&p->p_reflock);
if (!no_local_exec_lock)
rw_exit(&exec_lock);
exec_path_free(data);
/*
* the old process doesn't exist anymore. exit gracefully.
* get rid of the (new) address space we have created, if any, get rid
* of our namei data and vnode, and exit noting failure
*/
if (vm != NULL) {
uvm_deallocate(&vm->vm_map, VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS - VM_MIN_ADDRESS);
}
exec_free_emul_arg(epp);
pool_put(&exec_pool, data->ed_argp);
kmem_free(epp->ep_hdr, epp->ep_hdrlen);
if (epp->ep_emul_root != NULL)
vrele(epp->ep_emul_root);
if (epp->ep_interp != NULL)
vrele(epp->ep_interp);
/* Acquire the sched-state mutex (exit1() will release it). */
if (!is_spawn) {
mutex_enter(p->p_lock);
exit1(l, error, SIGABRT);
}
return error;
}
int
execve1(struct lwp *l, bool has_path, const char *path, int fd,
char * const *args, char * const *envs,
execve_fetch_element_t fetch_element)
{
struct execve_data data;
int error;
error = execve_loadvm(l, has_path, path, fd, args, envs, fetch_element,
&data);
if (error)
return error;
error = execve_runproc(l, &data, false, false);
return error;
}
static size_t
fromptrsz(const struct exec_package *epp)
{
return (epp->ep_flags & EXEC_FROM32) ? sizeof(int) : sizeof(char *);
}
static size_t
ptrsz(const struct exec_package *epp)
{
return (epp->ep_flags & EXEC_32) ? sizeof(int) : sizeof(char *);
}
static size_t
calcargs(struct execve_data * restrict data, const size_t argenvstrlen)
{
struct exec_package * const epp = &data->ed_pack;
const size_t nargenvptrs =
1 + /* long argc */
data->ed_argc + /* char *argv[] */
1 + /* \0 */
data->ed_envc + /* char *env[] */
1; /* \0 */
return (nargenvptrs * ptrsz(epp)) /* pointers */
+ argenvstrlen /* strings */
+ epp->ep_esch->es_arglen; /* auxinfo */
}
static size_t
calcstack(struct execve_data * restrict data, const size_t gaplen)
{
struct exec_package * const epp = &data->ed_pack;
data->ed_szsigcode = epp->ep_esch->es_emul->e_esigcode -
epp->ep_esch->es_emul->e_sigcode;
data->ed_ps_strings_sz = (epp->ep_flags & EXEC_32) ?
sizeof(struct ps_strings32) : sizeof(struct ps_strings);
const size_t sigcode_psstr_sz =
data->ed_szsigcode + /* sigcode */
data->ed_ps_strings_sz + /* ps_strings */
STACK_PTHREADSPACE; /* pthread space */
const size_t stacklen =
data->ed_argslen +
gaplen +
sigcode_psstr_sz;
/* make the stack "safely" aligned */
return STACK_LEN_ALIGN(stacklen, STACK_ALIGNBYTES);
}
static int
copyoutargs(struct execve_data * restrict data, struct lwp *l,
char * const newstack)
{
struct exec_package * const epp = &data->ed_pack;
struct proc *p = l->l_proc;
int error;
memset(&data->ed_arginfo, 0, sizeof(data->ed_arginfo));
/* remember information about the process */
data->ed_arginfo.ps_nargvstr = data->ed_argc;
data->ed_arginfo.ps_nenvstr = data->ed_envc;
/*
* Allocate the stack address passed to the newly execve()'ed process.
*
* The new stack address will be set to the SP (stack pointer) register
* in setregs().
*/
char *newargs = STACK_ALLOC(
STACK_SHRINK(newstack, data->ed_argslen), data->ed_argslen);
error = (*epp->ep_esch->es_copyargs)(l, epp,
&data->ed_arginfo, &newargs, data->ed_argp);
if (error) {
DPRINTF(("%s: copyargs failed %d\n", __func__, error));
return error;
}
error = copyoutpsstrs(data, p);
if (error != 0)
return error;
return 0;
}
static int
copyoutpsstrs(struct execve_data * restrict data, struct proc *p)
{
struct exec_package * const epp = &data->ed_pack;
struct ps_strings32 arginfo32;
void *aip;
int error;
/* fill process ps_strings info */
p->p_psstrp = (vaddr_t)STACK_ALLOC(STACK_GROW(epp->ep_minsaddr,
STACK_PTHREADSPACE), data->ed_ps_strings_sz);
if (epp->ep_flags & EXEC_32) {
aip = &arginfo32;
arginfo32.ps_argvstr = (vaddr_t)data->ed_arginfo.ps_argvstr;
arginfo32.ps_nargvstr = data->ed_arginfo.ps_nargvstr;
arginfo32.ps_envstr = (vaddr_t)data->ed_arginfo.ps_envstr;
arginfo32.ps_nenvstr = data->ed_arginfo.ps_nenvstr;
} else
aip = &data->ed_arginfo;
/* copy out the process's ps_strings structure */
if ((error = copyout(aip, (void *)p->p_psstrp, data->ed_ps_strings_sz))
!= 0) {
DPRINTF(("%s: ps_strings copyout %p->%p size %zu failed\n",
__func__, aip, (void *)p->p_psstrp, data->ed_ps_strings_sz));
return error;
}
return 0;
}
static int
copyinargs(struct execve_data * restrict data, char * const *args,
char * const *envs, execve_fetch_element_t fetch_element, char **dpp)
{
struct exec_package * const epp = &data->ed_pack;
char *dp;
size_t i;
int error;
dp = *dpp;
data->ed_argc = 0;
/* copy the fake args list, if there's one, freeing it as we go */
if (epp->ep_flags & EXEC_HASARGL) {
struct exec_fakearg *fa = epp->ep_fa;
while (fa->fa_arg != NULL) {
const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
size_t len;
len = strlcpy(dp, fa->fa_arg, maxlen);
/* Count NUL into len. */
if (len < maxlen)
len++;
else {
while (fa->fa_arg != NULL) {
kmem_free(fa->fa_arg, fa->fa_len);
fa++;
}
kmem_free(epp->ep_fa, epp->ep_fa_len);
epp->ep_flags &= ~EXEC_HASARGL;
return E2BIG;
}
ktrexecarg(fa->fa_arg, len - 1);
dp += len;
kmem_free(fa->fa_arg, fa->fa_len);
fa++;
data->ed_argc++;
}
kmem_free(epp->ep_fa, epp->ep_fa_len);
epp->ep_flags &= ~EXEC_HASARGL;
}
/*
* Read and count argument strings from user.
*/
if (args == NULL) {
DPRINTF(("%s: null args\n", __func__));
return EINVAL;
}
if (epp->ep_flags & EXEC_SKIPARG)
args = (const void *)((const char *)args + fromptrsz(epp));
i = 0;
error = copyinargstrs(data, args, fetch_element, &dp, &i, ktr_execarg);
if (error != 0) {
DPRINTF(("%s: copyin arg %d\n", __func__, error));
return error;
}
data->ed_argc += i;
/*
* Read and count environment strings from user.
*/
data->ed_envc = 0;
/* environment need not be there */
if (envs == NULL)
goto done;
i = 0;
error = copyinargstrs(data, envs, fetch_element, &dp, &i, ktr_execenv);
if (error != 0) {
DPRINTF(("%s: copyin env %d\n", __func__, error));
return error;
}
data->ed_envc += i;
done:
*dpp = dp;
return 0;
}
static int
copyinargstrs(struct execve_data * restrict data, char * const *strs,
execve_fetch_element_t fetch_element, char **dpp, size_t *ip,
void (*ktr)(const void *, size_t))
{
char *dp, *sp;
size_t i;
int error;
dp = *dpp;
i = 0;
while (1) {
const size_t maxlen = ARG_MAX - (dp - data->ed_argp);
size_t len;
if ((error = (*fetch_element)(strs, i, &sp)) != 0) {
return error;
}
if (!sp)
break;
if ((error = copyinstr(sp, dp, maxlen, &len)) != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
return error;
}
if (__predict_false(ktrace_on)) (*ktr)(dp, len - 1);
dp += len;
i++;
}
*dpp = dp;
*ip = i;
return 0;
}
/*
* Copy argv and env strings from kernel buffer (argp) to the new stack.
* Those strings are located just after auxinfo.
*/
int
copyargs(struct lwp *l, struct exec_package *pack, struct ps_strings *arginfo,
char **stackp, void *argp)
{
char **cpp, *dp, *sp;
size_t len;
void *nullp;
long argc, envc;
int error;
cpp = (char **)*stackp;
nullp = NULL;
argc = arginfo->ps_nargvstr;
envc = arginfo->ps_nenvstr;
/* argc on stack is long */
CTASSERT(sizeof(*cpp) == sizeof(argc));
dp = (char *)(cpp +
1 + /* long argc */
argc + /* char *argv[] */
1 + /* \0 */
envc + /* char *env[] */
1) + /* \0 */
pack->ep_esch->es_arglen; /* auxinfo */
sp = argp;
if ((error = copyout(&argc, cpp++, sizeof(argc))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(argc));
return error;
}
/* XXX don't copy them out, remap them! */
arginfo->ps_argvstr = cpp; /* remember location of argv for later */
for (; --argc >= 0; sp += len, dp += len) {
if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(dp));
return error;
}
if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
COPYPRINTF("str", dp, (size_t)ARG_MAX);
return error;
}
}
if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(nullp));
return error;
}
arginfo->ps_envstr = cpp; /* remember location of envp for later */
for (; --envc >= 0; sp += len, dp += len) {
if ((error = copyout(&dp, cpp++, sizeof(dp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(dp));
return error;
}
if ((error = copyoutstr(sp, dp, ARG_MAX, &len)) != 0) {
COPYPRINTF("str", dp, (size_t)ARG_MAX);
return error;
}
}
if ((error = copyout(&nullp, cpp++, sizeof(nullp))) != 0) {
COPYPRINTF("", cpp - 1, sizeof(nullp));
return error;
}
*stackp = (char *)cpp;
return 0;
}
/*
* Add execsw[] entries.
*/
int
exec_add(struct execsw *esp, int count)
{
struct exec_entry *it;
int i, error = 0;
if (count == 0) {
return 0;
}
/* Check for duplicates. */
rw_enter(&exec_lock, RW_WRITER);
for (i = 0; i < count; i++) {
LIST_FOREACH(it, &ex_head, ex_list) {
/* assume unique (makecmds, probe_func, emulation) */
if (it->ex_sw->es_makecmds == esp[i].es_makecmds &&
it->ex_sw->u.elf_probe_func ==
esp[i].u.elf_probe_func &&
it->ex_sw->es_emul == esp[i].es_emul) {
rw_exit(&exec_lock);
return EEXIST;
}
}
}
/* Allocate new entries. */
for (i = 0; i < count; i++) {
it = kmem_alloc(sizeof(*it), KM_SLEEP);
it->ex_sw = &esp[i];
error = exec_sigcode_alloc(it->ex_sw->es_emul);
if (error != 0) {
kmem_free(it, sizeof(*it));
break;
}
LIST_INSERT_HEAD(&ex_head, it, ex_list);
}
/* If even one fails, remove them all back. */
if (error != 0) {
for (i--; i >= 0; i--) {
it = LIST_FIRST(&ex_head);
LIST_REMOVE(it, ex_list);
exec_sigcode_free(it->ex_sw->es_emul);
kmem_free(it, sizeof(*it));
}
return error;
}
/* update execsw[] */
exec_init(0);
rw_exit(&exec_lock);
return 0;
}
/*
* Remove execsw[] entry.
*/
int
exec_remove(struct execsw *esp, int count)
{
struct exec_entry *it, *next;
int i;
const struct proclist_desc *pd;
proc_t *p;
if (count == 0) {
return 0;
}
/* Abort if any are busy. */
rw_enter(&exec_lock, RW_WRITER);
for (i = 0; i < count; i++) {
mutex_enter(&proc_lock);
for (pd = proclists; pd->pd_list != NULL; pd++) {
PROCLIST_FOREACH(p, pd->pd_list) {
if (p->p_execsw == &esp[i]) {
mutex_exit(&proc_lock);
rw_exit(&exec_lock);
return EBUSY;
}
}
}
mutex_exit(&proc_lock);
}
/* None are busy, so remove them all. */
for (i = 0; i < count; i++) {
for (it = LIST_FIRST(&ex_head); it != NULL; it = next) {
next = LIST_NEXT(it, ex_list);
if (it->ex_sw == &esp[i]) {
LIST_REMOVE(it, ex_list);
exec_sigcode_free(it->ex_sw->es_emul);
kmem_free(it, sizeof(*it));
break;
}
}
}
/* update execsw[] */
exec_init(0);
rw_exit(&exec_lock);
return 0;
}
/*
* Initialize exec structures. If init_boot is true, also does necessary
* one-time initialization (it's called from main() that way).
* Once system is multiuser, this should be called with exec_lock held,
* i.e. via exec_{add|remove}().
*/
int
exec_init(int init_boot)
{
const struct execsw **sw;
struct exec_entry *ex;
SLIST_HEAD(,exec_entry) first;
SLIST_HEAD(,exec_entry) any;
SLIST_HEAD(,exec_entry) last;
int i, sz;
if (init_boot) {
/* do one-time initializations */
vaddr_t vmin = 0, vmax;
rw_init(&exec_lock);
exec_map = uvm_km_suballoc(kernel_map, &vmin, &vmax,
maxexec*NCARGS, VM_MAP_PAGEABLE, false, NULL);
pool_init(&exec_pool, NCARGS, 0, 0, PR_NOALIGN|PR_NOTOUCH,
"execargs", &exec_palloc, IPL_NONE);
pool_sethardlimit(&exec_pool, maxexec, "should not happen", 0);
} else {
KASSERT(rw_write_held(&exec_lock));
}
/* Sort each entry onto the appropriate queue. */
SLIST_INIT(&first);
SLIST_INIT(&any);
SLIST_INIT(&last);
sz = 0;
LIST_FOREACH(ex, &ex_head, ex_list) {
switch(ex->ex_sw->es_prio) {
case EXECSW_PRIO_FIRST:
SLIST_INSERT_HEAD(&first, ex, ex_slist);
break;
case EXECSW_PRIO_ANY:
SLIST_INSERT_HEAD(&any, ex, ex_slist);
break;
case EXECSW_PRIO_LAST:
SLIST_INSERT_HEAD(&last, ex, ex_slist);
break;
default:
panic("%s", __func__);
break;
}
sz++;
}
/*
* Create new execsw[]. Ensure we do not try a zero-sized
* allocation.
*/
sw = kmem_alloc(sz * sizeof(struct execsw *) + 1, KM_SLEEP);
i = 0;
SLIST_FOREACH(ex, &first, ex_slist) {
sw[i++] = ex->ex_sw;
}
SLIST_FOREACH(ex, &any, ex_slist) {
sw[i++] = ex->ex_sw;
}
SLIST_FOREACH(ex, &last, ex_slist) {
sw[i++] = ex->ex_sw;
}
/* Replace old execsw[] and free used memory. */
if (execsw != NULL) {
kmem_free(__UNCONST(execsw),
nexecs * sizeof(struct execsw *) + 1);
}
execsw = sw;
nexecs = sz;
/* Figure out the maximum size of an exec header. */
exec_maxhdrsz = sizeof(int);
for (i = 0; i < nexecs; i++) {
if (execsw[i]->es_hdrsz > exec_maxhdrsz)
exec_maxhdrsz = execsw[i]->es_hdrsz;
}
return 0;
}
int
exec_sigcode_alloc(const struct emul *e)
{
vaddr_t va;
vsize_t sz;
int error;
struct uvm_object *uobj;
KASSERT(rw_lock_held(&exec_lock));
if (e == NULL || e->e_sigobject == NULL)
return 0;
sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
if (sz == 0)
return 0;
/*
* Create a sigobject for this emulation.
*
* sigobject is an anonymous memory object (just like SYSV shared
* memory) that we keep a permanent reference to and that we map
* in all processes that need this sigcode. The creation is simple,
* we create an object, add a permanent reference to it, map it in
* kernel space, copy out the sigcode to it and unmap it.
* We map it with PROT_READ|PROT_EXEC into the process just
* the way sys_mmap() would map it.
*/
if (*e->e_sigobject == NULL) {
uobj = uao_create(sz, 0);
(*uobj->pgops->pgo_reference)(uobj);
va = vm_map_min(kernel_map);
if ((error = uvm_map(kernel_map, &va, round_page(sz),
uobj, 0, 0,
UVM_MAPFLAG(UVM_PROT_RW, UVM_PROT_RW,
UVM_INH_SHARE, UVM_ADV_RANDOM, 0)))) {
printf("sigcode kernel mapping failed %d\n", error);
(*uobj->pgops->pgo_detach)(uobj);
return error;
}
memcpy((void *)va, e->e_sigcode, sz);
#ifdef PMAP_NEED_PROCWR
pmap_procwr(&proc0, va, sz);
#endif
uvm_unmap(kernel_map, va, va + round_page(sz));
*e->e_sigobject = uobj;
KASSERT(uobj->uo_refs == 1);
} else {
/* if already created, reference++ */
uobj = *e->e_sigobject;
(*uobj->pgops->pgo_reference)(uobj);
}
return 0;
}
void
exec_sigcode_free(const struct emul *e)
{
struct uvm_object *uobj;
KASSERT(rw_lock_held(&exec_lock));
if (e == NULL || e->e_sigobject == NULL)
return;
uobj = *e->e_sigobject;
if (uobj == NULL)
return;
if (uobj->uo_refs == 1)
*e->e_sigobject = NULL; /* I'm the last person to reference. */
(*uobj->pgops->pgo_detach)(uobj);
}
static int
exec_sigcode_map(struct proc *p, const struct emul *e)
{
vaddr_t va;
vsize_t sz;
int error;
struct uvm_object *uobj;
sz = (vaddr_t)e->e_esigcode - (vaddr_t)e->e_sigcode;
if (e->e_sigobject == NULL || sz == 0)
return 0;
uobj = *e->e_sigobject;
if (uobj == NULL)
return 0;
/* Just a hint to uvm_map where to put it. */
va = e->e_vm_default_addr(p, (vaddr_t)p->p_vmspace->vm_daddr,
round_page(sz), p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
#ifdef __alpha__
/*
* Tru64 puts /sbin/loader at the end of user virtual memory,
* which causes the above calculation to put the sigcode at
* an invalid address. Put it just below the text instead.
*/
if (va == (vaddr_t)vm_map_max(&p->p_vmspace->vm_map)) {
va = (vaddr_t)p->p_vmspace->vm_taddr - round_page(sz);
}
#endif
(*uobj->pgops->pgo_reference)(uobj);
error = uvm_map(&p->p_vmspace->vm_map, &va, round_page(sz),
uobj, 0, 0,
UVM_MAPFLAG(UVM_PROT_RX, UVM_PROT_RX, UVM_INH_SHARE,
UVM_ADV_RANDOM, 0));
if (error) {
DPRINTF(("%s, %d: map %p "
"uvm_map %#"PRIxVSIZE"@%#"PRIxVADDR" failed %d\n",
__func__, __LINE__, &p->p_vmspace->vm_map, round_page(sz),
va, error));
(*uobj->pgops->pgo_detach)(uobj);
return error;
}
p->p_sigctx.ps_sigcode = (void *)va;
return 0;
}
/*
* Release a refcount on spawn_exec_data and destroy memory, if this
* was the last one.
*/
static void
spawn_exec_data_release(struct spawn_exec_data *data)
{
membar_release();
if (atomic_dec_32_nv(&data->sed_refcnt) != 0)
return;
membar_acquire();
cv_destroy(&data->sed_cv_child_ready);
mutex_destroy(&data->sed_mtx_child);
if (data->sed_actions) posix_spawn_fa_free(data->sed_actions,
data->sed_actions->len);
if (data->sed_attrs) kmem_free(data->sed_attrs,
sizeof(*data->sed_attrs));
kmem_free(data, sizeof(*data));
}
static int
handle_posix_spawn_file_actions(struct posix_spawn_file_actions *actions)
{
struct lwp *l = curlwp;
register_t retval;
int error, newfd;
if (actions == NULL)
return 0;
for (size_t i = 0; i < actions->len; i++) {
const struct posix_spawn_file_actions_entry *fae =
&actions->fae[i];
switch (fae->fae_action) {
case FAE_OPEN:
if (fd_getfile(fae->fae_fildes) != NULL) {
error = fd_close(fae->fae_fildes);
if (error)
return error;
}
error = fd_open(fae->fae_path, fae->fae_oflag,
fae->fae_mode, &newfd);
if (error)
return error;
if (newfd != fae->fae_fildes) {
error = dodup(l, newfd,
fae->fae_fildes, 0, &retval);
if (fd_getfile(newfd) != NULL)
fd_close(newfd);
}
break;
case FAE_DUP2:
error = dodup(l, fae->fae_fildes,
fae->fae_newfildes, 0, &retval);
break;
case FAE_CLOSE:
if (fd_getfile(fae->fae_fildes) == NULL) {
return EBADF;
}
error = fd_close(fae->fae_fildes);
break;
case FAE_CHDIR:
error = do_sys_chdir(l, fae->fae_chdir_path,
UIO_SYSSPACE, &retval);
break;
case FAE_FCHDIR:
error = do_sys_fchdir(l, fae->fae_fildes, &retval);
break;
}
if (error)
return error;
}
return 0;
}
static int
handle_posix_spawn_attrs(struct posix_spawnattr *attrs, struct proc *parent)
{
struct sigaction sigact;
int error;
struct proc *p = curproc;
struct lwp *l = curlwp;
if (attrs == NULL)
return 0;
memset(&sigact, 0, sizeof(sigact));
sigact._sa_u._sa_handler = SIG_DFL;
sigact.sa_flags = 0;
/*
* set state to SSTOP so that this proc can be found by pid.
* see proc_enterprp, do_sched_setparam below
*/
mutex_enter(&proc_lock);
/*
* p_stat should be SACTIVE, so we need to adjust the
* parent's p_nstopchild here. For safety, just make
* we're on the good side of SDEAD before we adjust.
*/
int ostat = p->p_stat;
KASSERT(ostat < SSTOP);
p->p_stat = SSTOP;
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
mutex_exit(&proc_lock);
/* Set process group */
if (attrs->sa_flags & POSIX_SPAWN_SETPGROUP) {
pid_t mypid = p->p_pid;
pid_t pgrp = attrs->sa_pgroup;
if (pgrp == 0)
pgrp = mypid;
error = proc_enterpgrp(parent, mypid, pgrp, false);
if (error)
goto out;
}
/* Set scheduler policy */
if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDULER)
error = do_sched_setparam(p->p_pid, 0, attrs->sa_schedpolicy,
&attrs->sa_schedparam);
else if (attrs->sa_flags & POSIX_SPAWN_SETSCHEDPARAM) {
error = do_sched_setparam(parent->p_pid, 0,
SCHED_NONE, &attrs->sa_schedparam);
}
if (error)
goto out;
/* Reset user ID's */
if (attrs->sa_flags & POSIX_SPAWN_RESETIDS) {
error = do_setresgid(l, -1, kauth_cred_getgid(l->l_cred), -1,
ID_E_EQ_R | ID_E_EQ_S);
if (error)
return error;
error = do_setresuid(l, -1, kauth_cred_getuid(l->l_cred), -1,
ID_E_EQ_R | ID_E_EQ_S);
if (error)
goto out;
}
/* Set signal masks/defaults */
if (attrs->sa_flags & POSIX_SPAWN_SETSIGMASK) {
mutex_enter(p->p_lock);
error = sigprocmask1(l, SIG_SETMASK, &attrs->sa_sigmask, NULL);
mutex_exit(p->p_lock);
if (error)
goto out;
}
if (attrs->sa_flags & POSIX_SPAWN_SETSIGDEF) {
/*
* The following sigaction call is using a sigaction
* version 0 trampoline which is in the compatibility
* code only. This is not a problem because for SIG_DFL
* and SIG_IGN, the trampolines are now ignored. If they
* were not, this would be a problem because we are
* holding the exec_lock, and the compat code needs
* to do the same in order to replace the trampoline
* code of the process.
*/
for (int i = 1; i <= NSIG; i++) {
if (sigismember(&attrs->sa_sigdefault, i))
sigaction1(l, i, &sigact, NULL, NULL, 0);
}
}
error = 0;
out:
mutex_enter(&proc_lock);
p->p_stat = ostat;
p->p_pptr->p_nstopchild--;
mutex_exit(&proc_lock);
return error;
}
/*
* A child lwp of a posix_spawn operation starts here and ends up in
* cpu_spawn_return, dealing with all filedescriptor and scheduler
* manipulations in between.
* The parent waits for the child, as it is not clear whether the child
* will be able to acquire its own exec_lock. If it can, the parent can
* be released early and continue running in parallel. If not (or if the
* magic debug flag is passed in the scheduler attribute struct), the
* child rides on the parent's exec lock until it is ready to return to
* to userland - and only then releases the parent. This method loses
* concurrency, but improves error reporting.
*/
static void
spawn_return(void *arg)
{
struct spawn_exec_data *spawn_data = arg;
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int error;
bool have_reflock;
bool parent_is_waiting = true;
/*
* Check if we can release parent early.
* We either need to have no sed_attrs, or sed_attrs does not
* have POSIX_SPAWN_RETURNERROR or one of the flags, that require
* safe access to the parent proc (passed in sed_parent).
* We then try to get the exec_lock, and only if that works, we can
* release the parent here already.
*/
struct posix_spawnattr *attrs = spawn_data->sed_attrs;
if ((!attrs || (attrs->sa_flags
& (POSIX_SPAWN_RETURNERROR|POSIX_SPAWN_SETPGROUP)) == 0)
&& rw_tryenter(&exec_lock, RW_READER)) {
parent_is_waiting = false;
mutex_enter(&spawn_data->sed_mtx_child);
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
}
/* don't allow debugger access yet */
rw_enter(&p->p_reflock, RW_WRITER);
have_reflock = true;
/* handle posix_spawnattr */
error = handle_posix_spawn_attrs(attrs, spawn_data->sed_parent);
if (error)
goto report_error;
/* handle posix_spawn_file_actions */
error = handle_posix_spawn_file_actions(spawn_data->sed_actions);
if (error)
goto report_error;
/* now do the real exec */
error = execve_runproc(l, &spawn_data->sed_exec, parent_is_waiting,
true);
have_reflock = false;
if (error == EJUSTRETURN)
error = 0;
else if (error)
goto report_error;
if (parent_is_waiting) {
mutex_enter(&spawn_data->sed_mtx_child);
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
}
/* release our refcount on the data */
spawn_exec_data_release(spawn_data);
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
(PSL_TRACED|PSL_TRACEDCHILD)) {
eventswitchchild(p, TRAP_CHLD, PTRACE_POSIX_SPAWN);
}
/* and finally: leave to userland for the first time */
cpu_spawn_return(l);
/* NOTREACHED */
return;
report_error:
if (have_reflock) {
/*
* We have not passed through execve_runproc(),
* which would have released the p_reflock and also
* taken ownership of the sed_exec part of spawn_data,
* so release/free both here.
*/
rw_exit(&p->p_reflock);
execve_free_data(&spawn_data->sed_exec);
}
if (parent_is_waiting) {
/* pass error to parent */
mutex_enter(&spawn_data->sed_mtx_child);
spawn_data->sed_error = error;
cv_signal(&spawn_data->sed_cv_child_ready);
mutex_exit(&spawn_data->sed_mtx_child);
} else {
rw_exit(&exec_lock);
}
/* release our refcount on the data */
spawn_exec_data_release(spawn_data);
/* done, exit */
mutex_enter(p->p_lock);
/*
* Posix explicitly asks for an exit code of 127 if we report
* errors from the child process - so, unfortunately, there
* is no way to report a more exact error code.
* A NetBSD specific workaround is POSIX_SPAWN_RETURNERROR as
* flag bit in the attrp argument to posix_spawn(2), see above.
*/
exit1(l, 127, 0);
}
static __inline char **
posix_spawn_fae_path(struct posix_spawn_file_actions_entry *fae)
{
switch (fae->fae_action) {
case FAE_OPEN:
return &fae->fae_path;
case FAE_CHDIR:
return &fae->fae_chdir_path;
default:
return NULL;
}
}
void
posix_spawn_fa_free(struct posix_spawn_file_actions *fa, size_t len)
{
for (size_t i = 0; i < len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]);
if (pathp)
kmem_strfree(*pathp);
}
if (fa->len > 0) kmem_free(fa->fae, sizeof(*fa->fae) * fa->len);
kmem_free(fa, sizeof(*fa));
}
static int
posix_spawn_fa_alloc(struct posix_spawn_file_actions **fap,
const struct posix_spawn_file_actions *ufa, rlim_t lim)
{
struct posix_spawn_file_actions *fa;
struct posix_spawn_file_actions_entry *fae;
char *pbuf = NULL;
int error;
size_t i = 0;
fa = kmem_alloc(sizeof(*fa), KM_SLEEP);
error = copyin(ufa, fa, sizeof(*fa));
if (error || fa->len == 0) {
kmem_free(fa, sizeof(*fa));
return error; /* 0 if not an error, and len == 0 */
}
if (fa->len > lim) {
kmem_free(fa, sizeof(*fa));
return EINVAL;
}
fa->size = fa->len;
size_t fal = fa->len * sizeof(*fae);
fae = fa->fae;
fa->fae = kmem_alloc(fal, KM_SLEEP);
error = copyin(fae, fa->fae, fal);
if (error)
goto out;
pbuf = PNBUF_GET();
for (; i < fa->len; i++) { char **pathp = posix_spawn_fae_path(&fa->fae[i]);
if (pathp == NULL)
continue;
error = copyinstr(*pathp, pbuf, MAXPATHLEN, &fal);
if (error)
goto out;
*pathp = kmem_alloc(fal, KM_SLEEP);
memcpy(*pathp, pbuf, fal);
}
PNBUF_PUT(pbuf);
*fap = fa;
return 0;
out:
if (pbuf) PNBUF_PUT(pbuf); posix_spawn_fa_free(fa, i);
return error;
}
/*
* N.B. increments nprocs upon success. Callers need to drop nprocs if
* they fail for some other reason.
*/
int
check_posix_spawn(struct lwp *l1)
{
int error, tnprocs, count;
uid_t uid;
struct proc *p1;
p1 = l1->l_proc;
uid = kauth_cred_getuid(l1->l_cred);
tnprocs = atomic_inc_uint_nv(&nprocs);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create.
*/
if (__predict_false(tnprocs >= maxproc))
error = -1;
else
error = kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
if (error) {
atomic_dec_uint(&nprocs);
return EAGAIN;
}
/*
* Enforce limits.
*/
count = chgproccnt(uid, 1);
if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0 &&
__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
return EAGAIN;
}
return 0;
}
int
do_posix_spawn(struct lwp *l1, pid_t *pid_res, bool *child_ok, const char *path,
struct posix_spawn_file_actions *fa,
struct posix_spawnattr *sa,
char *const *argv, char *const *envp,
execve_fetch_element_t fetch)
{
struct proc *p1, *p2;
struct lwp *l2;
int error;
struct spawn_exec_data *spawn_data;
vaddr_t uaddr = 0;
pid_t pid;
bool have_exec_lock = false;
p1 = l1->l_proc;
/* Allocate and init spawn_data */
spawn_data = kmem_zalloc(sizeof(*spawn_data), KM_SLEEP);
spawn_data->sed_refcnt = 1; /* only parent so far */
cv_init(&spawn_data->sed_cv_child_ready, "pspawn");
mutex_init(&spawn_data->sed_mtx_child, MUTEX_DEFAULT, IPL_NONE);
mutex_enter(&spawn_data->sed_mtx_child);
/*
* Do the first part of the exec now, collect state
* in spawn_data.
*/
error = execve_loadvm(l1, true, path, -1, argv,
envp, fetch, &spawn_data->sed_exec);
if (error == EJUSTRETURN)
error = 0;
else if (error)
goto error_exit;
have_exec_lock = true;
/*
* Allocate virtual address space for the U-area now, while it
* is still easy to abort the fork operation if we're out of
* kernel virtual address space.
*/
uaddr = uvm_uarea_alloc();
if (__predict_false(uaddr == 0)) {
error = ENOMEM;
goto error_exit;
}
/*
* Allocate new proc. Borrow proc0 vmspace for it, we will
* replace it with its own before returning to userland
* in the child.
*/
p2 = proc_alloc();
if (p2 == NULL) {
/* We were unable to allocate a process ID. */
error = EAGAIN;
goto error_exit;
}
/*
* This is a point of no return, we will have to go through
* the child proc to properly clean it up past this point.
*/
pid = p2->p_pid;
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
memset(&p2->p_startzero, 0,
(unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
memcpy(&p2->p_startcopy, &p1->p_startcopy,
(unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
p2->p_vmspace = proc0.p_vmspace;
TAILQ_INIT(&p2->p_sigpend.sp_info);
LIST_INIT(&p2->p_lwps);
LIST_INIT(&p2->p_sigwaiters);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
* Inherit flags we want to keep. The flags related to SIGCHLD
* handling are important in order to keep a consistent behaviour
* for the child after the fork. If we are a 32-bit process, the
* child will be too.
*/
p2->p_flag =
p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
p2->p_emul = p1->p_emul;
p2->p_execsw = p1->p_execsw;
mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&p2->p_reflock);
cv_init(&p2->p_waitcv, "wait");
cv_init(&p2->p_lwpcv, "lwpwait");
p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
kauth_proc_fork(p1, p2);
p2->p_raslist = NULL;
p2->p_fd = fd_copy();
/* XXX racy */
p2->p_mqueue_cnt = p1->p_mqueue_cnt;
p2->p_cwdi = cwdinit();
/*
* Note: p_limit (rlimit stuff) is copy-on-write, so normally
* we just need increase pl_refcnt.
*/
if (!p1->p_limit->pl_writeable) {
lim_addref(p1->p_limit);
p2->p_limit = p1->p_limit;
} else {
p2->p_limit = lim_copy(p1->p_limit);
}
p2->p_lflag = 0;
l1->l_vforkwaiting = false;
p2->p_sflag = 0;
p2->p_slflag = 0;
p2->p_pptr = p1;
p2->p_ppid = p1->p_pid;
LIST_INIT(&p2->p_children);
p2->p_aio = NULL;
#ifdef KTRACE
/*
* Copy traceflag and tracefile if enabled.
* If not inherited, these were zeroed above.
*/
if (p1->p_traceflag & KTRFAC_INHERIT) {
mutex_enter(&ktrace_lock);
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2);
mutex_exit(&ktrace_lock);
}
#endif
/*
* Create signal actions for the child process.
*/
p2->p_sigacts = sigactsinit(p1, 0);
mutex_enter(p1->p_lock);
p2->p_sflag |=
(p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
sched_proc_fork(p1, p2);
mutex_exit(p1->p_lock);
p2->p_stflag = p1->p_stflag;
/*
* p_stats.
* Copy parts of p_stats, and zero out the rest.
*/
p2->p_stats = pstatscopy(p1->p_stats);
/* copy over machdep flags to the new proc */
cpu_proc_fork(p1, p2);
/*
* Prepare remaining parts of spawn data
*/
spawn_data->sed_actions = fa;
spawn_data->sed_attrs = sa;
spawn_data->sed_parent = p1;
/* create LWP */
lwp_create(l1, p2, uaddr, 0, NULL, 0, spawn_return, spawn_data,
&l2, l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
l2->l_ctxlink = NULL; /* reset ucontext link */
/*
* Copy the credential so other references don't see our changes.
* Test to see if this is necessary first, since in the common case
* we won't need a private reference.
*/
if (kauth_cred_geteuid(l2->l_cred) != kauth_cred_getsvuid(l2->l_cred) ||
kauth_cred_getegid(l2->l_cred) != kauth_cred_getsvgid(l2->l_cred)) {
l2->l_cred = kauth_cred_copy(l2->l_cred);
kauth_cred_setsvuid(l2->l_cred, kauth_cred_geteuid(l2->l_cred));
kauth_cred_setsvgid(l2->l_cred, kauth_cred_getegid(l2->l_cred));
}
/* Update the master credentials. */
if (l2->l_cred != p2->p_cred) {
kauth_cred_t ocred;
mutex_enter(p2->p_lock);
ocred = p2->p_cred;
p2->p_cred = kauth_cred_hold(l2->l_cred);
mutex_exit(p2->p_lock);
kauth_cred_free(ocred);
}
*child_ok = true;
spawn_data->sed_refcnt = 2; /* child gets it as well */
#if 0
l2->l_nopreempt = 1; /* start it non-preemptable */
#endif
/*
* It's now safe for the scheduler and other processes to see the
* child process.
*/
mutex_enter(&proc_lock);
if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
p2->p_exitsig = SIGCHLD; /* signal for parent on exit */
if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) ==
(PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
proc_changeparent(p2, p1->p_pptr);
SET(p2->p_slflag, PSL_TRACEDCHILD);
}
p2->p_oppid = p1->p_pid; /* Remember the original parent id. */
LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list);
p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
(*p2->p_emul->e_syscall_intern)(p2);
#endif
/*
* Make child runnable, set start time, and add to run queue except
* if the parent requested the child to start in SSTOP state.
*/
mutex_enter(p2->p_lock);
getmicrotime(&p2->p_stats->p_start);
lwp_lock(l2);
KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL);
p2->p_nrlwps = 1;
p2->p_stat = SACTIVE;
setrunnable(l2);
/* LWP now unlocked */
mutex_exit(p2->p_lock);
mutex_exit(&proc_lock);
cv_wait(&spawn_data->sed_cv_child_ready, &spawn_data->sed_mtx_child);
error = spawn_data->sed_error;
mutex_exit(&spawn_data->sed_mtx_child);
spawn_exec_data_release(spawn_data);
rw_exit(&p1->p_reflock);
rw_exit(&exec_lock);
have_exec_lock = false;
*pid_res = pid;
if (error)
return error;
if (p1->p_slflag & PSL_TRACED) {
/* Paranoid check */
mutex_enter(&proc_lock);
if ((p1->p_slflag & (PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) !=
(PSL_TRACEPOSIX_SPAWN|PSL_TRACED)) {
mutex_exit(&proc_lock);
return 0;
}
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD, PTRACE_POSIX_SPAWN, pid);
}
return 0;
error_exit:
if (have_exec_lock) {
execve_free_data(&spawn_data->sed_exec);
rw_exit(&p1->p_reflock);
rw_exit(&exec_lock);
}
mutex_exit(&spawn_data->sed_mtx_child);
spawn_exec_data_release(spawn_data);
if (uaddr != 0) uvm_uarea_free(uaddr);
return error;
}
int
sys_posix_spawn(struct lwp *l1, const struct sys_posix_spawn_args *uap,
register_t *retval)
{
/* {
syscallarg(pid_t *) pid;
syscallarg(const char *) path;
syscallarg(const struct posix_spawn_file_actions *) file_actions;
syscallarg(const struct posix_spawnattr *) attrp;
syscallarg(char *const *) argv;
syscallarg(char *const *) envp;
} */
int error;
struct posix_spawn_file_actions *fa = NULL;
struct posix_spawnattr *sa = NULL;
pid_t pid;
bool child_ok = false;
rlim_t max_fileactions;
proc_t *p = l1->l_proc;
/* check_posix_spawn() increments nprocs for us. */
error = check_posix_spawn(l1);
if (error) {
*retval = error;
return 0;
}
/* copy in file_actions struct */
if (SCARG(uap, file_actions) != NULL) {
max_fileactions = 2 * uimin(p->p_rlimit[RLIMIT_NOFILE].rlim_cur,
maxfiles);
error = posix_spawn_fa_alloc(&fa, SCARG(uap, file_actions),
max_fileactions);
if (error)
goto error_exit;
}
/* copyin posix_spawnattr struct */
if (SCARG(uap, attrp) != NULL) {
sa = kmem_alloc(sizeof(*sa), KM_SLEEP);
error = copyin(SCARG(uap, attrp), sa, sizeof(*sa));
if (error)
goto error_exit;
}
/*
* Do the spawn
*/
error = do_posix_spawn(l1, &pid, &child_ok, SCARG(uap, path), fa, sa,
SCARG(uap, argv), SCARG(uap, envp), execve_fetch_element);
if (error)
goto error_exit;
if (error == 0 && SCARG(uap, pid) != NULL) error = copyout(&pid, SCARG(uap, pid), sizeof(pid));
*retval = error;
return 0;
error_exit:
if (!child_ok) {
(void)chgproccnt(kauth_cred_getuid(l1->l_cred), -1);
atomic_dec_uint(&nprocs);
if (sa) kmem_free(sa, sizeof(*sa)); if (fa) posix_spawn_fa_free(fa, fa->len);
}
*retval = error;
return 0;
}
void
exec_free_emul_arg(struct exec_package *epp)
{
if (epp->ep_emul_arg_free != NULL) {
KASSERT(epp->ep_emul_arg != NULL);
(*epp->ep_emul_arg_free)(epp->ep_emul_arg);
epp->ep_emul_arg_free = NULL;
epp->ep_emul_arg = NULL;
} else {
KASSERT(epp->ep_emul_arg == NULL);
}
}
#ifdef DEBUG_EXEC
static void
dump_vmcmds(const struct exec_package * const epp, size_t x, int error)
{
struct exec_vmcmd *vp = &epp->ep_vmcmds.evs_cmds[0];
size_t j;
if (error == 0)
DPRINTF(("vmcmds %u\n", epp->ep_vmcmds.evs_used));
else
DPRINTF(("vmcmds %zu/%u, error %d\n", x,
epp->ep_vmcmds.evs_used, error));
for (j = 0; j < epp->ep_vmcmds.evs_used; j++) {
DPRINTF(("vmcmd[%zu] = vmcmd_map_%s %#"
PRIxVADDR"/%#"PRIxVSIZE" fd@%#"
PRIxVSIZE" prot=0%o flags=%d\n", j,
vp[j].ev_proc == vmcmd_map_pagedvn ?
"pagedvn" :
vp[j].ev_proc == vmcmd_map_readvn ?
"readvn" :
vp[j].ev_proc == vmcmd_map_zero ?
"zero" : "*unknown*",
vp[j].ev_addr, vp[j].ev_len,
vp[j].ev_offset, vp[j].ev_prot,
vp[j].ev_flags));
if (error != 0 && j == x)
DPRINTF((" ^--- failed\n"));
}
}
#endif
/* $NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel module support.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_module_hook.c,v 1.4 2019/12/13 08:02:53 skrll Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <uvm/uvm_extern.h>
/* Locking/synchronization stuff for module hooks */
static struct {
kmutex_t mtx;
kcondvar_t cv;
pserialize_t psz;
} module_hook __cacheline_aligned;
/*
* We use pserialize_perform() to issue a memory barrier on the current
* CPU and on all other CPUs so that all prior memory operations on the
* current CPU globally happen before all subsequent memory operations
* on the current CPU, as perceived by any other CPU.
*
* pserialize_perform() might be rather heavy-weight here, but it only
* happens during module loading, and it allows MODULE_HOOK_CALL() to
* work without any other memory barriers.
*/
void
module_hook_set(bool *hooked, struct localcount *lc)
{
KASSERT(kernconfig_is_held());
KASSERT(!*hooked);
localcount_init(lc);
/* Wait until setup has been witnessed by all CPUs. */
pserialize_perform(module_hook.psz);
/* Let others use it */
atomic_store_relaxed(hooked, true);
}
void
module_hook_unset(bool *hooked, struct localcount *lc)
{
KASSERT(kernconfig_is_held());
KASSERT(*hooked);
/* Get exclusive with pserialize and localcount. */
mutex_enter(&module_hook.mtx);
/* Prevent new calls to module_hook_tryenter(). */
atomic_store_relaxed(hooked, false);
/* Wait for existing calls to module_hook_tryenter(). */
pserialize_perform(module_hook.psz);
/* Wait for module_hook_exit. */
localcount_drain(lc, &module_hook.cv, &module_hook.mtx);
/* All done! */
mutex_exit(&module_hook.mtx);
localcount_fini(lc);
}
bool
module_hook_tryenter(bool *hooked, struct localcount *lc)
{
bool call_hook;
int s;
s = pserialize_read_enter();
call_hook = atomic_load_relaxed(hooked);
if (call_hook) localcount_acquire(lc);
pserialize_read_exit(s);
return call_hook;
}
void
module_hook_exit(struct localcount *lc)
{
localcount_release(lc, &module_hook.cv, &module_hook.mtx);
}
void
module_hook_init(void)
{
mutex_init(&module_hook.mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&module_hook.cv, "mod_hook");
module_hook.psz = pserialize_create();
}
/* $NetBSD: l2cap_upper.c,v 1.19 2016/12/12 15:58:45 maya Exp $ */
/*-
* Copyright (c) 2005 Iain Hibbert.
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: l2cap_upper.c,v 1.19 2016/12/12 15:58:45 maya Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/l2cap.h>
/*******************************************************************************
*
* L2CAP Channel - Upper Protocol API
*/
/*
* l2cap_attach_pcb(handle, btproto, upper)
*
* attach new l2cap_channel to handle, populate
* with reasonable defaults
*/
int
l2cap_attach_pcb(struct l2cap_channel **handle,
const struct btproto *proto, void *upper)
{
struct l2cap_channel *chan;
KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL);
chan = malloc(sizeof(struct l2cap_channel), M_BLUETOOTH,
M_NOWAIT | M_ZERO);
if (chan == NULL)
return ENOMEM;
chan->lc_proto = proto;
chan->lc_upper = upper;
chan->lc_state = L2CAP_CLOSED;
chan->lc_lcid = L2CAP_NULL_CID;
chan->lc_rcid = L2CAP_NULL_CID;
chan->lc_laddr.bt_len = sizeof(struct sockaddr_bt);
chan->lc_laddr.bt_family = AF_BLUETOOTH;
chan->lc_laddr.bt_psm = L2CAP_PSM_ANY;
chan->lc_raddr.bt_len = sizeof(struct sockaddr_bt);
chan->lc_raddr.bt_family = AF_BLUETOOTH;
chan->lc_raddr.bt_psm = L2CAP_PSM_ANY;
chan->lc_imtu = L2CAP_MTU_DEFAULT;
chan->lc_omtu = L2CAP_MTU_DEFAULT;
chan->lc_flush = L2CAP_FLUSH_TIMO_DEFAULT;
memcpy(&chan->lc_iqos, &l2cap_default_qos, sizeof(l2cap_qos_t));
memcpy(&chan->lc_oqos, &l2cap_default_qos, sizeof(l2cap_qos_t));
MBUFQ_INIT(&chan->lc_txq);
*handle = chan;
return 0;
}
/*
* l2cap_bind_pcb(l2cap_channel, sockaddr)
*
* set local address of channel
*/
int
l2cap_bind_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr)
{
if (chan->lc_lcid != L2CAP_NULL_CID)
return EINVAL;
memcpy(&chan->lc_laddr, addr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* l2cap_sockaddr_pcb(l2cap_channel, sockaddr)
*
* get local address of channel
*/
int
l2cap_sockaddr_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr)
{
memcpy(addr, &chan->lc_laddr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* l2cap_connect_pcb(l2cap_channel, sockaddr)
*
* Initiate a connection to destination. This corresponds to
* "Open Channel Request" in the L2CAP specification and will
* result in one of the following:
*
* proto->connected(upper)
* proto->disconnected(upper, error)
*
* and, optionally
* proto->connecting(upper)
*/
int
l2cap_connect_pcb(struct l2cap_channel *chan, struct sockaddr_bt *dest)
{
struct hci_unit *unit;
int err;
memcpy(&chan->lc_raddr, dest, sizeof(struct sockaddr_bt));
if (L2CAP_PSM_INVALID(chan->lc_raddr.bt_psm))
return EINVAL;
if (bdaddr_any(&chan->lc_raddr.bt_bdaddr))
return EDESTADDRREQ;
/* set local address if it needs setting */
if (bdaddr_any(&chan->lc_laddr.bt_bdaddr)) {
err = hci_route_lookup(&chan->lc_laddr.bt_bdaddr,
&chan->lc_raddr.bt_bdaddr);
if (err)
return err;
}
unit = hci_unit_lookup(&chan->lc_laddr.bt_bdaddr);
if (unit == NULL)
return EHOSTUNREACH;
/* attach to active list */
err = l2cap_cid_alloc(chan);
if (err)
return err;
/* open link to remote device */
chan->lc_link = hci_acl_open(unit, &chan->lc_raddr.bt_bdaddr);
if (chan->lc_link == NULL)
return EHOSTUNREACH;
/* set the link mode */
err = l2cap_setmode(chan);
if (err == EINPROGRESS) {
chan->lc_state = L2CAP_WAIT_SEND_CONNECT_REQ;
(*chan->lc_proto->connecting)(chan->lc_upper);
return 0;
}
if (err)
goto fail;
/*
* We can queue a connect request now even though the link may
* not yet be open; Our mode setting is assured, and the queue
* will be started automatically at the right time.
*/
chan->lc_state = L2CAP_WAIT_RECV_CONNECT_RSP;
err = l2cap_send_connect_req(chan);
if (err)
goto fail;
return 0;
fail:
chan->lc_state = L2CAP_CLOSED;
hci_acl_close(chan->lc_link, err);
chan->lc_link = NULL;
return err;
}
/*
* l2cap_peeraddr_pcb(l2cap_channel, sockaddr)
*
* get remote address of channel
*/
int
l2cap_peeraddr_pcb(struct l2cap_channel *chan, struct sockaddr_bt *addr)
{
memcpy(addr, &chan->lc_raddr, sizeof(struct sockaddr_bt));
return 0;
}
/*
* l2cap_disconnect_pcb(l2cap_channel, linger)
*
* Initiate L2CAP disconnection. This corresponds to
* "Close Channel Request" in the L2CAP specification
* and will result in a call to
*
* proto->disconnected(upper, error)
*
* when the disconnection is complete. If linger is set,
* the call will not be made until data has flushed from
* the queue.
*/
int
l2cap_disconnect_pcb(struct l2cap_channel *chan, int linger)
{
int err = 0;
if (chan->lc_state == L2CAP_CLOSED
|| chan->lc_state == L2CAP_WAIT_DISCONNECT)
return EINVAL;
chan->lc_flags |= L2CAP_SHUTDOWN;
/*
* no need to do anything unless the queue is empty or
* we are not lingering..
*/
if ((MBUFQ_FIRST(&chan->lc_txq) == NULL && chan->lc_pending == 0)
|| linger == 0) {
chan->lc_state = L2CAP_WAIT_DISCONNECT;
err = l2cap_send_disconnect_req(chan);
if (err)
l2cap_close(chan, err);
}
return err;
}
/*
* l2cap_detach_pcb(handle)
*
* Detach l2cap channel from handle & close it down
*/
void
l2cap_detach_pcb(struct l2cap_channel **handle)
{
struct l2cap_channel *chan;
chan = *handle;
*handle = NULL;
if (chan->lc_state != L2CAP_CLOSED) l2cap_close(chan, 0); if (chan->lc_lcid != L2CAP_NULL_CID) { LIST_REMOVE(chan, lc_ncid);
chan->lc_lcid = L2CAP_NULL_CID;
}
MBUFQ_DRAIN(&chan->lc_txq);
/*
* Could implement some kind of delayed expunge to make sure that the
* CID is really dead before it becomes available for reuse?
*/
free(chan, M_BLUETOOTH);
}
/*
* l2cap_listen_pcb(l2cap_channel)
*
* Use this channel as a listening post (until detached). This will
* result in calls to:
*
* proto->newconn(upper, laddr, raddr)
*
* for incoming connections matching the psm and local address of
* the channel. NULL address is permitted and matches any device.
* If L2CAP_PSM_ANY is bound the next higher unused value from the
* dynamic range (above 0x1001) will be selected.
*
* The upper layer should create and return a new channel.
*
* You cannot use this channel for anything else subsequent to this call
*/
int
l2cap_listen_pcb(struct l2cap_channel *chan)
{
struct l2cap_channel *used, *prev = NULL;
uint32_t psm;
if (chan->lc_lcid != L2CAP_NULL_CID)
return EINVAL;
/*
* This is simplistic but its not really worth spending a
* lot of time looking for an unused PSM..
*/
if (chan->lc_laddr.bt_psm == L2CAP_PSM_ANY) {
psm = 0x1001;
used = LIST_FIRST(&l2cap_listen_list);
if (used != NULL && used->lc_laddr.bt_psm >= psm) {
psm = used->lc_laddr.bt_psm + 0x0002;
if ((psm & 0x0100) != 0)
psm += 0x0100;
if (psm > UINT16_MAX)
return EADDRNOTAVAIL;
}
chan->lc_laddr.bt_psm = psm;
} else if (L2CAP_PSM_INVALID(chan->lc_laddr.bt_psm))
return EINVAL;
/*
* This CID is irrelevant, as the channel is not stored on the active
* list and the socket code does not allow operations on listening
* sockets, but we set it so the detach code knows to LIST_REMOVE the
* channel.
*/
chan->lc_lcid = L2CAP_SIGNAL_CID;
/*
* The list of listening channels is stored in an order such that new
* listeners dont usurp current listeners, but that specific listening
* takes precedence over promiscuous, and the connect request code can
* easily use the first matching entry.
*/
LIST_FOREACH(used, &l2cap_listen_list, lc_ncid) { if (used->lc_laddr.bt_psm < chan->lc_laddr.bt_psm)
break;
if (used->lc_laddr.bt_psm == chan->lc_laddr.bt_psm && bdaddr_any(&used->lc_laddr.bt_bdaddr) && !bdaddr_any(&chan->lc_laddr.bt_bdaddr))
break;
prev = used;
}
if (prev == NULL) LIST_INSERT_HEAD(&l2cap_listen_list, chan, lc_ncid);
else
LIST_INSERT_AFTER(prev, chan, lc_ncid);
return 0;
}
/*
* l2cap_send_pcb(l2cap_channel, mbuf)
*
* Output SDU on channel described by channel. This corresponds
* to "Send Data Request" in the L2CAP specification. The upper
* layer will be notified when SDU's have completed sending by a
* call to:
*
* proto->complete(upper, n)
*
* (currently n == 1)
*
* Note: I'm not sure how this will work out, but I think that
* if outgoing Retransmission Mode or Flow Control Mode is
* negotiated then this call will not be made until the SDU has
* been acknowledged by the peer L2CAP entity. For 'Best Effort'
* it will be made when the packet has cleared the controller
* buffers.
*
* We only support Basic mode so far, so encapsulate with a
* B-Frame header and start sending if we are not already
*/
int
l2cap_send_pcb(struct l2cap_channel *chan, struct mbuf *m)
{
l2cap_hdr_t *hdr;
int plen;
if (chan->lc_state == L2CAP_CLOSED) {
m_freem(m);
return ENOTCONN;
}
plen = m->m_pkthdr.len;
DPRINTFN(5, "send %d bytes on CID #%d (pending = %d)\n",
plen, chan->lc_lcid, chan->lc_pending);
/* Encapsulate with B-Frame */
M_PREPEND(m, sizeof(l2cap_hdr_t), M_DONTWAIT);
if (m == NULL)
return ENOMEM;
hdr = mtod(m, l2cap_hdr_t *);
hdr->length = htole16(plen);
hdr->dcid = htole16(chan->lc_rcid);
/* Queue it on our list */
MBUFQ_ENQUEUE(&chan->lc_txq, m);
/* If we are not sending, then start doing so */
if (chan->lc_pending == 0)
return l2cap_start(chan);
return 0;
}
/*
* l2cap_setopt(l2cap_channel, sopt)
*
* Apply configuration options to channel. This corresponds to
* "Configure Channel Request" in the L2CAP specification.
*
* for SO_L2CAP_LM, the settings will take effect when the
* channel is established. If the channel is already open,
* a call to
* proto->linkmode(upper, new)
*
* will be made when the change is complete.
*/
int
l2cap_setopt(struct l2cap_channel *chan, const struct sockopt *sopt)
{
int mode, err = 0;
uint16_t mtu;
switch (sopt->sopt_name) {
case SO_L2CAP_IMTU: /* set Incoming MTU */
err = sockopt_get(sopt, &mtu, sizeof(mtu));
if (err)
break;
if (mtu < L2CAP_MTU_MINIMUM)
err = EINVAL;
else if (chan->lc_state == L2CAP_CLOSED)
chan->lc_imtu = mtu;
else
err = EBUSY;
break;
case SO_L2CAP_LM: /* set link mode */
err = sockopt_getint(sopt, &mode);
if (err)
break;
mode &= (L2CAP_LM_SECURE | L2CAP_LM_ENCRYPT | L2CAP_LM_AUTH);
if (mode & L2CAP_LM_SECURE)
mode |= L2CAP_LM_ENCRYPT;
if (mode & L2CAP_LM_ENCRYPT)
mode |= L2CAP_LM_AUTH;
chan->lc_mode = mode;
if (chan->lc_state == L2CAP_OPEN)
err = l2cap_setmode(chan);
break;
case SO_L2CAP_OQOS: /* set Outgoing QoS flow spec */
case SO_L2CAP_FLUSH: /* set Outgoing Flush Timeout */
default:
err = ENOPROTOOPT;
break;
}
return err;
}
/*
* l2cap_getopt(l2cap_channel, sopt)
*
* Return configuration parameters.
*/
int
l2cap_getopt(struct l2cap_channel *chan, struct sockopt *sopt)
{
switch (sopt->sopt_name) {
case SO_L2CAP_IMTU: /* get Incoming MTU */
return sockopt_set(sopt, &chan->lc_imtu, sizeof(uint16_t));
case SO_L2CAP_OMTU: /* get Outgoing MTU */
return sockopt_set(sopt, &chan->lc_omtu, sizeof(uint16_t));
case SO_L2CAP_IQOS: /* get Incoming QoS flow spec */
return sockopt_set(sopt, &chan->lc_iqos, sizeof(l2cap_qos_t));
case SO_L2CAP_OQOS: /* get Outgoing QoS flow spec */
return sockopt_set(sopt, &chan->lc_oqos, sizeof(l2cap_qos_t));
case SO_L2CAP_FLUSH: /* get Flush Timeout */
return sockopt_set(sopt, &chan->lc_flush, sizeof(uint16_t));
case SO_L2CAP_LM: /* get link mode */
return sockopt_setint(sopt, chan->lc_mode);
default:
break;
}
return ENOPROTOOPT;
}
/* $NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $ */
/* $KAME: in6.c,v 1.198 2001/07/18 09:12:38 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in.c 8.2 (Berkeley) 11/15/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6.c,v 1.292 2024/03/01 23:50:27 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/cprng.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_llatbl.h>
#include <net/if_ether.h>
#include <net/if_dl.h>
#include <net/pfil.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/mld6_var.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/scope6_var.h>
#include <compat/netinet6/in6_var.h>
#include <compat/netinet6/nd6.h>
MALLOC_DEFINE(M_IP6OPT, "ip6_options", "IPv6 options");
/* enable backward compatibility code for obsoleted ioctls */
#define COMPAT_IN6IFIOCTL
#ifdef IN6_DEBUG
#define IN6_DPRINTF(__fmt, ...) printf(__fmt, __VA_ARGS__)
#else
#define IN6_DPRINTF(__fmt, ...) do { } while (/*CONSTCOND*/0)
#endif /* IN6_DEBUG */
/*
* Definitions of some constant IP6 addresses.
*/
const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT;
const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT;
const struct in6_addr in6addr_nodelocal_allnodes =
IN6ADDR_NODELOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allnodes =
IN6ADDR_LINKLOCAL_ALLNODES_INIT;
const struct in6_addr in6addr_linklocal_allrouters =
IN6ADDR_LINKLOCAL_ALLROUTERS_INIT;
const struct in6_addr in6mask0 = IN6MASK0;
const struct in6_addr in6mask32 = IN6MASK32;
const struct in6_addr in6mask64 = IN6MASK64;
const struct in6_addr in6mask96 = IN6MASK96;
const struct in6_addr in6mask128 = IN6MASK128;
const struct sockaddr_in6 sa6_any = {sizeof(sa6_any), AF_INET6,
0, 0, IN6ADDR_ANY_INIT, 0};
struct pslist_head in6_ifaddr_list;
kmutex_t in6_ifaddr_lock;
static int in6_lifaddr_ioctl(struct socket *, u_long, void *,
struct ifnet *);
static int in6_ifaddprefix(struct in6_ifaddr *);
static int in6_ifremprefix(struct in6_ifaddr *);
static int in6_ifinit(struct ifnet *, struct in6_ifaddr *,
const struct sockaddr_in6 *, int);
static void in6_unlink_ifa(struct in6_ifaddr *, struct ifnet *);
static int in6_update_ifa1(struct ifnet *, struct in6_aliasreq *,
struct in6_ifaddr **, struct psref *, int);
void
in6_init(void)
{
PSLIST_INIT(&in6_ifaddr_list);
mutex_init(&in6_ifaddr_lock, MUTEX_DEFAULT, IPL_NONE);
in6_sysctl_multicast_setup(NULL);
}
/*
* Add ownaddr as loopback rtentry. We previously add the route only if
* necessary (ex. on a p2p link). However, since we now manage addresses
* separately from prefixes, we should always add the route. We can't
* rely on the cloning mechanism from the corresponding interface route
* any more.
*/
void
in6_ifaddlocal(struct ifaddr *ifa)
{
if (IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &in6addr_any) ||
(ifa->ifa_ifp->if_flags & IFF_POINTOPOINT &&
IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), IFA_DSTIN6(ifa))))
{
rt_addrmsg(RTM_NEWADDR, ifa);
return;
}
rt_ifa_addlocal(ifa);
}
/*
* Remove loopback rtentry of ownaddr generated by in6_ifaddlocal(),
* if it exists.
*/
void
in6_ifremlocal(struct ifaddr *ifa)
{
struct in6_ifaddr *ia;
struct ifaddr *alt_ifa = NULL;
int ia_count = 0;
struct psref psref;
int s;
/*
* Some of BSD variants do not remove cloned routes
* from an interface direct route, when removing the direct route
* (see comments in net/net_osdep.h). Even for variants that do remove
* cloned routes, they could fail to remove the cloned routes when
* we handle multiple addresses that share a common prefix.
* So, we should remove the route corresponding to the deleted address.
*/
/*
* Delete the entry only if exactly one ifaddr matches the
* address, ifa->ifa_addr.
*
* If more than one ifaddr matches, replace the ifaddr in
* the routing table, rt_ifa, with a different ifaddr than
* the one we are purging, ifa. It is important to do
* this, or else the routing table can accumulate dangling
* pointers rt->rt_ifa->ifa_ifp to destroyed interfaces,
* which will lead to crashes, later. (More than one ifaddr
* can match if we assign the same address to multiple---probably
* p2p---interfaces.)
*
* XXX An old comment at this place said, "we should avoid
* XXX such a configuration [i.e., interfaces with the same
* XXX addressed assigned --ed.] in IPv6...". I do not
* XXX agree, especially now that I have fixed the dangling
* XXX ifp-pointers bug.
*/
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (!IN6_ARE_ADDR_EQUAL(IFA_IN6(ifa), &ia->ia_addr.sin6_addr))
continue;
if (ia->ia_ifp != ifa->ifa_ifp)
alt_ifa = &ia->ia_ifa;
if (++ia_count > 1 && alt_ifa != NULL)
break;
}
if (ia_count > 1 && alt_ifa != NULL)
ifa_acquire(alt_ifa, &psref);
pserialize_read_exit(s);
if (ia_count == 0)
return;
rt_ifa_remlocal(ifa, ia_count == 1 ? NULL : alt_ifa);
if (ia_count > 1 && alt_ifa != NULL)
ifa_release(alt_ifa, &psref);
}
/* Add prefix route for the network. */
static int
in6_ifaddprefix(struct in6_ifaddr *ia)
{
int error, flags = 0;
if (in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL) == 128) {
if (ia->ia_dstaddr.sin6_family != AF_INET6)
/* We don't need to install a host route. */
return 0;
flags |= RTF_HOST;
}
/* Is this a connected route for neighbour discovery? */
if (nd6_need_cache(ia->ia_ifp))
flags |= RTF_CONNECTED;
if ((error = rtinit(&ia->ia_ifa, RTM_ADD, RTF_UP | flags)) == 0)
ia->ia_flags |= IFA_ROUTE;
else if (error == EEXIST)
/* Existence of the route is not an error. */
error = 0;
return error;
}
static int
in6_rt_ifa_matcher(struct rtentry *rt, void *v)
{
struct ifaddr *ifa = v;
if (rt->rt_ifa == ifa)
return 1;
else
return 0;
}
/* Delete network prefix route if present.
* Re-add it to another address if the prefix matches. */
static int
in6_ifremprefix(struct in6_ifaddr *target)
{
int error, s;
struct in6_ifaddr *ia;
if ((target->ia_flags & IFA_ROUTE) == 0)
return 0;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (target->ia_dstaddr.sin6_len) {
if (ia->ia_dstaddr.sin6_len == 0 ||
!IN6_ARE_ADDR_EQUAL(&ia->ia_dstaddr.sin6_addr,
&target->ia_dstaddr.sin6_addr))
continue;
} else {
if (!IN6_ARE_MASKED_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
&target->ia_addr.sin6_addr,
&target->ia_prefixmask.sin6_addr))
continue;
}
/*
* if we got a matching prefix route, move IFA_ROUTE to him
*/
if ((ia->ia_flags & IFA_ROUTE) == 0) {
struct psref psref;
int bound = curlwp_bind();
ia6_acquire(ia, &psref);
pserialize_read_exit(s);
rtinit(&target->ia_ifa, RTM_DELETE, 0);
target->ia_flags &= ~IFA_ROUTE;
error = in6_ifaddprefix(ia);
if (!ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING))
goto skip;
/*
* Replace rt_ifa of routes that have the removing address
* with the new address.
*/
rt_replace_ifa_matched_entries(AF_INET6,
in6_rt_ifa_matcher, &target->ia_ifa, &ia->ia_ifa);
skip:
ia6_release(ia, &psref);
curlwp_bindx(bound);
return error;
}
}
pserialize_read_exit(s);
/*
* noone seem to have prefix route. remove it.
*/
rtinit(&target->ia_ifa, RTM_DELETE, 0);
target->ia_flags &= ~IFA_ROUTE;
if (ISSET(target->ia_ifa.ifa_flags, IFA_DESTROYING)) {
/* Remove routes that have the removing address as rt_ifa. */
rt_delete_matched_entries(AF_INET6, in6_rt_ifa_matcher,
&target->ia_ifa, true);
}
return 0;
}
int
in6_mask2len(struct in6_addr *mask, u_char *lim0)
{
int x = 0, y;
u_char *lim = lim0, *p;
/* ignore the scope_id part */
if (lim0 == NULL || lim0 - (u_char *)mask > sizeof(*mask))
lim = (u_char *)mask + sizeof(*mask);
for (p = (u_char *)mask; p < lim; x++, p++) { if (*p != 0xff)
break;
}
y = 0;
if (p < lim) {
for (y = 0; y < NBBY; y++) {
if ((*p & (0x80 >> y)) == 0)
break;
}
}
/*
* when the limit pointer is given, do a stricter check on the
* remaining bits.
*/
if (p < lim) { if (y != 0 && (*p & (0x00ff >> y)) != 0)
return -1;
for (p = p + 1; p < lim; p++) if (*p != 0)
return -1;
}
return x * NBBY + y;
}
#define ifa2ia6(ifa) ((struct in6_ifaddr *)(ifa))
#define ia62ifa(ia6) (&((ia6)->ia_ifa))
static int
in6_control1(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
struct in6_ifreq *ifr = (struct in6_ifreq *)data;
struct in6_ifaddr *ia = NULL;
struct in6_aliasreq *ifra = (struct in6_aliasreq *)data;
struct sockaddr_in6 *sa6;
int error, bound;
struct psref psref;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
case SIOCDADDRCTL_POLICY:
/* Privileged. */
return in6_src_ioctl(cmd, data);
/*
* XXX: Fix me, once we fix SIOCSIFADDR, SIOCIFDSTADDR, etc.
*/
case SIOCSIFADDR:
case SIOCSIFDSTADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
return EOPNOTSUPP;
case SIOCGETSGCNT_IN6:
case SIOCGETMIFCNT_IN6:
return mrt6_ioctl(cmd, data);
case SIOCGIFADDRPREF:
case SIOCSIFADDRPREF:
if (ifp == NULL)
return EINVAL;
return ifaddrpref_ioctl(so, cmd, data, ifp);
}
if (ifp == NULL)
return EOPNOTSUPP;
switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
case OSIOCSIFINFO_FLAGS_90:
case OSIOCSIFINFO_IN6_90:
case OSIOCSDEFIFACE_IN6:
case OSIOCSNDFLUSH_IN6:
case OSIOCSPFXFLUSH_IN6:
case OSIOCSRTRFLUSH_IN6:
#endif
case SIOCSIFINFO_FLAGS:
case SIOCSIFINFO_IN6:
/* Privileged. */
/* FALLTHROUGH */
#ifdef OSIOCGIFINFO_IN6
case OSIOCGIFINFO_IN6:
#endif
#ifdef OSIOCGIFINFO_IN6_90
case OSIOCGDRLST_IN6:
case OSIOCGPRLST_IN6:
case OSIOCGIFINFO_IN6_90:
case OSIOCGDEFIFACE_IN6:
#endif
case SIOCGIFINFO_IN6:
case SIOCGNBRINFO_IN6:
return nd6_ioctl(cmd, data, ifp);
}
switch (cmd) {
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* Privileged. */
/* FALLTHROUGH */
case SIOCGLIFADDR:
return in6_lifaddr_ioctl(so, cmd, data, ifp);
}
/*
* Find address for this interface, if it exists.
*
* In netinet code, we have checked ifra_addr in SIOCSIF*ADDR operation
* only, and used the first interface address as the target of other
* operations (without checking ifra_addr). This was because netinet
* code/API assumed at most 1 interface address per interface.
* Since IPv6 allows a node to assign multiple addresses
* on a single interface, we almost always look and check the
* presence of ifra_addr, and reject invalid ones here.
* It also decreases duplicated code among SIOC*_IN6 operations.
*/
switch (cmd) {
case SIOCAIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
#endif
#ifdef OSIOCSIFPHYADDR_IN6
case OSIOCSIFPHYADDR_IN6:
#endif
case SIOCSIFPHYADDR_IN6:
sa6 = &ifra->ifra_addr;
break;
case SIOCSIFADDR_IN6:
case SIOCGIFADDR_IN6:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFNETMASK_IN6:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCDIFADDR_IN6:
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFPDSTADDR_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
case SIOCGIFSTAT_IN6:
case SIOCGIFSTAT_ICMP6:
sa6 = &ifr->ifr_addr;
break;
default:
sa6 = NULL;
break;
}
error = 0;
bound = curlwp_bind();
if (sa6 && sa6->sin6_family == AF_INET6) {
if (sa6->sin6_scope_id != 0)
error = sa6_embedscope(sa6, 0);
else
error = in6_setscope(&sa6->sin6_addr, ifp, NULL); if (error != 0)
goto out;
ia = in6ifa_ifpwithaddr_psref(ifp, &sa6->sin6_addr, &psref);
} else
ia = NULL;
switch (cmd) {
case SIOCSIFADDR_IN6:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFNETMASK_IN6:
/*
* Since IPv6 allows a node to assign multiple addresses
* on a single interface, SIOCSIFxxx ioctls are deprecated.
*/
error = EINVAL;
goto release;
case SIOCDIFADDR_IN6:
/*
* for IPv4, we look for existing in_ifaddr here to allow
* "ifconfig if0 delete" to remove the first IPv4 address on
* the interface. For IPv6, as the spec allows multiple
* interface address from the day one, we consider "remove the
* first one" semantics to be not preferable.
*/
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
#ifdef OSIOCAIFADDR_IN6
/* FALLTHROUGH */
case OSIOCAIFADDR_IN6:
#endif
/* FALLTHROUGH */
case SIOCAIFADDR_IN6:
/*
* We always require users to specify a valid IPv6 address for
* the corresponding operation.
*/
if (ifra->ifra_addr.sin6_family != AF_INET6 ||
ifra->ifra_addr.sin6_len != sizeof(struct sockaddr_in6)) {
error = EAFNOSUPPORT;
goto release;
}
/* Privileged. */
break;
case SIOCGIFADDR_IN6:
/* This interface is basically deprecated. use SIOCGIFCONF. */
/* FALLTHROUGH */
case SIOCGIFAFLAG_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFALIFETIME_IN6:
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
/* must think again about its semantics */
if (ia == NULL) {
error = EADDRNOTAVAIL;
goto out;
}
break;
}
switch (cmd) {
case SIOCGIFADDR_IN6:
ifr->ifr_addr = ia->ia_addr;
error = sa6_recoverscope(&ifr->ifr_addr);
break;
case SIOCGIFDSTADDR_IN6:
if ((ifp->if_flags & IFF_POINTOPOINT) == 0) {
error = EINVAL;
break;
}
/*
* XXX: should we check if ifa_dstaddr is NULL and return
* an error?
*/
ifr->ifr_dstaddr = ia->ia_dstaddr;
error = sa6_recoverscope(&ifr->ifr_dstaddr);
break;
case SIOCGIFNETMASK_IN6:
ifr->ifr_addr = ia->ia_prefixmask;
break;
case SIOCGIFAFLAG_IN6:
ifr->ifr_ifru.ifru_flags6 = ia->ia6_flags;
break;
case SIOCGIFSTAT_IN6:
if (ifp == NULL) {
error = EINVAL;
break;
}
memset(&ifr->ifr_ifru.ifru_stat, 0,
sizeof(ifr->ifr_ifru.ifru_stat));
ifr->ifr_ifru.ifru_stat =
*((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->in6_ifstat;
break;
case SIOCGIFSTAT_ICMP6:
if (ifp == NULL) {
error = EINVAL;
break;
}
memset(&ifr->ifr_ifru.ifru_icmp6stat, 0,
sizeof(ifr->ifr_ifru.ifru_icmp6stat));
ifr->ifr_ifru.ifru_icmp6stat =
*((struct in6_ifextra *)ifp->if_afdata[AF_INET6])->icmp6_ifstat;
break;
#ifdef OSIOCGIFALIFETIME_IN6
case OSIOCGIFALIFETIME_IN6:
#endif
case SIOCGIFALIFETIME_IN6:
ifr->ifr_ifru.ifru_lifetime = ia->ia6_lifetime;
if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
time_t maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire = ((time_t)~0) &
(time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
if (ia->ia6_lifetime.ia6t_vltime <
maxexpire - ia->ia6_updatetime) {
retlt->ia6t_expire = ia->ia6_updatetime +
ia->ia6_lifetime.ia6t_vltime;
retlt->ia6t_expire = retlt->ia6t_expire ? time_mono_to_wall(retlt->ia6t_expire) :
0;
} else
retlt->ia6t_expire = maxexpire;
}
if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
time_t maxexpire;
struct in6_addrlifetime *retlt =
&ifr->ifr_ifru.ifru_lifetime;
/*
* XXX: adjust expiration time assuming time_t is
* signed.
*/
maxexpire = ((time_t)~0) &
(time_t)~(1ULL << ((sizeof(maxexpire) * NBBY) - 1));
if (ia->ia6_lifetime.ia6t_pltime <
maxexpire - ia->ia6_updatetime) {
retlt->ia6t_preferred = ia->ia6_updatetime +
ia->ia6_lifetime.ia6t_pltime;
retlt->ia6t_preferred = retlt->ia6t_preferred ? time_mono_to_wall(retlt->ia6t_preferred) :
0;
} else
retlt->ia6t_preferred = maxexpire;
}
#ifdef OSIOCFIFALIFETIME_IN6
if (cmd == OSIOCFIFALIFETIME_IN6)
in6_addrlifetime_to_in6_addrlifetime50(
&ifr->ifru.ifru_lifetime);
#endif
break;
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
in6_aliasreq50_to_in6_aliasreq(ifra);
#endif
/*FALLTHROUGH*/
case SIOCAIFADDR_IN6:
{
struct in6_addrlifetime *lt;
/* reject read-only flags */
if ((ifra->ifra_flags & IN6_IFF_DUPLICATED) != 0 ||
(ifra->ifra_flags & IN6_IFF_DETACHED) != 0 ||
(ifra->ifra_flags & IN6_IFF_TENTATIVE) != 0 ||
(ifra->ifra_flags & IN6_IFF_NODAD) != 0) {
error = EINVAL;
break;
}
/*
* ia6t_expire and ia6t_preferred won't be used for now,
* so just in case.
*/
lt = &ifra->ifra_lifetime;
if (lt->ia6t_expire != 0) lt->ia6t_expire = time_wall_to_mono(lt->ia6t_expire); if (lt->ia6t_preferred != 0)
lt->ia6t_preferred =
time_wall_to_mono(lt->ia6t_preferred);
/*
* make (ia == NULL) or update (ia != NULL) the interface
* address structure, and link it to the list.
*/
int s = splsoftnet();
error = in6_update_ifa1(ifp, ifra, &ia, &psref, 0);
splx(s);
/*
* in6_update_ifa1 doesn't create the address if its
* valid lifetime (vltime) is zero, since we would just
* delete the address immediately in that case anyway.
* So it may succeed but return null ia. In that case,
* nothing left to do.
*/
if (error || ia == NULL)
break;
pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
break;
}
case SIOCDIFADDR_IN6:
ia6_release(ia, &psref);
ifaref(&ia->ia_ifa);
in6_purgeaddr(&ia->ia_ifa);
pfil_run_addrhooks(if_pfil, cmd, &ia->ia_ifa);
ifafree(&ia->ia_ifa);
ia = NULL;
break;
default:
error = ENOTTY;
}
release:
ia6_release(ia, &psref);
out:
curlwp_bindx(bound);
return error;
}
int
in6_control(struct socket *so, u_long cmd, void *data, struct ifnet *ifp)
{
int error, s;
switch (cmd) {
#ifdef OSIOCSIFINFO_IN6_90
case OSIOCSIFINFO_FLAGS_90:
case OSIOCSIFINFO_IN6_90:
case OSIOCSDEFIFACE_IN6:
case OSIOCSNDFLUSH_IN6:
case OSIOCSPFXFLUSH_IN6:
case OSIOCSRTRFLUSH_IN6:
#endif
case SIOCSIFINFO_FLAGS:
case SIOCSIFINFO_IN6:
case SIOCALIFADDR:
case SIOCDLIFADDR:
case SIOCDIFADDR_IN6:
#ifdef OSIOCAIFADDR_IN6
case OSIOCAIFADDR_IN6:
#endif
case SIOCAIFADDR_IN6:
case SIOCAADDRCTL_POLICY:
case SIOCDADDRCTL_POLICY:
if (kauth_authorize_network(kauth_cred_get(),
KAUTH_NETWORK_SOCKET,
KAUTH_REQ_NETWORK_SOCKET_SETPRIV,
so, NULL, NULL))
return EPERM;
break;
}
s = splsoftnet();
#ifndef NET_MPSAFE
KASSERT(KERNEL_LOCKED_P());
#endif
error = in6_control1(so , cmd, data, ifp);
splx(s);
return error;
}
static int
in6_get_llsol_addr(struct in6_addr *llsol, struct ifnet *ifp,
struct in6_addr *ip6)
{
int error;
memset(llsol, 0, sizeof(struct in6_addr));
llsol->s6_addr16[0] = htons(0xff02);
llsol->s6_addr32[1] = 0;
llsol->s6_addr32[2] = htonl(1);
llsol->s6_addr32[3] = ip6->s6_addr32[3];
llsol->s6_addr8[12] = 0xff;
error = in6_setscope(llsol, ifp, NULL);
if (error != 0) {
/* XXX: should not happen */
log(LOG_ERR, "%s: in6_setscope failed\n", __func__);
}
return error;
}
static int
in6_join_mcastgroups(struct in6_aliasreq *ifra, struct in6_ifaddr *ia,
struct ifnet *ifp, int flags)
{
int error;
struct sockaddr_in6 mltaddr, mltmask;
struct in6_multi_mship *imm;
struct in6_addr llsol;
struct rtentry *rt;
int dad_delay;
char ip6buf[INET6_ADDRSTRLEN];
/* join solicited multicast addr for new host id */
error = in6_get_llsol_addr(&llsol, ifp, &ifra->ifra_addr.sin6_addr);
if (error != 0)
goto out;
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
/*
* We need a random delay for DAD on the address
* being configured. It also means delaying
* transmission of the corresponding MLD report to
* avoid report collision.
* [draft-ietf-ipv6-rfc2462bis-02.txt]
*/
dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
}
#define MLTMASK_LEN 4 /* mltmask's masklen (=32bit=4octet) */
/* join solicited multicast addr for new host id */
imm = in6_joingroup(ifp, &llsol, &error, dad_delay);
if (!imm) {
nd6log(LOG_ERR,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &llsol), if_name(ifp), error);
goto out;
}
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
sockaddr_in6_init(&mltmask, &in6mask32, 0, 0, 0);
/*
* join link-local all-nodes address
*/
sockaddr_in6_init(&mltaddr, &in6addr_linklocal_allnodes,
0, 0, 0);
if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
goto out; /* XXX: should not fail */
/*
* XXX: do we really need this automatic routes?
* We should probably reconsider this stuff. Most applications
* actually do not need the routes, since they usually specify
* the outgoing interface.
*/
rt = rtalloc1(sin6tosa(&mltaddr), 0);
if (rt) {
if (memcmp(&mltaddr.sin6_addr,
&satocsin6(rt_getkey(rt))->sin6_addr,
MLTMASK_LEN)) {
rt_unref(rt);
rt = NULL;
} else if (rt->rt_ifp != ifp) {
IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
"network %04x:%04x::/32 = %04x:%04x::/32\n",
__func__, rt->rt_ifp, ifp, ifp->if_xname,
ntohs(mltaddr.sin6_addr.s6_addr16[0]),
ntohs(mltaddr.sin6_addr.s6_addr16[1]),
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
error = rt_update_prepare(rt);
if (error == 0) {
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
rt_update_finish(rt);
} else {
/*
* If error != 0, the rtentry is being
* destroyed, so doing nothing doesn't
* matter.
*/
}
#else
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
#endif
}
}
if (!rt) {
struct rt_addrinfo info;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
/* XXX: we need RTF_CONNECTED to fake nd6_rtrequest */
info.rti_flags = RTF_UP | RTF_CONNECTED;
error = rtrequest1(RTM_ADD, &info, NULL);
if (error)
goto out;
} else {
rt_unref(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
if (!imm) {
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
goto out;
}
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
/*
* join node information group address
*/
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
/*
* The spec doesn't say anything about delay for this
* group, but the same logic should apply.
*/
dad_delay = cprng_fast32() % (MAX_RTR_SOLICITATION_DELAY * hz);
}
if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) != 0)
;
else if ((imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error,
dad_delay)) == NULL) { /* XXX jinmei */
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
/* XXX not very fatal, go on... */
} else {
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
}
/*
* join interface-local all-nodes address.
* (ff01::1%ifN, and ff01::%ifN/32)
*/
mltaddr.sin6_addr = in6addr_nodelocal_allnodes;
if ((error = in6_setscope(&mltaddr.sin6_addr, ifp, NULL)) != 0)
goto out; /* XXX: should not fail */
/* XXX: again, do we really need the route? */
rt = rtalloc1(sin6tosa(&mltaddr), 0);
if (rt) {
/* 32bit came from "mltmask" */
if (memcmp(&mltaddr.sin6_addr,
&satocsin6(rt_getkey(rt))->sin6_addr,
32 / NBBY)) {
rt_unref(rt);
rt = NULL;
} else if (rt->rt_ifp != ifp) {
IN6_DPRINTF("%s: rt_ifp %p -> %p (%s) "
"network %04x:%04x::/32 = %04x:%04x::/32\n",
__func__, rt->rt_ifp, ifp, ifp->if_xname,
ntohs(mltaddr.sin6_addr.s6_addr16[0]),
ntohs(mltaddr.sin6_addr.s6_addr16[1]),
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[0],
satocsin6(rt_getkey(rt))->sin6_addr.s6_addr16[1]);
#ifdef NET_MPSAFE
error = rt_update_prepare(rt);
if (error == 0) {
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
rt_update_finish(rt);
} else {
/*
* If error != 0, the rtentry is being
* destroyed, so doing nothing doesn't
* matter.
*/
}
#else
rt_replace_ifa(rt, &ia->ia_ifa);
rt->rt_ifp = ifp;
#endif
}
}
if (!rt) {
struct rt_addrinfo info;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = sin6tosa(&mltaddr);
info.rti_info[RTAX_GATEWAY] = sin6tosa(&ia->ia_addr);
info.rti_info[RTAX_NETMASK] = sin6tosa(&mltmask);
info.rti_info[RTAX_IFA] = sin6tosa(&ia->ia_addr);
info.rti_flags = RTF_UP | RTF_CONNECTED;
error = rtrequest1(RTM_ADD, &info, NULL);
if (error)
goto out;
#undef MLTMASK_LEN
} else {
rt_unref(rt);
}
imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error, 0);
if (!imm) {
nd6log(LOG_WARNING,
"addmulti failed for %s on %s (errno=%d)\n",
IN6_PRINT(ip6buf, &mltaddr.sin6_addr),
if_name(ifp), error);
goto out;
} else {
mutex_enter(&in6_ifaddr_lock);
LIST_INSERT_HEAD(&ia->ia6_memberships, imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
}
return 0;
out:
KASSERT(error != 0);
return error;
}
/*
* Update parameters of an IPv6 interface address.
* If necessary, a new entry is created and linked into address chains.
* This function is separated from in6_control().
* XXX: should this be performed under splsoftnet()?
*/
static int
in6_update_ifa1(struct ifnet *ifp, struct in6_aliasreq *ifra,
struct in6_ifaddr **iap, struct psref *psref, int flags)
{
int error = 0, hostIsNew = 0, plen = -1;
struct sockaddr_in6 dst6;
struct in6_addrlifetime *lt;
int dad_delay, was_tentative;
struct in6_ifaddr *ia = iap ? *iap : NULL;
char ip6buf[INET6_ADDRSTRLEN];
bool addrmaskNotChanged = false;
bool send_rtm_newaddr = (ip6_param_rt_msg == 1);
int saved_flags = 0;
KASSERT((iap == NULL && psref == NULL) ||
(iap != NULL && psref != NULL));
/* Validate parameters */
if (ifp == NULL || ifra == NULL) /* this maybe redundant */
return EINVAL;
/*
* The destination address for a p2p link must have a family
* of AF_UNSPEC or AF_INET6.
*/
if ((ifp->if_flags & IFF_POINTOPOINT) != 0 &&
ifra->ifra_dstaddr.sin6_family != AF_INET6 &&
ifra->ifra_dstaddr.sin6_family != AF_UNSPEC)
return EAFNOSUPPORT;
/*
* validate ifra_prefixmask. don't check sin6_family, netmask
* does not carry fields other than sin6_len.
*/
if (ifra->ifra_prefixmask.sin6_len > sizeof(struct sockaddr_in6))
return EINVAL;
/*
* Because the IPv6 address architecture is classless, we require
* users to specify a (non 0) prefix length (mask) for a new address.
* We also require the prefix (when specified) mask is valid, and thus
* reject a non-consecutive mask.
*/
if (ia == NULL && ifra->ifra_prefixmask.sin6_len == 0)
return EINVAL;
if (ifra->ifra_prefixmask.sin6_len != 0) {
plen = in6_mask2len(&ifra->ifra_prefixmask.sin6_addr,
(u_char *)&ifra->ifra_prefixmask +
ifra->ifra_prefixmask.sin6_len);
if (plen <= 0)
return EINVAL;
} else {
/*
* In this case, ia must not be NULL. We just use its prefix
* length.
*/
plen = in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
}
/*
* If the destination address on a p2p interface is specified,
* and the address is a scoped one, validate/set the scope
* zone identifier.
*/
dst6 = ifra->ifra_dstaddr;
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) != 0 &&
(dst6.sin6_family == AF_INET6)) {
struct in6_addr in6_tmp;
u_int32_t zoneid;
in6_tmp = dst6.sin6_addr;
if (in6_setscope(&in6_tmp, ifp, &zoneid))
return EINVAL; /* XXX: should be impossible */
if (dst6.sin6_scope_id != 0) {
if (dst6.sin6_scope_id != zoneid)
return EINVAL;
} else /* user omit to specify the ID. */
dst6.sin6_scope_id = zoneid;
/* convert into the internal form */
if (sa6_embedscope(&dst6, 0))
return EINVAL; /* XXX: should be impossible */
}
/*
* The destination address can be specified only for a p2p or a
* loopback interface. If specified, the corresponding prefix length
* must be 128.
*/
if (ifra->ifra_dstaddr.sin6_family == AF_INET6) {
#ifdef FORCE_P2PPLEN
int i;
#endif
if ((ifp->if_flags & (IFF_POINTOPOINT|IFF_LOOPBACK)) == 0) {
/* XXX: noisy message */
nd6log(LOG_INFO, "a destination can "
"be specified for a p2p or a loopback IF only\n");
return EINVAL;
}
if (plen != 128) {
nd6log(LOG_INFO, "prefixlen should "
"be 128 when dstaddr is specified\n");
#ifdef FORCE_P2PPLEN
/*
* To be compatible with old configurations,
* such as ifconfig gif0 inet6 2001::1 2001::2
* prefixlen 126, we override the specified
* prefixmask as if the prefix length was 128.
*/
ifra->ifra_prefixmask.sin6_len =
sizeof(struct sockaddr_in6);
for (i = 0; i < 4; i++)
ifra->ifra_prefixmask.sin6_addr.s6_addr32[i] =
0xffffffff;
plen = 128;
#else
return EINVAL;
#endif
}
}
/* lifetime consistency check */
lt = &ifra->ifra_lifetime;
if (lt->ia6t_pltime > lt->ia6t_vltime)
return EINVAL;
if (lt->ia6t_vltime == 0) {
/*
* the following log might be noisy, but this is a typical
* configuration mistake or a tool's bug.
*/
nd6log(LOG_INFO, "valid lifetime is 0 for %s\n",
IN6_PRINT(ip6buf, &ifra->ifra_addr.sin6_addr));
if (ia == NULL)
return 0; /* there's nothing to do */
}
#define sin6eq(a, b) \
((a)->sin6_len == sizeof(struct sockaddr_in6) && \
(b)->sin6_len == sizeof(struct sockaddr_in6) && \
IN6_ARE_ADDR_EQUAL(&(a)->sin6_addr, &(b)->sin6_addr))
if (!send_rtm_newaddr) {
if (ia != NULL &&
sin6eq(&ifra->ifra_addr, &ia->ia_addr) &&
sin6eq(&ifra->ifra_prefixmask, &ia->ia_prefixmask)) {
addrmaskNotChanged = true;
saved_flags = ia->ia6_flags; /* check it later */
}
}
#undef sin6eq
/*
* If this is a new address, allocate a new ifaddr and link it
* into chains.
*/
if (ia == NULL) {
hostIsNew = 1;
/*
* When in6_update_ifa() is called in a process of a received
* RA, it is called under an interrupt context. So, we should
* call malloc with M_NOWAIT.
*/
ia = malloc(sizeof(*ia), M_IFADDR, M_NOWAIT|M_ZERO);
if (ia == NULL)
return ENOBUFS;
LIST_INIT(&ia->ia6_memberships);
/* Initialize the address and masks, and put time stamp */
ia->ia_ifa.ifa_addr = sin6tosa(&ia->ia_addr);
ia->ia_addr.sin6_family = AF_INET6;
ia->ia_addr.sin6_len = sizeof(ia->ia_addr);
ia->ia6_createtime = time_uptime;
if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) != 0) {
/*
* XXX: some functions expect that ifa_dstaddr is not
* NULL for p2p interfaces.
*/
ia->ia_ifa.ifa_dstaddr = sin6tosa(&ia->ia_dstaddr);
} else {
ia->ia_ifa.ifa_dstaddr = NULL;
}
ia->ia_ifa.ifa_netmask = sin6tosa(&ia->ia_prefixmask);
ia->ia_ifp = ifp;
IN6_ADDRLIST_ENTRY_INIT(ia);
ifa_psref_init(&ia->ia_ifa);
}
/* update timestamp */
ia->ia6_updatetime = time_uptime;
/* set prefix mask */
if (ifra->ifra_prefixmask.sin6_len) {
if (ia->ia_prefixmask.sin6_len) {
if (!IN6_ARE_ADDR_EQUAL(&ia->ia_prefixmask.sin6_addr,
&ifra->ifra_prefixmask.sin6_addr))
in6_ifremprefix(ia);
}
ia->ia_prefixmask = ifra->ifra_prefixmask;
}
/* Set destination address. */
if (dst6.sin6_family == AF_INET6) {
if (!IN6_ARE_ADDR_EQUAL(&dst6.sin6_addr,
&ia->ia_dstaddr.sin6_addr))
in6_ifremprefix(ia);
ia->ia_dstaddr = dst6;
}
/*
* Set lifetimes. We do not refer to ia6t_expire and ia6t_preferred
* to see if the address is deprecated or invalidated, but initialize
* these members for applications.
*/
ia->ia6_lifetime = ifra->ifra_lifetime;
if (ia->ia6_lifetime.ia6t_vltime != ND6_INFINITE_LIFETIME) {
ia->ia6_lifetime.ia6t_expire =
time_uptime + ia->ia6_lifetime.ia6t_vltime;
} else
ia->ia6_lifetime.ia6t_expire = 0;
if (ia->ia6_lifetime.ia6t_pltime != ND6_INFINITE_LIFETIME) {
ia->ia6_lifetime.ia6t_preferred =
time_uptime + ia->ia6_lifetime.ia6t_pltime;
} else
ia->ia6_lifetime.ia6t_preferred = 0;
/*
* configure address flags.
* We need to preserve tentative state so DAD works if
* something adds the same address before DAD finishes.
*/
was_tentative = ia->ia6_flags & (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED);
ia->ia6_flags = ifra->ifra_flags;
/*
* Make the address tentative before joining multicast addresses,
* so that corresponding MLD responses would not have a tentative
* source address.
*/
ia->ia6_flags &= ~IN6_IFF_DUPLICATED; /* safety */
if (ifp->if_link_state == LINK_STATE_DOWN) {
ia->ia6_flags |= IN6_IFF_DETACHED;
ia->ia6_flags &= ~IN6_IFF_TENTATIVE;
} else if ((hostIsNew || was_tentative) && if_do_dad(ifp) &&
ip6_dad_enabled()) {
ia->ia6_flags |= IN6_IFF_TENTATIVE;
}
/*
* backward compatibility - if IN6_IFF_DEPRECATED is set from the
* userland, make it deprecated.
*/
if ((ifra->ifra_flags & IN6_IFF_DEPRECATED) != 0) {
ia->ia6_lifetime.ia6t_pltime = 0;
ia->ia6_lifetime.ia6t_preferred = time_uptime;
}
if (!send_rtm_newaddr) {
/*
* We will not send RTM_NEWADDR if the only difference between
* ia and ifra is preferred/valid lifetimes, because it is not
* very useful for userland programs to be notified of that
* changes.
*/
if (addrmaskNotChanged && ia->ia6_flags == saved_flags)
return 0;
}
if (hostIsNew) {
/*
* We need a reference to ia before calling in6_ifinit.
* Otherwise ia can be freed in in6_ifinit accidentally.
*/
ifaref(&ia->ia_ifa);
}
/* Must execute in6_ifinit and ifa_insert atomically */
mutex_enter(&in6_ifaddr_lock);
/* reset the interface and routing table appropriately. */
error = in6_ifinit(ifp, ia, &ifra->ifra_addr, hostIsNew);
if (error != 0) {
if (hostIsNew)
free(ia, M_IFADDR);
mutex_exit(&in6_ifaddr_lock);
return error;
}
/*
* We are done if we have simply modified an existing address.
*/
if (!hostIsNew) {
mutex_exit(&in6_ifaddr_lock);
return error;
}
/*
* Insert ia to the global list and ifa to the interface's list.
* A reference to it is already gained above.
*/
IN6_ADDRLIST_WRITER_INSERT_TAIL(ia);
ifa_insert(ifp, &ia->ia_ifa);
mutex_exit(&in6_ifaddr_lock);
/*
* Beyond this point, we should call in6_purgeaddr upon an error,
* not just go to unlink.
*/
/* join necessary multicast groups */
if ((ifp->if_flags & IFF_MULTICAST) != 0) {
error = in6_join_mcastgroups(ifra, ia, ifp, flags);
if (error != 0)
goto cleanup;
}
if (nd6_need_cache(ifp)) {
/* XXX maybe unnecessary */
ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
ia->ia_ifa.ifa_flags |= RTF_CONNECTED;
}
/*
* Perform DAD, if needed.
* XXX It may be of use, if we can administratively
* disable DAD.
*/
if (hostIsNew && if_do_dad(ifp) &&
((ifra->ifra_flags & IN6_IFF_NODAD) == 0) &&
(ia->ia6_flags & IN6_IFF_TENTATIVE))
{
int mindelay, maxdelay;
dad_delay = 0;
if ((flags & IN6_IFAUPDATE_DADDELAY)) {
struct in6_addr llsol;
struct in6_multi *in6m_sol = NULL;
/*
* We need to impose a delay before sending an NS
* for DAD. Check if we also needed a delay for the
* corresponding MLD message. If we did, the delay
* should be larger than the MLD delay (this could be
* relaxed a bit, but this simple logic is at least
* safe).
*/
mindelay = 0;
error = in6_get_llsol_addr(&llsol, ifp,
&ifra->ifra_addr.sin6_addr);
in6_multi_lock(RW_READER);
if (error == 0)
in6m_sol = in6_lookup_multi(&llsol, ifp);
if (in6m_sol != NULL &&
in6m_sol->in6m_state == MLD_REPORTPENDING) {
mindelay = in6m_sol->in6m_timer;
}
in6_multi_unlock();
maxdelay = MAX_RTR_SOLICITATION_DELAY * hz;
if (maxdelay - mindelay == 0)
dad_delay = 0;
else {
dad_delay =
(cprng_fast32() % (maxdelay - mindelay)) +
mindelay;
}
}
/* +1 ensures callout is always used */
nd6_dad_start(&ia->ia_ifa, dad_delay + 1);
}
if (iap != NULL) {
*iap = ia;
if (hostIsNew)
ia6_acquire(ia, psref);
}
return 0;
cleanup:
in6_purgeaddr(&ia->ia_ifa);
return error;
}
int
in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq *ifra, int flags)
{
int rc, s;
s = splsoftnet();
rc = in6_update_ifa1(ifp, ifra, NULL, NULL, flags);
splx(s);
return rc;
}
void
in6_purgeaddr(struct ifaddr *ifa)
{
struct ifnet *ifp = ifa->ifa_ifp;
struct in6_ifaddr *ia = (struct in6_ifaddr *) ifa;
struct in6_multi_mship *imm;
/* KASSERT(!ifa_held(ifa)); XXX need ifa_not_held (psref_not_held) */
KASSERT(IFNET_LOCKED(ifp));
ifa->ifa_flags |= IFA_DESTROYING;
/* stop DAD processing */
nd6_dad_stop(ifa);
/* Delete any network route. */
in6_ifremprefix(ia);
/* Remove ownaddr's loopback rtentry, if it exists. */
in6_ifremlocal(&(ia->ia_ifa));
/*
* leave from multicast groups we have joined for the interface
*/
again:
mutex_enter(&in6_ifaddr_lock);
while ((imm = LIST_FIRST(&ia->ia6_memberships)) != NULL) {
struct in6_multi *in6m __diagused = imm->i6mm_maddr;
KASSERTMSG(in6m == NULL || in6m->in6m_ifp == ifp,
"in6m_ifp=%s ifp=%s", in6m ? in6m->in6m_ifp->if_xname : NULL,
ifp->if_xname);
LIST_REMOVE(imm, i6mm_chain);
mutex_exit(&in6_ifaddr_lock);
in6_leavegroup(imm);
goto again;
}
mutex_exit(&in6_ifaddr_lock);
in6_unlink_ifa(ia, ifp);
}
static void
in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
{
int s = splsoftnet();
mutex_enter(&in6_ifaddr_lock);
IN6_ADDRLIST_WRITER_REMOVE(ia);
ifa_remove(ifp, &ia->ia_ifa);
/* Assume ifa_remove called pserialize_perform and psref_destroy */
mutex_exit(&in6_ifaddr_lock);
IN6_ADDRLIST_ENTRY_DESTROY(ia);
/*
* release another refcnt for the link from in6_ifaddr.
* Note that we should decrement the refcnt at least once for all *BSD.
*/
ifafree(&ia->ia_ifa);
splx(s);
}
void
in6_purgeif(struct ifnet *ifp)
{
IFNET_LOCK(ifp);
in6_ifdetach(ifp);
IFNET_UNLOCK(ifp);
}
void
in6_purge_mcast_references(struct in6_multi *in6m)
{
struct in6_ifaddr *ia;
KASSERT(in6_multi_locked(RW_WRITER));
mutex_enter(&in6_ifaddr_lock);
IN6_ADDRLIST_WRITER_FOREACH(ia) {
struct in6_multi_mship *imm;
LIST_FOREACH(imm, &ia->ia6_memberships, i6mm_chain) { if (imm->i6mm_maddr == in6m) imm->i6mm_maddr = NULL;
}
}
mutex_exit(&in6_ifaddr_lock);
}
/*
* SIOC[GAD]LIFADDR.
* SIOCGLIFADDR: get first address. (?)
* SIOCGLIFADDR with IFLR_PREFIX:
* get first address that matches the specified prefix.
* SIOCALIFADDR: add the specified address.
* SIOCALIFADDR with IFLR_PREFIX:
* add the specified prefix, filling hostid part from
* the first link-local address. prefixlen must be <= 64.
* SIOCDLIFADDR: delete the specified address.
* SIOCDLIFADDR with IFLR_PREFIX:
* delete the first address that matches the specified prefix.
* return values:
* EINVAL on invalid parameters
* EADDRNOTAVAIL on prefix match failed/specified address not found
* other values may be returned from in6_ioctl()
*
* NOTE: SIOCALIFADDR(with IFLR_PREFIX set) allows prefixlen less than 64.
* this is to accommodate address naming scheme other than RFC2374,
* in the future.
* RFC2373 defines interface id to be 64bit, but it allows non-RFC2374
* address encoding scheme. (see figure on page 8)
*/
static int
in6_lifaddr_ioctl(struct socket *so, u_long cmd, void *data,
struct ifnet *ifp)
{
struct in6_ifaddr *ia = NULL; /* XXX gcc 4.8 maybe-uninitialized */
struct if_laddrreq *iflr = (struct if_laddrreq *)data;
struct ifaddr *ifa;
struct sockaddr *sa;
/* sanity checks */
if (!data || !ifp) {
panic("invalid argument to in6_lifaddr_ioctl");
/* NOTREACHED */
}
switch (cmd) {
case SIOCGLIFADDR:
/* address must be specified on GET with IFLR_PREFIX */
if ((iflr->flags & IFLR_PREFIX) == 0)
break;
/* FALLTHROUGH */
case SIOCALIFADDR:
case SIOCDLIFADDR:
/* address must be specified on ADD and DELETE */
sa = (struct sockaddr *)&iflr->addr;
if (sa->sa_family != AF_INET6)
return EINVAL;
if (sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
/* XXX need improvement */
sa = (struct sockaddr *)&iflr->dstaddr;
if (sa->sa_family && sa->sa_family != AF_INET6)
return EINVAL;
if (sa->sa_len && sa->sa_len != sizeof(struct sockaddr_in6))
return EINVAL;
break;
default: /* shouldn't happen */
#if 0
panic("invalid cmd to in6_lifaddr_ioctl");
/* NOTREACHED */
#else
return EOPNOTSUPP;
#endif
}
if (sizeof(struct in6_addr) * NBBY < iflr->prefixlen)
return EINVAL;
switch (cmd) {
case SIOCALIFADDR:
{
struct in6_aliasreq ifra;
struct in6_addr *xhostid = NULL;
int prefixlen;
int bound = curlwp_bind();
struct psref psref;
if ((iflr->flags & IFLR_PREFIX) != 0) {
struct sockaddr_in6 *sin6;
/*
* xhostid is to fill in the hostid part of the
* address. xhostid points to the first link-local
* address attached to the interface.
*/
ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref);
if (ia == NULL) {
curlwp_bindx(bound);
return EADDRNOTAVAIL;
}
xhostid = IFA_IN6(&ia->ia_ifa);
/* prefixlen must be <= 64. */
if (64 < iflr->prefixlen) {
ia6_release(ia, &psref);
curlwp_bindx(bound);
return EINVAL;
}
prefixlen = iflr->prefixlen;
/* hostid part must be zero. */
sin6 = (struct sockaddr_in6 *)&iflr->addr;
if (sin6->sin6_addr.s6_addr32[2] != 0 || sin6->sin6_addr.s6_addr32[3] != 0) {
ia6_release(ia, &psref);
curlwp_bindx(bound);
return EINVAL;
}
} else
prefixlen = iflr->prefixlen;
/* copy args to in6_aliasreq, perform ioctl(SIOCAIFADDR_IN6). */
memset(&ifra, 0, sizeof(ifra));
memcpy(ifra.ifra_name, iflr->iflr_name, sizeof(ifra.ifra_name));
memcpy(&ifra.ifra_addr, &iflr->addr,
((struct sockaddr *)&iflr->addr)->sa_len);
if (xhostid) {
/* fill in hostid part */
ifra.ifra_addr.sin6_addr.s6_addr32[2] =
xhostid->s6_addr32[2];
ifra.ifra_addr.sin6_addr.s6_addr32[3] =
xhostid->s6_addr32[3];
}
if (((struct sockaddr *)&iflr->dstaddr)->sa_family) { /* XXX */
memcpy(&ifra.ifra_dstaddr, &iflr->dstaddr,
((struct sockaddr *)&iflr->dstaddr)->sa_len);
if (xhostid) {
ifra.ifra_dstaddr.sin6_addr.s6_addr32[2] =
xhostid->s6_addr32[2];
ifra.ifra_dstaddr.sin6_addr.s6_addr32[3] =
xhostid->s6_addr32[3];
}
}
if (xhostid) {
ia6_release(ia, &psref);
ia = NULL;
}
curlwp_bindx(bound);
ifra.ifra_prefixmask.sin6_len = sizeof(struct sockaddr_in6);
in6_prefixlen2mask(&ifra.ifra_prefixmask.sin6_addr, prefixlen);
ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
ifra.ifra_flags = iflr->flags & ~IFLR_PREFIX;
return in6_control(so, SIOCAIFADDR_IN6, &ifra, ifp);
}
case SIOCGLIFADDR:
case SIOCDLIFADDR:
{
struct in6_addr mask, candidate, match;
struct sockaddr_in6 *sin6;
int cmp;
int error, s;
memset(&mask, 0, sizeof(mask));
if (iflr->flags & IFLR_PREFIX) {
/* lookup a prefix rather than address. */
in6_prefixlen2mask(&mask, iflr->prefixlen);
sin6 = (struct sockaddr_in6 *)&iflr->addr;
memcpy(&match, &sin6->sin6_addr, sizeof(match));
match.s6_addr32[0] &= mask.s6_addr32[0];
match.s6_addr32[1] &= mask.s6_addr32[1];
match.s6_addr32[2] &= mask.s6_addr32[2];
match.s6_addr32[3] &= mask.s6_addr32[3];
/* if you set extra bits, that's wrong */
if (memcmp(&match, &sin6->sin6_addr, sizeof(match)))
return EINVAL;
cmp = 1;
} else {
if (cmd == SIOCGLIFADDR) {
/* on getting an address, take the 1st match */
cmp = 0; /* XXX */
} else {
/* on deleting an address, do exact match */
in6_prefixlen2mask(&mask, 128);
sin6 = (struct sockaddr_in6 *)&iflr->addr;
memcpy(&match, &sin6->sin6_addr, sizeof(match));
cmp = 1;
}
}
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!cmp)
break;
/*
* XXX: this is adhoc, but is necessary to allow
* a user to specify fe80::/64 (not /10) for a
* link-local address.
*/
memcpy(&candidate, IFA_IN6(ifa), sizeof(candidate));
in6_clearscope(&candidate);
candidate.s6_addr32[0] &= mask.s6_addr32[0];
candidate.s6_addr32[1] &= mask.s6_addr32[1];
candidate.s6_addr32[2] &= mask.s6_addr32[2];
candidate.s6_addr32[3] &= mask.s6_addr32[3];
if (IN6_ARE_ADDR_EQUAL(&candidate, &match))
break;
}
if (!ifa) {
error = EADDRNOTAVAIL;
goto error;
}
ia = ifa2ia6(ifa);
if (cmd == SIOCGLIFADDR) {
/* fill in the if_laddrreq structure */
memcpy(&iflr->addr, &ia->ia_addr, ia->ia_addr.sin6_len);
error = sa6_recoverscope(
(struct sockaddr_in6 *)&iflr->addr);
if (error != 0)
goto error;
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
memcpy(&iflr->dstaddr, &ia->ia_dstaddr,
ia->ia_dstaddr.sin6_len);
error = sa6_recoverscope(
(struct sockaddr_in6 *)&iflr->dstaddr);
if (error != 0)
goto error;
} else
memset(&iflr->dstaddr, 0, sizeof(iflr->dstaddr));
iflr->prefixlen =
in6_mask2len(&ia->ia_prefixmask.sin6_addr, NULL);
iflr->flags = ia->ia6_flags; /* XXX */
error = 0;
} else {
struct in6_aliasreq ifra;
/* fill in6_aliasreq and do ioctl(SIOCDIFADDR_IN6) */
memset(&ifra, 0, sizeof(ifra));
memcpy(ifra.ifra_name, iflr->iflr_name,
sizeof(ifra.ifra_name));
memcpy(&ifra.ifra_addr, &ia->ia_addr,
ia->ia_addr.sin6_len);
if ((ifp->if_flags & IFF_POINTOPOINT) != 0) {
memcpy(&ifra.ifra_dstaddr, &ia->ia_dstaddr,
ia->ia_dstaddr.sin6_len);
} else {
memset(&ifra.ifra_dstaddr, 0,
sizeof(ifra.ifra_dstaddr));
}
memcpy(&ifra.ifra_dstaddr, &ia->ia_prefixmask,
ia->ia_prefixmask.sin6_len);
ifra.ifra_flags = ia->ia6_flags;
pserialize_read_exit(s);
return in6_control(so, SIOCDIFADDR_IN6, &ifra, ifp);
}
error:
pserialize_read_exit(s);
return error;
}
}
return EOPNOTSUPP; /* just for safety */
}
/*
* Initialize an interface's internet6 address
* and routing table entry.
*/
static int
in6_ifinit(struct ifnet *ifp, struct in6_ifaddr *ia,
const struct sockaddr_in6 *sin6, int newhost)
{
int error = 0, ifacount = 0;
int s;
struct ifaddr *ifa;
KASSERT(mutex_owned(&in6_ifaddr_lock));
/*
* Give the interface a chance to initialize
* if this is its first address,
* and to validate the address if necessary.
*/
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifacount++;
}
pserialize_read_exit(s);
ia->ia_addr = *sin6;
if (ifacount == 0 &&
(error = if_addr_init(ifp, &ia->ia_ifa, true)) != 0) {
return error;
}
ia->ia_ifa.ifa_metric = ifp->if_metric;
/* we could do in(6)_socktrim here, but just omit it at this moment. */
/* Add ownaddr as loopback rtentry, if necessary (ex. on p2p link). */
if (newhost) {
/* set the rtrequest function to create llinfo */
if (ifp->if_flags & IFF_POINTOPOINT)
ia->ia_ifa.ifa_rtrequest = p2p_rtrequest;
else if ((ifp->if_flags & IFF_LOOPBACK) == 0)
ia->ia_ifa.ifa_rtrequest = nd6_rtrequest;
in6_ifaddlocal(&ia->ia_ifa);
} else {
/* Inform the routing socket of new flags/timings */
rt_addrmsg(RTM_NEWADDR, &ia->ia_ifa);
}
/* Add the network prefix route. */
if ((error = in6_ifaddprefix(ia)) != 0) {
if (newhost)
in6_ifremlocal(&ia->ia_ifa);
return error;
}
return error;
}
static struct ifaddr *
bestifa(struct ifaddr *best_ifa, struct ifaddr *ifa)
{
if (best_ifa == NULL || best_ifa->ifa_preference < ifa->ifa_preference)
return ifa;
return best_ifa;
}
/*
* Find an IPv6 interface link-local address specific to an interface.
*/
struct in6_ifaddr *
in6ifa_ifpforlinklocal(const struct ifnet *ifp, const int ignoreflags)
{
struct ifaddr *best_ifa = NULL, *ifa;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!IN6_IS_ADDR_LINKLOCAL(IFA_IN6(ifa)))
continue;
if ((((struct in6_ifaddr *)ifa)->ia6_flags & ignoreflags) != 0)
continue;
best_ifa = bestifa(best_ifa, ifa);
}
return (struct in6_ifaddr *)best_ifa;
}
struct in6_ifaddr *
in6ifa_ifpforlinklocal_psref(const struct ifnet *ifp, const int ignoreflags,
struct psref *psref)
{
struct in6_ifaddr *ia;
int s = pserialize_read_enter();
ia = in6ifa_ifpforlinklocal(ifp, ignoreflags);
if (ia != NULL) ia6_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
/*
* find the internet address corresponding to a given address.
* ifaddr is returned referenced.
*/
struct in6_ifaddr *
in6ifa_ifwithaddr(const struct in6_addr *addr, uint32_t zoneid)
{
struct in6_ifaddr *ia;
int s;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (IN6_ARE_ADDR_EQUAL(IA6_IN6(ia), addr)) {
if (zoneid != 0 &&
zoneid != ia->ia_addr.sin6_scope_id)
continue;
ifaref(&ia->ia_ifa);
break;
}
}
pserialize_read_exit(s);
return ia;
}
/*
* find the internet address corresponding to a given interface and address.
*/
struct in6_ifaddr *
in6ifa_ifpwithaddr(const struct ifnet *ifp, const struct in6_addr *addr)
{
struct ifaddr *best_ifa = NULL, *ifa;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
if (!IN6_ARE_ADDR_EQUAL(addr, IFA_IN6(ifa)))
continue;
best_ifa = bestifa(best_ifa, ifa);
}
return (struct in6_ifaddr *)best_ifa;
}
struct in6_ifaddr *
in6ifa_ifpwithaddr_psref(const struct ifnet *ifp, const struct in6_addr *addr,
struct psref *psref)
{
struct in6_ifaddr *ia;
int s = pserialize_read_enter();
ia = in6ifa_ifpwithaddr(ifp, addr);
if (ia != NULL) ia6_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
static struct in6_ifaddr *
bestia(struct in6_ifaddr *best_ia, struct in6_ifaddr *ia)
{
if (best_ia == NULL || best_ia->ia_ifa.ifa_preference < ia->ia_ifa.ifa_preference)
return ia;
return best_ia;
}
/*
* Determine if an address is on a local network.
*/
int
in6_localaddr(const struct in6_addr *in6)
{
struct in6_ifaddr *ia;
int s;
if (IN6_IS_ADDR_LOOPBACK(in6) || IN6_IS_ADDR_LINKLOCAL(in6))
return 1;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) { if (IN6_ARE_MASKED_ADDR_EQUAL(in6, &ia->ia_addr.sin6_addr,
&ia->ia_prefixmask.sin6_addr)) {
pserialize_read_exit(s);
return 1;
}
}
pserialize_read_exit(s);
return 0;
}
int
in6_is_addr_deprecated(struct sockaddr_in6 *sa6)
{
struct in6_ifaddr *ia;
int s;
s = pserialize_read_enter();
IN6_ADDRLIST_READER_FOREACH(ia) {
if (IN6_ARE_ADDR_EQUAL(&ia->ia_addr.sin6_addr,
&sa6->sin6_addr) &&
#ifdef SCOPEDROUTING
ia->ia_addr.sin6_scope_id == sa6->sin6_scope_id &&
#endif
(ia->ia6_flags & IN6_IFF_DEPRECATED) != 0) {
pserialize_read_exit(s);
return 1; /* true */
}
/* XXX: do we still have to go thru the rest of the list? */
}
pserialize_read_exit(s);
return 0; /* false */
}
/*
* return length of part which dst and src are equal
* hard coding...
*/
int
in6_matchlen(struct in6_addr *src, struct in6_addr *dst)
{
int match = 0;
u_char *s = (u_char *)src, *d = (u_char *)dst;
u_char *lim = s + 16, r;
while (s < lim)
if ((r = (*d++ ^ *s++)) != 0) { while (r < 128) {
match++;
r <<= 1;
}
break;
} else
match += NBBY;
return match;
}
void
in6_prefixlen2mask(struct in6_addr *maskp, int len)
{
static const u_char maskarray[NBBY] = {0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};
int bytelen, bitlen, i;
/* sanity check */
if (len < 0 || len > 128) {
log(LOG_ERR, "in6_prefixlen2mask: invalid prefix length(%d)\n",
len);
return;
}
memset(maskp, 0, sizeof(*maskp));
bytelen = len / NBBY;
bitlen = len % NBBY;
for (i = 0; i < bytelen; i++)
maskp->s6_addr[i] = 0xff;
if (bitlen) maskp->s6_addr[bytelen] = maskarray[bitlen - 1];
}
/*
* return the best address out of the same scope. if no address was
* found, return the first valid address from designated IF.
*/
struct in6_ifaddr *
in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
{
int dst_scope = in6_addrscope(dst), blen = -1, tlen;
struct ifaddr *ifa;
struct in6_ifaddr *best_ia = NULL, *ia;
struct in6_ifaddr *dep[2]; /* last-resort: deprecated */
dep[0] = dep[1] = NULL;
/*
* We first look for addresses in the same scope.
* If there is one, return it.
* If two or more, return one which matches the dst longest.
* If none, return one of global addresses assigned other ifs.
*/
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (ia->ia6_flags & IN6_IFF_ANYCAST)
continue; /* XXX: is there any case to allow anycast? */
if (ia->ia6_flags & IN6_IFF_NOTREADY)
continue; /* don't use this interface */
if (ia->ia6_flags & IN6_IFF_DETACHED)
continue;
if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
if (ip6_use_deprecated)
dep[0] = ia;
continue;
}
if (dst_scope != in6_addrscope(IFA_IN6(ifa)))
continue;
/*
* call in6_matchlen() as few as possible
*/
if (best_ia == NULL) {
best_ia = ia;
continue;
}
if (blen == -1) blen = in6_matchlen(&best_ia->ia_addr.sin6_addr, dst);
tlen = in6_matchlen(IFA_IN6(ifa), dst);
if (tlen > blen) {
blen = tlen;
best_ia = ia;
} else if (tlen == blen) best_ia = bestia(best_ia, ia);
}
if (best_ia != NULL)
return best_ia;
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (ia->ia6_flags & IN6_IFF_ANYCAST)
continue; /* XXX: is there any case to allow anycast? */
if (ia->ia6_flags & IN6_IFF_NOTREADY)
continue; /* don't use this interface */
if (ia->ia6_flags & IN6_IFF_DETACHED)
continue;
if (ia->ia6_flags & IN6_IFF_DEPRECATED) {
if (ip6_use_deprecated)
dep[1] = (struct in6_ifaddr *)ifa;
continue;
}
best_ia = bestia(best_ia, ia);
}
if (best_ia != NULL)
return best_ia;
/* use the last-resort values, that are, deprecated addresses */
if (dep[0])
return dep[0];
if (dep[1])
return dep[1];
return NULL;
}
/*
* perform DAD when interface becomes IFF_UP.
*/
void
in6_if_link_up(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct in6_ifaddr *ia;
int s, bound;
char ip6buf[INET6_ADDRSTRLEN];
/* Ensure it's sane to run DAD */
if (ifp->if_link_state == LINK_STATE_DOWN)
return;
if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING))
return;
bound = curlwp_bind();
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
ia = (struct in6_ifaddr *)ifa;
/* If detached then mark as tentative */
if (ia->ia6_flags & IN6_IFF_DETACHED) {
ia->ia6_flags &= ~IN6_IFF_DETACHED;
if (ip6_dad_enabled() && if_do_dad(ifp)) {
ia->ia6_flags |= IN6_IFF_TENTATIVE;
nd6log(LOG_ERR, "%s marked tentative\n",
IN6_PRINT(ip6buf,
&ia->ia_addr.sin6_addr));
} else if ((ia->ia6_flags & IN6_IFF_TENTATIVE) == 0) rt_addrmsg(RTM_NEWADDR, ifa);
}
if (ia->ia6_flags & IN6_IFF_TENTATIVE) {
int rand_delay;
/* Clear the duplicated flag as we're starting DAD. */
ia->ia6_flags &= ~IN6_IFF_DUPLICATED;
/*
* The TENTATIVE flag was likely set by hand
* beforehand, implicitly indicating the need for DAD.
* We may be able to skip the random delay in this
* case, but we impose delays just in case.
*/
rand_delay = cprng_fast32() %
(MAX_RTR_SOLICITATION_DELAY * hz);
/* +1 ensures callout is always used */
nd6_dad_start(ifa, rand_delay + 1);
}
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
}
void
in6_if_up(struct ifnet *ifp)
{
/*
* special cases, like 6to4, are handled in in6_ifattach
*/
in6_ifattach(ifp, NULL);
/* interface may not support link state, so bring it up also */
in6_if_link_up(ifp);
}
/*
* Mark all addresses as detached.
*/
void
in6_if_link_down(struct ifnet *ifp)
{
struct ifaddr *ifa;
struct in6_ifaddr *ia;
int s, bound;
char ip6buf[INET6_ADDRSTRLEN];
bound = curlwp_bind();
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
struct psref psref;
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
ia = (struct in6_ifaddr *)ifa;
/* Stop DAD processing */
nd6_dad_stop(ifa);
/*
* Mark the address as detached.
* This satisfies RFC4862 Section 5.3, but we should apply
* this logic to all addresses to be a good citizen and
* avoid potential duplicated addresses.
* When the interface comes up again, detached addresses
* are marked tentative and DAD commences.
*/
if (!(ia->ia6_flags & IN6_IFF_DETACHED)) { nd6log(LOG_DEBUG, "%s marked detached\n",
IN6_PRINT(ip6buf, &ia->ia_addr.sin6_addr));
ia->ia6_flags |= IN6_IFF_DETACHED;
ia->ia6_flags &=
~(IN6_IFF_TENTATIVE | IN6_IFF_DUPLICATED);
rt_addrmsg(RTM_NEWADDR, ifa);
}
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
/* Clear ND6_IFF_IFDISABLED to allow DAD again on link-up. */
if (ifp->if_afdata[AF_INET6] != NULL) ND_IFINFO(ifp)->flags &= ~ND6_IFF_IFDISABLED;
}
void
in6_if_down(struct ifnet *ifp)
{
in6_if_link_down(ifp);
lltable_purge_entries(LLTABLE6(ifp));
}
void
in6_if_link_state_change(struct ifnet *ifp, int link_state)
{
/*
* Treat LINK_STATE_UNKNOWN as UP.
* LINK_STATE_UNKNOWN transitions to LINK_STATE_DOWN when
* if_link_state_change() transitions to LINK_STATE_UP.
*/
if (link_state == LINK_STATE_DOWN)
in6_if_link_down(ifp);
else
in6_if_link_up(ifp);
}
int
in6_tunnel_validate(const struct ip6_hdr *ip6, const struct in6_addr *src,
const struct in6_addr *dst)
{
/* check for address match */
if (!IN6_ARE_ADDR_EQUAL(src, &ip6->ip6_dst) ||
!IN6_ARE_ADDR_EQUAL(dst, &ip6->ip6_src))
return 0;
/* martian filters on outer source - done in ip6_input */
/* NOTE: the packet may be dropped by uRPF. */
/* return valid bytes length */
return sizeof(*src) + sizeof(*dst);
}
#define IN6_LLTBL_DEFAULT_HSIZE 32
#define IN6_LLTBL_HASH(k, h) \
(((((((k >> 8) ^ k) >> 8) ^ k) >> 8) ^ k) & ((h) - 1))
/*
* Do actual deallocation of @lle.
* Called by LLE_FREE_LOCKED when number of references
* drops to zero.
*/
static void
in6_lltable_destroy_lle(struct llentry *lle)
{
KASSERTMSG(lle->la_numheld == 0, "la_numheld=%d", lle->la_numheld);
LLE_WUNLOCK(lle);
LLE_LOCK_DESTROY(lle);
llentry_pool_put(lle);
}
static struct llentry *
in6_lltable_new(const struct in6_addr *addr6, u_int flags)
{
struct llentry *lle;
lle = llentry_pool_get(PR_NOWAIT);
if (lle == NULL) /* NB: caller generates msg */
return NULL;
lle->r_l3addr.addr6 = *addr6;
lle->lle_refcnt = 1;
lle->lle_free = in6_lltable_destroy_lle;
LLE_LOCK_INIT(lle);
callout_init(&lle->lle_timer, CALLOUT_MPSAFE);
return lle;
}
static int
in6_lltable_match_prefix(const struct sockaddr *prefix,
const struct sockaddr *mask, u_int flags, struct llentry *lle)
{
const struct sockaddr_in6 *pfx = (const struct sockaddr_in6 *)prefix;
const struct sockaddr_in6 *msk = (const struct sockaddr_in6 *)mask;
if (IN6_ARE_MASKED_ADDR_EQUAL(&lle->r_l3addr.addr6,
&pfx->sin6_addr, &msk->sin6_addr) &&
((flags & LLE_STATIC) || !(lle->la_flags & LLE_STATIC)))
return 1;
return 0;
}
static void
in6_lltable_free_entry(struct lltable *llt, struct llentry *lle)
{
LLE_WLOCK_ASSERT(lle);
(void) llentry_free(lle);
}
static int
in6_lltable_rtcheck(struct ifnet *ifp, u_int flags,
const struct sockaddr *l3addr, const struct rtentry *rt)
{
char ip6buf[INET6_ADDRSTRLEN];
if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) || rt->rt_ifp != ifp) {
int s;
struct ifaddr *ifa;
/*
* Create an ND6 cache for an IPv6 neighbor
* that is not covered by our own prefix.
*/
/* XXX ifaof_ifpforaddr should take a const param */
s = pserialize_read_enter();
ifa = ifaof_ifpforaddr(l3addr, ifp);
if (ifa != NULL) {
pserialize_read_exit(s);
return 0;
}
pserialize_read_exit(s);
log(LOG_INFO, "IPv6 address: \"%s\" is not on the network\n",
IN6_PRINT(ip6buf,
&((const struct sockaddr_in6 *)l3addr)->sin6_addr));
return EINVAL;
}
return 0;
}
static inline uint32_t
in6_lltable_hash_dst(const struct in6_addr *dst, uint32_t hsize)
{
return IN6_LLTBL_HASH(dst->s6_addr32[3], hsize);
}
static uint32_t
in6_lltable_hash(const struct llentry *lle, uint32_t hsize)
{
return in6_lltable_hash_dst(&lle->r_l3addr.addr6, hsize);
}
static void
in6_lltable_fill_sa_entry(const struct llentry *lle, struct sockaddr *sa)
{
struct sockaddr_in6 *sin6;
sin6 = (struct sockaddr_in6 *)sa;
bzero(sin6, sizeof(*sin6));
sin6->sin6_family = AF_INET6;
sin6->sin6_len = sizeof(*sin6);
sin6->sin6_addr = lle->r_l3addr.addr6;
}
static inline struct llentry *
in6_lltable_find_dst(struct lltable *llt, const struct in6_addr *dst)
{
struct llentry *lle;
struct llentries *lleh;
u_int hashidx;
hashidx = in6_lltable_hash_dst(dst, llt->llt_hsize);
lleh = &llt->lle_head[hashidx];
LIST_FOREACH(lle, lleh, lle_next) { if (lle->la_flags & LLE_DELETED)
continue;
if (IN6_ARE_ADDR_EQUAL(&lle->r_l3addr.addr6, dst))
break;
}
return lle;
}
static int
in6_lltable_delete(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct llentry *lle;
IF_AFDATA_WLOCK_ASSERT(llt->llt_ifp);
KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle == NULL) {
#ifdef LLTABLE_DEBUG
char buf[64];
sockaddr_format(l3addr, buf, sizeof(buf));
log(LOG_INFO, "%s: cache for %s is not found\n",
__func__, buf);
#endif
return ENOENT;
}
LLE_WLOCK(lle);
#ifdef LLTABLE_DEBUG
{
char buf[64];
sockaddr_format(l3addr, buf, sizeof(buf));
log(LOG_INFO, "%s: cache for %s (%p) is deleted\n",
__func__, buf, lle);
}
#endif
llentry_free(lle);
return 0;
}
static struct llentry *
in6_lltable_create(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr, const struct rtentry *rt)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct ifnet *ifp = llt->llt_ifp;
struct llentry *lle;
IF_AFDATA_WLOCK_ASSERT(ifp);
KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle != NULL) {
LLE_WLOCK(lle);
return lle;
}
/*
* A route that covers the given address must have
* been installed 1st because we are doing a resolution,
* verify this.
*/
if (!(flags & LLE_IFADDR) &&
in6_lltable_rtcheck(ifp, flags, l3addr, rt) != 0)
return NULL;
lle = in6_lltable_new(&sin6->sin6_addr, flags);
if (lle == NULL) {
log(LOG_INFO, "lla_lookup: new lle malloc failed\n");
return NULL;
}
lle->la_flags = flags;
if ((flags & LLE_IFADDR) == LLE_IFADDR) {
memcpy(&lle->ll_addr, CLLADDR(ifp->if_sadl), ifp->if_addrlen);
lle->la_flags |= LLE_VALID;
}
lltable_link_entry(llt, lle);
LLE_WLOCK(lle);
return lle;
}
static struct llentry *
in6_lltable_lookup(struct lltable *llt, u_int flags,
const struct sockaddr *l3addr)
{
const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)l3addr;
struct llentry *lle;
IF_AFDATA_LOCK_ASSERT(llt->llt_ifp); KASSERTMSG(l3addr->sa_family == AF_INET6,
"sin_family %d", l3addr->sa_family);
lle = in6_lltable_find_dst(llt, &sin6->sin6_addr);
if (lle == NULL)
return NULL;
if (flags & LLE_EXCLUSIVE)
LLE_WLOCK(lle);
else
LLE_RLOCK(lle);
return lle;
}
static int
in6_lltable_dump_entry(struct lltable *llt, struct llentry *lle,
struct rt_walkarg *w)
{
struct sockaddr_in6 sin6;
LLTABLE_LOCK_ASSERT();
/* skip deleted entries */
if (lle->la_flags & LLE_DELETED)
return 0;
sockaddr_in6_init(&sin6, &lle->r_l3addr.addr6, 0, 0, 0);
return lltable_dump_entry(llt, lle, w, sin6tosa(&sin6));
}
static struct lltable *
in6_lltattach(struct ifnet *ifp)
{
struct lltable *llt;
llt = lltable_allocate_htbl(IN6_LLTBL_DEFAULT_HSIZE);
llt->llt_af = AF_INET6;
llt->llt_ifp = ifp;
llt->llt_lookup = in6_lltable_lookup;
llt->llt_create = in6_lltable_create;
llt->llt_delete = in6_lltable_delete;
llt->llt_dump_entry = in6_lltable_dump_entry;
llt->llt_hash = in6_lltable_hash;
llt->llt_fill_sa_entry = in6_lltable_fill_sa_entry;
llt->llt_free_entry = in6_lltable_free_entry;
llt->llt_match_prefix = in6_lltable_match_prefix;
lltable_link(llt);
return llt;
}
void *
in6_domifattach(struct ifnet *ifp)
{
struct in6_ifextra *ext;
ext = malloc(sizeof(*ext), M_IFADDR, M_WAITOK|M_ZERO);
ext->in6_ifstat = malloc(sizeof(struct in6_ifstat),
M_IFADDR, M_WAITOK|M_ZERO);
ext->icmp6_ifstat = malloc(sizeof(struct icmp6_ifstat),
M_IFADDR, M_WAITOK|M_ZERO);
ext->nd_ifinfo = nd6_ifattach(ifp);
ext->scope6_id = scope6_ifattach(ifp);
ext->lltable = in6_lltattach(ifp);
return ext;
}
void
in6_domifdetach(struct ifnet *ifp, void *aux)
{
struct in6_ifextra *ext = (struct in6_ifextra *)aux;
lltable_free(ext->lltable);
ext->lltable = NULL;
SOFTNET_LOCK_UNLESS_NET_MPSAFE();
nd6_ifdetach(ifp, ext);
SOFTNET_UNLOCK_UNLESS_NET_MPSAFE();
free(ext->in6_ifstat, M_IFADDR);
free(ext->icmp6_ifstat, M_IFADDR);
scope6_ifdetach(ext->scope6_id);
free(ext, M_IFADDR);
}
/*
* Convert IPv4 address stored in struct in_addr to IPv4-Mapped IPv6 address
* stored in struct in6_addr as defined in RFC 4921 section 2.5.5.2.
*/
void
in6_in_2_v4mapin6(const struct in_addr *in, struct in6_addr *in6)
{
in6->s6_addr32[0] = 0;
in6->s6_addr32[1] = 0;
in6->s6_addr32[2] = IPV6_ADDR_INT32_SMP;
in6->s6_addr32[3] = in->s_addr;
}
/*
* Convert sockaddr_in6 to sockaddr_in. Original sockaddr_in6 must be
* v4 mapped addr or v4 compat addr
*/
void
in6_sin6_2_sin(struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
memset(sin, 0, sizeof(*sin));
sin->sin_len = sizeof(struct sockaddr_in);
sin->sin_family = AF_INET;
sin->sin_port = sin6->sin6_port;
sin->sin_addr.s_addr = sin6->sin6_addr.s6_addr32[3];
}
/* Convert sockaddr_in to sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6(const struct sockaddr_in *sin, struct sockaddr_in6 *sin6)
{
memset(sin6, 0, sizeof(*sin6));
sin6->sin6_len = sizeof(struct sockaddr_in6);
sin6->sin6_family = AF_INET6;
sin6->sin6_port = sin->sin_port;
in6_in_2_v4mapin6(&sin->sin_addr, &sin6->sin6_addr);
}
/* Convert sockaddr_in6 into sockaddr_in. */
void
in6_sin6_2_sin_in_sock(struct sockaddr *nam)
{
struct sockaddr_in *sin_p;
struct sockaddr_in6 sin6;
/*
* Save original sockaddr_in6 addr and convert it
* to sockaddr_in.
*/
sin6 = *(struct sockaddr_in6 *)nam;
sin_p = (struct sockaddr_in *)nam;
in6_sin6_2_sin(sin_p, &sin6);
}
/* Convert sockaddr_in into sockaddr_in6 in v4 mapped addr format. */
void
in6_sin_2_v4mapsin6_in_sock(struct sockaddr **nam)
{
struct sockaddr_in *sin_p;
struct sockaddr_in6 *sin6_p;
sin6_p = malloc(sizeof(*sin6_p), M_SONAME, M_WAITOK);
sin_p = (struct sockaddr_in *)*nam;
in6_sin_2_v4mapsin6(sin_p, sin6_p);
free(*nam, M_SONAME);
*nam = sin6tosa(sin6_p);
}
/* $NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $ */
/* $KAME: in6_src.c,v 1.159 2005/10/19 01:40:32 t-momose Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_src.c,v 1.92 2023/08/03 04:24:55 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/nd6.h>
#include <netinet6/scope6_var.h>
#ifdef MIP6
#include <netinet6/mip6.h>
#include <netinet6/mip6_var.h>
#include "mip.h"
#if NMIP > 0
#include <net/if_mip.h>
#endif /* NMIP > 0 */
#endif /* MIP6 */
#include <netinet/tcp_vtw.h>
#define ADDR_LABEL_NOTAPP (-1)
struct in6_addrpolicy defaultaddrpolicy;
int ip6_prefer_tempaddr = 0;
static int in6_selectif(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route *, struct ifnet **, struct psref *);
static struct in6_addrpolicy *lookup_addrsel_policy(struct sockaddr_in6 *);
static void init_policy_queue(void);
static int add_addrsel_policyent(struct in6_addrpolicy *);
static int delete_addrsel_policyent(struct in6_addrpolicy *);
static int walk_addrsel_policy(int (*)(struct in6_addrpolicy *, void *),
void *);
static int dump_addrsel_policyent(struct in6_addrpolicy *, void *);
static struct in6_addrpolicy *match_addrsel_policy(struct sockaddr_in6 *);
#define IFA6_IS_VALIDATED(ia) \
(((ia)->ia6_flags & (IN6_IFF_TENTATIVE | IN6_IFF_DETACHED)) == 0)
/*
* Return an IPv6 address, which is the most appropriate for a given
* destination and user specified options.
* If necessary, this function lookups the routing table and returns
* an entry to the caller for later use.
*/
#if 0 /* disabled ad-hoc */
#define REPLACE(r) do {\
char _buf1[INET6_ADDRSTRLEN], _buf2[INET6_ADDRSTRLEN]; \
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
printf("%s: replace %s with %s by %d\n", __func__, ia_best ? \
IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
goto replace; \
} while(/*CONSTCOND*/0)
#define NEXT(r) do {\
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
printf("%s: keep %s against %s by %d\n", ia_best ? \
IN6_PRINT(_buf1, &ia_best->ia_addr.sin6_addr) : "none", \
IN6_PRINT(_buf2, &ia->ia_addr.sin6_addr), (r)); \
goto next; /* XXX: we can't use 'continue' here */ \
} while(/*CONSTCOND*/0)
#define BREAK(r) do { \
if ((r) < sizeof(ip6stat.ip6s_sources_rule) / \
sizeof(ip6stat.ip6s_sources_rule[0])) /* check for safety */ \
ip6stat.ip6s_sources_rule[(r)]++; \
goto out; /* XXX: we can't use 'break' here */ \
} while(/*CONSTCOND*/0)
#else
#define REPLACE(r) goto replace
#define NEXT(r) goto next
#define BREAK(r) goto out
#endif
/*
* Called inside pserialize critical section. Don't sleep/block.
*/
static struct in6_ifaddr *
in6_select_best_ia(struct sockaddr_in6 *dstsock, struct in6_addr *dst,
const struct ifnet *ifp, const struct ip6_pktopts *opts,
const u_int32_t odstzone)
{
struct in6_ifaddr *ia, *ia_best = NULL;
int dst_scope = -1, best_scope = -1, best_matchlen = -1;
struct in6_addrpolicy *dst_policy = NULL, *best_policy = NULL;
IN6_ADDRLIST_READER_FOREACH(ia) {
int new_scope = -1, new_matchlen = -1;
struct in6_addrpolicy *new_policy = NULL;
u_int32_t srczone, osrczone, dstzone;
struct in6_addr src;
struct ifnet *ifp1 = ia->ia_ifp;
int prefer_tempaddr;
/*
* We'll never take an address that breaks the scope zone
* of the destination. We also skip an address if its zone
* does not contain the outgoing interface.
* XXX: we should probably use sin6_scope_id here.
*/
if (in6_setscope(dst, ifp1, &dstzone) ||
odstzone != dstzone) {
continue;
}
src = ia->ia_addr.sin6_addr;
/* Skip the scope test in impossible cases */
if (!(ifp->if_flags & IFF_LOOPBACK) &&
IN6_IS_ADDR_LOOPBACK(&src))
continue;
if (in6_setscope(&src, ifp, &osrczone) || in6_setscope(&src, ifp1, &srczone) ||
osrczone != srczone) {
continue;
}
/* avoid unusable addresses */
if ((ia->ia6_flags & (IN6_IFF_DUPLICATED | IN6_IFF_ANYCAST)))
continue;
if (!ip6_use_deprecated && IFA6_IS_DEPRECATED(ia))
continue;
#if defined(MIP6) && NMIP > 0
/* avoid unusable home addresses. */
if ((ia->ia6_flags & IN6_IFF_HOME) &&
!mip6_ifa6_is_addr_valid_hoa(ia))
continue;
#endif /* MIP6 && NMIP > 0 */
/* Rule 1: Prefer same address */
if (IN6_ARE_ADDR_EQUAL(dst, &ia->ia_addr.sin6_addr)) {
ia_best = ia;
BREAK(1); /* there should be no better candidate */
}
if (ia_best == NULL)
REPLACE(1);
/* Rule 2: Prefer appropriate scope */
if (dst_scope < 0) dst_scope = in6_addrscope(dst);
new_scope = in6_addrscope(&ia->ia_addr.sin6_addr);
if (IN6_ARE_SCOPE_CMP(best_scope, new_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(best_scope, dst_scope) < 0)
REPLACE(2);
NEXT(2);
} else if (IN6_ARE_SCOPE_CMP(new_scope, best_scope) < 0) {
if (IN6_ARE_SCOPE_CMP(new_scope, dst_scope) < 0)
NEXT(2);
REPLACE(2);
}
/*
* Rule 3: Avoid deprecated addresses. Note that the case of
* !ip6_use_deprecated is already rejected above.
* Treat unvalidated addresses as deprecated here.
*/
if (IFA6_IS_VALIDATED(ia_best) && !IFA6_IS_VALIDATED(ia))
NEXT(3);
if (!IFA6_IS_VALIDATED(ia_best) && IFA6_IS_VALIDATED(ia))
REPLACE(3);
if (!IFA6_IS_DEPRECATED(ia_best) && IFA6_IS_DEPRECATED(ia))
NEXT(3);
if (IFA6_IS_DEPRECATED(ia_best) && !IFA6_IS_DEPRECATED(ia))
REPLACE(3);
/* Rule 4: Prefer home addresses */
#if defined(MIP6) && NMIP > 0
if (!MIP6_IS_MN)
goto skip_rule4;
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
/* both address are not home addresses. */
goto skip_rule4;
}
/*
* If SA is simultaneously a home address and care-of
* address and SB is not, then prefer SA. Similarly,
* if SB is simultaneously a home address and care-of
* address and SA is not, then prefer SB.
*/
if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
ia_best->ia_ifp->if_type != IFT_MIP)
&&
((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
ia->ia_ifp->if_type == IFT_MIP))
NEXT(4);
if (((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
ia_best->ia_ifp->if_type == IFT_MIP)
&&
((ia->ia6_flags & IN6_IFF_HOME) != 0 &&
ia->ia_ifp->if_type != IFT_MIP))
REPLACE(4);
if (ip6po_usecoa == 0) {
/*
* If SA is just a home address and SB is just
* a care-of address, then prefer
* SA. Similarly, if SB is just a home address
* and SA is just a care-of address, then
* prefer SB.
*/
if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
NEXT(4);
}
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) != 0) {
REPLACE(4);
}
} else {
/*
* a sender don't want to use a home address
* because:
*
* 1) we cannot use. (ex. NS or NA to global
* addresses.)
*
* 2) a user specified not to use.
* (ex. mip6control -u)
*/
if ((ia_best->ia6_flags & IN6_IFF_HOME) == 0 &&
(ia->ia6_flags & IN6_IFF_HOME) != 0) {
/* XXX breaks stat */
NEXT(0);
}
if ((ia_best->ia6_flags & IN6_IFF_HOME) != 0 &&
(ia->ia6_flags & IN6_IFF_HOME) == 0) {
/* XXX breaks stat */
REPLACE(0);
}
}
skip_rule4:
#endif /* MIP6 && NMIP > 0 */
/* Rule 5: Prefer outgoing interface */
if (ia_best->ia_ifp == ifp && ia->ia_ifp != ifp)
NEXT(5);
if (ia_best->ia_ifp != ifp && ia->ia_ifp == ifp)
REPLACE(5);
/*
* Rule 6: Prefer matching label
* Note that best_policy should be non-NULL here.
*/
if (dst_policy == NULL) dst_policy = lookup_addrsel_policy(dstsock); if (dst_policy->label != ADDR_LABEL_NOTAPP) { new_policy = lookup_addrsel_policy(&ia->ia_addr); if (dst_policy->label == best_policy->label &&
dst_policy->label != new_policy->label)
NEXT(6);
if (dst_policy->label != best_policy->label &&
dst_policy->label == new_policy->label)
REPLACE(6);
}
/*
* Rule 7: Prefer public addresses.
* We allow users to reverse the logic by configuring
* a sysctl variable, so that privacy conscious users can
* always prefer temporary addresses.
*/
if (opts == NULL ||
opts->ip6po_prefer_tempaddr == IP6PO_TEMPADDR_SYSTEM) {
prefer_tempaddr = ip6_prefer_tempaddr;
} else if (opts->ip6po_prefer_tempaddr ==
IP6PO_TEMPADDR_NOTPREFER) {
prefer_tempaddr = 0;
} else
prefer_tempaddr = 1;
if (!(ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
REPLACE(7);
else
NEXT(7);
}
if ((ia_best->ia6_flags & IN6_IFF_TEMPORARY) &&
!(ia->ia6_flags & IN6_IFF_TEMPORARY)) {
if (prefer_tempaddr)
NEXT(7);
else
REPLACE(7);
}
/*
* Rule 8: prefer addresses on alive interfaces.
* This is a KAME specific rule.
*/
if ((ia_best->ia_ifp->if_flags & IFF_UP) &&
!(ia->ia_ifp->if_flags & IFF_UP))
NEXT(8);
if (!(ia_best->ia_ifp->if_flags & IFF_UP) &&
(ia->ia_ifp->if_flags & IFF_UP))
REPLACE(8);
/*
* Rule 9: prefer addresses on "preferred" interfaces.
* This is a KAME specific rule.
*/
#ifdef notyet /* until introducing address selection */
#define NDI_BEST ND_IFINFO(ia_best->ia_ifp)
#define NDI_NEW ND_IFINFO(ia->ia_ifp)
if ((NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
!(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
NEXT(9);
if (!(NDI_BEST->flags & ND6_IFF_PREFER_SOURCE) &&
(NDI_NEW->flags & ND6_IFF_PREFER_SOURCE))
REPLACE(9);
#undef NDI_BEST
#undef NDI_NEW
#endif
/*
* Rule 14: Use longest matching prefix.
* Note: in the address selection draft, this rule is
* documented as "Rule 8". However, since it is also
* documented that this rule can be overridden, we assign
* a large number so that it is easy to assign smaller numbers
* to more preferred rules.
*/
new_matchlen = in6_matchlen(&ia->ia_addr.sin6_addr, dst);
if (best_matchlen < new_matchlen)
REPLACE(14);
if (new_matchlen < best_matchlen)
NEXT(14);
/* Rule 15 is reserved. */
/*
* Last resort: just keep the current candidate.
* Or, do we need more rules?
*/
continue;
replace:
ia_best = ia;
best_scope = (new_scope >= 0 ? new_scope :
in6_addrscope(&ia_best->ia_addr.sin6_addr));
best_policy = (new_policy ? new_policy : lookup_addrsel_policy(&ia_best->ia_addr)); best_matchlen = (new_matchlen >= 0 ? new_matchlen : in6_matchlen(&ia_best->ia_addr.sin6_addr,
dst));
next:
continue;
out:
break;
}
return ia_best;
}
#undef REPLACE
#undef BREAK
#undef NEXT
int
in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route *ro, struct in6_addr *laddr,
struct ifnet **ifpp, struct psref *psref, struct in6_addr *ret_ia6)
{
struct in6_addr dst;
struct ifnet *ifp = NULL;
struct in6_ifaddr *ia = NULL;
struct in6_pktinfo *pi = NULL;
u_int32_t odstzone;
int error = 0, iferror;
#if defined(MIP6) && NMIP > 0
u_int8_t ip6po_usecoa = 0;
#endif /* MIP6 && NMIP > 0 */
struct psref local_psref;
int bound = curlwp_bind();
#define PSREF (psref == NULL) ? &local_psref : psref
int s;
KASSERT((ifpp != NULL && psref != NULL) ||
(ifpp == NULL && psref == NULL));
dst = dstsock->sin6_addr; /* make a copy for local operation */
if (ifpp) *ifpp = NULL;
/*
* Try to determine the outgoing interface for the given destination.
* We do this regardless of whether the socket is bound, since the
* caller may need this information as a side effect of the call
* to this function (e.g., for identifying the appropriate scope zone
* ID).
*/
iferror = in6_selectif(dstsock, opts, mopts, ro, &ifp, PSREF); if (ifpp != NULL)
*ifpp = ifp;
/*
* If the source address is explicitly specified by the caller,
* check if the requested source address is indeed a unicast address
* assigned to the node, and can be used as the packet's source
* address. If everything is okay, use the address as source.
*/
if (opts && (pi = opts->ip6po_pktinfo) && !IN6_IS_ADDR_UNSPECIFIED(&pi->ipi6_addr)) {
struct sockaddr_in6 srcsock;
struct in6_ifaddr *ia6;
int _s;
struct ifaddr *ifa;
/*
* Determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
* If the specified address is ambiguous wrt the scope zone,
* the interface must be specified; otherwise, ifa_ifwithaddr()
* will fail matching the address.
*/
memset(&srcsock, 0, sizeof(srcsock));
srcsock.sin6_family = AF_INET6;
srcsock.sin6_len = sizeof(srcsock);
srcsock.sin6_addr = pi->ipi6_addr;
if (ifp) {
error = in6_setscope(&srcsock.sin6_addr, ifp, NULL);
if (error != 0)
goto exit;
}
_s = pserialize_read_enter();
ifa = ifa_ifwithaddr(sin6tosa(&srcsock));
if ((ia6 = ifatoia6(ifa)) == NULL ||
ia6->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) {
pserialize_read_exit(_s);
error = EADDRNOTAVAIL;
goto exit;
}
pi->ipi6_addr = srcsock.sin6_addr; /* XXX: this overrides pi */
if (ifpp)
*ifpp = ifp;
*ret_ia6 = ia6->ia_addr.sin6_addr;
pserialize_read_exit(_s);
goto exit;
}
/*
* If the socket has already bound the source, just use it. We don't
* care at the moment whether in6_selectif() succeeded above, even
* though it would eventually cause an error.
*/
if (laddr && !IN6_IS_ADDR_UNSPECIFIED(laddr)) {
*ret_ia6 = *laddr;
goto exit;
}
/*
* The outgoing interface is crucial in the general selection procedure
* below. If it is not known at this point, we fail.
*/
if (ifp == NULL) {
error = iferror;
goto exit;
}
/*
* If the address is not yet determined, choose the best one based on
* the outgoing interface and the destination address.
*/
#if defined(MIP6) && NMIP > 0
/*
* a caller can specify IP6PO_USECOA to not to use a home
* address. for example, the case that the neighbour
* unreachability detection to the global address.
*/
if (opts != NULL &&
(opts->ip6po_flags & IP6PO_USECOA) != 0) {
ip6po_usecoa = 1;
}
#endif /* MIP6 && NMIP > 0 */
error = in6_setscope(&dst, ifp, &odstzone);
if (error != 0)
goto exit;
s = pserialize_read_enter();
ia = in6_select_best_ia(dstsock, &dst, ifp, opts, odstzone); if (ia == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
goto exit;
}
*ret_ia6 = ia->ia_addr.sin6_addr;
pserialize_read_exit(s);
exit:
if (ifpp == NULL)
if_put(ifp, PSREF);
curlwp_bindx(bound);
return error;
#undef PSREF
}
int
in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct route **ro, struct rtentry **retrt, bool count_discard)
{
int error = 0;
struct rtentry *rt = NULL;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
struct sockaddr_in6 dst6;
} u;
KASSERT(ro != NULL); KASSERT(*ro != NULL); KASSERT(retrt != NULL);
#if 0
if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
dstsock->sin6_addr.s6_addr32[1] == 0 &&
!IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
char ip6buf[INET6_ADDRSTRLEN];
printf("%s: strange destination %s\n", __func__,
IN6_PRINT(ip6buf, &dstsock->sin6_addr));
} else {
char ip6buf[INET6_ADDRSTRLEN];
printf("%s: destination = %s%%%d\n", __func__,
IN6_PRINT(ip6buf, &dstsock->sin6_addr),
dstsock->sin6_scope_id); /* for debug */
}
#endif
/*
* If the next hop address for the packet is specified by the caller,
* use it as the gateway.
*/
if (opts && opts->ip6po_nexthop) {
struct route *ron;
struct sockaddr_in6 *sin6_next;
sin6_next = satosin6(opts->ip6po_nexthop);
/* at this moment, we only support AF_INET6 next hops */
if (sin6_next->sin6_family != AF_INET6) {
IP6_STATINC(IP6_STAT_ODROPPED);
error = EAFNOSUPPORT; /* or should we proceed? */
goto done;
}
/*
* If the next hop is an IPv6 address, then the node identified
* by that address must be a neighbor of the sending host.
*/
ron = &opts->ip6po_nextroute;
rt = rtcache_lookup(ron, sin6tosa(sin6_next));
if (rt == NULL || (rt->rt_flags & RTF_GATEWAY) != 0 ||
!nd6_is_addr_neighbor(sin6_next, rt->rt_ifp)) {
if (rt != NULL) {
if (count_discard) in6_ifstat_inc(rt->rt_ifp,
ifs6_out_discard);
rtcache_unref(rt, ron);
rt = NULL;
}
rtcache_free(ron);
error = EHOSTUNREACH;
goto done;
}
*ro = ron;
goto done;
}
/*
* Use a cached route if it exists and is valid, else try to allocate
* a new one. Note that we should check the address family of the
* cached destination, in case of sharing the cache with IPv4.
*
* for V4 mapped addresses we want to pick up the v4 route
* see PR kern/56348
*/
if (IN6_IS_ADDR_V4MAPPED(&dstsock->sin6_addr)) { in6_sin6_2_sin(&u.dst4, dstsock);
} else {
u.dst6 = *dstsock;
u.dst6.sin6_scope_id = 0;
}
rt = rtcache_lookup1(*ro, &u.dst, 1);
if (rt == NULL)
error = EHOSTUNREACH;
/*
* Check if the outgoing interface conflicts with
* the interface specified by ipi6_ifindex (if specified).
* Note that loopback interface is always okay.
* (this may happen when we are sending a packet to one of
* our own addresses.)
*/
if (opts && opts->ip6po_pktinfo && opts->ip6po_pktinfo->ipi6_ifindex) { if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
rt->rt_ifp->if_index != opts->ip6po_pktinfo->ipi6_ifindex) {
if (count_discard) in6_ifstat_inc(rt->rt_ifp, ifs6_out_discard);
error = EHOSTUNREACH;
rtcache_unref(rt, *ro);
rt = NULL;
}
}
done:
if (error == EHOSTUNREACH)
IP6_STATINC(IP6_STAT_NOROUTE);
*retrt = rt;
return error;
}
static int
in6_selectif(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
struct ip6_moptions *mopts, struct route *ro, struct ifnet **retifp,
struct psref *psref)
{
int error = 0;
struct rtentry *rt = NULL;
struct in6_addr *dst;
struct in6_pktinfo *pi = NULL;
KASSERT(retifp != NULL);
*retifp = NULL;
dst = &dstsock->sin6_addr;
/* If the caller specify the outgoing interface explicitly, use it. */
if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
/* XXX boundary check is assumed to be already done. */
*retifp = if_get_byindex(pi->ipi6_ifindex, psref);
if (*retifp != NULL)
return 0;
goto getroute;
}
/*
* If the destination address is a multicast address and the outgoing
* interface for the address is specified by the caller, use it.
*/
if (IN6_IS_ADDR_MULTICAST(dst) && mopts != NULL) {
*retifp = if_get_byindex(mopts->im6o_multicast_if_index, psref);
if (*retifp != NULL)
return 0; /* we do not need a route for multicast. */
}
getroute:
error = in6_selectroute(dstsock, opts, &ro, &rt, false);
if (error != 0)
return error;
*retifp = if_get_byindex(rt->rt_ifp->if_index, psref);
/*
* do not use a rejected or black hole route.
* XXX: this check should be done in the L2 output routine.
* However, if we skipped this check here, we'd see the following
* scenario:
* - install a rejected route for a scoped address prefix
* (like fe80::/10)
* - send a packet to a destination that matches the scoped prefix,
* with ambiguity about the scope zone.
* - pick the outgoing interface from the route, and disambiguate the
* scope zone with the interface.
* - ip6_output() would try to get another route with the "new"
* destination, which may be valid.
* - we'd see no error on output.
* Although this may not be very harmful, it should still be confusing.
* We thus reject the case here.
*/
if ((rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
error = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
/* XXX: ifp can be returned with psref even if error */
goto out;
}
/*
* Adjust the "outgoing" interface. If we're going to loop the packet
* back to ourselves, the ifp would be the loopback interface.
* However, we'd rather know the interface associated to the
* destination address (which should probably be one of our own
* addresses.)
*/
if (rt->rt_ifa->ifa_ifp != *retifp &&
!if_is_deactivated(rt->rt_ifa->ifa_ifp)) {
if_put(*retifp, psref);
*retifp = rt->rt_ifa->ifa_ifp;
if_acquire(*retifp, psref);
}
out:
rtcache_unref(rt, ro);
return error;
}
/*
* Default hop limit selection. The precedence is as follows:
* 1. Hoplimit value specified via ioctl.
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
*/
int
in6pcb_selecthlim(struct inpcb *inp, struct ifnet *ifp)
{ if (inp && in6p_hops6(inp) >= 0)
return in6p_hops6(inp);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
else
return (ip6_defhlim);
}
int
in6pcb_selecthlim_rt(struct inpcb *inp)
{
struct rtentry *rt;
if (inp == NULL)
return in6pcb_selecthlim(inp, NULL);
rt = rtcache_validate(&inp->inp_route);
if (rt != NULL) {
int ret = in6pcb_selecthlim(inp, rt->rt_ifp);
rtcache_unref(rt, &inp->inp_route);
return ret;
} else
return in6pcb_selecthlim(inp, NULL);
}
/*
* Find an empty port and set it to the specified PCB.
*/
int
in6pcb_set_port(struct sockaddr_in6 *sin6, struct inpcb *inp, struct lwp *l)
{
struct socket *so = inp->inp_socket;
struct inpcbtable *table = inp->inp_table;
u_int16_t lport, *lastport;
enum kauth_network_req req;
int error = 0;
if (inp->inp_flags & IN6P_LOWPORT) {
#ifndef IPNOPRIVPORTS
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
lastport = &table->inpt_lastlow;
} else {
req = KAUTH_REQ_NETWORK_BIND_PORT;
lastport = &table->inpt_lastport;
}
/* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND, req, so,
sin6, NULL);
if (error)
return (EACCES);
/*
* Use RFC6056 randomized port selection
*/
error = portalgo_randport(&lport, inp, l->l_cred);
if (error)
return error;
inp->inp_flags |= IN6P_ANONPORT;
*lastport = lport;
inp->inp_lport = htons(lport);
in6pcb_set_state(inp, INP_BOUND);
return (0); /* success */
}
void
addrsel_policy_init(void)
{
init_policy_queue();
/* initialize the "last resort" policy */
memset(&defaultaddrpolicy, 0, sizeof(defaultaddrpolicy));
defaultaddrpolicy.label = ADDR_LABEL_NOTAPP;
}
/*
* XXX: NOMPSAFE if a policy is set
*/
static struct in6_addrpolicy *
lookup_addrsel_policy(struct sockaddr_in6 *key)
{
struct in6_addrpolicy *match = NULL;
match = match_addrsel_policy(key); if (match == NULL)
match = &defaultaddrpolicy;
else
match->use++;
return (match);
}
/*
* Subroutines to manage the address selection policy table via sysctl.
*/
struct sel_walkarg {
size_t w_total;
size_t w_given;
void * w_where;
void *w_limit;
};
int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);
int
sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS)
{
int error = 0;
int s;
s = splsoftnet();
if (newp) {
error = EPERM;
goto end;
}
if (oldp && oldlenp == NULL) {
error = EINVAL;
goto end;
}
if (oldp || oldlenp) {
struct sel_walkarg w;
size_t oldlen = *oldlenp;
memset(&w, 0, sizeof(w));
w.w_given = oldlen;
w.w_where = oldp;
if (oldp)
w.w_limit = (char *)oldp + oldlen;
error = walk_addrsel_policy(dump_addrsel_policyent, &w);
*oldlenp = w.w_total;
if (oldp && w.w_total > oldlen && error == 0)
error = ENOMEM;
}
end:
splx(s);
return (error);
}
int
in6_src_ioctl(u_long cmd, void *data)
{
int i;
struct in6_addrpolicy ent0;
if (cmd != SIOCAADDRCTL_POLICY && cmd != SIOCDADDRCTL_POLICY)
return (EOPNOTSUPP); /* check for safety */
ent0 = *(struct in6_addrpolicy *)data;
if (ent0.label == ADDR_LABEL_NOTAPP)
return (EINVAL);
/* check if the prefix mask is consecutive. */
if (in6_mask2len(&ent0.addrmask.sin6_addr, NULL) < 0)
return (EINVAL);
/* clear trailing garbages (if any) of the prefix address. */
for (i = 0; i < 4; i++) {
ent0.addr.sin6_addr.s6_addr32[i] &=
ent0.addrmask.sin6_addr.s6_addr32[i];
}
ent0.use = 0;
switch (cmd) {
case SIOCAADDRCTL_POLICY:
return (add_addrsel_policyent(&ent0));
case SIOCDADDRCTL_POLICY:
return (delete_addrsel_policyent(&ent0));
}
return (0); /* XXX: compromise compilers */
}
/*
* The followings are implementation of the policy table using a
* simple tail queue.
* XXX such details should be hidden.
* XXX implementation using binary tree should be more efficient.
*/
struct addrsel_policyent {
TAILQ_ENTRY(addrsel_policyent) ape_entry;
struct in6_addrpolicy ape_policy;
};
TAILQ_HEAD(addrsel_policyhead, addrsel_policyent);
struct addrsel_policyhead addrsel_policytab;
static void
init_policy_queue(void)
{
TAILQ_INIT(&addrsel_policytab);
}
static int
add_addrsel_policyent(struct in6_addrpolicy *newpolicy)
{
struct addrsel_policyent *newpol, *pol;
/* duplication check */
TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) { if (IN6_ARE_ADDR_EQUAL(&newpolicy->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&newpolicy->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
return (EEXIST); /* or override it? */
}
}
newpol = malloc(sizeof(*newpol), M_IFADDR, M_WAITOK|M_ZERO);
/* XXX: should validate entry */
newpol->ape_policy = *newpolicy;
TAILQ_INSERT_TAIL(&addrsel_policytab, newpol, ape_entry);
return (0);
}
static int
delete_addrsel_policyent(struct in6_addrpolicy *key)
{
struct addrsel_policyent *pol;
/* search for the entry in the table */
for (pol = TAILQ_FIRST(&addrsel_policytab); pol;
pol = TAILQ_NEXT(pol, ape_entry)) {
if (IN6_ARE_ADDR_EQUAL(&key->addr.sin6_addr, &pol->ape_policy.addr.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&key->addrmask.sin6_addr,
&pol->ape_policy.addrmask.sin6_addr)) {
break;
}
}
if (pol == NULL) {
return (ESRCH);
}
TAILQ_REMOVE(&addrsel_policytab, pol, ape_entry);
return (0);
}
static int
walk_addrsel_policy(int (*callback)(struct in6_addrpolicy *, void *), void *w)
{
struct addrsel_policyent *pol;
int error = 0;
TAILQ_FOREACH(pol, &addrsel_policytab, ape_entry) {
if ((error = (*callback)(&pol->ape_policy, w)) != 0)
return error;
}
return error;
}
static int
dump_addrsel_policyent(struct in6_addrpolicy *pol, void *arg)
{
int error = 0;
struct sel_walkarg *w = arg;
if (w->w_where && (char *)w->w_where + sizeof(*pol) <= (char *)w->w_limit) {
if ((error = copyout(pol, w->w_where, sizeof(*pol))) != 0)
return error;
w->w_where = (char *)w->w_where + sizeof(*pol);
}
w->w_total += sizeof(*pol);
return error;
}
static struct in6_addrpolicy *
match_addrsel_policy(struct sockaddr_in6 *key)
{
struct addrsel_policyent *pent;
struct in6_addrpolicy *bestpol = NULL, *pol;
int matchlen, bestmatchlen = -1;
u_char *mp, *ep, *k, *p, m;
for (pent = TAILQ_FIRST(&addrsel_policytab); pent;
pent = TAILQ_NEXT(pent, ape_entry)) {
matchlen = 0;
pol = &pent->ape_policy;
mp = (u_char *)&pol->addrmask.sin6_addr;
ep = mp + 16; /* XXX: scope field? */
k = (u_char *)&key->sin6_addr;
p = (u_char *)&pol->addr.sin6_addr;
for (; mp < ep && *mp; mp++, k++, p++) {
m = *mp;
if ((*k & m) != *p)
goto next; /* not match */
if (m == 0xff) /* short cut for a typical case */
matchlen += 8;
else {
while (m >= 0x80) {
matchlen++;
m <<= 1;
}
}
}
/* matched. check if this is better than the current best. */
if (bestpol == NULL ||
matchlen > bestmatchlen) {
bestpol = pol;
bestmatchlen = matchlen;
}
next:
continue;
}
return (bestpol);
}
/* $NetBSD: kern_tc.c,v 1.77 2024/05/11 06:34:45 andvar Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* ----------------------------------------------------------------------------
* "THE BEER-WARE LICENSE" (Revision 42):
* <phk@FreeBSD.ORG> wrote this file. As long as you retain this notice you
* can do whatever you want with this stuff. If we meet some day, and you think
* this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
* ---------------------------------------------------------------------------
*/
/*
* https://papers.freebsd.org/2002/phk-timecounters.files/timecounter.pdf
*/
#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.166 2005/09/19 22:16:31 andre Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_tc.c,v 1.77 2024/05/11 06:34:45 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/reboot.h> /* XXX just to get AB_VERBOSE */
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/timepps.h>
#include <sys/timetc.h>
#include <sys/timex.h>
#include <sys/xcall.h>
/*
* A large step happens on boot. This constant detects such steps.
* It is relatively small so that ntp_update_second gets called enough
* in the typical 'missed a couple of seconds' case, but doesn't loop
* forever when the time step is large.
*/
#define LARGE_STEP 200
/*
* Implement a dummy timecounter which we can use until we get a real one
* in the air. This allows the console and other early stuff to use
* time services.
*/
static u_int
dummy_get_timecount(struct timecounter *tc)
{
static u_int now;
return ++now;
}
static struct timecounter dummy_timecounter = {
.tc_get_timecount = dummy_get_timecount,
.tc_counter_mask = ~0u,
.tc_frequency = 1000000,
.tc_name = "dummy",
.tc_quality = -1000000,
.tc_priv = NULL,
};
struct timehands {
/* These fields must be initialized by the driver. */
struct timecounter *th_counter; /* active timecounter */
int64_t th_adjustment; /* frequency adjustment */
/* (NTP/adjtime) */
uint64_t th_scale; /* scale factor (counter */
/* tick->time) */
uint64_t th_offset_count; /* offset at last time */
/* update (tc_windup()) */
struct bintime th_offset; /* bin (up)time at windup */
struct timeval th_microtime; /* cached microtime */
struct timespec th_nanotime; /* cached nanotime */
/* Fields not to be copied in tc_windup start with th_generation. */
volatile u_int th_generation; /* current genration */
struct timehands *th_next; /* next timehand */
};
static struct timehands th0;
static struct timehands th9 = { .th_next = &th0, };
static struct timehands th8 = { .th_next = &th9, };
static struct timehands th7 = { .th_next = &th8, };
static struct timehands th6 = { .th_next = &th7, };
static struct timehands th5 = { .th_next = &th6, };
static struct timehands th4 = { .th_next = &th5, };
static struct timehands th3 = { .th_next = &th4, };
static struct timehands th2 = { .th_next = &th3, };
static struct timehands th1 = { .th_next = &th2, };
static struct timehands th0 = {
.th_counter = &dummy_timecounter,
.th_scale = (uint64_t)-1 / 1000000,
.th_offset = { .sec = 1, .frac = 0 },
.th_generation = 1,
.th_next = &th1,
};
static struct timehands *volatile timehands = &th0;
struct timecounter *timecounter = &dummy_timecounter;
static struct timecounter *timecounters = &dummy_timecounter;
/* used by savecore(8) */
time_t time_second_legacy asm("time_second");
#ifdef __HAVE_ATOMIC64_LOADSTORE
volatile time_t time__second __cacheline_aligned = 1;
volatile time_t time__uptime __cacheline_aligned = 1;
#else
static volatile struct {
uint32_t lo, hi;
} time__uptime32 __cacheline_aligned = {
.lo = 1,
}, time__second32 __cacheline_aligned = {
.lo = 1,
};
#endif
static struct {
struct bintime bin;
volatile unsigned gen; /* even when stable, odd when changing */
} timebase __cacheline_aligned;
static int timestepwarnings;
kmutex_t timecounter_lock;
static u_int timecounter_mods;
static volatile int timecounter_removals = 1;
static u_int timecounter_bad;
#ifdef __HAVE_ATOMIC64_LOADSTORE
static inline void
setrealuptime(time_t second, time_t uptime)
{
time_second_legacy = second;
atomic_store_relaxed(&time__second, second);
atomic_store_relaxed(&time__uptime, uptime);
}
#else
static inline void
setrealuptime(time_t second, time_t uptime)
{
uint32_t seclo = second & 0xffffffff, sechi = second >> 32;
uint32_t uplo = uptime & 0xffffffff, uphi = uptime >> 32;
KDASSERT(mutex_owned(&timecounter_lock));
time_second_legacy = second;
/*
* Fast path -- no wraparound, just updating the low bits, so
* no need for seqlocked access.
*/
if (__predict_true(sechi == time__second32.hi) &&
__predict_true(uphi == time__uptime32.hi)) {
atomic_store_relaxed(&time__second32.lo, seclo);
atomic_store_relaxed(&time__uptime32.lo, uplo);
return;
}
atomic_store_relaxed(&time__second32.hi, 0xffffffff);
atomic_store_relaxed(&time__uptime32.hi, 0xffffffff);
membar_producer();
atomic_store_relaxed(&time__second32.lo, seclo);
atomic_store_relaxed(&time__uptime32.lo, uplo);
membar_producer();
atomic_store_relaxed(&time__second32.hi, sechi);
atomic_store_relaxed(&time__uptime32.hi, uphi);
}
time_t
getrealtime(void)
{
uint32_t lo, hi;
do {
for (;;) {
hi = atomic_load_relaxed(&time__second32.hi);
if (__predict_true(hi != 0xffffffff))
break;
SPINLOCK_BACKOFF_HOOK;
}
membar_consumer();
lo = atomic_load_relaxed(&time__second32.lo);
membar_consumer();
} while (hi != atomic_load_relaxed(&time__second32.hi));
return ((time_t)hi << 32) | lo;
}
time_t
getuptime(void)
{
uint32_t lo, hi;
do {
for (;;) {
hi = atomic_load_relaxed(&time__uptime32.hi);
if (__predict_true(hi != 0xffffffff))
break;
SPINLOCK_BACKOFF_HOOK;
}
membar_consumer();
lo = atomic_load_relaxed(&time__uptime32.lo);
membar_consumer();
} while (hi != atomic_load_relaxed(&time__uptime32.hi));
return ((time_t)hi << 32) | lo;
}
time_t
getboottime(void)
{
return getrealtime() - getuptime();
}
uint32_t
getuptime32(void)
{
return atomic_load_relaxed(&time__uptime32.lo);
}
#endif /* !defined(__HAVE_ATOMIC64_LOADSTORE) */
/*
* sysctl helper routine for kern.timercounter.hardware
*/
static int
sysctl_kern_timecounter_hardware(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int error;
char newname[MAX_TCNAMELEN];
struct timecounter *newtc, *tc;
tc = timecounter;
strlcpy(newname, tc->tc_name, sizeof(newname));
node = *rnode;
node.sysctl_data = newname;
node.sysctl_size = sizeof(newname);
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error ||
newp == NULL ||
strncmp(newname, tc->tc_name, sizeof(newname)) == 0)
return error;
if (l != NULL && (error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS, newname,
NULL, NULL)) != 0)
return error;
if (!cold)
mutex_spin_enter(&timecounter_lock);
error = EINVAL;
for (newtc = timecounters; newtc != NULL; newtc = newtc->tc_next) {
if (strcmp(newname, newtc->tc_name) != 0)
continue;
/* Warm up new timecounter. */
(void)newtc->tc_get_timecount(newtc);
(void)newtc->tc_get_timecount(newtc);
timecounter = newtc;
error = 0;
break;
}
if (!cold)
mutex_spin_exit(&timecounter_lock);
return error;
}
static int
sysctl_kern_timecounter_choice(SYSCTLFN_ARGS)
{
char buf[MAX_TCNAMELEN+48];
char *where;
const char *spc;
struct timecounter *tc;
size_t needed, left, slen;
int error, mods;
if (newp != NULL)
return EPERM;
if (namelen != 0)
return EINVAL;
mutex_spin_enter(&timecounter_lock);
retry:
spc = "";
error = 0;
needed = 0;
left = *oldlenp;
where = oldp;
for (tc = timecounters; error == 0 && tc != NULL; tc = tc->tc_next) {
if (where == NULL) {
needed += sizeof(buf); /* be conservative */
} else {
slen = snprintf(buf, sizeof(buf), "%s%s(q=%d, f=%" PRId64
" Hz)", spc, tc->tc_name, tc->tc_quality,
tc->tc_frequency);
if (left < slen + 1)
break;
mods = timecounter_mods;
mutex_spin_exit(&timecounter_lock);
error = copyout(buf, where, slen + 1);
mutex_spin_enter(&timecounter_lock);
if (mods != timecounter_mods) {
goto retry;
}
spc = " ";
where += slen;
needed += slen;
left -= slen;
}
}
mutex_spin_exit(&timecounter_lock);
*oldlenp = needed;
return error;
}
SYSCTL_SETUP(sysctl_timecounter_setup, "sysctl timecounter setup")
{
const struct sysctlnode *node;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "timecounter",
SYSCTL_DESCR("time counter information"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node != NULL) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "choice",
SYSCTL_DESCR("available counters"),
sysctl_kern_timecounter_choice, 0, NULL, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRING, "hardware",
SYSCTL_DESCR("currently active time counter"),
sysctl_kern_timecounter_hardware, 0, NULL, MAX_TCNAMELEN,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "timestepwarnings",
SYSCTL_DESCR("log time steps"),
NULL, 0, ×tepwarnings, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
}
}
#ifdef TC_COUNTERS
#define TC_STATS(name) \
static struct evcnt n##name = \
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "timecounter", #name); \
EVCNT_ATTACH_STATIC(n##name)
TC_STATS(binuptime); TC_STATS(nanouptime); TC_STATS(microuptime);
TC_STATS(bintime); TC_STATS(nanotime); TC_STATS(microtime);
TC_STATS(getbinuptime); TC_STATS(getnanouptime); TC_STATS(getmicrouptime);
TC_STATS(getbintime); TC_STATS(getnanotime); TC_STATS(getmicrotime);
TC_STATS(setclock);
#define TC_COUNT(var) var.ev_count++
#undef TC_STATS
#else
#define TC_COUNT(var) /* nothing */
#endif /* TC_COUNTERS */
static void tc_windup(void);
/*
* Return the difference between the timehands' counter value now and what
* was when we copied it to the timehands' offset_count.
*/
static inline u_int
tc_delta(struct timehands *th)
{
struct timecounter *tc;
tc = th->th_counter;
return (tc->tc_get_timecount(tc) -
th->th_offset_count) & tc->tc_counter_mask;
}
/*
* Functions for reading the time. We have to loop until we are sure that
* the timehands that we operated on was not updated under our feet. See
* the comment in <sys/timevar.h> for a description of these 12 functions.
*/
void
binuptime(struct bintime *bt)
{
struct timehands *th;
lwp_t *l;
u_int lgen, gen;
TC_COUNT(nbinuptime);
/*
* Provide exclusion against tc_detach().
*
* We record the number of timecounter removals before accessing
* timecounter state. Note that the LWP can be using multiple
* "generations" at once, due to interrupts (interrupted while in
* this function). Hardware interrupts will borrow the interrupted
* LWP's l_tcgen value for this purpose, and can themselves be
* interrupted by higher priority interrupts. In this case we need
* to ensure that the oldest generation in use is recorded.
*
* splsched() is too expensive to use, so we take care to structure
* this code in such a way that it is not required. Likewise, we
* do not disable preemption.
*
* Memory barriers are also too expensive to use for such a
* performance critical function. The good news is that we do not
* need memory barriers for this type of exclusion, as the thread
* updating timecounter_removals will issue a broadcast cross call
* before inspecting our l_tcgen value (this elides memory ordering
* issues).
*
* XXX If the author of the above comment knows how to make it
* safe to avoid memory barriers around the access to
* th->th_generation, I'm all ears.
*/
l = curlwp;
lgen = l->l_tcgen;
if (__predict_true(lgen == 0)) { l->l_tcgen = timecounter_removals;
}
__insn_barrier();
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
bintime_addx(bt, th->th_scale * tc_delta(th));
membar_consumer();
} while (gen == 0 || gen != th->th_generation); __insn_barrier();
l->l_tcgen = lgen;
}
void
nanouptime(struct timespec *tsp)
{
struct bintime bt;
TC_COUNT(nnanouptime);
binuptime(&bt);
bintime2timespec(&bt, tsp);
}
void
microuptime(struct timeval *tvp)
{
struct bintime bt;
TC_COUNT(nmicrouptime);
binuptime(&bt);
bintime2timeval(&bt, tvp);
}
void
bintime(struct bintime *bt)
{
struct bintime boottime;
TC_COUNT(nbintime);
binuptime(bt); getbinboottime(&boottime);
bintime_add(bt, &boottime);
}
void
nanotime(struct timespec *tsp)
{
struct bintime bt;
TC_COUNT(nnanotime);
bintime(&bt);
bintime2timespec(&bt, tsp);
}
void
microtime(struct timeval *tvp)
{
struct bintime bt;
TC_COUNT(nmicrotime);
bintime(&bt);
bintime2timeval(&bt, tvp);
}
void
getbinuptime(struct bintime *bt)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetbinuptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
getnanouptime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetnanouptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
bintime2timespec(&th->th_offset, tsp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getmicrouptime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetmicrouptime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
bintime2timeval(&th->th_offset, tvp);
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getbintime(struct bintime *bt)
{
struct timehands *th;
struct bintime boottime;
u_int gen;
TC_COUNT(ngetbintime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*bt = th->th_offset;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
getbinboottime(&boottime);
bintime_add(bt, &boottime);
}
static inline void
dogetnanotime(struct timespec *tsp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetnanotime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*tsp = th->th_nanotime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);
}
void
getnanotime(struct timespec *tsp)
{ dogetnanotime(tsp);}
void dtrace_getnanotime(struct timespec *tsp);
void
dtrace_getnanotime(struct timespec *tsp)
{
dogetnanotime(tsp);
}
void
getmicrotime(struct timeval *tvp)
{
struct timehands *th;
u_int gen;
TC_COUNT(ngetmicrotime);
do {
th = atomic_load_consume(&timehands);
gen = th->th_generation;
membar_consumer();
*tvp = th->th_microtime;
membar_consumer();
} while (gen == 0 || gen != th->th_generation);}
void
getnanoboottime(struct timespec *tsp)
{
struct bintime bt;
getbinboottime(&bt);
bintime2timespec(&bt, tsp);
}
void
getmicroboottime(struct timeval *tvp)
{
struct bintime bt;
getbinboottime(&bt);
bintime2timeval(&bt, tvp);
}
void
getbinboottime(struct bintime *basep)
{
struct bintime base;
unsigned gen;
do {
/* Spin until the timebase isn't changing. */
while ((gen = atomic_load_relaxed(&timebase.gen)) & 1)
SPINLOCK_BACKOFF_HOOK;
/* Read out a snapshot of the timebase. */
membar_consumer();
base = timebase.bin;
membar_consumer();
/* Restart if it changed while we were reading. */
} while (gen != atomic_load_relaxed(&timebase.gen));
*basep = base;
}
/*
* Initialize a new timecounter and possibly use it.
*/
void
tc_init(struct timecounter *tc)
{
u_int u;
KASSERTMSG(tc->tc_next == NULL, "timecounter %s already initialised",
tc->tc_name);
u = tc->tc_frequency / tc->tc_counter_mask;
/* XXX: We need some margin here, 10% is a guess */
u *= 11;
u /= 10;
if (u > hz && tc->tc_quality >= 0) {
tc->tc_quality = -2000;
aprint_verbose(
"timecounter: Timecounter \"%s\" frequency %ju Hz",
tc->tc_name, (uintmax_t)tc->tc_frequency);
aprint_verbose(" -- Insufficient hz, needs at least %u\n", u);
} else if (tc->tc_quality >= 0 || bootverbose) {
aprint_verbose(
"timecounter: Timecounter \"%s\" frequency %ju Hz "
"quality %d\n", tc->tc_name, (uintmax_t)tc->tc_frequency,
tc->tc_quality);
}
mutex_spin_enter(&timecounter_lock);
tc->tc_next = timecounters;
timecounters = tc;
timecounter_mods++;
/*
* Never automatically use a timecounter with negative quality.
* Even though we run on the dummy counter, switching here may be
* worse since this timecounter may not be monotonous.
*/
if (tc->tc_quality >= 0 && (tc->tc_quality > timecounter->tc_quality ||
(tc->tc_quality == timecounter->tc_quality &&
tc->tc_frequency > timecounter->tc_frequency))) {
(void)tc->tc_get_timecount(tc);
(void)tc->tc_get_timecount(tc);
timecounter = tc;
tc_windup();
}
mutex_spin_exit(&timecounter_lock);
}
/*
* Pick a new timecounter due to the existing counter going bad.
*/
static void
tc_pick(void)
{
struct timecounter *best, *tc;
KASSERT(mutex_owned(&timecounter_lock));
for (best = tc = timecounters; tc != NULL; tc = tc->tc_next) {
if (tc->tc_quality > best->tc_quality)
best = tc;
else if (tc->tc_quality < best->tc_quality)
continue;
else if (tc->tc_frequency > best->tc_frequency)
best = tc;
}
(void)best->tc_get_timecount(best);
(void)best->tc_get_timecount(best);
timecounter = best;
}
/*
* A timecounter has gone bad, arrange to pick a new one at the next
* clock tick.
*/
void
tc_gonebad(struct timecounter *tc)
{
tc->tc_quality = -100;
membar_producer();
atomic_inc_uint(&timecounter_bad);
}
/*
* Stop using a timecounter and remove it from the timecounters list.
*/
int
tc_detach(struct timecounter *target)
{
struct timecounter *tc;
struct timecounter **tcp = NULL;
int removals;
lwp_t *l;
/* First, find the timecounter. */
mutex_spin_enter(&timecounter_lock);
for (tcp = &timecounters, tc = timecounters;
tc != NULL;
tcp = &tc->tc_next, tc = tc->tc_next) {
if (tc == target)
break;
}
if (tc == NULL) {
mutex_spin_exit(&timecounter_lock);
return ESRCH;
}
/* And now, remove it. */
*tcp = tc->tc_next;
if (timecounter == target) {
tc_pick();
tc_windup();
}
timecounter_mods++;
removals = timecounter_removals++;
mutex_spin_exit(&timecounter_lock);
/*
* We now have to determine if any threads in the system are still
* making use of this timecounter.
*
* We issue a broadcast cross call to elide memory ordering issues,
* then scan all LWPs in the system looking at each's timecounter
* generation number. We need to see a value of zero (not actively
* using a timecounter) or a value greater than our removal value.
*
* We may race with threads that read `timecounter_removals' and
* and then get preempted before updating `l_tcgen'. This is not
* a problem, since it means that these threads have not yet started
* accessing timecounter state. All we do need is one clean
* snapshot of the system where every thread appears not to be using
* old timecounter state.
*/
for (;;) {
xc_barrier(0);
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
if (l->l_tcgen == 0 || l->l_tcgen > removals) {
/*
* Not using timecounter or old timecounter
* state at time of our xcall or later.
*/
continue;
}
break;
}
mutex_exit(&proc_lock);
/*
* If the timecounter is still in use, wait at least 10ms
* before retrying.
*/
if (l == NULL) {
break;
}
(void)kpause("tcdetach", false, mstohz(10), NULL);
}
tc->tc_next = NULL;
return 0;
}
/* Report the frequency of the current timecounter. */
uint64_t
tc_getfrequency(void)
{
return atomic_load_consume(&timehands)->th_counter->tc_frequency;
}
/*
* Step our concept of UTC. This is done by modifying our estimate of
* when we booted.
*/
void
tc_setclock(const struct timespec *ts)
{
struct timespec ts2;
struct bintime bt, bt2;
mutex_spin_enter(&timecounter_lock);
TC_COUNT(nsetclock);
binuptime(&bt2);
timespec2bintime(ts, &bt);
bintime_sub(&bt, &bt2);
bintime_add(&bt2, &timebase.bin);
timebase.gen |= 1; /* change in progress */
membar_producer();
timebase.bin = bt;
membar_producer();
timebase.gen++; /* commit change */
tc_windup();
mutex_spin_exit(&timecounter_lock);
if (timestepwarnings) {
bintime2timespec(&bt2, &ts2);
log(LOG_INFO,
"Time stepped from %lld.%09ld to %lld.%09ld\n",
(long long)ts2.tv_sec, ts2.tv_nsec,
(long long)ts->tv_sec, ts->tv_nsec);
}
}
/*
* Initialize the next struct timehands in the ring and make
* it the active timehands. Along the way we might switch to a different
* timecounter and/or do seconds processing in NTP. Slightly magic.
*/
static void
tc_windup(void)
{
struct bintime bt;
struct timehands *th, *tho;
uint64_t scale;
u_int delta, ncount, ogen;
int i, s_update;
time_t t;
KASSERT(mutex_owned(&timecounter_lock));
s_update = 0;
/*
* Make the next timehands a copy of the current one, but do not
* overwrite the generation or next pointer. While we update
* the contents, the generation must be zero. Ensure global
* visibility of the generation before proceeding.
*/
tho = timehands;
th = tho->th_next;
ogen = th->th_generation;
th->th_generation = 0;
membar_producer();
bcopy(tho, th, offsetof(struct timehands, th_generation));
/*
* Capture a timecounter delta on the current timecounter and if
* changing timecounters, a counter value from the new timecounter.
* Update the offset fields accordingly.
*/
delta = tc_delta(th);
if (th->th_counter != timecounter) ncount = timecounter->tc_get_timecount(timecounter);
else
ncount = 0;
th->th_offset_count += delta;
bintime_addx(&th->th_offset, th->th_scale * delta);
/*
* Hardware latching timecounters may not generate interrupts on
* PPS events, so instead we poll them. There is a finite risk that
* the hardware might capture a count which is later than the one we
* got above, and therefore possibly in the next NTP second which might
* have a different rate than the current NTP second. It doesn't
* matter in practice.
*/
if (tho->th_counter->tc_poll_pps) tho->th_counter->tc_poll_pps(tho->th_counter);
/*
* Deal with NTP second processing. The for loop normally
* iterates at most once, but in extreme situations it might
* keep NTP sane if timeouts are not run for several seconds.
* At boot, the time step can be large when the TOD hardware
* has been read, so on really large steps, we call
* ntp_update_second only twice. We need to call it twice in
* case we missed a leap second.
* If NTP is not compiled in ntp_update_second still calculates
* the adjustment resulting from adjtime() calls.
*/
bt = th->th_offset;
bintime_add(&bt, &timebase.bin);
i = bt.sec - tho->th_microtime.tv_sec;
if (i > LARGE_STEP)
i = 2;
for (; i > 0; i--) {
t = bt.sec;
ntp_update_second(&th->th_adjustment, &bt.sec);
s_update = 1;
if (bt.sec != t) { timebase.gen |= 1; /* change in progress */
membar_producer();
timebase.bin.sec += bt.sec - t;
membar_producer();
timebase.gen++; /* commit change */
}
}
/* Update the UTC timestamps used by the get*() functions. */
/* XXX shouldn't do this here. Should force non-`get' versions. */
bintime2timeval(&bt, &th->th_microtime);
bintime2timespec(&bt, &th->th_nanotime);
/* Now is a good time to change timecounters. */
if (th->th_counter != timecounter) {
th->th_counter = timecounter;
th->th_offset_count = ncount;
s_update = 1;
}
/*-
* Recalculate the scaling factor. We want the number of 1/2^64
* fractions of a second per period of the hardware counter, taking
* into account the th_adjustment factor which the NTP PLL/adjtime(2)
* processing provides us with.
*
* The th_adjustment is nanoseconds per second with 32 bit binary
* fraction and we want 64 bit binary fraction of second:
*
* x = a * 2^32 / 10^9 = a * 4.294967296
*
* The range of th_adjustment is +/- 5000PPM so inside a 64bit int
* we can only multiply by about 850 without overflowing, but that
* leaves suitably precise fractions for multiply before divide.
*
* Divide before multiply with a fraction of 2199/512 results in a
* systematic undercompensation of 10PPM of th_adjustment. On a
* 5000PPM adjustment this is a 0.05PPM error. This is acceptable.
*
* We happily sacrifice the lowest of the 64 bits of our result
* to the goddess of code clarity.
*
*/
if (s_update) {
scale = (uint64_t)1 << 63;
scale += (th->th_adjustment / 1024) * 2199;
scale /= th->th_counter->tc_frequency;
th->th_scale = scale * 2;
}
/*
* Now that the struct timehands is again consistent, set the new
* generation number, making sure to not make it zero. Ensure
* changes are globally visible before changing.
*/
if (++ogen == 0)
ogen = 1;
membar_producer();
th->th_generation = ogen;
/*
* Go live with the new struct timehands. Ensure changes are
* globally visible before changing.
*/
setrealuptime(th->th_microtime.tv_sec, th->th_offset.sec);
atomic_store_release(&timehands, th);
/*
* Force users of the old timehand to move on. This is
* necessary for MP systems; we need to ensure that the
* consumers will move away from the old timehand before
* we begin updating it again when we eventually wrap
* around.
*/
if (++tho->th_generation == 0) tho->th_generation = 1;
}
/*
* RFC 2783 PPS-API implementation.
*/
int
pps_ioctl(u_long cmd, void *data, struct pps_state *pps)
{
pps_params_t *app;
pps_info_t *pipi;
#ifdef PPS_SYNC
int *epi;
#endif
KASSERT(mutex_owned(&timecounter_lock)); KASSERT(pps != NULL); switch (cmd) {
case PPS_IOC_CREATE:
return 0;
case PPS_IOC_DESTROY:
return 0;
case PPS_IOC_SETPARAMS:
app = (pps_params_t *)data;
if (app->mode & ~pps->ppscap)
return EINVAL;
pps->ppsparam = *app;
return 0;
case PPS_IOC_GETPARAMS:
app = (pps_params_t *)data;
*app = pps->ppsparam;
app->api_version = PPS_API_VERS_1;
return 0;
case PPS_IOC_GETCAP:
*(int*)data = pps->ppscap;
return 0;
case PPS_IOC_FETCH:
pipi = (pps_info_t *)data;
pps->ppsinfo.current_mode = pps->ppsparam.mode;
*pipi = pps->ppsinfo;
return 0;
case PPS_IOC_KCBIND:
#ifdef PPS_SYNC
epi = (int *)data;
/* XXX Only root should be able to do this */
if (*epi & ~pps->ppscap)
return EINVAL;
pps->kcmode = *epi;
return 0;
#else
return EOPNOTSUPP;
#endif
default:
return EPASSTHROUGH;
}
}
void
pps_init(struct pps_state *pps)
{ KASSERT(mutex_owned(&timecounter_lock));
pps->ppscap |= PPS_TSFMT_TSPEC;
if (pps->ppscap & PPS_CAPTUREASSERT)
pps->ppscap |= PPS_OFFSETASSERT;
if (pps->ppscap & PPS_CAPTURECLEAR)
pps->ppscap |= PPS_OFFSETCLEAR;
}
/*
* capture a timestamp in the pps structure
*/
void
pps_capture(struct pps_state *pps)
{
struct timehands *th;
KASSERT(mutex_owned(&timecounter_lock));
KASSERT(pps != NULL);
th = timehands;
pps->capgen = th->th_generation;
pps->capth = th;
pps->capcount = (uint64_t)tc_delta(th) + th->th_offset_count;
if (pps->capgen != th->th_generation)
pps->capgen = 0;
}
#ifdef PPS_DEBUG
int ppsdebug = 0;
#endif
/*
* process a pps_capture()ed event
*/
void
pps_event(struct pps_state *pps, int event)
{
pps_ref_event(pps, event, NULL, PPS_REFEVNT_PPS|PPS_REFEVNT_CAPTURE);
}
/*
* extended pps api / kernel pll/fll entry point
*
* feed reference time stamps to PPS engine
*
* will simulate a PPS event and feed
* the NTP PLL/FLL if requested.
*
* the ref time stamps should be roughly once
* a second but do not need to be exactly in phase
* with the UTC second but should be close to it.
* this relaxation of requirements allows callout
* driven timestamping mechanisms to feed to pps
* capture/kernel pll logic.
*
* calling pattern is:
* pps_capture() (for PPS_REFEVNT_{CAPTURE|CAPCUR})
* read timestamp from reference source
* pps_ref_event()
*
* supported refmodes:
* PPS_REFEVNT_CAPTURE
* use system timestamp of pps_capture()
* PPS_REFEVNT_CURRENT
* use system timestamp of this call
* PPS_REFEVNT_CAPCUR
* use average of read capture and current system time stamp
* PPS_REFEVNT_PPS
* assume timestamp on second mark - ref_ts is ignored
*
*/
void
pps_ref_event(struct pps_state *pps,
int event,
struct bintime *ref_ts,
int refmode
)
{
struct bintime bt; /* current time */
struct bintime btd; /* time difference */
struct bintime bt_ref; /* reference time */
struct timespec ts, *tsp, *osp;
struct timehands *th;
uint64_t tcount, acount, dcount, *pcount;
int foff, gen;
#ifdef PPS_SYNC
int fhard;
#endif
pps_seq_t *pseq;
KASSERT(mutex_owned(&timecounter_lock));
KASSERT(pps != NULL);
/* pick up current time stamp if needed */
if (refmode & (PPS_REFEVNT_CURRENT|PPS_REFEVNT_CAPCUR)) {
/* pick up current time stamp */
th = timehands;
gen = th->th_generation;
tcount = (uint64_t)tc_delta(th) + th->th_offset_count;
if (gen != th->th_generation)
gen = 0;
/* If the timecounter was wound up underneath us, bail out. */
if (pps->capgen == 0 ||
pps->capgen != pps->capth->th_generation ||
gen == 0 ||
gen != pps->capgen) {
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ...): DROP (wind-up)\n",
pps, event);
}
#endif
return;
}
} else {
tcount = 0; /* keep GCC happy */
}
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
struct timespec tmsp;
if (ref_ts == NULL) {
tmsp.tv_sec = 0;
tmsp.tv_nsec = 0;
} else {
bintime2timespec(ref_ts, &tmsp);
}
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ref_ts=%"PRIi64
".%09"PRIi32", refmode=0x%1x)\n",
pps, event, tmsp.tv_sec, (int32_t)tmsp.tv_nsec, refmode);
}
#endif
/* setup correct event references */
if (event == PPS_CAPTUREASSERT) {
tsp = &pps->ppsinfo.assert_timestamp;
osp = &pps->ppsparam.assert_offset;
foff = pps->ppsparam.mode & PPS_OFFSETASSERT;
#ifdef PPS_SYNC
fhard = pps->kcmode & PPS_CAPTUREASSERT;
#endif
pcount = &pps->ppscount[0];
pseq = &pps->ppsinfo.assert_sequence;
} else {
tsp = &pps->ppsinfo.clear_timestamp;
osp = &pps->ppsparam.clear_offset;
foff = pps->ppsparam.mode & PPS_OFFSETCLEAR;
#ifdef PPS_SYNC
fhard = pps->kcmode & PPS_CAPTURECLEAR;
#endif
pcount = &pps->ppscount[1];
pseq = &pps->ppsinfo.clear_sequence;
}
/* determine system time stamp according to refmode */
dcount = 0; /* keep GCC happy */
switch (refmode & PPS_REFEVNT_RMASK) {
case PPS_REFEVNT_CAPTURE:
acount = pps->capcount; /* use capture timestamp */
break;
case PPS_REFEVNT_CURRENT:
acount = tcount; /* use current timestamp */
break;
case PPS_REFEVNT_CAPCUR:
/*
* calculate counter value between pps_capture() and
* pps_ref_event()
*/
dcount = tcount - pps->capcount;
acount = (dcount / 2) + pps->capcount;
break;
default: /* ignore call error silently */
return;
}
/*
* If the timecounter changed, we cannot compare the count values, so
* we have to drop the rest of the PPS-stuff until the next event.
*/
if (pps->ppstc != pps->capth->th_counter) {
pps->ppstc = pps->capth->th_counter;
pps->capcount = acount;
*pcount = acount;
pps->ppscount[2] = acount;
#ifdef PPS_DEBUG
if (ppsdebug & 0x1) {
log(LOG_DEBUG,
"pps_ref_event(pps=%p, event=%d, ...): DROP (time-counter change)\n",
pps, event);
}
#endif
return;
}
pps->capcount = acount;
/* Convert the count to a bintime. */
bt = pps->capth->th_offset;
bintime_addx(&bt, pps->capth->th_scale * (acount - pps->capth->th_offset_count));
bintime_add(&bt, &timebase.bin);
if ((refmode & PPS_REFEVNT_PPS) == 0) {
/* determine difference to reference time stamp */
bt_ref = *ref_ts;
btd = bt;
bintime_sub(&btd, &bt_ref);
/*
* simulate a PPS timestamp by dropping the fraction
* and applying the offset
*/
if (bt.frac >= (uint64_t)1<<63) /* skip to nearest second */
bt.sec++;
bt.frac = 0;
bintime_add(&bt, &btd);
} else {
/*
* create ref_ts from current time -
* we are supposed to be called on
* the second mark
*/
bt_ref = bt;
if (bt_ref.frac >= (uint64_t)1<<63) /* skip to nearest second */
bt_ref.sec++;
bt_ref.frac = 0;
}
/* convert bintime to timestamp */
bintime2timespec(&bt, &ts);
/* If the timecounter was wound up underneath us, bail out. */
if (pps->capgen != pps->capth->th_generation)
return;
/* store time stamp */
*pcount = pps->capcount;
(*pseq)++;
*tsp = ts;
/* add offset correction */
if (foff) {
timespecadd(tsp, osp, tsp);
if (tsp->tv_nsec < 0) {
tsp->tv_nsec += 1000000000;
tsp->tv_sec -= 1;
}
}
#ifdef PPS_DEBUG
if (ppsdebug & 0x2) {
struct timespec ts2;
struct timespec ts3;
bintime2timespec(&bt_ref, &ts2);
bt.sec = 0;
bt.frac = 0;
if (refmode & PPS_REFEVNT_CAPCUR) {
bintime_addx(&bt, pps->capth->th_scale * dcount);
}
bintime2timespec(&bt, &ts3);
log(LOG_DEBUG, "ref_ts=%"PRIi64".%09"PRIi32
", ts=%"PRIi64".%09"PRIi32", read latency=%"PRIi64" ns\n",
ts2.tv_sec, (int32_t)ts2.tv_nsec,
tsp->tv_sec, (int32_t)tsp->tv_nsec,
timespec2ns(&ts3));
}
#endif
#ifdef PPS_SYNC
if (fhard) {
uint64_t scale;
uint64_t div;
/*
* Feed the NTP PLL/FLL.
* The FLL wants to know how many (hardware) nanoseconds
* elapsed since the previous event (mod 1 second) thus
* we are actually looking at the frequency difference scaled
* in nsec.
* As the counter time stamps are not truly at 1Hz
* we need to scale the count by the elapsed
* reference time.
* valid sampling interval: [0.5..2[ sec
*/
/* calculate elapsed raw count */
tcount = pps->capcount - pps->ppscount[2];
pps->ppscount[2] = pps->capcount;
tcount &= pps->capth->th_counter->tc_counter_mask;
/* calculate elapsed ref time */
btd = bt_ref;
bintime_sub(&btd, &pps->ref_time);
pps->ref_time = bt_ref;
/* check that we stay below 2 sec */
if (btd.sec < 0 || btd.sec > 1)
return;
/* we want at least 0.5 sec between samples */
if (btd.sec == 0 && btd.frac < (uint64_t)1<<63)
return;
/*
* calculate cycles per period by multiplying
* the frequency with the elapsed period
* we pick a fraction of 30 bits
* ~1ns resolution for elapsed time
*/
div = (uint64_t)btd.sec << 30;
div |= (btd.frac >> 34) & (((uint64_t)1 << 30) - 1);
div *= pps->capth->th_counter->tc_frequency;
div >>= 30;
if (div == 0) /* safeguard */
return;
scale = (uint64_t)1 << 63;
scale /= div;
scale *= 2;
bt.sec = 0;
bt.frac = 0;
bintime_addx(&bt, scale * tcount);
bintime2timespec(&bt, &ts);
#ifdef PPS_DEBUG
if (ppsdebug & 0x4) {
struct timespec ts2;
int64_t df;
bintime2timespec(&bt_ref, &ts2);
df = timespec2ns(&ts);
if (df > 500000000)
df -= 1000000000;
log(LOG_DEBUG, "hardpps: ref_ts=%"PRIi64
".%09"PRIi32", ts=%"PRIi64".%09"PRIi32
", freqdiff=%"PRIi64" ns/s\n",
ts2.tv_sec, (int32_t)ts2.tv_nsec,
tsp->tv_sec, (int32_t)tsp->tv_nsec,
df);
}
#endif
hardpps(tsp, timespec2ns(&ts));
}
#endif
}
/*
* Timecounters need to be updated every so often to prevent the hardware
* counter from overflowing. Updating also recalculates the cached values
* used by the get*() family of functions, so their precision depends on
* the update frequency.
*/
static int tc_tick;
void
tc_ticktock(void)
{
static int count;
if (++count < tc_tick)
return;
count = 0;
mutex_spin_enter(&timecounter_lock);
if (__predict_false(timecounter_bad != 0)) {
/* An existing timecounter has gone bad, pick a new one. */
(void)atomic_swap_uint(&timecounter_bad, 0);
if (timecounter->tc_quality < 0) {
tc_pick();
}
}
tc_windup();
mutex_spin_exit(&timecounter_lock);
}
void
inittimecounter(void)
{
u_int p;
mutex_init(&timecounter_lock, MUTEX_DEFAULT, IPL_HIGH);
/*
* Set the initial timeout to
* max(1, <approx. number of hardclock ticks in a millisecond>).
* People should probably not use the sysctl to set the timeout
* to smaller than its initial value, since that value is the
* smallest reasonable one. If they want better timestamps they
* should use the non-"get"* functions.
*/
if (hz > 1000)
tc_tick = (hz + 500) / 1000;
else
tc_tick = 1;
p = (tc_tick * 1000000) / hz;
aprint_verbose("timecounter: Timecounters tick every %d.%03u msec\n",
p / 1000, p % 1000);
/* warm up new timecounter (again) and get rolling. */
(void)timecounter->tc_get_timecount(timecounter);
(void)timecounter->tc_get_timecount(timecounter);
}
/* $NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_50.c,v 1.3 2020/01/29 15:47:51 ad Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/sys/resource.h>
#include <compat/sys/time.h>
#include <compat/common/compat_mod.h>
static const struct syscall_package kern_50_syscalls[] = {
{ SYS_compat_50__lwp_park, 0, (sy_call_t *)compat_50_sys__lwp_park },
{ SYS_compat_50___sigtimedwait, 0,
(sy_call_t *)compat_50_sys___sigtimedwait },
{ SYS_compat_50_wait4, 0, (sy_call_t *)compat_50_sys_wait4 },
{ 0, 0, NULL }
};
int
compat_50_sys__lwp_park(struct lwp *l,
const struct compat_50_sys__lwp_park_args *uap, register_t *retval)
{
/* {
syscallarg(const struct timespec50 *) ts;
syscallarg(lwpid_t) unpark;
syscallarg(const void *) hint;
syscallarg(const void *) unparkhint;
} */
struct timespec ts, *tsp;
struct timespec50 ts50;
int error;
if (SCARG(uap, ts) == NULL)
tsp = NULL;
else {
error = copyin(SCARG(uap, ts), &ts50, sizeof(ts50));
if (error != 0)
return error;
timespec50_to_timespec(&ts50, &ts);
tsp = &ts;
}
if (SCARG(uap, unpark) != 0) {
error = lwp_unpark(&SCARG(uap, unpark), 1);
if (error != 0)
return error;
}
return lwp_park(CLOCK_REALTIME, TIMER_ABSTIME, tsp);
}
static int
tscopyin(const void *u, void *s, size_t len)
{
struct timespec50 ts50;
int error;
KASSERT(len == sizeof(struct timespec));
error = copyin(u, &ts50, sizeof(ts50));
if (error)
return error;
timespec50_to_timespec(&ts50, s);
return 0;
}
static int
tscopyout(const void *s, void *u, size_t len)
{
struct timespec50 ts50;
KASSERT(len == sizeof(struct timespec));
timespec_to_timespec50(s, &ts50);
return copyout(&ts50, u, sizeof(ts50));
}
int
compat_50_sys___sigtimedwait(struct lwp *l,
const struct compat_50_sys___sigtimedwait_args *uap, register_t *retval)
{
int res;
res = sigtimedwait1(l,
(const struct sys_____sigtimedwait50_args *)uap, retval, copyin,
copyout, tscopyin, tscopyout);
if (!res)
*retval = 0; /* XXX NetBSD<=5 was not POSIX compliant */
return res;
}
int
compat_50_sys_wait4(struct lwp *l, const struct compat_50_sys_wait4_args *uap,
register_t *retval)
{
/* {
syscallarg(int) pid;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct rusage50 *) rusage;
} */
int status, error, pid = SCARG(uap, pid);
struct rusage50 ru50;
struct rusage ru;
error = do_sys_wait(&pid, &status, SCARG(uap, options),
SCARG(uap, rusage) != NULL ? &ru : NULL);
retval[0] = pid;
if (pid == 0)
return error;
if (SCARG(uap, rusage)) { rusage_to_rusage50(&ru, &ru50);
error = copyout(&ru50, SCARG(uap, rusage), sizeof(ru50));
}
if (error == 0 && SCARG(uap, status)) error = copyout(&status, SCARG(uap, status), sizeof(status));
return error;
}
int
kern_50_init(void)
{
return syscall_establish(NULL, kern_50_syscalls);
}
int
kern_50_fini(void)
{
return syscall_disestablish(NULL, kern_50_syscalls);
}
/* $NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Shared support code for kernels built with the DEBUG option.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_debug.c,v 1.7 2008/04/30 20:20:53 ad Exp $");
#include "opt_ddb.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/debug.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <uvm/uvm_extern.h>
#include <machine/lock.h>
/*
* Allocation/free validation by pointer address. Introduces
* significant overhead and is not enabled by default. Patch
* `debug_freecheck' to 1 at boot time to enable.
*/
#define FREECHECK_BYTES (8*1024*1024)
typedef struct fcitem {
void *i_addr;
struct fcitem *i_next;
} fcitem_t;
fcitem_t *freecheck_free;
__cpu_simple_lock_t freecheck_lock;
u_int debug_freecheck;
void
debug_init(void)
{
size_t cnt;
fcitem_t *i;
__cpu_simple_lock_init(&freecheck_lock);
if (debug_freecheck) {
i = (fcitem_t *)uvm_km_alloc(kernel_map, FREECHECK_BYTES, 0,
UVM_KMF_WIRED);
if (i == NULL) {
printf("freecheck_init: unable to allocate memory");
return;
}
for (cnt = FREECHECK_BYTES / sizeof(*i); cnt != 0; cnt--) {
i->i_next = freecheck_free;
freecheck_free = i++;
}
}
}
void
freecheck_out(void **head, void *addr)
{
fcitem_t *i;
int s;
if (!debug_freecheck)
return;
s = splvm();
__cpu_simple_lock(&freecheck_lock);
for (i = *head; i != NULL; i = i->i_next) {
if (i->i_addr != addr)
continue;
__cpu_simple_unlock(&freecheck_lock);
splx(s);
panic("freecheck_out: %p already out", addr);
}
if ((i = freecheck_free) != NULL) {
freecheck_free = i->i_next;
i->i_addr = addr;
i->i_next = *head;
*head = i;
}
__cpu_simple_unlock(&freecheck_lock);
splx(s);
if (i == NULL) {
if (atomic_swap_uint(&debug_freecheck, 1) == 0) printf("freecheck_out: no more slots\n");
}
}
void
freecheck_in(void **head, void *addr)
{
fcitem_t *i;
void *pp;
int s;
if (!debug_freecheck)
return;
s = splvm();
__cpu_simple_lock(&freecheck_lock);
for (i = *head, pp = head; i != NULL; pp = &i->i_next, i = i->i_next) { if (i->i_addr == addr) {
*(fcitem_t **)pp = i->i_next;
i->i_next = freecheck_free;
freecheck_free = i;
break;
}
}
__cpu_simple_unlock(&freecheck_lock);
splx(s);
if (i != NULL)
return;
#ifdef DDB
printf("freecheck_in: %p not out\n", addr);
Debugger();
#else
panic("freecheck_in: %p not out", addr);
#endif
}
/* $NetBSD: md_root.c,v 1.19 2015/08/30 05:24:03 uebayasi Exp $ */
/*-
* Copyright (c) 1996 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Gordon W. Ross.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: md_root.c,v 1.19 2015/08/30 05:24:03 uebayasi Exp $");
#include "opt_md.h"
#include "opt_memory_disk_image.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/reboot.h>
#include <dev/md.h>
#ifdef MEMORY_DISK_DYNAMIC
#ifdef makeoptions_MEMORY_DISK_IMAGE
#error MEMORY_DISK_DYNAMIC is not compatible with MEMORY_DISK_IMAGE
#endif
size_t md_root_size;
char *md_root_image;
#else /* MEMORY_DISK_DYNAMIC */
#ifdef makeoptions_MEMORY_DISK_IMAGE
#ifdef MEMORY_DISK_ROOT_SIZE
#error MEMORY_DISK_ROOT_SIZE is not compatible with MEMORY_DISK_IMAGE
#endif
char md_root_image[] = {
#include "md_root_image.h"
};
uint32_t md_root_size = sizeof(md_root_image) & ~(DEV_BSIZE - 1);
#else /* makeoptions_MEMORY_DISK_IMAGE */
#ifndef MEMORY_DISK_ROOT_SIZE
#define MEMORY_DISK_ROOT_SIZE 512
#endif
#define ROOTBYTES (MEMORY_DISK_ROOT_SIZE << DEV_BSHIFT)
/*
* This array will be patched to contain a file-system image.
* See the program mdsetimage(8) for details.
*/
uint32_t md_root_size = ROOTBYTES;
char md_root_image[ROOTBYTES] = "|This is the root ramdisk!\n";
#endif /* makeoptions_MEMORY_DISK_IMAGE */
#endif /* MEMORY_DISK_DYNAMIC */
#ifndef MEMORY_DISK_RBFLAGS
#define MEMORY_DISK_RBFLAGS RB_AUTOBOOT /* default boot mode */
#endif
#ifdef MEMORY_DISK_DYNAMIC
void
md_root_setconf(char *addr, size_t size)
{
md_is_root = 1;
md_root_image = addr;
md_root_size = size;
}
#endif /* MEMORY_DISK_DYNAMIC */
/*
* This is called during pseudo-device attachment.
*/
#define PBUFLEN sizeof("99999 KB")
void
md_attach_hook(int unit, struct md_conf *md)
{
char pbuf[PBUFLEN];
if (unit == 0 && md_is_root) {
/* Setup root ramdisk */
md->md_addr = (void *)md_root_image;
md->md_size = (size_t)md_root_size;
md->md_type = MD_KMEM_FIXED;
format_bytes(pbuf, sizeof(pbuf), md->md_size);
aprint_verbose("md%d: internal %s image area\n", unit, pbuf);
}
}
/*
* This is called during open (i.e. mountroot)
*/
void
md_open_hook(int unit, struct md_conf *md)
{
if (unit == 0 && md_is_root) {
boothowto |= MEMORY_DISK_RBFLAGS;
}
}
/* $NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $ */
/*
* Copyright (c) 2001, 2002 Ian Dowse. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/sys/ufs/ufs/ufs_dirhash.c,v 1.3.2.8 2004/12/08 11:54:13 dwmalone Exp $
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_dirhash.c,v 1.41 2022/08/07 02:33:47 simonb Exp $");
/*
* This implements a hash-based lookup scheme for UFS directories.
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/types.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/sysctl.h>
#include <sys/atomic.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/dirhash.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
/*
* Defaults for dirhash cache sizes:
* - use up to 1/64th of system memory.
* - disable dirhash (set the cache size to 0 bytes) if the
* calculated value of hash is less than 2MB.
* - cap maximum size of the dirhash cache at 32MB.
*/
#define DIRHASH_DEFAULT_DIVIDER 64
#define MIN_DEFAULT_DIRHASH_MEM (2 * 1024 * 1024)
#define MAX_DEFAULT_DIRHASH_MEM (32 * 1024 * 1024)
#define WRAPINCR(val, limit) (((val) + 1 == (limit)) ? 0 : ((val) + 1))
#define WRAPDECR(val, limit) (((val) == 0) ? ((limit) - 1) : ((val) - 1))
#define OFSFMT(ip) ((ip)->i_ump->um_maxsymlinklen <= 0)
#define BLKFREE2IDX(n) ((n) > DH_NFSTATS ? DH_NFSTATS : (n))
static u_int ufs_dirhashminblks = 5;
static u_int ufs_dirhashmaxmem = 0;
static u_int ufs_dirhashmem;
static u_int ufs_dirhashcheck = 0;
static int ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen);
static void ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff,
int dirblksiz);
static void ufsdirhash_delslot(struct dirhash *dh, int slot);
static int ufsdirhash_findslot(struct dirhash *dh, const char *name,
int namelen, doff_t offset);
static doff_t ufsdirhash_getprev(struct direct *dp, doff_t offset,
int dirblksiz);
static int ufsdirhash_recycle(int wanted);
static pool_cache_t ufsdirhashblk_cache;
static pool_cache_t ufsdirhash_cache;
#define DIRHASHLIST_LOCK() mutex_enter(&ufsdirhash_lock)
#define DIRHASHLIST_UNLOCK() mutex_exit(&ufsdirhash_lock)
#define DIRHASH_LOCK(dh) mutex_enter(&(dh)->dh_lock)
#define DIRHASH_UNLOCK(dh) mutex_exit(&(dh)->dh_lock)
#define DIRHASH_BLKALLOC() \
pool_cache_get(ufsdirhashblk_cache, PR_NOWAIT)
#define DIRHASH_BLKFREE(ptr) \
pool_cache_put(ufsdirhashblk_cache, ptr)
/* Dirhash list; recently-used entries are near the tail. */
static TAILQ_HEAD(, dirhash) ufsdirhash_list;
/* Protects: ufsdirhash_list, `dh_list' field, ufs_dirhashmem. */
static kmutex_t ufsdirhash_lock;
/*
* Locking order:
* ufsdirhash_lock
* dh_lock
*
* The dh_lock mutex should be acquired either via the inode lock, or via
* ufsdirhash_lock. Only the owner of the inode may free the associated
* dirhash, but anything can steal its memory and set dh_hash to NULL.
*/
/*
* Attempt to build up a hash table for the directory contents in
* inode 'ip'. Returns 0 on success, or -1 of the operation failed.
*/
int
ufsdirhash_build(struct inode *ip)
{
struct dirhash *dh;
struct buf *bp = NULL;
struct direct *ep;
struct vnode *vp;
doff_t bmask, pos;
int dirblocks, i, j, memreqd, nblocks, narrays, nslots, slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
/* Check if we can/should use dirhash. */
if (ip->i_dirhash == NULL) {
if (ufs_dirhashmaxmem == 0 || ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
OFSFMT(ip))
return (-1);
} else {
/* Hash exists, but sysctls could have changed. */
if (ip->i_size < (ufs_dirhashminblks * dirblksiz) ||
ufs_dirhashmem > ufs_dirhashmaxmem) {
ufsdirhash_free(ip);
return (-1);
}
/* Check if hash exists and is intact (note: unlocked read). */
if (ip->i_dirhash->dh_hash != NULL)
return (0);
/* Free the old, recycled hash and build a new one. */
ufsdirhash_free(ip);
}
/* Don't hash removed directories. */
if (ip->i_nlink == 0)
return (-1);
vp = ip->i_vnode;
/* Allocate 50% more entries than this dir size could ever need. */
KASSERT(ip->i_size >= dirblksiz);
nslots = ip->i_size / UFS_DIRECTSIZ(1);
nslots = (nslots * 3 + 1) / 2;
narrays = howmany(nslots, DH_NBLKOFF);
nslots = narrays * DH_NBLKOFF;
dirblocks = howmany(ip->i_size, dirblksiz);
nblocks = (dirblocks * 3 + 1) / 2;
memreqd = sizeof(*dh) + narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
nblocks * sizeof(*dh->dh_blkfree);
while (atomic_add_int_nv(&ufs_dirhashmem, memreqd) >
ufs_dirhashmaxmem) {
atomic_add_int(&ufs_dirhashmem, -memreqd);
if (memreqd > ufs_dirhashmaxmem / 2)
return (-1);
/* Try to free some space. */
if (ufsdirhash_recycle(memreqd) != 0)
return (-1);
else
DIRHASHLIST_UNLOCK();
}
/*
* Use non-blocking mallocs so that we will revert to a linear
* lookup on failure rather than potentially blocking forever.
*/
dh = pool_cache_get(ufsdirhash_cache, PR_NOWAIT);
if (dh == NULL) {
atomic_add_int(&ufs_dirhashmem, -memreqd);
return (-1);
}
memset(dh, 0, sizeof(*dh));
mutex_init(&dh->dh_lock, MUTEX_DEFAULT, IPL_NONE);
DIRHASH_LOCK(dh);
dh->dh_hashsz = narrays * sizeof(dh->dh_hash[0]);
dh->dh_hash = kmem_zalloc(dh->dh_hashsz, KM_NOSLEEP);
dh->dh_blkfreesz = nblocks * sizeof(dh->dh_blkfree[0]);
dh->dh_blkfree = kmem_zalloc(dh->dh_blkfreesz, KM_NOSLEEP);
if (dh->dh_hash == NULL || dh->dh_blkfree == NULL)
goto fail;
for (i = 0; i < narrays; i++) { if ((dh->dh_hash[i] = DIRHASH_BLKALLOC()) == NULL)
goto fail;
for (j = 0; j < DH_NBLKOFF; j++)
dh->dh_hash[i][j] = DIRHASH_EMPTY;
}
/* Initialise the hash table and block statistics. */
dh->dh_narrays = narrays;
dh->dh_hlen = nslots;
dh->dh_nblk = nblocks;
dh->dh_dirblks = dirblocks;
for (i = 0; i < dirblocks; i++)
dh->dh_blkfree[i] = dirblksiz / DIRALIGN;
for (i = 0; i < DH_NFSTATS; i++)
dh->dh_firstfree[i] = -1;
dh->dh_firstfree[DH_NFSTATS] = 0;
dh->dh_seqopt = 0;
dh->dh_seqoff = 0;
dh->dh_score = DH_SCOREINIT;
ip->i_dirhash = dh;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
pos = 0;
while (pos < ip->i_size) {
preempt_point();
/* If necessary, get the next directory block. */
if ((pos & bmask) == 0) { if (bp != NULL) brelse(bp, 0); if (ufs_blkatoff(vp, (off_t)pos, NULL, &bp, false) != 0)
goto fail;
}
/* Add this entry to the hash. */
ep = (struct direct *)((char *)bp->b_data + (pos & bmask));
if (ep->d_reclen == 0 || ep->d_reclen >
dirblksiz - (pos & (dirblksiz - 1))) {
/* Corrupted directory. */
brelse(bp, 0);
goto fail;
}
if (ep->d_ino != 0) {
/* Add the entry (simplified ufsdirhash_add). */
slot = ufsdirhash_hash(dh, ep->d_name, ep->d_namlen); while (DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
dh->dh_hused++;
DH_ENTRY(dh, slot) = pos;
ufsdirhash_adjfree(dh, pos, -UFS_DIRSIZ(0, ep, needswap),
dirblksiz);
}
pos += ep->d_reclen;
}
if (bp != NULL) brelse(bp, 0);
DIRHASHLIST_LOCK();
TAILQ_INSERT_TAIL(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 1;
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
return (0);
fail:
ip->i_dirhash = NULL;
DIRHASH_UNLOCK(dh);
if (dh->dh_hash != NULL) { for (i = 0; i < narrays; i++) if (dh->dh_hash[i] != NULL) DIRHASH_BLKFREE(dh->dh_hash[i]); kmem_free(dh->dh_hash, dh->dh_hashsz);
}
if (dh->dh_blkfree != NULL) kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
mutex_destroy(&dh->dh_lock);
pool_cache_put(ufsdirhash_cache, dh);
atomic_add_int(&ufs_dirhashmem, -memreqd);
return (-1);
}
/*
* Free any hash table associated with inode 'ip'.
*/
void
ufsdirhash_free(struct inode *ip)
{
struct dirhash *dh;
int i, mem;
if ((dh = ip->i_dirhash) == NULL)
return;
ip->i_dirhash = NULL;
DIRHASHLIST_LOCK();
if (dh->dh_onlist) TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
DIRHASHLIST_UNLOCK();
/* The dirhash pointed to by 'dh' is exclusively ours now. */
mem = sizeof(*dh);
if (dh->dh_hash != NULL) { for (i = 0; i < dh->dh_narrays; i++)
DIRHASH_BLKFREE(dh->dh_hash[i]);
kmem_free(dh->dh_hash, dh->dh_hashsz);
kmem_free(dh->dh_blkfree, dh->dh_blkfreesz);
mem += dh->dh_hashsz;
mem += dh->dh_narrays * DH_NBLKOFF * sizeof(**dh->dh_hash);
mem += dh->dh_nblk * sizeof(*dh->dh_blkfree);
}
mutex_destroy(&dh->dh_lock);
pool_cache_put(ufsdirhash_cache, dh);
atomic_add_int(&ufs_dirhashmem, -mem);
}
/*
* Find the offset of the specified name within the given inode.
* Returns 0 on success, ENOENT if the entry does not exist, or
* EJUSTRETURN if the caller should revert to a linear search.
*
* If successful, the directory offset is stored in *offp, and a
* pointer to a struct buf containing the entry is stored in *bpp. If
* prevoffp is non-NULL, the offset of the previous entry within
* the UFS_DIRBLKSIZ-sized block is stored in *prevoffp (if the entry
* is the first in a block, the start of the block is used).
*/
int
ufsdirhash_lookup(struct inode *ip, const char *name, int namelen, doff_t *offp,
struct buf **bpp, doff_t *prevoffp)
{
struct dirhash *dh, *dh_next;
struct direct *dp;
struct vnode *vp;
struct buf *bp;
doff_t blkoff, bmask, offset, prevoff;
int i, slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (EJUSTRETURN);
/*
* Move this dirhash towards the end of the list if it has a
* score higher than the next entry, and acquire the dh_lock.
* Optimise the case where it's already the last by performing
* an unlocked read of the TAILQ_NEXT pointer.
*
* In both cases, end up holding just dh_lock.
*/
if (TAILQ_NEXT(dh, dh_list) != NULL) {
DIRHASHLIST_LOCK();
DIRHASH_LOCK(dh);
/*
* If the new score will be greater than that of the next
* entry, then move this entry past it. With both mutexes
* held, dh_next won't go away, but its dh_score could
* change; that's not important since it is just a hint.
*/
if (dh->dh_hash != NULL && (dh_next = TAILQ_NEXT(dh, dh_list)) != NULL &&
dh->dh_score >= dh_next->dh_score) {
KASSERT(dh->dh_onlist); TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list); TAILQ_INSERT_AFTER(&ufsdirhash_list, dh_next, dh,
dh_list);
}
DIRHASHLIST_UNLOCK();
} else {
/* Already the last, though that could change as we wait. */
DIRHASH_LOCK(dh);
}
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/* Update the score. */
if (dh->dh_score < DH_SCOREMAX) dh->dh_score++;
vp = ip->i_vnode;
bmask = VFSTOUFS(vp->v_mount)->um_mountp->mnt_stat.f_iosize - 1;
blkoff = -1;
bp = NULL;
restart:
slot = ufsdirhash_hash(dh, name, namelen); if (dh->dh_seqopt) {
/*
* Sequential access optimisation. dh_seqoff contains the
* offset of the directory entry immediately following
* the last entry that was looked up. Check if this offset
* appears in the hash chain for the name we are looking for.
*/
for (i = slot; (offset = DH_ENTRY(dh, i)) != DIRHASH_EMPTY;
i = WRAPINCR(i, dh->dh_hlen))
if (offset == dh->dh_seqoff)
break;
if (offset == dh->dh_seqoff) {
/*
* We found an entry with the expected offset. This
* is probably the entry we want, but if not, the
* code below will turn off seqoff and retry.
*/
slot = i;
} else
dh->dh_seqopt = 0;
}
for (; (offset = DH_ENTRY(dh, slot)) != DIRHASH_EMPTY; slot = WRAPINCR(slot, dh->dh_hlen)) {
if (offset == DIRHASH_DEL)
continue;
if (offset < 0 || offset >= ip->i_size)
panic("ufsdirhash_lookup: bad offset in hash array");
if ((offset & ~bmask) != blkoff) { if (bp != NULL) brelse(bp, 0);
blkoff = offset & ~bmask;
if (ufs_blkatoff(vp, (off_t)blkoff,
NULL, &bp, false) != 0) {
DIRHASH_UNLOCK(dh);
return (EJUSTRETURN);
}
}
dp = (struct direct *)((char *)bp->b_data + (offset & bmask));
if (dp->d_reclen == 0 || dp->d_reclen >
dirblksiz - (offset & (dirblksiz - 1))) {
/* Corrupted directory. */
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (EJUSTRETURN);
}
if (dp->d_namlen == namelen &&
memcmp(dp->d_name, name, namelen) == 0) {
/* Found. Get the prev offset if needed. */
if (prevoffp != NULL) { if (offset & (dirblksiz - 1)) { prevoff = ufsdirhash_getprev(dp,
offset, dirblksiz);
if (prevoff == -1) {
brelse(bp, 0);
return (EJUSTRETURN);
}
} else
prevoff = offset;
*prevoffp = prevoff;
}
/* Check for sequential access, and update offset. */
if (dh->dh_seqopt == 0 && dh->dh_seqoff == offset) dh->dh_seqopt = 1;
dh->dh_seqoff = offset + UFS_DIRSIZ(0, dp, needswap);
DIRHASH_UNLOCK(dh);
*bpp = bp;
*offp = offset;
return (0);
}
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp, 0);
ufsdirhash_free(ip);
return (EJUSTRETURN);
}
/*
* When the name doesn't match in the seqopt case, go back
* and search normally.
*/
if (dh->dh_seqopt) {
dh->dh_seqopt = 0;
goto restart;
}
}
DIRHASH_UNLOCK(dh);
if (bp != NULL) brelse(bp, 0);
return (ENOENT);
}
/*
* Find a directory block with room for 'slotneeded' bytes. Returns
* the offset of the directory entry that begins the free space.
* This will either be the offset of an existing entry that has free
* space at the end, or the offset of an entry with d_ino == 0 at
* the start of a UFS_DIRBLKSIZ block.
*
* To use the space, the caller may need to compact existing entries in
* the directory. The total number of bytes in all of the entries involved
* in the compaction is stored in *slotsize. In other words, all of
* the entries that must be compacted are exactly contained in the
* region beginning at the returned offset and spanning *slotsize bytes.
*
* Returns -1 if no space was found, indicating that the directory
* must be extended.
*/
doff_t
ufsdirhash_findfree(struct inode *ip, int slotneeded, int *slotsize)
{
struct direct *dp;
struct dirhash *dh;
struct buf *bp;
doff_t pos, slotstart;
int dirblock, error, freebytes, i;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
/* Find a directory block with the desired free space. */
dirblock = -1;
for (i = howmany(slotneeded, DIRALIGN); i <= DH_NFSTATS; i++)
if ((dirblock = dh->dh_firstfree[i]) != -1)
break;
if (dirblock == -1) {
DIRHASH_UNLOCK(dh);
return (-1);
}
KASSERT(dirblock < dh->dh_nblk &&
dh->dh_blkfree[dirblock] >= howmany(slotneeded, DIRALIGN));
pos = dirblock * dirblksiz;
error = ufs_blkatoff(ip->i_vnode, (off_t)pos, (void *)&dp, &bp, false);
if (error) {
DIRHASH_UNLOCK(dh);
return (-1);
}
/* Find the first entry with free space. */
for (i = 0; i < dirblksiz; ) {
if (dp->d_reclen == 0) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
if (dp->d_ino == 0 || dp->d_reclen > UFS_DIRSIZ(0, dp, needswap))
break;
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > dirblksiz) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
slotstart = pos + i;
/* Find the range of entries needed to get enough space */
freebytes = 0;
while (i < dirblksiz && freebytes < slotneeded) {
freebytes += dp->d_reclen;
if (dp->d_ino != 0) freebytes -= UFS_DIRSIZ(0, dp, needswap);
if (dp->d_reclen == 0) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
i += dp->d_reclen;
dp = (struct direct *)((char *)dp + dp->d_reclen);
}
if (i > dirblksiz) {
DIRHASH_UNLOCK(dh);
brelse(bp, 0);
return (-1);
}
if (freebytes < slotneeded)
panic("ufsdirhash_findfree: free mismatch"); DIRHASH_UNLOCK(dh);
brelse(bp, 0);
*slotsize = pos + i - slotstart;
return (slotstart);
}
/*
* Return the start of the unused space at the end of a directory, or
* -1 if there are no trailing unused blocks.
*/
doff_t
ufsdirhash_enduseful(struct inode *ip)
{
struct dirhash *dh;
int i;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return (-1);
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return (-1);
}
if (dh->dh_blkfree[dh->dh_dirblks - 1] != dirblksiz / DIRALIGN) {
DIRHASH_UNLOCK(dh);
return (-1);
}
for (i = dh->dh_dirblks - 1; i >= 0; i--) if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
break;
DIRHASH_UNLOCK(dh);
return ((doff_t)(i + 1) * dirblksiz);
}
/*
* Insert information into the hash about a new directory entry. dirp
* points to a struct direct containing the entry, and offset specifies
* the offset of this entry.
*/
void
ufsdirhash_add(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset < dh->dh_dirblks * dirblksiz);
/*
* Normal hash usage is < 66%. If the usage gets too high then
* remove the hash entirely and let it be rebuilt later.
*/
if (dh->dh_hused >= (dh->dh_hlen * 3) / 4) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/* Find a free hash slot (empty or deleted), and add the entry. */
slot = ufsdirhash_hash(dh, dirp->d_name, dirp->d_namlen); while (DH_ENTRY(dh, slot) >= 0)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) == DIRHASH_EMPTY) dh->dh_hused++;
DH_ENTRY(dh, slot) = offset;
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, -UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
DIRHASH_UNLOCK(dh);
}
/*
* Remove the specified directory entry from the hash. The entry to remove
* is defined by the name in `dirp', which must exist at the specified
* `offset' within the directory.
*/
void
ufsdirhash_remove(struct inode *ip, struct direct *dirp, doff_t offset)
{
struct dirhash *dh;
int slot;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset < dh->dh_dirblks * dirblksiz);
/* Find the entry */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, offset);
/* Remove the hash entry. */
ufsdirhash_delslot(dh, slot);
/* Update the per-block summary info. */
ufsdirhash_adjfree(dh, offset, UFS_DIRSIZ(0, dirp, needswap), dirblksiz);
DIRHASH_UNLOCK(dh);
}
/*
* Change the offset associated with a directory entry in the hash. Used
* when compacting directory blocks.
*/
void
ufsdirhash_move(struct inode *ip, struct direct *dirp, doff_t oldoff,
doff_t newoff)
{
struct dirhash *dh;
int slot;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(oldoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz &&
newoff < dh->dh_dirblks * ip->i_ump->um_dirblksiz);
/* Find the entry, and update the offset. */
slot = ufsdirhash_findslot(dh, dirp->d_name, dirp->d_namlen, oldoff);
DH_ENTRY(dh, slot) = newoff;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory has grown by one block that
* begins at offset (i.e. the new length is offset + UFS_DIRBLKSIZ).
*/
void
ufsdirhash_newblk(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset == dh->dh_dirblks * dirblksiz);
block = offset / dirblksiz;
if (block >= dh->dh_nblk) {
/* Out of space; must rebuild. */
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
dh->dh_dirblks = block + 1;
/* Account for the new free block. */
dh->dh_blkfree[block] = dirblksiz / DIRALIGN;
if (dh->dh_firstfree[DH_NFSTATS] == -1) dh->dh_firstfree[DH_NFSTATS] = block;
DIRHASH_UNLOCK(dh);
}
/*
* Inform dirhash that the directory is being truncated.
*/
void
ufsdirhash_dirtrunc(struct inode *ip, doff_t offset)
{
struct dirhash *dh;
int block, i;
int dirblksiz = ip->i_ump->um_dirblksiz;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
KASSERT(offset <= dh->dh_dirblks * dirblksiz);
block = howmany(offset, dirblksiz);
/*
* If the directory shrinks to less than 1/8 of dh_nblk blocks
* (about 20% of its original size due to the 50% extra added in
* ufsdirhash_build) then free it, and let the caller rebuild
* if necessary.
*/
if (block < dh->dh_nblk / 8 && dh->dh_narrays > 1) { DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
/*
* Remove any `first free' information pertaining to the
* truncated blocks. All blocks we're removing should be
* completely unused.
*/
if (dh->dh_firstfree[DH_NFSTATS] >= block) dh->dh_firstfree[DH_NFSTATS] = -1; for (i = block; i < dh->dh_dirblks; i++)
if (dh->dh_blkfree[i] != dirblksiz / DIRALIGN)
panic("ufsdirhash_dirtrunc: blocks in use"); for (i = 0; i < DH_NFSTATS; i++) if (dh->dh_firstfree[i] >= block)
panic("ufsdirhash_dirtrunc: first free corrupt");
dh->dh_dirblks = block;
DIRHASH_UNLOCK(dh);
}
/*
* Debugging function to check that the dirhash information about
* a directory block matches its actual contents. Panics if a mismatch
* is detected.
*
* On entry, `sbuf' should point to the start of an in-core
* DIRBLKSIZ-sized directory block, and `offset' should contain the
* offset from the start of the directory of that block.
*/
void
ufsdirhash_checkblock(struct inode *ip, char *sbuf, doff_t offset)
{
struct dirhash *dh;
struct direct *dp;
int block, ffslot, i, nfree;
const int needswap = UFS_MPNEEDSWAP(ip->i_ump);
int dirblksiz = ip->i_ump->um_dirblksiz;
if (!ufs_dirhashcheck)
return;
if ((dh = ip->i_dirhash) == NULL)
return;
DIRHASH_LOCK(dh);
if (dh->dh_hash == NULL) {
DIRHASH_UNLOCK(dh);
ufsdirhash_free(ip);
return;
}
block = offset / dirblksiz;
if ((offset & (dirblksiz - 1)) != 0 || block >= dh->dh_dirblks)
panic("ufsdirhash_checkblock: bad offset");
nfree = 0;
for (i = 0; i < dirblksiz; i += dp->d_reclen) {
dp = (struct direct *)(sbuf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > dirblksiz)
panic("ufsdirhash_checkblock: bad dir");
if (dp->d_ino == 0) {
#if 0
/*
* XXX entries with d_ino == 0 should only occur
* at the start of a DIRBLKSIZ block. However the
* ufs code is tolerant of such entries at other
* offsets, and fsck does not fix them.
*/
if (i != 0)
panic("ufsdirhash_checkblock: bad dir inode");
#endif
nfree += dp->d_reclen;
continue;
}
/* Check that the entry exists (will panic if it doesn't). */
ufsdirhash_findslot(dh, dp->d_name, dp->d_namlen, offset + i);
nfree += dp->d_reclen - UFS_DIRSIZ(0, dp, needswap);
}
if (i != dirblksiz)
panic("ufsdirhash_checkblock: bad dir end");
if (dh->dh_blkfree[block] * DIRALIGN != nfree)
panic("ufsdirhash_checkblock: bad free count");
ffslot = BLKFREE2IDX(nfree / DIRALIGN);
for (i = 0; i <= DH_NFSTATS; i++) if (dh->dh_firstfree[i] == block && i != ffslot)
panic("ufsdirhash_checkblock: bad first-free");
if (dh->dh_firstfree[ffslot] == -1)
panic("ufsdirhash_checkblock: missing first-free entry"); DIRHASH_UNLOCK(dh);
}
/*
* Hash the specified filename into a dirhash slot.
*/
static int
ufsdirhash_hash(struct dirhash *dh, const char *name, int namelen)
{
u_int32_t hash;
/*
* We hash the name and then some other bit of data that is
* invariant over the dirhash's lifetime. Otherwise names
* differing only in the last byte are placed close to one
* another in the table, which is bad for linear probing.
*/
hash = hash32_buf(name, namelen, HASH32_BUF_INIT);
hash = hash32_buf(&dh, sizeof(dh), hash);
return (hash % dh->dh_hlen);
}
/*
* Adjust the number of free bytes in the block containing `offset'
* by the value specified by `diff'.
*
* The caller must ensure we have exclusive access to `dh'; normally
* that means that dh_lock should be held, but this is also called
* from ufsdirhash_build() where exclusive access can be assumed.
*/
static void
ufsdirhash_adjfree(struct dirhash *dh, doff_t offset, int diff, int dirblksiz)
{
int block, i, nfidx, ofidx;
KASSERT(mutex_owned(&dh->dh_lock));
/* Update the per-block summary info. */
block = offset / dirblksiz;
KASSERT(block < dh->dh_nblk && block < dh->dh_dirblks);
ofidx = BLKFREE2IDX(dh->dh_blkfree[block]);
dh->dh_blkfree[block] = (int)dh->dh_blkfree[block] + (diff / DIRALIGN);
nfidx = BLKFREE2IDX(dh->dh_blkfree[block]);
/* Update the `first free' list if necessary. */
if (ofidx != nfidx) {
/* If removing, scan forward for the next block. */
if (dh->dh_firstfree[ofidx] == block) { for (i = block + 1; i < dh->dh_dirblks; i++) if (BLKFREE2IDX(dh->dh_blkfree[i]) == ofidx)
break;
dh->dh_firstfree[ofidx] = (i < dh->dh_dirblks) ? i : -1;
}
/* Make this the new `first free' if necessary */
if (dh->dh_firstfree[nfidx] > block ||
dh->dh_firstfree[nfidx] == -1)
dh->dh_firstfree[nfidx] = block;
}
}
/*
* Find the specified name which should have the specified offset.
* Returns a slot number, and panics on failure.
*
* `dh' must be locked on entry and remains so on return.
*/
static int
ufsdirhash_findslot(struct dirhash *dh, const char *name, int namelen,
doff_t offset)
{
int slot;
KASSERT(mutex_owned(&dh->dh_lock));
/* Find the entry. */
KASSERT(dh->dh_hused < dh->dh_hlen); slot = ufsdirhash_hash(dh, name, namelen); while (DH_ENTRY(dh, slot) != offset &&
DH_ENTRY(dh, slot) != DIRHASH_EMPTY)
slot = WRAPINCR(slot, dh->dh_hlen);
if (DH_ENTRY(dh, slot) != offset)
panic("ufsdirhash_findslot: '%.*s' not found", namelen, name); return (slot);
}
/*
* Remove the entry corresponding to the specified slot from the hash array.
*
* `dh' must be locked on entry and remains so on return.
*/
static void
ufsdirhash_delslot(struct dirhash *dh, int slot)
{
int i;
KASSERT(mutex_owned(&dh->dh_lock));
/* Mark the entry as deleted. */
DH_ENTRY(dh, slot) = DIRHASH_DEL;
/* If this is the end of a chain of DIRHASH_DEL slots, remove them. */
for (i = slot; DH_ENTRY(dh, i) == DIRHASH_DEL; ) i = WRAPINCR(i, dh->dh_hlen);
if (DH_ENTRY(dh, i) == DIRHASH_EMPTY) {
i = WRAPDECR(i, dh->dh_hlen); while (DH_ENTRY(dh, i) == DIRHASH_DEL) {
DH_ENTRY(dh, i) = DIRHASH_EMPTY;
dh->dh_hused--;
i = WRAPDECR(i, dh->dh_hlen);
}
KASSERT(dh->dh_hused >= 0);
}
}
/*
* Given a directory entry and its offset, find the offset of the
* previous entry in the same UFS_DIRBLKSIZ-sized block. Returns an
* offset, or -1 if there is no previous entry in the block or some
* other problem occurred.
*/
static doff_t
ufsdirhash_getprev(struct direct *dirp, doff_t offset, int dirblksiz)
{
struct direct *dp;
char *blkbuf;
doff_t blkoff, prevoff;
int entrypos, i;
blkoff = offset & ~(dirblksiz - 1); /* offset of start of block */
entrypos = offset & (dirblksiz - 1); /* entry relative to block */
blkbuf = (char *)dirp - entrypos;
prevoff = blkoff;
/* If `offset' is the start of a block, there is no previous entry. */
if (entrypos == 0)
return (-1);
/* Scan from the start of the block until we get to the entry. */
for (i = 0; i < entrypos; i += dp->d_reclen) {
dp = (struct direct *)(blkbuf + i);
if (dp->d_reclen == 0 || i + dp->d_reclen > entrypos)
return (-1); /* Corrupted directory. */
prevoff = blkoff + i;
}
return (prevoff);
}
/*
* Try to free up `wanted' bytes by stealing memory from existing
* dirhashes. Returns zero with list locked if successful.
*/
static int
ufsdirhash_recycle(int wanted)
{
struct dirhash *dh;
doff_t **hash;
u_int8_t *blkfree;
int i, mem, narrays;
size_t hashsz, blkfreesz;
DIRHASHLIST_LOCK();
while (wanted + ufs_dirhashmem > ufs_dirhashmaxmem) {
/* Find a dirhash, and lock it. */
if ((dh = TAILQ_FIRST(&ufsdirhash_list)) == NULL) {
DIRHASHLIST_UNLOCK();
return (-1);
}
DIRHASH_LOCK(dh);
KASSERT(dh->dh_hash != NULL);
/* Decrement the score; only recycle if it becomes zero. */
if (--dh->dh_score > 0) {
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
return (-1);
}
/* Remove it from the list and detach its memory. */
TAILQ_REMOVE(&ufsdirhash_list, dh, dh_list);
dh->dh_onlist = 0;
hash = dh->dh_hash;
hashsz = dh->dh_hashsz;
dh->dh_hash = NULL;
blkfree = dh->dh_blkfree;
blkfreesz = dh->dh_blkfreesz;
dh->dh_blkfree = NULL;
narrays = dh->dh_narrays;
mem = narrays * sizeof(*dh->dh_hash) +
narrays * DH_NBLKOFF * sizeof(**dh->dh_hash) +
dh->dh_nblk * sizeof(*dh->dh_blkfree);
/* Unlock everything, free the detached memory. */
DIRHASH_UNLOCK(dh);
DIRHASHLIST_UNLOCK();
for (i = 0; i < narrays; i++)
DIRHASH_BLKFREE(hash[i]);
kmem_free(hash, hashsz);
kmem_free(blkfree, blkfreesz);
/* Account for the returned memory, and repeat if necessary. */
DIRHASHLIST_LOCK();
atomic_add_int(&ufs_dirhashmem, -mem);
}
/* Success. */
return (0);
}
SYSCTL_SETUP(ufsdirhash_sysctl_init, "ufs_dirhash sysctl")
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ufs",
SYSCTL_DESCR("ufs"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "dirhash",
SYSCTL_DESCR("dirhash"),
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "minblocks",
SYSCTL_DESCR("minimum hashed directory size in blocks"),
NULL, 0, &ufs_dirhashminblks, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxmem",
SYSCTL_DESCR("maximum dirhash memory usage"),
NULL, 0, &ufs_dirhashmaxmem, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "memused",
SYSCTL_DESCR("current dirhash memory usage"),
NULL, 0, &ufs_dirhashmem, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "docheck",
SYSCTL_DESCR("enable extra sanity checks"),
NULL, 0, &ufs_dirhashcheck, 0,
CTL_CREATE, CTL_EOL);
}
void
ufsdirhash_init(void)
{
/*
* Only initialise defaults for the dirhash size if it hasn't
* hasn't been set.
*/
if (ufs_dirhashmaxmem == 0) {
/* Use 64-bit math to avoid overflows. */
uint64_t physmem_bytes, hash_bytes;
physmem_bytes = ctob((uint64_t)physmem);
hash_bytes = physmem_bytes / DIRHASH_DEFAULT_DIVIDER;
if (hash_bytes < MIN_DEFAULT_DIRHASH_MEM)
hash_bytes = 0;
if (hash_bytes > MAX_DEFAULT_DIRHASH_MEM)
hash_bytes = MAX_DEFAULT_DIRHASH_MEM;
ufs_dirhashmaxmem = (u_int)hash_bytes;
}
mutex_init(&ufsdirhash_lock, MUTEX_DEFAULT, IPL_NONE);
ufsdirhashblk_cache = pool_cache_init(DH_NBLKOFF * sizeof(daddr_t), 0,
0, 0, "dirhashblk", NULL, IPL_NONE, NULL, NULL, NULL);
ufsdirhash_cache = pool_cache_init(sizeof(struct dirhash), 0,
0, 0, "dirhash", NULL, IPL_NONE, NULL, NULL, NULL);
TAILQ_INIT(&ufsdirhash_list);
}
void
ufsdirhash_done(void)
{
KASSERT(TAILQ_EMPTY(&ufsdirhash_list));
pool_cache_destroy(ufsdirhashblk_cache);
pool_cache_destroy(ufsdirhash_cache);
mutex_destroy(&ufsdirhash_lock);
}
/* $NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: cons.c 1.7 92/01/21$
*
* @(#)cons.c 8.2 (Berkeley) 1/12/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cons.c,v 1.95 2023/09/02 17:44:59 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/file.h>
#include <sys/heartbeat.h>
#include <sys/ioctl.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/vnode.h>
#include <dev/cons.h>
#include "nullcons.h"
dev_type_open(cnopen);
dev_type_close(cnclose);
dev_type_read(cnread);
dev_type_write(cnwrite);
dev_type_ioctl(cnioctl);
dev_type_poll(cnpoll);
dev_type_kqfilter(cnkqfilter);
static bool cn_redirect(dev_t *, int, int *, struct tty **);
static void cn_release(struct tty *);
const struct cdevsw cons_cdevsw = {
.d_open = cnopen,
.d_close = cnclose,
.d_read = cnread,
.d_write = cnwrite,
.d_ioctl = cnioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = cnpoll,
.d_mmap = nommap,
.d_kqfilter = cnkqfilter,
.d_discard = nodiscard,
.d_flag = D_TTY|D_MPSAFE,
};
static struct kmutex cn_lock;
struct tty *volatile constty; /* virtual console output device */
struct consdev *cn_tab; /* physical console device info */
struct vnode *cn_devvp[2]; /* vnode for underlying device. */
void
cn_set_tab(struct consdev *tab)
{
/*
* This is a point that we should have KASSERT(cold) or add
* synchronization in case this can happen after cold boot.
* However, cn_tab initialization is so critical to any
* diagnostics or debugging that we need to tread carefully
* about introducing new ways to crash. So let's put the
* assertion in only after we've audited most or all of the
* cn_tab updates.
*/
cn_tab = tab;
}
int
cnopen(dev_t dev, int flag, int mode, struct lwp *l)
{
dev_t cndev;
int unit, error;
unit = minor(dev);
if (unit > 1)
return ENODEV;
mutex_enter(&cn_lock);
if (cn_tab == NULL) {
error = 0;
goto out;
}
/*
* always open the 'real' console device, so we don't get nailed
* later. This follows normal device semantics; they always get
* open() calls.
*/
cndev = cn_tab->cn_dev;
#if NNULLCONS > 0
if (cndev == NODEV) {
nullconsattach(0);
}
#else /* NNULLCONS > 0 */
if (cndev == NODEV) {
/*
* This is most likely an error in the console attach
* code. Panicking looks better than jumping into nowhere
* through cdevsw below....
*/
panic("cnopen: no console device");
}
#endif /* NNULLCONS > 0 */
if (dev == cndev) {
/*
* This causes cnopen() to be called recursively, which
* is generally a bad thing. It is often caused when
* dev == 0 and cn_dev has not been set, but was probably
* initialised to 0.
*/
panic("cnopen: cn_tab->cn_dev == dev");
}
if (cn_devvp[unit] != NULLVP) {
error = 0;
goto out;
}
if ((error = cdevvp(cndev, &cn_devvp[unit])) != 0) {
printf("cnopen: unable to get vnode reference\n");
goto out;
}
vn_lock(cn_devvp[unit], LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(cn_devvp[unit], flag, kauth_cred_get());
VOP_UNLOCK(cn_devvp[unit]);
out: mutex_exit(&cn_lock);
return error;
}
int
cnclose(dev_t dev, int flag, int mode, struct lwp *l)
{
struct vnode *vp;
int unit, error;
unit = minor(dev);
if (unit > 1)
return ENODEV;
mutex_enter(&cn_lock);
if (cn_tab == NULL) {
error = 0;
goto out;
}
vp = cn_devvp[unit];
cn_devvp[unit] = NULL;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_CLOSE(vp, flag, kauth_cred_get());
VOP_UNLOCK(vp);
vrele(vp);
out: mutex_exit(&cn_lock);
return error;
}
int
cnread(dev_t dev, struct uio *uio, int flag)
{
struct tty *ctp = NULL;
int error;
/*
* If we would redirect input, punt. This will keep strange
* things from happening to people who are using the real
* console. Nothing should be using /dev/console for
* input (except a shell in single-user mode, but then,
* one wouldn't TIOCCONS then).
*/
if (!cn_redirect(&dev, 1, &error, &ctp))
return error;
error = cdev_read(dev, uio, flag);
cn_release(ctp);
return error;
}
int
cnwrite(dev_t dev, struct uio *uio, int flag)
{
struct tty *ctp = NULL;
int error;
/* Redirect output, if that's appropriate. */
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_write(dev, uio, flag);
cn_release(ctp);
return error;
}
int
cnioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
struct tty *ctp = NULL;
int error;
error = 0;
/*
* Superuser can always use this to wrest control of console
* output from the "virtual" console.
*/
if (cmd == TIOCCONS) {
struct tty *tp;
mutex_enter(&constty_lock);
tp = atomic_load_relaxed(&constty);
if (tp == NULL) {
mutex_exit(&constty_lock);
goto passthrough; /* XXX ??? */
}
error = kauth_authorize_device_tty(l->l_cred,
KAUTH_DEVICE_TTY_VIRTUAL, tp);
if (!error)
atomic_store_relaxed(&constty, NULL);
mutex_exit(&constty_lock);
return error;
}
passthrough:
/*
* Redirect the ioctl, if that's appropriate.
* Note that strange things can happen, if a program does
* ioctls on /dev/console, then the console is redirected
* out from under it.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_ioctl(dev, cmd, data, flag, l);
cn_release(ctp);
return error;
}
/*ARGSUSED*/
int
cnpoll(dev_t dev, int events, struct lwp *l)
{
struct tty *ctp = NULL;
int error;
/*
* Redirect the poll, if that's appropriate.
* I don't want to think of the possible side effects
* of console redirection here.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return POLLHUP;
error = cdev_poll(dev, events, l);
cn_release(ctp);
return error;
}
/*ARGSUSED*/
int
cnkqfilter(dev_t dev, struct knote *kn)
{
struct tty *ctp = NULL;
int error;
/*
* Redirect the kqfilter, if that's appropriate.
* I don't want to think of the possible side effects
* of console redirection here.
*/
if (!cn_redirect(&dev, 0, &error, &ctp))
return error;
error = cdev_kqfilter(dev, kn);
cn_release(ctp);
return error;
}
int
cngetc(void)
{
if (cn_tab == NULL)
return (0);
int s = splhigh();
for (;;) {
const int rv = (*cn_tab->cn_getc)(cn_tab->cn_dev);
if (rv >= 0) {
splx(s);
return rv;
}
docritpollhooks();
}
}
int
cngetsn(char *cp, int size)
{
char *lp;
int c, len;
cnpollc(1);
lp = cp;
len = 0;
for (;;) {
c = cngetc();
switch (c) {
case '\n':
case '\r':
printf("\n");
*lp++ = '\0';
cnpollc(0);
return (len);
case '\b':
case '\177':
case '#':
if (len) {
--len;
--lp;
printf("\b \b");
}
continue;
case '@':
case 'u'&037: /* CTRL-u */
len = 0;
lp = cp;
printf("\n");
continue;
default:
if (len + 1 >= size || c < ' ') {
printf("\007");
continue;
}
printf("%c", c);
++len;
*lp++ = c;
}
}
}
void
cnputc(int c)
{ if (cn_tab == NULL)
return;
/*
* XXX
* for some reason this causes ARCS firmware to output an endless stream of
* whitespaces with n32 kernels, so use the pre-1.74 code for now until I can
* figure out why this happens
*/
#ifndef sgimips
if (c) {
if (c == '\n') { (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
docritpollhooks();
}
(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
}
#else
if (c) {
(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
if (c == '\n') {
docritpollhooks();
(*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
}
}
#endif
}
void
cnpollc(int on)
{
static int refcount = 0;
if (cn_tab == NULL)
return;
if (!on)
--refcount;
if (refcount == 0) {
if (on) {
/*
* Bind to the current CPU by disabling
* preemption (more convenient than finding a
* place to store a stack to unwind for
* curlwp_bind/bindx, and preemption wouldn't
* happen anyway while spinning at high IPL in
* cngetc) so that curcpu() is stable so that
* we can suspend heartbeat checks for it.
*/
kpreempt_disable();
heartbeat_suspend();
}
(*cn_tab->cn_pollc)(cn_tab->cn_dev, on);
if (!on) {
heartbeat_resume();
kpreempt_enable();
}
}
if (on)
++refcount;
}
void
nullcnpollc(dev_t dev, int on)
{
}
void
cnbell(u_int pitch, u_int period, u_int volume)
{
if (cn_tab == NULL || cn_tab->cn_bell == NULL)
return;
(*cn_tab->cn_bell)(cn_tab->cn_dev, pitch, period, volume);
}
void
cnflush(void)
{ if (cn_tab == NULL || cn_tab->cn_flush == NULL)
return;
(*cn_tab->cn_flush)(cn_tab->cn_dev);
}
void
cnhalt(void)
{
if (cn_tab == NULL || cn_tab->cn_halt == NULL)
return;
(*cn_tab->cn_halt)(cn_tab->cn_dev);
}
/*
* Redirect output, if that's appropriate. If there's no real console,
* return ENXIO.
*/
static bool
cn_redirect(dev_t *devp, int is_read, int *error, struct tty **ctpp)
{
dev_t dev = *devp;
struct tty *ctp;
int s;
bool ok = false;
*error = ENXIO;
*ctpp = NULL;
s = pserialize_read_enter();
if ((ctp = atomic_load_consume(&constty)) != NULL && minor(dev) == 0 && (cn_tab == NULL || (cn_tab->cn_pri != CN_REMOTE))) {
if (is_read) {
*error = 0;
goto out;
}
tty_acquire(ctp);
*ctpp = ctp;
dev = ctp->t_dev;
} else if (cn_tab == NULL)
goto out;
else
dev = cn_tab->cn_dev;
ok = true;
*devp = dev;
out: pserialize_read_exit(s);
return ok;
}
static void
cn_release(struct tty *ctp)
{
if (ctp == NULL)
return;
tty_release(ctp);
}
MODULE(MODULE_CLASS_DRIVER, cons, NULL);
static int
cons_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
mutex_init(&cn_lock, MUTEX_DEFAULT, IPL_NONE);
return 0;
case MODULE_CMD_FINI:
mutex_destroy(&cn_lock);
return 0;
default:
return ENOTTY;
}
}
/* $NetBSD: mount.h,v 1.16 2024/01/19 18:39:15 christos Exp $ */
/*
* Copyright (c) 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mount.h 8.21 (Berkeley) 5/20/95
*/
#ifndef _COMPAT_SYS_MOUNT_H_
#define _COMPAT_SYS_MOUNT_H_
#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#endif
#define MFSNAMELEN 16
struct statfs12 {
short f_type; /* type of file system */
u_short f_oflags; /* deprecated copy of mount flags */
long f_bsize; /* fundamental file system block size */
long f_iosize; /* optimal transfer block size */
long f_blocks; /* total data blocks in file system */
long f_bfree; /* free blocks in fs */
long f_bavail; /* free blocks avail to non-superuser */
long f_files; /* total file nodes in file system */
long f_ffree; /* free file nodes in fs */
fsid_t f_fsid; /* file system id */
uid_t f_owner; /* user that mounted the file system */
long f_flags; /* copy of mount flags */
long f_syncwrites; /* count of sync writes since mount */
long f_asyncwrites; /* count of async writes since mount */
long f_spare[1]; /* spare for later */
char f_fstypename[MFSNAMELEN]; /* fs type name */
char f_mntonname[MNAMELEN]; /* directory on which mounted */
char f_mntfromname[MNAMELEN]; /* mounted file system */
};
#ifndef _KERNEL
#include <string.h>
#endif
/*
* Operations supported on mounted file system.
*/
/*
* Convert from a new statvfs to an old statfs structure.
*/
#define MOUNTNO_NONE 0
#define MOUNTNO_UFS 1 /* UNIX "Fast" Filesystem */
#define MOUNTNO_NFS 2 /* Network Filesystem */
#define MOUNTNO_MFS 3 /* Memory Filesystem */
#define MOUNTNO_MSDOS 4 /* MSDOS Filesystem */
#define MOUNTNO_CD9660 5 /* iso9660 cdrom */
#define MOUNTNO_FDESC 6 /* /dev/fd filesystem */
#define MOUNTNO_KERNFS 7 /* kernel variable filesystem */
#define MOUNTNO_DEVFS 8 /* device node filesystem */
#define MOUNTNO_AFS 9 /* AFS 3.x */
static const struct {
const char *name;
const int value;
} __nv[] = {
{ MOUNT_UFS, MOUNTNO_UFS },
{ MOUNT_NFS, MOUNTNO_NFS },
{ MOUNT_MFS, MOUNTNO_MFS },
{ MOUNT_MSDOS, MOUNTNO_MSDOS },
{ MOUNT_CD9660, MOUNTNO_CD9660 },
{ MOUNT_FDESC, MOUNTNO_FDESC },
{ MOUNT_KERNFS, MOUNTNO_KERNFS },
{ MOUNT_AFS, MOUNTNO_AFS },
};
static __inline void
statvfs_to_statfs12(const struct statvfs *fs, struct statfs12 *s12)
{
size_t i = 0;
memset(s12, 0, sizeof(*s12));
s12->f_type = 0;
s12->f_oflags = (short)fs->f_flag;
for (i = 0; i < sizeof(__nv) / sizeof(__nv[0]); i++) {
if (strcmp(__nv[i].name, fs->f_fstypename) == 0) {
s12->f_type = __nv[i].value;
break;
}
}
#define __STATFSCLAMP(a) (long)(((a) & ~LONG_MAX) ? LONG_MAX : (a))
s12->f_bsize = __STATFSCLAMP(fs->f_frsize);
s12->f_iosize = __STATFSCLAMP(fs->f_iosize);
s12->f_blocks = __STATFSCLAMP(fs->f_blocks);
s12->f_bfree = __STATFSCLAMP(fs->f_bfree);
if (fs->f_bfree > fs->f_bresvd)
s12->f_bavail = __STATFSCLAMP(fs->f_bfree - fs->f_bresvd);
else
s12->f_bavail = -__STATFSCLAMP(fs->f_bresvd - fs->f_bfree);
s12->f_files = __STATFSCLAMP(fs->f_files);
s12->f_ffree = __STATFSCLAMP(fs->f_ffree);
s12->f_fsid = fs->f_fsidx;
s12->f_owner = fs->f_owner;
s12->f_flags = (long)fs->f_flag;
s12->f_syncwrites = __STATFSCLAMP(fs->f_syncwrites);
s12->f_asyncwrites = __STATFSCLAMP(fs->f_asyncwrites);
memcpy(s12->f_fstypename, fs->f_fstypename, sizeof(s12->f_fstypename));
memcpy(s12->f_mntonname, fs->f_mntonname, sizeof(s12->f_mntonname));
memcpy(s12->f_mntfromname, fs->f_mntfromname,
sizeof(s12->f_mntfromname));
}
#ifdef _KERNEL
static __inline int
statvfs_to_statfs12_copy(const void *vs, void *vs12, size_t l)
{
struct statfs12 *s12 = kmem_zalloc(sizeof(*s12), KM_SLEEP);
int error;
statvfs_to_statfs12(vs, s12);
error = copyout(s12, vs12, sizeof(*s12));
kmem_free(s12, sizeof(*s12));
return error;
}
/*
* Filesystem configuration information. Not used by NetBSD, but
* defined here to provide a compatible sysctl interface to Lite2.
*/
struct vfsconf {
struct vfsops *vfc_vfsops; /* filesystem operations vector */
char vfc_name[MFSNAMELEN]; /* filesystem type name */
int vfc_typenum; /* historic filesystem type number */
int vfc_refcount; /* number mounted of this type */
int vfc_flags; /* permanent flags */
int (*vfc_mountroot)(void); /* if != NULL, routine to mount root */
struct vfsconf *vfc_next; /* next in list */
};
/* Old, fixed size filehandle structures (used upto (including) 3.x) */
struct compat_30_fid {
unsigned short fid_len;
unsigned short fid_reserved;
char fid_data[16];
};
struct compat_30_fhandle {
fsid_t fh_fsid;
struct compat_30_fid fh_fid;
};
#else
__BEGIN_DECLS
int __compat_fstatfs(int, struct statfs12 *) __dso_hidden;
int __compat_getfsstat(struct statfs12 *, long, int) __dso_hidden;
int __compat_statfs(const char *, struct statfs12 *) __dso_hidden;
int __compat_getmntinfo(struct statfs12 **, int) __dso_hidden;
#if defined(_NETBSD_SOURCE)
struct compat_30_fhandle;
int __compat_fhstatfs(const struct compat_30_fhandle *, struct statfs12 *)
__dso_hidden;
struct stat13;
int __compat_fhstat(const struct compat_30_fhandle *, struct stat13 *)
__dso_hidden;
struct stat30;
int __compat___fhstat30(const struct compat_30_fhandle *, struct stat30 *)
__dso_hidden;
int __compat___fhstat40(const void *, size_t, struct stat30 *) __dso_hidden;
struct stat;
int __fhstat50(const void *, size_t, struct stat *);
int __fhopen40(const void *, size_t, int);
int fhopen(const struct compat_30_fhandle *, int);
int __getfh30(const char *, void*, size_t *);
int getfh(const char *path, struct compat_30_fhandle *fhp);
int mount(const char *, const char *, int, void *);
int __mount50(const char *, const char *, int, void *, size_t);
#endif /* _NETBSD_SOURCE */
__END_DECLS
#endif /* _KERNEL */
#endif /* !_COMPAT_SYS_MOUNT_H_ */
/* $NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel mutex implementation, modeled after those found in Solaris,
* a description of which can be found in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*/
#define __MUTEX_PRIVATE
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mutex.c,v 1.112 2023/10/15 10:28:23 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <dev/lockstat.h>
#include <machine/lock.h>
/*
* When not running a debug kernel, spin mutexes are not much
* more than an splraiseipl() and splx() pair.
*/
#if defined(DIAGNOSTIC) || defined(MULTIPROCESSOR) || defined(LOCKDEBUG)
#define FULL
#endif
/*
* Debugging support.
*/
#define MUTEX_WANTLOCK(mtx) \
LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_TESTLOCK(mtx) \
LOCKDEBUG_WANTLOCK(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), -1)
#define MUTEX_LOCKED(mtx) \
LOCKDEBUG_LOCKED(MUTEX_DEBUG_P(mtx), (mtx), NULL, \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_UNLOCKED(mtx) \
LOCKDEBUG_UNLOCKED(MUTEX_DEBUG_P(mtx), (mtx), \
(uintptr_t)__builtin_return_address(0), 0)
#define MUTEX_ABORT(mtx, msg) \
mutex_abort(__func__, __LINE__, mtx, msg)
#if defined(LOCKDEBUG)
#define MUTEX_DASSERT(mtx, cond) \
do { \
if (__predict_false(!(cond))) \
MUTEX_ABORT(mtx, "assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else /* LOCKDEBUG */
#define MUTEX_DASSERT(mtx, cond) /* nothing */
#endif /* LOCKDEBUG */
#if defined(DIAGNOSTIC)
#define MUTEX_ASSERT(mtx, cond) \
do { \
if (__predict_false(!(cond))) \
MUTEX_ABORT(mtx, "assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else /* DIAGNOSTIC */
#define MUTEX_ASSERT(mtx, cond) /* nothing */
#endif /* DIAGNOSTIC */
/*
* Some architectures can't use __cpu_simple_lock as is so allow a way
* for them to use an alternate definition.
*/
#ifndef MUTEX_SPINBIT_LOCK_INIT
#define MUTEX_SPINBIT_LOCK_INIT(mtx) __cpu_simple_lock_init(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCKED_P
#define MUTEX_SPINBIT_LOCKED_P(mtx) __SIMPLELOCK_LOCKED_P(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_TRY
#define MUTEX_SPINBIT_LOCK_TRY(mtx) __cpu_simple_lock_try(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_SPINBIT_LOCK_UNLOCK
#define MUTEX_SPINBIT_LOCK_UNLOCK(mtx) __cpu_simple_unlock(&(mtx)->mtx_lock)
#endif
#ifndef MUTEX_INITIALIZE_SPIN_IPL
#define MUTEX_INITIALIZE_SPIN_IPL(mtx, ipl) \
((mtx)->mtx_ipl = makeiplcookie((ipl)))
#endif
/*
* Spin mutex SPL save / restore.
*/
#define MUTEX_SPIN_SPLRAISE(mtx) \
do { \
const int s = splraiseipl(MUTEX_SPIN_IPL(mtx)); \
struct cpu_info * const x__ci = curcpu(); \
const int x__cnt = x__ci->ci_mtx_count--; \
__insn_barrier(); \
if (x__cnt == 0) \
x__ci->ci_mtx_oldspl = s; \
} while (/* CONSTCOND */ 0)
#define MUTEX_SPIN_SPLRESTORE(mtx) \
do { \
struct cpu_info * const x__ci = curcpu(); \
const int s = x__ci->ci_mtx_oldspl; \
__insn_barrier(); \
if (++(x__ci->ci_mtx_count) == 0) \
splx(s); \
} while (/* CONSTCOND */ 0)
/*
* Memory barriers.
*/
#ifdef __HAVE_ATOMIC_AS_MEMBAR
#define MUTEX_MEMBAR_ENTER()
#else
#define MUTEX_MEMBAR_ENTER() membar_enter()
#endif
/*
* For architectures that provide 'simple' mutexes: they provide a
* CAS function that is either MP-safe, or does not need to be MP
* safe. Adaptive mutexes on these architectures do not require an
* additional interlock.
*/
#ifdef __HAVE_SIMPLE_MUTEXES
#define MUTEX_OWNER(owner) \
(owner & MUTEX_THREAD)
#define MUTEX_HAS_WAITERS(mtx) \
(((int)(mtx)->mtx_owner & MUTEX_BIT_WAITERS) != 0)
#define MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug) \
do { \
if (!dodebug) \
(mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \
} while (/* CONSTCOND */ 0)
#define MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl) \
do { \
(mtx)->mtx_owner = MUTEX_BIT_SPIN; \
if (!dodebug) \
(mtx)->mtx_owner |= MUTEX_BIT_NODEBUG; \
MUTEX_INITIALIZE_SPIN_IPL((mtx), (ipl)); \
MUTEX_SPINBIT_LOCK_INIT((mtx)); \
} while (/* CONSTCOND */ 0)
#define MUTEX_DESTROY(mtx) \
do { \
(mtx)->mtx_owner = MUTEX_THREAD; \
} while (/* CONSTCOND */ 0)
#define MUTEX_SPIN_P(owner) \
(((owner) & MUTEX_BIT_SPIN) != 0)
#define MUTEX_ADAPTIVE_P(owner) \
(((owner) & MUTEX_BIT_SPIN) == 0)
#ifndef MUTEX_CAS
#define MUTEX_CAS(p, o, n) \
(atomic_cas_ulong((volatile unsigned long *)(p), (o), (n)) == (o))
#endif /* MUTEX_CAS */
#define MUTEX_DEBUG_P(mtx) (((mtx)->mtx_owner & MUTEX_BIT_NODEBUG) == 0)
#if defined(LOCKDEBUG)
#define MUTEX_OWNED(owner) (((owner) & ~MUTEX_BIT_NODEBUG) != 0)
#define MUTEX_INHERITDEBUG(n, o) (n) |= (o) & MUTEX_BIT_NODEBUG
#else /* defined(LOCKDEBUG) */
#define MUTEX_OWNED(owner) ((owner) != 0)
#define MUTEX_INHERITDEBUG(n, o) /* nothing */
#endif /* defined(LOCKDEBUG) */
static inline int
MUTEX_ACQUIRE(kmutex_t *mtx, uintptr_t curthread)
{
int rv;
uintptr_t oldown = 0;
uintptr_t newown = curthread;
MUTEX_INHERITDEBUG(oldown, mtx->mtx_owner);
MUTEX_INHERITDEBUG(newown, oldown);
rv = MUTEX_CAS(&mtx->mtx_owner, oldown, newown);
membar_acquire();
return rv;
}
static inline int
MUTEX_SET_WAITERS(kmutex_t *mtx, uintptr_t owner)
{
int rv;
rv = MUTEX_CAS(&mtx->mtx_owner, owner, owner | MUTEX_BIT_WAITERS);
MUTEX_MEMBAR_ENTER();
return rv;
}
static inline void
MUTEX_RELEASE(kmutex_t *mtx)
{
uintptr_t newown;
newown = 0;
MUTEX_INHERITDEBUG(newown, mtx->mtx_owner);
atomic_store_release(&mtx->mtx_owner, newown);
}
#endif /* __HAVE_SIMPLE_MUTEXES */
/*
* Patch in stubs via strong alias where they are not available.
*/
#if defined(LOCKDEBUG)
#undef __HAVE_MUTEX_STUBS
#undef __HAVE_SPIN_MUTEX_STUBS
#endif
#ifndef __HAVE_MUTEX_STUBS
__strong_alias(mutex_enter,mutex_vector_enter);
__strong_alias(mutex_exit,mutex_vector_exit);
#endif
#ifndef __HAVE_SPIN_MUTEX_STUBS
__strong_alias(mutex_spin_enter,mutex_vector_enter);
__strong_alias(mutex_spin_exit,mutex_vector_exit);
#endif
static void mutex_abort(const char *, size_t, volatile const kmutex_t *,
const char *);
static void mutex_dump(const volatile void *, lockop_printer_t);
static lwp_t *mutex_owner(wchan_t);
lockops_t mutex_spin_lockops = {
.lo_name = "Mutex",
.lo_type = LOCKOPS_SPIN,
.lo_dump = mutex_dump,
};
lockops_t mutex_adaptive_lockops = {
.lo_name = "Mutex",
.lo_type = LOCKOPS_SLEEP,
.lo_dump = mutex_dump,
};
syncobj_t mutex_syncobj = {
.sobj_name = "mutex",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = turnstile_unsleep,
.sobj_changepri = turnstile_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = mutex_owner,
};
/*
* mutex_dump:
*
* Dump the contents of a mutex structure.
*/
static void
mutex_dump(const volatile void *cookie, lockop_printer_t pr)
{
const volatile kmutex_t *mtx = cookie;
uintptr_t owner = mtx->mtx_owner;
pr("owner field : %#018lx wait/spin: %16d/%d\n",
(long)MUTEX_OWNER(owner), MUTEX_HAS_WAITERS(mtx),
MUTEX_SPIN_P(owner));
}
/*
* mutex_abort:
*
* Dump information about an error and panic the system. This
* generates a lot of machine code in the DIAGNOSTIC case, so
* we ask the compiler to not inline it.
*/
static void __noinline
mutex_abort(const char *func, size_t line, volatile const kmutex_t *mtx,
const char *msg)
{
LOCKDEBUG_ABORT(func, line, mtx, (MUTEX_SPIN_P(mtx->mtx_owner) ?
&mutex_spin_lockops : &mutex_adaptive_lockops), msg);
}
/*
* mutex_init:
*
* Initialize a mutex for use. Note that adaptive mutexes are in
* essence spin mutexes that can sleep to avoid deadlock and wasting
* CPU time. We can't easily provide a type of mutex that always
* sleeps - see comments in mutex_vector_enter() about releasing
* mutexes unlocked.
*/
void
_mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl,
uintptr_t return_address)
{
lockops_t *lockops __unused;
bool dodebug;
memset(mtx, 0, sizeof(*mtx));
if (ipl == IPL_NONE || ipl == IPL_SOFTCLOCK ||
ipl == IPL_SOFTBIO || ipl == IPL_SOFTNET ||
ipl == IPL_SOFTSERIAL) {
lockops = (type == MUTEX_NODEBUG ?
NULL : &mutex_adaptive_lockops);
dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
MUTEX_INITIALIZE_ADAPTIVE(mtx, dodebug);
} else {
lockops = (type == MUTEX_NODEBUG ?
NULL : &mutex_spin_lockops);
dodebug = LOCKDEBUG_ALLOC(mtx, lockops, return_address);
MUTEX_INITIALIZE_SPIN(mtx, dodebug, ipl);
}
}
void
mutex_init(kmutex_t *mtx, kmutex_type_t type, int ipl)
{
_mutex_init(mtx, type, ipl, (uintptr_t)__builtin_return_address(0));
}
/*
* mutex_destroy:
*
* Tear down a mutex.
*/
void
mutex_destroy(kmutex_t *mtx)
{
uintptr_t owner = mtx->mtx_owner;
if (MUTEX_ADAPTIVE_P(owner)) {
MUTEX_ASSERT(mtx, !MUTEX_OWNED(owner)); MUTEX_ASSERT(mtx, !MUTEX_HAS_WAITERS(mtx));
} else {
MUTEX_ASSERT(mtx, !MUTEX_SPINBIT_LOCKED_P(mtx));
}
LOCKDEBUG_FREE(MUTEX_DEBUG_P(mtx), mtx);
MUTEX_DESTROY(mtx);
}
#ifdef MULTIPROCESSOR
/*
* mutex_oncpu:
*
* Return true if an adaptive mutex owner is running on a CPU in the
* system. If the target is waiting on the kernel big lock, then we
* must release it. This is necessary to avoid deadlock.
*/
static bool
mutex_oncpu(uintptr_t owner)
{
struct cpu_info *ci;
lwp_t *l;
KASSERT(kpreempt_disabled()); if (!MUTEX_OWNED(owner)) {
return false;
}
/*
* See lwp_dtor() why dereference of the LWP pointer is safe.
* We must have kernel preemption disabled for that.
*/
l = (lwp_t *)MUTEX_OWNER(owner);
ci = l->l_cpu;
if (ci && ci->ci_curlwp == l) {
/* Target is running; do we need to block? */
return (atomic_load_relaxed(&ci->ci_biglock_wanted) != l);
}
/* Not running. It may be safe to block now. */
return false;
}
#endif /* MULTIPROCESSOR */
/*
* mutex_vector_enter:
*
* Support routine for mutex_enter() that must handle all cases. In
* the LOCKDEBUG case, mutex_enter() is always aliased here, even if
* fast-path stubs are available. If a mutex_spin_enter() stub is
* not available, then it is also aliased directly here.
*/
void
mutex_vector_enter(kmutex_t *mtx)
{
uintptr_t owner, curthread;
turnstile_t *ts;
#ifdef MULTIPROCESSOR
u_int count;
#endif
LOCKSTAT_COUNTER(spincnt);
LOCKSTAT_COUNTER(slpcnt);
LOCKSTAT_TIMER(spintime);
LOCKSTAT_TIMER(slptime);
LOCKSTAT_FLAG(lsflag);
/*
* Handle spin mutexes.
*/
KPREEMPT_DISABLE(curlwp);
owner = mtx->mtx_owner;
if (MUTEX_SPIN_P(owner)) {
#if defined(LOCKDEBUG) && defined(MULTIPROCESSOR)
u_int spins = 0;
#endif
KPREEMPT_ENABLE(curlwp); MUTEX_SPIN_SPLRAISE(mtx); MUTEX_WANTLOCK(mtx);
#ifdef FULL
if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
MUTEX_LOCKED(mtx);
return;
}
#if !defined(MULTIPROCESSOR)
MUTEX_ABORT(mtx, "locking against myself");
#else /* !MULTIPROCESSOR */
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
/*
* Spin testing the lock word and do exponential backoff
* to reduce cache line ping-ponging between CPUs.
*/
do {
while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
SPINLOCK_SPIN_HOOK;
SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins)) MUTEX_ABORT(mtx, "spinout");
#endif /* LOCKDEBUG */
}
} while (!MUTEX_SPINBIT_LOCK_TRY(mtx)); if (count != SPINLOCK_BACKOFF_MIN) { LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_EVENT(lsflag, mtx,
LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
}
LOCKSTAT_EXIT(lsflag);
#endif /* !MULTIPROCESSOR */
#endif /* FULL */
MUTEX_LOCKED(mtx);
return;
}
curthread = (uintptr_t)curlwp;
MUTEX_DASSERT(mtx, MUTEX_ADAPTIVE_P(owner));
MUTEX_ASSERT(mtx, curthread != 0);
MUTEX_ASSERT(mtx, !cpu_intr_p()); MUTEX_WANTLOCK(mtx); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section());
LOCKDEBUG_BARRIER(&kernel_lock, 1);
}
LOCKSTAT_ENTER(lsflag);
/*
* Adaptive mutex; spin trying to acquire the mutex. If we
* determine that the owner is not running on a processor,
* then we stop spinning, and sleep instead.
*/
for (;;) {
if (!MUTEX_OWNED(owner)) {
/*
* Mutex owner clear could mean two things:
*
* * The mutex has been released.
* * The owner field hasn't been set yet.
*
* Try to acquire it again. If that fails,
* we'll just loop again.
*/
if (MUTEX_ACQUIRE(mtx, curthread))
break;
owner = mtx->mtx_owner;
continue;
}
if (__predict_false(MUTEX_OWNER(owner) == curthread)) { MUTEX_ABORT(mtx, "locking against myself");
}
#ifdef MULTIPROCESSOR
/*
* Check to see if the owner is running on a processor.
* If so, then we should just spin, as the owner will
* likely release the lock very soon.
*/
if (mutex_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
do {
KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count);
KPREEMPT_DISABLE(curlwp);
owner = mtx->mtx_owner;
} while (mutex_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_COUNT(spincnt, 1);
if (!MUTEX_OWNED(owner))
continue;
}
#endif
ts = turnstile_lookup(mtx);
/*
* Once we have the turnstile chain interlock, mark the
* mutex as having waiters. If that fails, spin again:
* chances are that the mutex has been released.
*/
if (!MUTEX_SET_WAITERS(mtx, owner)) {
turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
#ifdef MULTIPROCESSOR
/*
* mutex_exit() is permitted to release the mutex without
* any interlocking instructions, and the following can
* occur as a result:
*
* CPU 1: MUTEX_SET_WAITERS() CPU2: mutex_exit()
* ---------------------------- ----------------------------
* .. load mtx->mtx_owner
* .. see has-waiters bit clear
* set has-waiters bit ..
* .. store mtx->mtx_owner := 0
* return success
*
* There is another race that can occur: a third CPU could
* acquire the mutex as soon as it is released. Since
* adaptive mutexes are primarily spin mutexes, this is not
* something that we need to worry about too much. What we
* do need to ensure is that the waiters bit gets set.
*
* To allow the unlocked release, we need to make some
* assumptions here:
*
* o Release is the only non-atomic/unlocked operation
* that can be performed on the mutex. (It must still
* be atomic on the local CPU, e.g. in case interrupted
* or preempted).
*
* o At any given time on each mutex, MUTEX_SET_WAITERS()
* can only ever be in progress on one CPU in the
* system - guaranteed by the turnstile chain lock.
*
* o No other operations other than MUTEX_SET_WAITERS()
* and release can modify a mutex with a non-zero
* owner field.
*
* o If the holding LWP switches away, it posts a store
* fence before changing curlwp, ensuring that any
* overwrite of the mutex waiters flag by mutex_exit()
* completes before the modification of curlwp becomes
* visible to this CPU.
*
* o cpu_switchto() posts a store fence after setting curlwp
* and before resuming execution of an LWP.
*
* o _kernel_lock() posts a store fence before setting
* curcpu()->ci_biglock_wanted, and after clearing it.
* This ensures that any overwrite of the mutex waiters
* flag by mutex_exit() completes before the modification
* of ci_biglock_wanted becomes visible.
*
* After MUTEX_SET_WAITERS() succeeds, simultaneously
* confirming that the same LWP still holds the mutex
* since we took the turnstile lock and notifying it that
* we're waiting, we check the lock holder's status again.
* Some of the possible outcomes (not an exhaustive list;
* XXX this should be made exhaustive):
*
* 1. The on-CPU check returns true: the holding LWP is
* running again. The lock may be released soon and
* we should spin. Importantly, we can't trust the
* value of the waiters flag.
*
* 2. The on-CPU check returns false: the holding LWP is
* not running. We now have the opportunity to check
* if mutex_exit() has blatted the modifications made
* by MUTEX_SET_WAITERS().
*
* 3. The on-CPU check returns false: the holding LWP may
* or may not be running. It has context switched at
* some point during our check. Again, we have the
* chance to see if the waiters bit is still set or
* has been overwritten.
*
* 4. The on-CPU check returns false: the holding LWP is
* running on a CPU, but wants the big lock. It's OK
* to check the waiters field in this case.
*
* 5. The has-waiters check fails: the mutex has been
* released, the waiters flag cleared and another LWP
* now owns the mutex.
*
* 6. The has-waiters check fails: the mutex has been
* released.
*
* If the waiters bit is not set it's unsafe to go asleep,
* as we might never be awoken.
*/
if (mutex_oncpu(owner)) { turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
membar_consumer();
if (!MUTEX_HAS_WAITERS(mtx)) {
turnstile_exit(mtx);
owner = mtx->mtx_owner;
continue;
}
#endif /* MULTIPROCESSOR */
LOCKSTAT_START_TIMER(lsflag, slptime);
turnstile_block(ts, TS_WRITER_Q, mtx, &mutex_syncobj);
LOCKSTAT_STOP_TIMER(lsflag, slptime);
LOCKSTAT_COUNT(slpcnt, 1);
owner = mtx->mtx_owner;
}
KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SLEEP1,
slpcnt, slptime);
LOCKSTAT_EVENT(lsflag, mtx, LB_ADAPTIVE_MUTEX | LB_SPIN,
spincnt, spintime);
LOCKSTAT_EXIT(lsflag);
MUTEX_DASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_LOCKED(mtx);
}
/*
* mutex_vector_exit:
*
* Support routine for mutex_exit() that handles all cases.
*/
void
mutex_vector_exit(kmutex_t *mtx)
{
turnstile_t *ts;
uintptr_t curthread;
if (MUTEX_SPIN_P(mtx->mtx_owner)) {
#ifdef FULL
if (__predict_false(!MUTEX_SPINBIT_LOCKED_P(mtx))) { MUTEX_ABORT(mtx, "exiting unheld spin mutex");
}
MUTEX_UNLOCKED(mtx);
MUTEX_SPINBIT_LOCK_UNLOCK(mtx);
#endif
MUTEX_SPIN_SPLRESTORE(mtx);
return;
}
#ifndef __HAVE_MUTEX_STUBS
/*
* On some architectures without mutex stubs, we can enter here to
* release mutexes before interrupts and whatnot are up and running.
* We need this hack to keep them sweet.
*/
if (__predict_false(cold)) {
MUTEX_UNLOCKED(mtx); MUTEX_RELEASE(mtx);
return;
}
#endif
curthread = (uintptr_t)curlwp;
MUTEX_DASSERT(mtx, curthread != 0); MUTEX_ASSERT(mtx, MUTEX_OWNER(mtx->mtx_owner) == curthread); MUTEX_UNLOCKED(mtx);
#if !defined(LOCKDEBUG)
__USE(curthread);
#endif
#ifdef LOCKDEBUG
/*
* Avoid having to take the turnstile chain lock every time
* around. Raise the priority level to splhigh() in order
* to disable preemption and so make the following atomic.
* This also blocks out soft interrupts that could set the
* waiters bit.
*/
{
int s = splhigh();
if (!MUTEX_HAS_WAITERS(mtx)) {
MUTEX_RELEASE(mtx);
splx(s);
return;
}
splx(s);
}
#endif
/*
* Get this lock's turnstile. This gets the interlock on
* the sleep queue. Once we have that, we can clear the
* lock. If there was no turnstile for the lock, there
* were no waiters remaining.
*/
ts = turnstile_lookup(mtx);
if (ts == NULL) {
MUTEX_RELEASE(mtx);
turnstile_exit(mtx);
} else {
MUTEX_RELEASE(mtx);
turnstile_wakeup(ts, TS_WRITER_Q,
TS_WAITERS(ts, TS_WRITER_Q), NULL);
}
}
#ifndef __HAVE_SIMPLE_MUTEXES
/*
* mutex_wakeup:
*
* Support routine for mutex_exit() that wakes up all waiters.
* We assume that the mutex has been released, but it need not
* be.
*/
void
mutex_wakeup(kmutex_t *mtx)
{
turnstile_t *ts;
ts = turnstile_lookup(mtx);
if (ts == NULL) {
turnstile_exit(mtx);
return;
}
MUTEX_CLEAR_WAITERS(mtx);
turnstile_wakeup(ts, TS_WRITER_Q, TS_WAITERS(ts, TS_WRITER_Q), NULL);
}
#endif /* !__HAVE_SIMPLE_MUTEXES */
/*
* mutex_owned:
*
* Return true if the current LWP (adaptive) or CPU (spin)
* holds the mutex.
*/
int
mutex_owned(const kmutex_t *mtx)
{ if (mtx == NULL)
return 0;
if (MUTEX_ADAPTIVE_P(mtx->mtx_owner))
return MUTEX_OWNER(mtx->mtx_owner) == (uintptr_t)curlwp;
#ifdef FULL
return MUTEX_SPINBIT_LOCKED_P(mtx);
#else
return 1;
#endif
}
/*
* mutex_owner:
*
* Return the current owner of an adaptive mutex. Used for
* priority inheritance.
*/
static lwp_t *
mutex_owner(wchan_t wchan)
{ volatile const kmutex_t *mtx = wchan; MUTEX_ASSERT(mtx, MUTEX_ADAPTIVE_P(mtx->mtx_owner));
return (struct lwp *)MUTEX_OWNER(mtx->mtx_owner);
}
/*
* mutex_ownable:
*
* When compiled with DEBUG and LOCKDEBUG defined, ensure that
* the mutex is available. We cannot use !mutex_owned() since
* that won't work correctly for spin mutexes.
*/
int
mutex_ownable(const kmutex_t *mtx)
{
#ifdef LOCKDEBUG
MUTEX_TESTLOCK(mtx);
#endif
return 1;
}
/*
* mutex_tryenter:
*
* Try to acquire the mutex; return non-zero if we did.
*/
int
mutex_tryenter(kmutex_t *mtx)
{
uintptr_t curthread;
/*
* Handle spin mutexes.
*/
if (MUTEX_SPIN_P(mtx->mtx_owner)) {
MUTEX_SPIN_SPLRAISE(mtx);
#ifdef FULL
if (MUTEX_SPINBIT_LOCK_TRY(mtx)) {
MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx);
return 1;
}
MUTEX_SPIN_SPLRESTORE(mtx);
#else
MUTEX_WANTLOCK(mtx);
MUTEX_LOCKED(mtx);
return 1;
#endif
} else {
curthread = (uintptr_t)curlwp;
MUTEX_ASSERT(mtx, curthread != 0); if (MUTEX_ACQUIRE(mtx, curthread)) { MUTEX_WANTLOCK(mtx); MUTEX_LOCKED(mtx); MUTEX_DASSERT(mtx,
MUTEX_OWNER(mtx->mtx_owner) == curthread);
return 1;
}
}
return 0;
}
#if defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL)
/*
* mutex_spin_retry:
*
* Support routine for mutex_spin_enter(). Assumes that the caller
* has already raised the SPL, and adjusted counters.
*/
void
mutex_spin_retry(kmutex_t *mtx)
{
#ifdef MULTIPROCESSOR
u_int count;
LOCKSTAT_TIMER(spintime);
LOCKSTAT_FLAG(lsflag);
#ifdef LOCKDEBUG
u_int spins = 0;
#endif /* LOCKDEBUG */
MUTEX_WANTLOCK(mtx);
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
count = SPINLOCK_BACKOFF_MIN;
/*
* Spin testing the lock word and do exponential backoff
* to reduce cache line ping-ponging between CPUs.
*/
do {
while (MUTEX_SPINBIT_LOCKED_P(mtx)) {
SPINLOCK_BACKOFF(count);
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins))
MUTEX_ABORT(mtx, "spinout");
#endif /* LOCKDEBUG */
}
} while (!MUTEX_SPINBIT_LOCK_TRY(mtx));
LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_EVENT(lsflag, mtx, LB_SPIN_MUTEX | LB_SPIN, 1, spintime);
LOCKSTAT_EXIT(lsflag);
MUTEX_LOCKED(mtx);
#else /* MULTIPROCESSOR */
MUTEX_ABORT(mtx, "locking against myself");
#endif /* MULTIPROCESSOR */
}
#endif /* defined(__HAVE_SPIN_MUTEX_STUBS) || defined(FULL) */
/* $NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran and Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
*/
/*
* System calls of synchronous I/O multiplexing subsystem.
*
* Locking
*
* Two locks are used: <object-lock> and selcluster_t::sc_lock.
*
* The <object-lock> might be a device driver or another subsystem, e.g.
* socket or pipe. This lock is not exported, and thus invisible to this
* subsystem. Mainly, synchronisation between selrecord() and selnotify()
* routines depends on this lock, as it will be described in the comments.
*
* Lock order
*
* <object-lock> ->
* selcluster_t::sc_lock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_select.c,v 1.66 2023/10/15 10:29:34 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/lwp.h>
#include <sys/mount.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sleepq.h>
#include <sys/socketvar.h>
#include <sys/socketvar.h>
#include <sys/syncobj.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/uio.h>
/* Flags for lwp::l_selflag. */
#define SEL_RESET 0 /* awoken, interrupted, or not yet polling */
#define SEL_SCANNING 1 /* polling descriptors */
#define SEL_BLOCKING 2 /* blocking and waiting for event */
#define SEL_EVENT 3 /* interrupted, events set directly */
/*
* Per-cluster state for select()/poll(). For a system with fewer
* than 64 CPUs, this gives us per-CPU clusters.
*/
#define SELCLUSTERS 64
#define SELCLUSTERMASK (SELCLUSTERS - 1)
typedef struct selcluster {
kmutex_t *sc_lock;
sleepq_t sc_sleepq;
uint64_t sc_mask;
int sc_ncoll;
} selcluster_t;
static inline int selscan(char *, const int, const size_t, register_t *);
static inline int pollscan(struct pollfd *, const int, register_t *);
static void selclear(void);
static const int sel_flag[] = {
POLLRDNORM | POLLHUP | POLLERR,
POLLWRNORM | POLLHUP | POLLERR,
POLLRDBAND
};
/*
* LWPs are woken using the sleep queue only due to a collision, the case
* with the maximum Suck Factor. Save the cost of sorting for named waiters
* by inserting in LIFO order. In the future it would be preferable to not
* enqueue LWPs at all, unless subject to a collision.
*/
syncobj_t select_sobj = {
.sobj_name = "select",
.sobj_flag = SOBJ_SLEEPQ_LIFO,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static selcluster_t *selcluster[SELCLUSTERS] __read_mostly;
static int direct_select __read_mostly = 0;
/* Operations: either select() or poll(). */
const char selop_select[] = "select";
const char selop_poll[] = "poll";
/*
* Select system call.
*/
int
sys___pselect50(struct lwp *l, const struct sys___pselect50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) nd;
syscallarg(fd_set *) in;
syscallarg(fd_set *) ou;
syscallarg(fd_set *) ex;
syscallarg(const struct timespec *) ts;
syscallarg(sigset_t *) mask;
} */
struct timespec ats, *ts = NULL;
sigset_t amask, *mask = NULL;
int error;
if (SCARG(uap, ts)) {
error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
if (error)
return error;
ts = &ats;
}
if (SCARG(uap, mask) != NULL) {
error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
if (error)
return error;
mask = &amask;
}
return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
SCARG(uap, ou), SCARG(uap, ex), ts, mask);
}
int
sys___select50(struct lwp *l, const struct sys___select50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) nd;
syscallarg(fd_set *) in;
syscallarg(fd_set *) ou;
syscallarg(fd_set *) ex;
syscallarg(struct timeval *) tv;
} */
struct timeval atv;
struct timespec ats, *ts = NULL;
int error;
if (SCARG(uap, tv)) {
error = copyin(SCARG(uap, tv), (void *)&atv, sizeof(atv));
if (error)
return error;
if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
return EINVAL;
TIMEVAL_TO_TIMESPEC(&atv, &ats);
ts = &ats;
}
return selcommon(retval, SCARG(uap, nd), SCARG(uap, in),
SCARG(uap, ou), SCARG(uap, ex), ts, NULL);
}
/*
* sel_do_scan: common code to perform the scan on descriptors.
*/
static int
sel_do_scan(const char *opname, void *fds, const int nf, const size_t ni,
struct timespec *ts, sigset_t *mask, register_t *retval)
{
lwp_t * const l = curlwp;
selcluster_t *sc;
kmutex_t *lock;
struct timespec sleepts;
int error, timo;
timo = 0;
if (ts && inittimeleft(ts, &sleepts) == -1) {
return EINVAL;
}
if (__predict_false(mask)) sigsuspendsetup(l, mask);
/*
* We may context switch during or at any time after picking a CPU
* and cluster to associate with, but it doesn't matter. In the
* unlikely event we migrate elsewhere all we risk is a little lock
* contention; correctness is not sacrificed.
*/
sc = curcpu()->ci_data.cpu_selcluster;
lock = sc->sc_lock;
l->l_selcluster = sc;
if (opname == selop_select) {
l->l_selbits = fds;
l->l_selni = ni;
} else {
l->l_selbits = NULL;
}
for (;;) {
int ncoll;
SLIST_INIT(&l->l_selwait);
l->l_selret = 0;
/*
* No need to lock. If this is overwritten by another value
* while scanning, we will retry below. We only need to see
* exact state from the descriptors that we are about to poll,
* and lock activity resulting from fo_poll is enough to
* provide an up to date value for new polling activity.
*/
if (ts && (ts->tv_sec | ts->tv_nsec | direct_select) == 0) {
/* Non-blocking: no need for selrecord()/selclear() */
l->l_selflag = SEL_RESET;
} else {
l->l_selflag = SEL_SCANNING;
}
ncoll = sc->sc_ncoll;
membar_release();
if (opname == selop_select) {
error = selscan((char *)fds, nf, ni, retval);
} else {
error = pollscan((struct pollfd *)fds, nf, retval);
}
if (error || *retval)
break;
if (ts && (timo = gettimeleft(ts, &sleepts)) <= 0)
break;
/*
* Acquire the lock and perform the (re)checks. Note, if
* collision has occurred, then our state does not matter,
* as we must perform re-scan. Therefore, check it first.
*/
state_check:
mutex_spin_enter(lock);
if (__predict_false(sc->sc_ncoll != ncoll)) {
/* Collision: perform re-scan. */
mutex_spin_exit(lock);
selclear();
continue;
}
if (__predict_true(l->l_selflag == SEL_EVENT)) {
/* Events occurred, they are set directly. */
mutex_spin_exit(lock);
break;
}
if (__predict_true(l->l_selflag == SEL_RESET)) {
/* Events occurred, but re-scan is requested. */
mutex_spin_exit(lock);
selclear();
continue;
}
/* Nothing happen, therefore - sleep. */
l->l_selflag = SEL_BLOCKING;
KASSERT(l->l_blcnt == 0);
(void)sleepq_enter(&sc->sc_sleepq, l, lock);
sleepq_enqueue(&sc->sc_sleepq, sc, opname, &select_sobj, true);
error = sleepq_block(timo, true, &select_sobj, 0);
if (error != 0) {
break;
}
/* Awoken: need to check the state. */
goto state_check;
}
selclear();
/* Add direct events if any. */
if (l->l_selflag == SEL_EVENT) { KASSERT(l->l_selret != 0);
*retval += l->l_selret;
}
if (__predict_false(mask)) sigsuspendteardown(l);
/* select and poll are not restarted after signals... */
if (error == ERESTART)
return EINTR;
if (error == EWOULDBLOCK)
return 0; return error;
}
int
selcommon(register_t *retval, int nd, fd_set *u_in, fd_set *u_ou,
fd_set *u_ex, struct timespec *ts, sigset_t *mask)
{
char smallbits[howmany(FD_SETSIZE, NFDBITS) *
sizeof(fd_mask) * 6];
char *bits;
int error, nf;
size_t ni;
if (nd < 0)
return (EINVAL);
nf = atomic_load_consume(&curlwp->l_fd->fd_dt)->dt_nfiles;
if (nd > nf) {
/* forgiving; slightly wrong */
nd = nf;
}
ni = howmany(nd, NFDBITS) * sizeof(fd_mask);
if (ni * 6 > sizeof(smallbits)) bits = kmem_alloc(ni * 6, KM_SLEEP);
else
bits = smallbits;
#define getbits(name, x) \
if (u_ ## name) { \
error = copyin(u_ ## name, bits + ni * x, ni); \
if (error) \
goto fail; \
} else \
memset(bits + ni * x, 0, ni);
getbits(in, 0); getbits(ou, 1); getbits(ex, 2);
#undef getbits
error = sel_do_scan(selop_select, bits, nd, ni, ts, mask, retval);
if (error == 0 && u_in != NULL) error = copyout(bits + ni * 3, u_in, ni); if (error == 0 && u_ou != NULL) error = copyout(bits + ni * 4, u_ou, ni); if (error == 0 && u_ex != NULL) error = copyout(bits + ni * 5, u_ex, ni);
fail:
if (bits != smallbits) kmem_free(bits, ni * 6);
return (error);
}
static inline int
selscan(char *bits, const int nfd, const size_t ni, register_t *retval)
{
fd_mask *ibitp, *obitp;
int msk, i, j, fd, n;
file_t *fp;
lwp_t *l;
ibitp = (fd_mask *)(bits + ni * 0);
obitp = (fd_mask *)(bits + ni * 3);
n = 0;
l = curlwp;
memset(obitp, 0, ni * 3);
for (msk = 0; msk < 3; msk++) { for (i = 0; i < nfd; i += NFDBITS) {
fd_mask ibits, obits;
ibits = *ibitp;
obits = 0;
while ((j = ffs(ibits)) && (fd = i + --j) < nfd) {
ibits &= ~(1U << j);
if ((fp = fd_getfile(fd)) == NULL)
return (EBADF);
/*
* Setup an argument to selrecord(), which is
* a file descriptor number.
*/
l->l_selrec = fd;
if ((*fp->f_ops->fo_poll)(fp, sel_flag[msk])) { if (!direct_select) {
/*
* Have events: do nothing in
* selrecord().
*/
l->l_selflag = SEL_RESET;
}
obits |= (1U << j);
n++;
}
fd_putfile(fd);
}
if (obits != 0) {
if (direct_select) {
kmutex_t *lock;
lock = l->l_selcluster->sc_lock;
mutex_spin_enter(lock);
*obitp |= obits;
mutex_spin_exit(lock);
} else {
*obitp |= obits;
}
}
ibitp++;
obitp++;
}
}
*retval = n;
return (0);
}
/*
* Poll system call.
*/
int
sys_poll(struct lwp *l, const struct sys_poll_args *uap, register_t *retval)
{
/* {
syscallarg(struct pollfd *) fds;
syscallarg(u_int) nfds;
syscallarg(int) timeout;
} */
struct timespec ats, *ts = NULL;
if (SCARG(uap, timeout) != INFTIM) { ats.tv_sec = SCARG(uap, timeout) / 1000;
ats.tv_nsec = (SCARG(uap, timeout) % 1000) * 1000000;
ts = &ats;
}
return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, NULL);
}
/*
* Poll system call.
*/
int
sys___pollts50(struct lwp *l, const struct sys___pollts50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct pollfd *) fds;
syscallarg(u_int) nfds;
syscallarg(const struct timespec *) ts;
syscallarg(const sigset_t *) mask;
} */
struct timespec ats, *ts = NULL;
sigset_t amask, *mask = NULL;
int error;
if (SCARG(uap, ts)) {
error = copyin(SCARG(uap, ts), &ats, sizeof(ats));
if (error)
return error;
ts = &ats;
}
if (SCARG(uap, mask)) {
error = copyin(SCARG(uap, mask), &amask, sizeof(amask));
if (error)
return error;
mask = &amask;
}
return pollcommon(retval, SCARG(uap, fds), SCARG(uap, nfds), ts, mask);
}
int
pollcommon(register_t *retval, struct pollfd *u_fds, u_int nfds,
struct timespec *ts, sigset_t *mask)
{
struct pollfd smallfds[32];
struct pollfd *fds;
int error;
size_t ni;
if (nfds > curlwp->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_max + 1000) {
/*
* Prevent userland from causing over-allocation.
* Raising the default limit too high can still cause
* a lot of memory to be allocated, but this also means
* that the file descriptor array will also be large.
*
* To reduce the memory requirements here, we could
* process the 'fds' array in chunks, but that
* is a lot of code that isn't normally useful.
* (Or just move the copyin/out into pollscan().)
*
* Historically the code silently truncated 'fds' to
* dt_nfiles entries - but that does cause issues.
*
* Using the max limit equivalent to sysctl
* kern.maxfiles is the moral equivalent of OPEN_MAX
* as specified by POSIX.
*
* We add a slop of 1000 in case the resource limit was
* changed after opening descriptors or the same descriptor
* was specified more than once.
*/
return EINVAL;
}
ni = nfds * sizeof(struct pollfd);
if (ni > sizeof(smallfds))
fds = kmem_alloc(ni, KM_SLEEP);
else
fds = smallfds;
error = copyin(u_fds, fds, ni);
if (error)
goto fail;
error = sel_do_scan(selop_poll, fds, nfds, ni, ts, mask, retval);
if (error == 0) error = copyout(fds, u_fds, ni);
fail:
if (fds != smallfds) kmem_free(fds, ni);
return (error);
}
static inline int
pollscan(struct pollfd *fds, const int nfd, register_t *retval)
{
file_t *fp;
int i, n = 0, revents;
for (i = 0; i < nfd; i++, fds++) {
fds->revents = 0;
if (fds->fd < 0) {
revents = 0;
} else if ((fp = fd_getfile(fds->fd)) == NULL) {
revents = POLLNVAL;
} else {
/*
* Perform poll: registers select request or returns
* the events which are set. Setup an argument for
* selrecord(), which is a pointer to struct pollfd.
*/
curlwp->l_selrec = (uintptr_t)fds;
revents = (*fp->f_ops->fo_poll)(fp,
fds->events | POLLERR | POLLHUP);
fd_putfile(fds->fd);
}
if (revents) { if (!direct_select) {
/* Have events: do nothing in selrecord(). */
curlwp->l_selflag = SEL_RESET;
}
fds->revents = revents;
n++;
}
}
*retval = n;
return (0);
}
int
seltrue(dev_t dev, int events, lwp_t *l)
{
return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}
/*
* Record a select request. Concurrency issues:
*
* The caller holds the same lock across calls to selrecord() and
* selnotify(), so we don't need to consider a concurrent wakeup
* while in this routine.
*
* The only activity we need to guard against is selclear(), called by
* another thread that is exiting sel_do_scan().
* `sel_lwp' can only become non-NULL while the caller's lock is held,
* so it cannot become non-NULL due to a change made by another thread
* while we are in this routine. It can only become _NULL_ due to a
* call to selclear().
*
* If it is non-NULL and != selector there is the potential for
* selclear() to be called by another thread. If either of those
* conditions are true, we're not interested in touching the `named
* waiter' part of the selinfo record because we need to record a
* collision. Hence there is no need for additional locking in this
* routine.
*/
void
selrecord(lwp_t *selector, struct selinfo *sip)
{
selcluster_t *sc;
lwp_t *other;
KASSERT(selector == curlwp);
sc = selector->l_selcluster;
other = sip->sel_lwp;
if (selector->l_selflag == SEL_RESET) {
/* 0. We're not going to block - will poll again if needed. */
} else if (other == selector) {
/* 1. We (selector) already claimed to be the first LWP. */
KASSERT(sip->sel_cluster == sc);
} else if (other == NULL) {
/*
* 2. No first LWP, therefore we (selector) are the first.
*
* There may be unnamed waiters (collisions). Issue a memory
* barrier to ensure that we access sel_lwp (above) before
* other fields - this guards against a call to selclear().
*/
membar_acquire();
sip->sel_lwp = selector;
SLIST_INSERT_HEAD(&selector->l_selwait, sip, sel_chain);
/* Copy the argument, which is for selnotify(). */
sip->sel_fdinfo = selector->l_selrec;
/* Replace selinfo's lock with the chosen cluster's lock. */
sip->sel_cluster = sc;
} else {
/* 3. Multiple waiters: record a collision. */
sip->sel_collision |= sc->sc_mask;
KASSERT(sip->sel_cluster != NULL);
}
}
/*
* Record a knote.
*
* The caller holds the same lock as for selrecord().
*/
void
selrecord_knote(struct selinfo *sip, struct knote *kn)
{
klist_insert(&sip->sel_klist, kn);
}
/*
* Remove a knote.
*
* The caller holds the same lock as for selrecord().
*
* Returns true if the last knote was removed and the list
* is now empty.
*/
bool
selremove_knote(struct selinfo *sip, struct knote *kn)
{
return klist_remove(&sip->sel_klist, kn);
}
/*
* sel_setevents: a helper function for selnotify(), to set the events
* for LWP sleeping in selcommon() or pollcommon().
*/
static inline bool
sel_setevents(lwp_t *l, struct selinfo *sip, const int events)
{
const int oflag = l->l_selflag;
int ret = 0;
/*
* If we require re-scan or it was required by somebody else,
* then just (re)set SEL_RESET and return.
*/
if (__predict_false(events == 0 || oflag == SEL_RESET)) {
l->l_selflag = SEL_RESET;
return true;
}
/*
* Direct set. Note: select state of LWP is locked. First,
* determine whether it is selcommon() or pollcommon().
*/
if (l->l_selbits != NULL) {
const size_t ni = l->l_selni;
fd_mask *fds = (fd_mask *)l->l_selbits;
fd_mask *ofds = (fd_mask *)((char *)fds + ni * 3);
const int fd = sip->sel_fdinfo, fbit = 1 << (fd & __NFDMASK);
const int idx = fd >> __NFDSHIFT;
int n;
for (n = 0; n < 3; n++) {
if ((fds[idx] & fbit) != 0 && (ofds[idx] & fbit) == 0 &&
(sel_flag[n] & events)) {
ofds[idx] |= fbit;
ret++;
}
fds = (fd_mask *)((char *)fds + ni);
ofds = (fd_mask *)((char *)ofds + ni);
}
} else {
struct pollfd *pfd = (void *)sip->sel_fdinfo;
int revents = events & (pfd->events | POLLERR | POLLHUP);
if (revents) {
if (pfd->revents == 0)
ret = 1;
pfd->revents |= revents;
}
}
/* Check whether there are any events to return. */
if (!ret) {
return false;
}
/* Indicate direct set and note the event (cluster lock is held). */
l->l_selflag = SEL_EVENT;
l->l_selret += ret;
return true;
}
/*
* Do a wakeup when a selectable event occurs. Concurrency issues:
*
* As per selrecord(), the caller's object lock is held. If there
* is a named waiter, we must acquire the associated selcluster's lock
* in order to synchronize with selclear() and pollers going to sleep
* in sel_do_scan().
*
* sip->sel_cluser cannot change at this point, as it is only changed
* in selrecord(), and concurrent calls to selrecord() are locked
* out by the caller.
*/
void
selnotify(struct selinfo *sip, int events, long knhint)
{
selcluster_t *sc;
uint64_t mask;
int index, oflag;
lwp_t *l;
kmutex_t *lock;
KNOTE(&sip->sel_klist, knhint); if (sip->sel_lwp != NULL) {
/* One named LWP is waiting. */
sc = sip->sel_cluster;
lock = sc->sc_lock;
mutex_spin_enter(lock);
/* Still there? */
if (sip->sel_lwp != NULL) {
/*
* Set the events for our LWP and indicate that.
* Otherwise, request for a full re-scan.
*/
l = sip->sel_lwp;
oflag = l->l_selflag;
if (!direct_select) {
l->l_selflag = SEL_RESET; } else if (!sel_setevents(l, sip, events)) {
/* No events to return. */
mutex_spin_exit(lock);
return;
}
/*
* If thread is sleeping, wake it up. If it's not
* yet asleep, it will notice the change in state
* and will re-poll the descriptors.
*/
if (oflag == SEL_BLOCKING && l->l_mutex == lock) { KASSERT(l->l_wchan == sc);
sleepq_remove(l->l_sleepq, l, true);
}
}
mutex_spin_exit(lock);
}
if ((mask = sip->sel_collision) != 0) {
/*
* There was a collision (multiple waiters): we must
* inform all potentially interested waiters.
*/
sip->sel_collision = 0;
do {
index = ffs64(mask) - 1;
mask ^= __BIT(index);
sc = selcluster[index];
lock = sc->sc_lock;
mutex_spin_enter(lock);
sc->sc_ncoll++;
sleepq_wake(&sc->sc_sleepq, sc, (u_int)-1, lock);
} while (__predict_false(mask != 0));
}
}
/*
* Remove an LWP from all objects that it is waiting for. Concurrency
* issues:
*
* The object owner's (e.g. device driver) lock is not held here. Calls
* can be made to selrecord() and we do not synchronize against those
* directly using locks. However, we use `sel_lwp' to lock out changes.
* Before clearing it we must use memory barriers to ensure that we can
* safely traverse the list of selinfo records.
*/
static void
selclear(void)
{
struct selinfo *sip, *next;
selcluster_t *sc;
lwp_t *l;
kmutex_t *lock;
l = curlwp;
sc = l->l_selcluster;
lock = sc->sc_lock;
/*
* If the request was non-blocking, or we found events on the first
* descriptor, there will be no need to clear anything - avoid
* taking the lock.
*/
if (SLIST_EMPTY(&l->l_selwait)) {
return;
}
mutex_spin_enter(lock);
for (sip = SLIST_FIRST(&l->l_selwait); sip != NULL; sip = next) { KASSERT(sip->sel_lwp == l); KASSERT(sip->sel_cluster == l->l_selcluster);
/*
* Read link to next selinfo record, if any.
* It's no longer safe to touch `sip' after clearing
* `sel_lwp', so ensure that the read of `sel_chain'
* completes before the clearing of sel_lwp becomes
* globally visible.
*/
next = SLIST_NEXT(sip, sel_chain);
/* Release the record for another named waiter to use. */
atomic_store_release(&sip->sel_lwp, NULL);
}
mutex_spin_exit(lock);
}
/*
* Initialize the select/poll system calls. Called once for each
* CPU in the system, as they are attached.
*/
void
selsysinit(struct cpu_info *ci)
{
selcluster_t *sc;
u_int index;
/* If already a cluster in place for this bit, re-use. */
index = cpu_index(ci) & SELCLUSTERMASK;
sc = selcluster[index];
if (sc == NULL) {
sc = kmem_alloc(roundup2(sizeof(selcluster_t),
coherency_unit) + coherency_unit, KM_SLEEP);
sc = (void *)roundup2((uintptr_t)sc, coherency_unit);
sc->sc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
sleepq_init(&sc->sc_sleepq);
sc->sc_ncoll = 0;
sc->sc_mask = __BIT(index);
selcluster[index] = sc;
}
ci->ci_data.cpu_selcluster = sc;
}
/*
* Initialize a selinfo record.
*/
void
selinit(struct selinfo *sip)
{
memset(sip, 0, sizeof(*sip));
klist_init(&sip->sel_klist);
}
/*
* Destroy a selinfo record. The owning object must not gain new
* references while this is in progress: all activity on the record
* must be stopped.
*
* Concurrency issues: we only need guard against a call to selclear()
* by a thread exiting sel_do_scan(). The caller has prevented further
* references being made to the selinfo record via selrecord(), and it
* will not call selnotify() again.
*/
void
seldestroy(struct selinfo *sip)
{
selcluster_t *sc;
kmutex_t *lock;
lwp_t *l;
klist_fini(&sip->sel_klist);
if (sip->sel_lwp == NULL)
return;
/*
* Lock out selclear(). The selcluster pointer can't change while
* we are here since it is only ever changed in selrecord(),
* and that will not be entered again for this record because
* it is dying.
*/
KASSERT(sip->sel_cluster != NULL);
sc = sip->sel_cluster;
lock = sc->sc_lock;
mutex_spin_enter(lock);
if ((l = sip->sel_lwp) != NULL) {
/*
* This should rarely happen, so although SLIST_REMOVE()
* is slow, using it here is not a problem.
*/
KASSERT(l->l_selcluster == sc); SLIST_REMOVE(&l->l_selwait, sip, selinfo, sel_chain);
sip->sel_lwp = NULL;
}
mutex_spin_exit(lock);
}
/*
* System control nodes.
*/
SYSCTL_SETUP(sysctl_select_setup, "sysctl select setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "direct_select",
SYSCTL_DESCR("Enable/disable direct select (for testing)"),
NULL, 0, &direct_select, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $ */
/*-
* Copyright (c) 2005, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty_conf.c 8.5 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_conf.c,v 1.57 2021/08/09 20:49:10 andvar Exp $");
#define TTY_ALLOW_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/ttycom.h>
#include <sys/conf.h>
#include <sys/mutex.h>
#include <sys/queue.h>
static struct linesw termios_disc = {
.l_name = "termios",
.l_open = ttylopen,
.l_close = ttylclose,
.l_read = ttread,
.l_write = ttwrite,
.l_ioctl = ttynullioctl,
.l_rint = ttyinput,
.l_start = ttstart,
.l_modem = ttymodem,
.l_poll = ttpoll
};
/*
* This is for the benefit of old BSD TTY compatibility, but since it is
* identical to termios (except for the name), don't bother conditionalizing
* it.
*/
static struct linesw ntty_disc = { /* old NTTYDISC */
.l_name = "ntty",
.l_open = ttylopen,
.l_close = ttylclose,
.l_read = ttread,
.l_write = ttwrite,
.l_ioctl = ttynullioctl,
.l_rint = ttyinput,
.l_start = ttstart,
.l_modem = ttymodem,
.l_poll = ttpoll
};
static LIST_HEAD(, linesw) ttyldisc_list = LIST_HEAD_INITIALIZER(ttyldisc_head);
/*
* Note: We don't bother refcounting termios_disc and ntty_disc; they can't
* be removed from the list, and termios_disc is likely to have very many
* references (could we overflow the count?).
*/
#define TTYLDISC_ISSTATIC(disc) \
((disc) == &termios_disc || (disc) == &ntty_disc)
#define TTYLDISC_HOLD(disc) \
do { \
if (! TTYLDISC_ISSTATIC(disc)) { \
KASSERT((disc)->l_refcnt != UINT_MAX); \
(disc)->l_refcnt++; \
} \
} while (/*CONSTCOND*/0)
#define TTYLDISC_RELE(disc) \
do { \
if (! TTYLDISC_ISSTATIC(disc)) { \
KASSERT((disc)->l_refcnt != 0); \
(disc)->l_refcnt--; \
} \
} while (/*CONSTCOND*/0)
#define TTYLDISC_ISINUSE(disc) \
(TTYLDISC_ISSTATIC(disc) || (disc)->l_refcnt != 0)
/*
* Do nothing specific version of line
* discipline specific ioctl command.
*/
/*ARGSUSED*/
int
ttynullioctl(struct tty *tp, u_long cmd, void *data, int flags, struct lwp *l)
{
return (EPASSTHROUGH);
}
/*
* Return error to line discipline
* specific poll call.
*/
/*ARGSUSED*/
int
ttyerrpoll(struct tty *tp, int events, struct lwp *l)
{
return (POLLERR);
}
void
ttyldisc_init(void)
{
if (ttyldisc_attach(&termios_disc) != 0)
panic("ttyldisc_init: termios_disc");
if (ttyldisc_attach(&ntty_disc) != 0)
panic("ttyldisc_init: ntty_disc");
}
static struct linesw *
ttyldisc_lookup_locked(const char *name)
{
struct linesw *disc;
LIST_FOREACH(disc, &ttyldisc_list, l_list) {
if (strcmp(name, disc->l_name) == 0)
return (disc);
}
return (NULL);
}
/*
* Look up a line discipline by its name. Caller holds a reference on
* the returned line discipline.
*/
struct linesw *
ttyldisc_lookup(const char *name)
{
struct linesw *disc;
mutex_spin_enter(&tty_lock);
disc = ttyldisc_lookup_locked(name);
if (disc != NULL)
TTYLDISC_HOLD(disc);
mutex_spin_exit(&tty_lock);
return (disc);
}
/*
* Look up a line discipline by its legacy number. Caller holds a
* reference on the returned line discipline.
*/
struct linesw *
ttyldisc_lookup_bynum(int num)
{
struct linesw *disc;
mutex_spin_enter(&tty_lock);
LIST_FOREACH(disc, &ttyldisc_list, l_list) {
if (disc->l_no == num) {
TTYLDISC_HOLD(disc);
mutex_spin_exit(&tty_lock);
return (disc);
}
}
mutex_spin_exit(&tty_lock);
return (NULL);
}
/*
* Release a reference on a line discipline previously added by
* ttyldisc_lookup() or ttyldisc_lookup_bynum().
*/
void
ttyldisc_release(struct linesw *disc)
{ if (disc == NULL)
return;
mutex_spin_enter(&tty_lock);
TTYLDISC_RELE(disc);
mutex_spin_exit(&tty_lock);
}
#define TTYLDISC_LEGACY_NUMBER_MIN 10
#define TTYLDISC_LEGACY_NUMBER_MAX INT_MAX
static void
ttyldisc_assign_legacy_number(struct linesw *disc)
{
static const struct {
const char *name;
int num;
} table[] = {
{ "termios", TTYDISC },
{ "ntty", 2 /* XXX old NTTYDISC */ },
{ "tablet", TABLDISC },
{ "slip", SLIPDISC },
{ "ppp", PPPDISC },
{ "strip", STRIPDISC },
{ "hdlc", HDLCDISC },
{ NULL, 0 }
};
struct linesw *ldisc;
int i;
for (i = 0; table[i].name != NULL; i++) {
if (strcmp(disc->l_name, table[i].name) == 0) {
disc->l_no = table[i].num;
return;
}
}
disc->l_no = TTYLDISC_LEGACY_NUMBER_MIN;
LIST_FOREACH(ldisc, &ttyldisc_list, l_list) {
if (disc->l_no == ldisc->l_no) {
KASSERT(disc->l_no < TTYLDISC_LEGACY_NUMBER_MAX);
disc->l_no++;
}
}
}
/*
* Register a line discipline.
*/
int
ttyldisc_attach(struct linesw *disc)
{
KASSERT(disc->l_name != NULL);
KASSERT(disc->l_open != NULL);
KASSERT(disc->l_close != NULL);
KASSERT(disc->l_read != NULL);
KASSERT(disc->l_write != NULL);
KASSERT(disc->l_ioctl != NULL);
KASSERT(disc->l_rint != NULL);
KASSERT(disc->l_start != NULL);
KASSERT(disc->l_modem != NULL);
KASSERT(disc->l_poll != NULL);
/* You are not allowed to exceed TTLINEDNAMELEN */
if (strlen(disc->l_name) >= TTLINEDNAMELEN)
return (ENAMETOOLONG);
mutex_spin_enter(&tty_lock);
if (ttyldisc_lookup_locked(disc->l_name) != NULL) {
mutex_spin_exit(&tty_lock);
return (EEXIST);
}
ttyldisc_assign_legacy_number(disc);
LIST_INSERT_HEAD(&ttyldisc_list, disc, l_list);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Remove a line discipline.
*/
int
ttyldisc_detach(struct linesw *disc)
{
#ifdef DIAGNOSTIC
struct linesw *ldisc = ttyldisc_lookup(disc->l_name);
KASSERT(ldisc != NULL);
KASSERT(ldisc == disc);
ttyldisc_release(ldisc);
#endif
mutex_spin_enter(&tty_lock);
if (TTYLDISC_ISINUSE(disc)) {
mutex_spin_exit(&tty_lock);
return (EBUSY);
}
LIST_REMOVE(disc, l_list);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Return the default line discipline.
*/
struct linesw *
ttyldisc_default(void)
{
return (&termios_disc);
}
/* $NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by David Laight.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_syscall.c,v 1.15 2022/06/29 16:33:09 hannken Exp $");
#include <sys/syscall_stats.h>
#include <sys/syscallvar.h>
/*
* MI indirect system call support.
* Included from sys_indirect.c and compat/netbsd32/netbsd32_indirect.c
*
* SYS_SYSCALL is set to the required function name.
*/
#define CONCAT(a,b) __CONCAT(a,b)
static void
CONCAT(SYS_SYSCALL, _biglockcheck)(struct proc *p, int code)
{
#ifdef DIAGNOSTIC
kpreempt_disable(); /* make curcpu() stable */
KASSERTMSG(curcpu()->ci_biglock_count == 0,
"syscall %ld of emul %s leaked %d kernel locks",
(long)code, p->p_emul->e_name, curcpu()->ci_biglock_count);
kpreempt_enable();
#endif
}
int
SYS_SYSCALL(struct lwp *l, const struct CONCAT(SYS_SYSCALL, _args) *uap,
register_t *rval)
{
/* {
syscallarg(int) code;
syscallarg(register_t) args[SYS_MAXSYSARGS];
} */
const struct sysent *callp;
struct proc *p = l->l_proc;
int code;
int error;
#ifdef NETBSD32_SYSCALL
register_t args64[SYS_MAXSYSARGS];
int i, narg;
#define TRACE_ARGS args64
#else
#define TRACE_ARGS &SCARG(uap, args[0])
#endif
callp = p->p_emul->e_sysent;
code = SCARG(uap, code) & (SYS_NSYSENT - 1);
SYSCALL_COUNT(syscall_counts, code);
callp += code;
if (__predict_false(callp->sy_flags & SYCALL_INDIRECT))
return ENOSYS;
if (__predict_true(!p->p_trace_enabled)) {
error = sy_call(callp, l, &uap->args, rval);
CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
return error;
}
#ifdef NETBSD32_SYSCALL
narg = callp->sy_narg;
for (i = 0; i < narg; i++)
args64[i] = SCARG(uap, args[i]);
#endif
error = trace_enter(code, callp, TRACE_ARGS);
if (__predict_true(error == 0)) error = sy_call(callp, l, &uap->args, rval);
trace_exit(code, callp, &uap->args, rval, error);
CONCAT(SYS_SYSCALL, _biglockcheck)(p, code);
return error;
#undef TRACE_ARGS
}
/* $NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_output.c 8.3 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_output.c,v 1.326 2023/04/19 22:00:18 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_mrouting.h"
#include "opt_net_mpsafe.h"
#include "opt_mpls.h"
#endif
#include "arp.h"
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/kauth.h>
#include <sys/systm.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/ip_private.h>
#include <netinet/in_offload.h>
#include <netinet/portalgo.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
#endif
#ifdef MROUTING
#include <netinet/ip_mroute.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif
static int ip_pcbopts(struct inpcb *, const struct sockopt *);
static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
static struct ifnet *ip_multicast_if(struct in_addr *, int *);
static void ip_mloopback(struct ifnet *, struct mbuf *,
const struct sockaddr_in *);
static int ip_ifaddrvalid(const struct in_ifaddr *);
extern pfil_head_t *inet_pfil_hook; /* XXX */
int ip_do_loopback_cksum = 0;
static int
ip_mark_mpls(struct ifnet * const ifp, struct mbuf * const m,
const struct rtentry *rt)
{
int error = 0;
#ifdef MPLS
union mpls_shim msh;
if (rt == NULL || rt_gettag(rt) == NULL ||
rt_gettag(rt)->sa_family != AF_MPLS ||
(m->m_flags & (M_MCAST | M_BCAST)) != 0 ||
ifp->if_type != IFT_ETHER)
return 0;
msh.s_addr = MPLS_GETSADDR(rt);
if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
struct m_tag *mtag;
/*
* XXX tentative solution to tell ether_output
* it's MPLS. Need some more efficient solution.
*/
mtag = m_tag_get(PACKET_TAG_MPLS,
sizeof(int) /* dummy */,
M_NOWAIT);
if (mtag == NULL)
return ENOMEM;
m_tag_prepend(m, mtag);
}
#endif
return error;
}
/*
* Send an IP packet to a host.
*/
int
ip_if_output(struct ifnet * const ifp, struct mbuf * const m,
const struct sockaddr * const dst, const struct rtentry *rt)
{
int error = 0;
if (rt != NULL) {
error = rt_check_reject_route(rt, ifp);
if (error != 0) { IP_STATINC(IP_STAT_RTREJECT);
m_freem(m);
return error;
}
}
error = ip_mark_mpls(ifp, m, rt);
if (error != 0) {
m_freem(m);
return error;
}
error = if_output_lock(ifp, ifp, m, dst, rt);
return error;
}
/*
* IP output. The packet in mbuf chain m contains a skeletal IP
* header (with len, off, ttl, proto, tos, src, dst).
* The mbuf chain containing the packet will be freed.
* The mbuf opt, if present, will not be freed.
*/
int
ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
struct ip_moptions *imo, struct inpcb *inp)
{
struct rtentry *rt;
struct ip *ip;
struct ifnet *ifp, *mifp = NULL;
struct mbuf *m = m0;
int len, hlen, error = 0;
struct route iproute;
const struct sockaddr_in *dst;
struct in_ifaddr *ia = NULL;
struct ifaddr *ifa;
int isbroadcast;
int sw_csum;
u_long mtu;
bool natt_frag = false;
bool rtmtu_nolock;
union {
struct sockaddr sa;
struct sockaddr_in sin;
} udst, usrc;
struct sockaddr *rdst = &udst.sa; /* real IP destination, as
* opposed to the nexthop
*/
struct psref psref, psref_ia;
int bound;
bool bind_need_restore = false;
const struct sockaddr *sa;
len = 0;
MCLAIM(m, &ip_tx_mowner);
KASSERT((m->m_flags & M_PKTHDR) != 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv6|M_CSUM_UDPv6)) == 0); KASSERT((m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) !=
(M_CSUM_TCPv4|M_CSUM_UDPv4));
KASSERT(m->m_len >= sizeof(struct ip));
hlen = sizeof(struct ip);
if (opt) { m = ip_insertoptions(m, opt, &len);
hlen = len;
}
ip = mtod(m, struct ip *);
/*
* Fill in IP header.
*/
if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
ip->ip_v = IPVERSION;
ip->ip_off = htons(0);
/* ip->ip_id filled in after we find out source ia */
ip->ip_hl = hlen >> 2;
IP_STATINC(IP_STAT_LOCALOUT);
} else {
hlen = ip->ip_hl << 2;
}
/*
* Route packet.
*/
if (ro == NULL) { memset(&iproute, 0, sizeof(iproute));
ro = &iproute;
}
sockaddr_in_init(&udst.sin, &ip->ip_dst, 0);
dst = satocsin(rtcache_getdst(ro));
/*
* If there is a cached route, check that it is to the same
* destination and is still up. If not, free it and try again.
* The address family should also be checked in case of sharing
* the cache with IPv6.
*/
if (dst && (dst->sin_family != AF_INET ||
!in_hosteq(dst->sin_addr, ip->ip_dst)))
rtcache_free(ro);
/* XXX must be before rtcache operations */
bound = curlwp_bind();
bind_need_restore = true;
if ((rt = rtcache_validate(ro)) == NULL &&
(rt = rtcache_update(ro, 1)) == NULL) {
dst = &udst.sin;
error = rtcache_setdst(ro, &udst.sa);
if (error != 0) { IP_STATINC(IP_STAT_ODROPPED);
goto bad;
}
}
/*
* If routing to interface only, short circuit routing lookup.
*/
if (flags & IP_ROUTETOIF) {
ifa = ifa_ifwithladdr_psref(sintocsa(dst), &psref_ia);
if (ifa == NULL) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/* ia is already referenced by psref_ia */
ia = ifatoia(ifa);
ifp = ia->ia_ifp;
mtu = ifp->if_mtu;
ip->ip_ttl = 1;
isbroadcast = in_broadcast(dst->sin_addr, ifp);
} else if (((IN_MULTICAST(ip->ip_dst.s_addr) ||
ip->ip_dst.s_addr == INADDR_BROADCAST) ||
(flags & IP_ROUTETOIFINDEX)) && imo != NULL && imo->imo_multicast_if_index != 0) {
ifp = mifp = if_get_byindex(imo->imo_multicast_if_index, &psref);
if (ifp == NULL) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
mtu = ifp->if_mtu;
ia = in_get_ia_from_ifp_psref(ifp, &psref_ia); if (IN_MULTICAST(ip->ip_dst.s_addr) ||
ip->ip_dst.s_addr == INADDR_BROADCAST) {
isbroadcast = 0;
} else {
/* IP_ROUTETOIFINDEX */
isbroadcast = in_broadcast(dst->sin_addr, ifp);
if ((isbroadcast == 0) && ((ifp->if_flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) == 0) &&
(in_direct(dst->sin_addr, ifp) == 0)) {
/* gateway address required */
if (rt == NULL)
rt = rtcache_init(ro);
if (rt == NULL || rt->rt_ifp != ifp) {
IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
rt->rt_use++;
if (rt->rt_flags & RTF_GATEWAY) dst = satosin(rt->rt_gateway);
if (rt->rt_flags & RTF_HOST)
isbroadcast =
rt->rt_flags & RTF_BROADCAST;
}
}
} else {
if (rt == NULL)
rt = rtcache_init(ro);
if (rt == NULL) { IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
if (ifa_is_destroying(rt->rt_ifa)) {
rtcache_unref(rt, ro);
rt = NULL;
IP_STATINC(IP_STAT_NOROUTE);
error = EHOSTUNREACH;
goto bad;
}
ifa_acquire(rt->rt_ifa, &psref_ia);
ia = ifatoia(rt->rt_ifa);
ifp = rt->rt_ifp;
if ((mtu = rt->rt_rmx.rmx_mtu) == 0) mtu = ifp->if_mtu;
rt->rt_use++;
if (rt->rt_flags & RTF_GATEWAY) dst = satosin(rt->rt_gateway);
if (rt->rt_flags & RTF_HOST)
isbroadcast = rt->rt_flags & RTF_BROADCAST;
else
isbroadcast = in_broadcast(dst->sin_addr, ifp);
}
rtmtu_nolock = rt && (rt->rt_rmx.rmx_locks & RTV_MTU) == 0;
if (IN_MULTICAST(ip->ip_dst.s_addr) ||
(ip->ip_dst.s_addr == INADDR_BROADCAST)) {
bool inmgroup;
m->m_flags |= (ip->ip_dst.s_addr == INADDR_BROADCAST) ?
M_BCAST : M_MCAST;
/*
* See if the caller provided any multicast options
*/
if (imo != NULL) ip->ip_ttl = imo->imo_multicast_ttl;
else
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
/*
* if we don't know the outgoing ifp yet, we can't generate
* output
*/
if (!ifp) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/*
* If the packet is multicast or broadcast, confirm that
* the outgoing interface can transmit it.
*/
if (((m->m_flags & M_MCAST) && (ifp->if_flags & IFF_MULTICAST) == 0) || ((m->m_flags & M_BCAST) &&
(ifp->if_flags & (IFF_BROADCAST|IFF_POINTOPOINT)) == 0)) {
IP_STATINC(IP_STAT_NOROUTE);
error = ENETUNREACH;
goto bad;
}
/*
* If source address not specified yet, use an address
* of outgoing interface.
*/
if (in_nullhost(ip->ip_src)) {
struct in_ifaddr *xia;
struct ifaddr *xifa;
struct psref _psref;
xia = in_get_ia_from_ifp_psref(ifp, &_psref);
if (!xia) {
IP_STATINC(IP_STAT_IFNOADDR);
error = EADDRNOTAVAIL;
goto bad;
}
xifa = &xia->ia_ifa;
if (xifa->ifa_getifa != NULL) {
ia4_release(xia, &_psref);
/* FIXME ifa_getifa is NOMPSAFE */
xia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); if (xia == NULL) {
IP_STATINC(IP_STAT_IFNOADDR);
error = EADDRNOTAVAIL;
goto bad;
}
ia4_acquire(xia, &_psref);
}
ip->ip_src = xia->ia_addr.sin_addr;
ia4_release(xia, &_psref);
}
inmgroup = in_multi_group(ip->ip_dst, ifp, flags);
if (inmgroup && (imo == NULL || imo->imo_multicast_loop)) {
/*
* If we belong to the destination multicast group
* on the outgoing interface, and the caller did not
* forbid loopback, loop back a copy.
*/
ip_mloopback(ifp, m, &udst.sin);
}
#ifdef MROUTING
else {
/*
* If we are acting as a multicast router, perform
* multicast forwarding as if the packet had just
* arrived on the interface to which we are about
* to send. The multicast forwarding function
* recursively calls this function, using the
* IP_FORWARDING flag to prevent infinite recursion.
*
* Multicasts that are looped back by ip_mloopback(),
* above, will be forwarded by the ip_input() routine,
* if necessary.
*/
extern struct socket *ip_mrouter;
if (ip_mrouter && (flags & IP_FORWARDING) == 0) {
if (ip_mforward(m, ifp) != 0) {
m_freem(m);
goto done;
}
}
}
#endif
/*
* Multicasts with a time-to-live of zero may be looped-
* back, above, but must not be transmitted on a network.
* Also, multicasts addressed to the loopback interface
* are not sent -- the above call to ip_mloopback() will
* loop back a copy if this host actually belongs to the
* destination group on the loopback interface.
*/
if (ip->ip_ttl == 0 || (ifp->if_flags & IFF_LOOPBACK) != 0) {
IP_STATINC(IP_STAT_ODROPPED);
m_freem(m);
goto done;
}
goto sendit;
}
/*
* If source address not specified yet, use address
* of outgoing interface.
*/
if (in_nullhost(ip->ip_src)) {
struct ifaddr *xifa;
xifa = &ia->ia_ifa;
if (xifa->ifa_getifa != NULL) {
ia4_release(ia, &psref_ia);
/* FIXME ifa_getifa is NOMPSAFE */
ia = ifatoia((*xifa->ifa_getifa)(xifa, rdst)); if (ia == NULL) {
error = EADDRNOTAVAIL;
goto bad;
}
ia4_acquire(ia, &psref_ia);
}
ip->ip_src = ia->ia_addr.sin_addr;
}
/*
* Packets with Class-D address as source are not valid per
* RFC1112.
*/
if (IN_MULTICAST(ip->ip_src.s_addr)) {
IP_STATINC(IP_STAT_ODROPPED);
error = EADDRNOTAVAIL;
goto bad;
}
/*
* Look for broadcast address and verify user is allowed to
* send such a packet.
*/
if (isbroadcast) {
if ((ifp->if_flags & IFF_BROADCAST) == 0) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EADDRNOTAVAIL;
goto bad;
}
if ((flags & IP_ALLOWBROADCAST) == 0) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EACCES;
goto bad;
}
/* don't allow broadcast messages to be fragmented */
if (ntohs(ip->ip_len) > ifp->if_mtu) {
IP_STATINC(IP_STAT_BCASTDENIED);
error = EMSGSIZE;
goto bad;
}
m->m_flags |= M_BCAST;
} else
m->m_flags &= ~M_BCAST;
sendit:
if ((flags & (IP_FORWARDING|IP_NOIPNEWID)) == 0) { if (m->m_pkthdr.len < IP_MINFRAGSIZE) {
ip->ip_id = 0;
} else if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
ip->ip_id = ip_newid(ia);
} else {
/*
* TSO capable interfaces (typically?) increment
* ip_id for each segment.
* "allocate" enough ids here to increase the chance
* for them to be unique.
*
* note that the following calculation is not
* needed to be precise. wasting some ip_id is fine.
*/
unsigned int segsz = m->m_pkthdr.segsz;
unsigned int datasz = ntohs(ip->ip_len) - hlen;
unsigned int num = howmany(datasz, segsz);
ip->ip_id = ip_newid_range(ia, num);
}
}
if (ia != NULL) { ia4_release(ia, &psref_ia);
ia = NULL;
}
/*
* If we're doing Path MTU Discovery, we need to set DF unless
* the route's MTU is locked.
*/
if ((flags & IP_MTUDISC) != 0 && rtmtu_nolock) { ip->ip_off |= htons(IP_DF);
}
#ifdef IPSEC
if (ipsec_used) {
bool ipsec_done = false;
bool count_drop = false;
/* Perform IPsec processing, if any. */
error = ipsec4_output(m, inp, flags, &mtu, &natt_frag,
&ipsec_done, &count_drop);
if (count_drop) IP_STATINC(IP_STAT_IPSECDROP_OUT);
if (error || ipsec_done)
goto done;
}
if (!ipsec_used || !natt_frag)
#endif
{
/*
* Run through list of hooks for output packets.
*/
error = pfil_run_hooks(inet_pfil_hook, &m, ifp, PFIL_OUT);
if (error || m == NULL) { IP_STATINC(IP_STAT_PFILDROP_OUT);
goto done;
}
}
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
m->m_pkthdr.csum_data |= hlen << 16;
/*
* search for the source address structure to
* maintain output statistics, and verify address
* validity
*/
KASSERT(ia == NULL);
sockaddr_in_init(&usrc.sin, &ip->ip_src, 0);
ifa = ifaof_ifpforaddr_psref(&usrc.sa, ifp, &psref_ia);
if (ifa != NULL)
ia = ifatoia(ifa);
/*
* Ensure we only send from a valid address.
* A NULL address is valid because the packet could be
* generated from a packet filter.
*/
if (ia != NULL && (flags & IP_FORWARDING) == 0 && (error = ip_ifaddrvalid(ia)) != 0)
{
ARPLOG(LOG_ERR,
"refusing to send from invalid address %s (pid %d)\n",
ARPLOGADDR(&ip->ip_src), curproc->p_pid);
IP_STATINC(IP_STAT_ODROPPED);
if (error == 1)
/*
* Address exists, but is tentative or detached.
* We can't send from it because it's invalid,
* so we drop the packet.
*/
error = 0;
else
error = EADDRNOTAVAIL;
goto bad;
}
/* Maybe skip checksums on loopback interfaces. */
if (IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) {
m->m_pkthdr.csum_flags |= M_CSUM_IPv4;
}
sw_csum = m->m_pkthdr.csum_flags & ~ifp->if_csum_flags_tx;
/* Need to fragment the packet */
if (ntohs(ip->ip_len) > mtu &&
(m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
goto fragment;
}
#if IFA_STATS
if (ia)
ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
/*
* Always initialize the sum to 0! Some HW assisted
* checksumming requires this.
*/
ip->ip_sum = 0;
if ((m->m_pkthdr.csum_flags & M_CSUM_TSOv4) == 0) {
/*
* Perform any checksums that the hardware can't do
* for us.
*
* XXX Does any hardware require the {th,uh}_sum
* XXX fields to be 0?
*/
if (sw_csum & M_CSUM_IPv4) { KASSERT(IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4));
ip->ip_sum = in_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
}
if (sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { if (IN_NEED_CHECKSUM(ifp,
sw_csum & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
in_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &=
~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
}
sa = (m->m_flags & M_MCAST) ? sintocsa(rdst) : sintocsa(dst);
/* Send it */
if (__predict_false(sw_csum & M_CSUM_TSOv4)) {
/*
* TSO4 is required by a packet, but disabled for
* the interface.
*/
error = ip_tso_output(ifp, m, sa, rt);
} else
error = ip_if_output(ifp, m, sa, rt);
goto done;
fragment:
/*
* We can't use HW checksumming if we're about to fragment the packet.
*
* XXX Some hardware can do this.
*/
if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { if (IN_NEED_CHECKSUM(ifp,
m->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4))) {
in_undefer_cksum_tcpudp(m);
}
m->m_pkthdr.csum_flags &= ~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
/*
* Too large for interface; fragment if possible.
* Must be able to put at least 8 bytes per fragment.
*/
if (ntohs(ip->ip_off) & IP_DF) {
if (flags & IP_RETURNMTU) { KASSERT(inp != NULL);
in4p_errormtu(inp) = mtu;
}
error = EMSGSIZE;
IP_STATINC(IP_STAT_CANTFRAG);
goto bad;
}
error = ip_fragment(m, ifp, mtu);
if (error) {
m = NULL;
goto bad;
}
for (; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
if (error) {
m_freem(m);
continue;
}
#if IFA_STATS
if (ia)
ia->ia_ifa.ifa_data.ifad_outbytes += ntohs(ip->ip_len);
#endif
/*
* If we get there, the packet has not been handled by
* IPsec whereas it should have. Now that it has been
* fragmented, re-inject it in ip_output so that IPsec
* processing can occur.
*/
if (natt_frag) {
error = ip_output(m, opt, NULL,
flags | IP_RAWOUTPUT | IP_NOIPNEWID,
imo, inp);
} else {
KASSERT((m->m_pkthdr.csum_flags &
(M_CSUM_UDPv4 | M_CSUM_TCPv4)) == 0);
error = ip_if_output(ifp, m, (m->m_flags & M_MCAST) ?
sintocsa(rdst) : sintocsa(dst), rt);
}
}
if (error == 0) {
IP_STATINC(IP_STAT_FRAGMENTED);
}
done:
ia4_release(ia, &psref_ia);
rtcache_unref(rt, ro);
if (ro == &iproute) { rtcache_free(&iproute);
}
if (mifp != NULL) { if_put(mifp, &psref);
}
if (bind_need_restore)
curlwp_bindx(bound);
return error;
bad:
m_freem(m);
goto done;
}
int
ip_fragment(struct mbuf *m, struct ifnet *ifp, u_long mtu)
{
struct ip *ip, *mhip;
struct mbuf *m0;
int len, hlen, off;
int mhlen, firstlen;
struct mbuf **mnext;
int sw_csum = m->m_pkthdr.csum_flags;
int fragments = 0;
int error = 0;
int ipoff, ipflg;
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
/* Preserve the offset and flags. */
ipoff = ntohs(ip->ip_off) & IP_OFFMASK;
ipflg = ntohs(ip->ip_off) & (IP_RF|IP_DF|IP_MF);
if (ifp != NULL)
sw_csum &= ~ifp->if_csum_flags_tx;
len = (mtu - hlen) &~ 7;
if (len < 8) {
IP_STATINC(IP_STAT_CANTFRAG);
m_freem(m);
return EMSGSIZE;
}
firstlen = len;
mnext = &m->m_nextpkt;
/*
* Loop through length of segment after first fragment,
* make new header and copy data of each part and link onto chain.
*/
m0 = m;
mhlen = sizeof(struct ip);
for (off = hlen + len; off < ntohs(ip->ip_len); off += len) {
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
goto sendorfree;
}
MCLAIM(m, m0->m_owner);
*mnext = m;
mnext = &m->m_nextpkt;
m->m_data += max_linkhdr;
mhip = mtod(m, struct ip *);
*mhip = *ip;
/* we must inherit the flags */
m->m_flags |= m0->m_flags & M_COPYFLAGS;
if (hlen > sizeof(struct ip)) {
mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
mhip->ip_hl = mhlen >> 2;
}
m->m_len = mhlen;
mhip->ip_off = ((off - hlen) >> 3) + ipoff;
mhip->ip_off |= ipflg;
if (off + len >= ntohs(ip->ip_len))
len = ntohs(ip->ip_len) - off;
else
mhip->ip_off |= IP_MF;
HTONS(mhip->ip_off);
mhip->ip_len = htons((u_int16_t)(len + mhlen));
m->m_next = m_copym(m0, off, len, M_DONTWAIT);
if (m->m_next == NULL) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
goto sendorfree;
}
m->m_pkthdr.len = mhlen + len;
m_reset_rcvif(m);
mhip->ip_sum = 0;
KASSERT((m->m_pkthdr.csum_flags & M_CSUM_IPv4) == 0);
if (sw_csum & M_CSUM_IPv4) {
mhip->ip_sum = in_cksum(m, mhlen);
} else {
/*
* checksum is hw-offloaded or not necessary.
*/
m->m_pkthdr.csum_flags |=
m0->m_pkthdr.csum_flags & M_CSUM_IPv4;
m->m_pkthdr.csum_data |= mhlen << 16;
KASSERT(!(ifp != NULL &&
IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
(m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
}
IP_STATINC(IP_STAT_OFRAGMENTS);
fragments++;
}
/*
* Update first fragment by trimming what's been copied out
* and updating header, then send each fragment (in order).
*/
m = m0;
m_adj(m, hlen + firstlen - ntohs(ip->ip_len));
m->m_pkthdr.len = hlen + firstlen;
ip->ip_len = htons((u_int16_t)m->m_pkthdr.len);
ip->ip_off |= htons(IP_MF);
ip->ip_sum = 0;
if (sw_csum & M_CSUM_IPv4) {
ip->ip_sum = in_cksum(m, hlen);
m->m_pkthdr.csum_flags &= ~M_CSUM_IPv4;
} else {
/*
* checksum is hw-offloaded or not necessary.
*/
KASSERT(!(ifp != NULL && IN_NEED_CHECKSUM(ifp, M_CSUM_IPv4)) ||
(m->m_pkthdr.csum_flags & M_CSUM_IPv4) != 0);
KASSERT(M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data) >=
sizeof(struct ip));
}
sendorfree:
/*
* If there is no room for all the fragments, don't queue
* any of them.
*/
if (ifp != NULL) {
IFQ_LOCK(&ifp->if_snd);
if (ifp->if_snd.ifq_maxlen - ifp->if_snd.ifq_len < fragments &&
error == 0) {
error = ENOBUFS;
IP_STATINC(IP_STAT_ODROPPED);
IFQ_INC_DROPS(&ifp->if_snd);
}
IFQ_UNLOCK(&ifp->if_snd);
}
if (error) {
for (m = m0; m; m = m0) {
m0 = m->m_nextpkt;
m->m_nextpkt = NULL;
m_freem(m);
}
}
return error;
}
/*
* Determine the maximum length of the options to be inserted;
* we would far rather allocate too much space rather than too little.
*/
u_int
ip_optlen(struct inpcb *inp)
{
struct mbuf *m = inp->inp_options;
if (m && m->m_len > offsetof(struct ipoption, ipopt_dst)) {
return (m->m_len - offsetof(struct ipoption, ipopt_dst));
}
return 0;
}
/*
* Insert IP options into preformed packet.
* Adjust IP destination as required for IP source routing,
* as indicated by a non-zero in_addr at the start of the options.
*/
static struct mbuf *
ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
{
struct ipoption *p = mtod(opt, struct ipoption *);
struct mbuf *n;
struct ip *ip = mtod(m, struct ip *);
unsigned optlen;
optlen = opt->m_len - sizeof(p->ipopt_dst);
KASSERT(optlen % 4 == 0); if (optlen + ntohs(ip->ip_len) > IP_MAXPACKET)
return m; /* XXX should fail */
if (!in_nullhost(p->ipopt_dst)) ip->ip_dst = p->ipopt_dst; if (M_READONLY(m) || M_LEADINGSPACE(m) < optlen) {
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n == NULL)
return m;
MCLAIM(n, m->m_owner);
m_move_pkthdr(n, m);
m->m_len -= sizeof(struct ip);
m->m_data += sizeof(struct ip);
n->m_next = m;
n->m_len = optlen + sizeof(struct ip);
n->m_data += max_linkhdr;
memcpy(mtod(n, void *), ip, sizeof(struct ip));
m = n;
} else {
m->m_data -= optlen;
m->m_len += optlen;
memmove(mtod(m, void *), ip, sizeof(struct ip));
}
m->m_pkthdr.len += optlen;
ip = mtod(m, struct ip *);
memcpy(ip + 1, p->ipopt_list, optlen);
*phlen = sizeof(struct ip) + optlen;
ip->ip_len = htons(ntohs(ip->ip_len) + optlen);
return m;
}
/*
* Copy options from ipsrc to ipdst, omitting those not copied during
* fragmentation.
*/
int
ip_optcopy(struct ip *ipsrc, struct ip *ipdst)
{
u_char *cp, *dp;
int opt, optlen, cnt;
cp = (u_char *)(ipsrc + 1);
dp = (u_char *)(ipdst + 1);
cnt = (ipsrc->ip_hl << 2) - sizeof(struct ip);
for (; cnt > 0; cnt -= optlen, cp += optlen) {
opt = cp[0];
if (opt == IPOPT_EOL)
break;
if (opt == IPOPT_NOP) {
/* Preserve for IP mcast tunnel's LSRR alignment. */
*dp++ = IPOPT_NOP;
optlen = 1;
continue;
}
KASSERT(cnt >= IPOPT_OLEN + sizeof(*cp));
optlen = cp[IPOPT_OLEN];
KASSERT(optlen >= IPOPT_OLEN + sizeof(*cp) && optlen < cnt);
/* Invalid lengths should have been caught by ip_dooptions. */
if (optlen > cnt)
optlen = cnt;
if (IPOPT_COPIED(opt)) {
bcopy((void *)cp, (void *)dp, (unsigned)optlen);
dp += optlen;
}
}
for (optlen = dp - (u_char *)(ipdst+1); optlen & 0x3; optlen++) {
*dp++ = IPOPT_EOL;
}
return optlen;
}
/*
* IP socket option processing.
*/
int
ip_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
struct inpcb *inp = sotoinpcb(so);
struct ip *ip = &in4p_ip(inp);
int inpflags = inp->inp_flags;
int optval = 0, error = 0;
struct in_pktinfo pktinfo;
KASSERT(solocked(so)); if (sopt->sopt_level != IPPROTO_IP) { if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER)
return 0;
return ENOPROTOOPT;
}
switch (op) {
case PRCO_SETOPT:
switch (sopt->sopt_name) {
case IP_OPTIONS:
#ifdef notyet
case IP_RETOPTS:
#endif
error = ip_pcbopts(inp, sopt);
break;
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVPKTINFO:
case IP_RECVTTL:
case IP_BINDANY:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (sopt->sopt_name) {
case IP_TOS:
ip->ip_tos = optval;
break;
case IP_TTL:
ip->ip_ttl = optval;
break;
case IP_MINTTL:
if (optval > 0 && optval <= MAXTTL) in4p_ip_minttl(inp) = optval;
else
error = EINVAL;
break;
#define OPTSET(bit) \
if (optval) \
inpflags |= bit; \
else \
inpflags &= ~bit;
case IP_RECVOPTS:
OPTSET(INP_RECVOPTS);
break;
case IP_RECVPKTINFO:
OPTSET(INP_RECVPKTINFO);
break;
case IP_RECVRETOPTS:
OPTSET(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
OPTSET(INP_RECVDSTADDR);
break;
case IP_RECVIF:
OPTSET(INP_RECVIF);
break;
case IP_RECVTTL:
OPTSET(INP_RECVTTL);
break;
case IP_BINDANY:
error = kauth_authorize_network(
kauth_cred_get(), KAUTH_NETWORK_BIND,
KAUTH_REQ_NETWORK_BIND_ANYADDR, so,
NULL, NULL);
if (error == 0) { OPTSET(INP_BINDANY);
}
break;
}
break;
case IP_PKTINFO:
error = sockopt_getint(sopt, &optval);
if (!error) {
/* Linux compatibility */
OPTSET(INP_RECVPKTINFO);
break;
}
error = sockopt_get(sopt, &pktinfo, sizeof(pktinfo));
if (error)
break;
if (pktinfo.ipi_ifindex == 0) {
in4p_prefsrcip(inp) = pktinfo.ipi_addr;
break;
}
/* Solaris compatibility */
struct ifnet *ifp;
struct in_ifaddr *ia;
int s;
/* pick up primary address */
s = pserialize_read_enter();
ifp = if_byindex(pktinfo.ipi_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
ia = in_get_ia_from_ifp(ifp);
if (ia == NULL) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
in4p_prefsrcip(inp) = IA_SIN(ia)->sin_addr;
pserialize_read_exit(s);
break;
break;
#undef OPTSET
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_setmoptions(&inp->inp_moptions, sopt);
break;
case IP_PORTRANGE:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (optval) {
case IP_PORTRANGE_DEFAULT:
case IP_PORTRANGE_HIGH:
inpflags &= ~(INP_LOWPORT);
break;
case IP_PORTRANGE_LOW:
inpflags |= INP_LOWPORT;
break;
default:
error = EINVAL;
break;
}
break;
case IP_PORTALGO:
error = sockopt_getint(sopt, &optval);
if (error)
break;
error = portalgo_algo_index_select(inp, optval);
break;
#if defined(IPSEC)
case IP_IPSEC_POLICY:
if (ipsec_enabled) {
error = ipsec_set_policy(inp,
sopt->sopt_data, sopt->sopt_size,
curlwp->l_cred);
} else
error = ENOPROTOOPT;
break;
#endif /* IPSEC */
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
switch (sopt->sopt_name) {
case IP_OPTIONS:
case IP_RETOPTS: {
struct mbuf *mopts = inp->inp_options;
if (mopts) {
struct mbuf *m;
m = m_copym(mopts, 0, M_COPYALL, M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
break;
}
error = sockopt_setmbuf(sopt, m);
}
break;
}
case IP_TOS:
case IP_TTL:
case IP_MINTTL:
case IP_RECVOPTS:
case IP_RECVRETOPTS:
case IP_RECVDSTADDR:
case IP_RECVIF:
case IP_RECVPKTINFO:
case IP_RECVTTL:
case IP_ERRORMTU:
case IP_BINDANY:
switch (sopt->sopt_name) {
case IP_TOS:
optval = ip->ip_tos;
break;
case IP_TTL:
optval = ip->ip_ttl;
break;
case IP_MINTTL:
optval = in4p_ip_minttl(inp);
break;
case IP_ERRORMTU:
optval = in4p_errormtu(inp);
break;
#define OPTBIT(bit) (inpflags & bit ? 1 : 0)
case IP_RECVOPTS:
optval = OPTBIT(INP_RECVOPTS);
break;
case IP_RECVPKTINFO:
optval = OPTBIT(INP_RECVPKTINFO);
break;
case IP_RECVRETOPTS:
optval = OPTBIT(INP_RECVRETOPTS);
break;
case IP_RECVDSTADDR:
optval = OPTBIT(INP_RECVDSTADDR);
break;
case IP_RECVIF:
optval = OPTBIT(INP_RECVIF);
break;
case IP_RECVTTL:
optval = OPTBIT(INP_RECVTTL);
break;
case IP_BINDANY:
optval = OPTBIT(INP_BINDANY);
break;
}
error = sockopt_setint(sopt, optval);
break;
case IP_PKTINFO:
switch (sopt->sopt_size) {
case sizeof(int):
/* Linux compatibility */
optval = OPTBIT(INP_RECVPKTINFO);
error = sockopt_setint(sopt, optval);
break;
case sizeof(struct in_pktinfo):
/* Solaris compatibility */
pktinfo.ipi_ifindex = 0;
pktinfo.ipi_addr = in4p_prefsrcip(inp);
error = sockopt_set(sopt, &pktinfo,
sizeof(pktinfo));
break;
default:
/*
* While size is stuck at 0, and, later, if
* the caller doesn't use an exactly sized
* recipient for the data, default to Linux
* compatibility
*/
optval = OPTBIT(INP_RECVPKTINFO);
error = sockopt_setint(sopt, optval);
break;
}
break;
#if 0 /* defined(IPSEC) */
case IP_IPSEC_POLICY:
{
struct mbuf *m = NULL;
/* XXX this will return EINVAL as sopt is empty */
error = ipsec_get_policy(inp, sopt->sopt_data,
sopt->sopt_size, &m);
if (error == 0)
error = sockopt_setmbuf(sopt, m);
break;
}
#endif /*IPSEC*/
case IP_MULTICAST_IF:
case IP_MULTICAST_TTL:
case IP_MULTICAST_LOOP:
case IP_ADD_MEMBERSHIP:
case IP_DROP_MEMBERSHIP:
error = ip_getmoptions(inp->inp_moptions, sopt);
break;
case IP_PORTRANGE:
if (inpflags & INP_LOWPORT)
optval = IP_PORTRANGE_LOW;
else
optval = IP_PORTRANGE_DEFAULT;
error = sockopt_setint(sopt, optval);
break;
case IP_PORTALGO:
optval = inp->inp_portalgo;
error = sockopt_setint(sopt, optval);
break;
default:
error = ENOPROTOOPT;
break;
}
break;
}
if (!error) {
inp->inp_flags = inpflags;
}
return error;
}
static int
ip_pktinfo_prepare(const struct inpcb *inp, const struct in_pktinfo *pktinfo,
struct ip_pktopts *pktopts, int *flags, kauth_cred_t cred)
{
struct ip_moptions *imo;
int error = 0;
bool addrset = false;
if (!in_nullhost(pktinfo->ipi_addr)) {
pktopts->ippo_laddr.sin_addr = pktinfo->ipi_addr;
/* EADDRNOTAVAIL? */
error = inpcb_bindableaddr(inp, &pktopts->ippo_laddr, cred);
if (error != 0)
return error;
addrset = true;
}
if (pktinfo->ipi_ifindex != 0) {
if (!addrset) {
struct ifnet *ifp;
struct in_ifaddr *ia;
int s;
/* pick up primary address */
s = pserialize_read_enter();
ifp = if_byindex(pktinfo->ipi_ifindex);
if (ifp == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
ia = in_get_ia_from_ifp(ifp);
if (ia == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
pktopts->ippo_laddr.sin_addr = IA_SIN(ia)->sin_addr;
pserialize_read_exit(s);
}
/*
* If specified ipi_ifindex,
* use copied or locally initialized ip_moptions.
* Original ip_moptions must not be modified.
*/
imo = &pktopts->ippo_imobuf; /* local buf in pktopts */
if (pktopts->ippo_imo != NULL) {
memcpy(imo, pktopts->ippo_imo, sizeof(*imo));
} else {
memset(imo, 0, sizeof(*imo));
imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
}
imo->imo_multicast_if_index = pktinfo->ipi_ifindex;
pktopts->ippo_imo = imo;
*flags |= IP_ROUTETOIFINDEX;
}
return error;
}
/*
* Set up IP outgoing packet options. Even if control is NULL,
* pktopts->ippo_laddr and pktopts->ippo_imo are set and used.
*/
int
ip_setpktopts(struct mbuf *control, struct ip_pktopts *pktopts, int *flags,
struct inpcb *inp, kauth_cred_t cred)
{
struct cmsghdr *cm;
struct in_pktinfo pktinfo;
int error;
pktopts->ippo_imo = inp->inp_moptions;
struct in_addr *ia = in_nullhost(in4p_prefsrcip(inp)) ? &in4p_laddr(inp) :
&in4p_prefsrcip(inp);
sockaddr_in_init(&pktopts->ippo_laddr, ia, 0);
if (control == NULL)
return 0;
/*
* XXX: Currently, we assume all the optional information is
* stored in a single mbuf.
*/
if (control->m_next)
return EINVAL;
for (; control->m_len > 0;
control->m_data += CMSG_ALIGN(cm->cmsg_len),
control->m_len -= CMSG_ALIGN(cm->cmsg_len)) {
cm = mtod(control, struct cmsghdr *);
if ((control->m_len < sizeof(*cm)) || (cm->cmsg_len == 0) ||
(cm->cmsg_len > control->m_len)) {
return EINVAL;
}
if (cm->cmsg_level != IPPROTO_IP)
continue;
switch (cm->cmsg_type) {
case IP_PKTINFO:
if (cm->cmsg_len != CMSG_LEN(sizeof(pktinfo)))
return EINVAL;
memcpy(&pktinfo, CMSG_DATA(cm), sizeof(pktinfo));
error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
flags, cred);
if (error)
return error;
break;
case IP_SENDSRCADDR: /* FreeBSD compatibility */
if (cm->cmsg_len != CMSG_LEN(sizeof(struct in_addr)))
return EINVAL;
pktinfo.ipi_ifindex = 0;
pktinfo.ipi_addr =
((struct in_pktinfo *)CMSG_DATA(cm))->ipi_addr;
error = ip_pktinfo_prepare(inp, &pktinfo, pktopts,
flags, cred);
if (error)
return error;
break;
default:
return ENOPROTOOPT;
}
}
return 0;
}
/*
* Set up IP options in pcb for insertion in output packets.
* Store in mbuf with pointer in pcbopt, adding pseudo-option
* with destination address if source routed.
*/
static int
ip_pcbopts(struct inpcb *inp, const struct sockopt *sopt)
{
struct mbuf *m;
const u_char *cp;
u_char *dp;
int cnt;
KASSERT(inp_locked(inp));
/* Turn off any old options. */
if (inp->inp_options) { m_free(inp->inp_options);
}
inp->inp_options = NULL;
if ((cnt = sopt->sopt_size) == 0) {
/* Only turning off any previous options. */
return 0;
}
cp = sopt->sopt_data;
if (cnt % 4) {
/* Must be 4-byte aligned, because there's no padding. */
return EINVAL;
}
m = m_get(M_DONTWAIT, MT_SOOPTS);
if (m == NULL)
return ENOBUFS;
dp = mtod(m, u_char *);
memset(dp, 0, sizeof(struct in_addr));
dp += sizeof(struct in_addr);
m->m_len = sizeof(struct in_addr);
/*
* IP option list according to RFC791. Each option is of the form
*
* [optval] [olen] [(olen - 2) data bytes]
*
* We validate the list and copy options to an mbuf for prepending
* to data packets. The IP first-hop destination address will be
* stored before actual options and is zero if unset.
*/
while (cnt > 0) {
uint8_t optval, olen, offset;
optval = cp[IPOPT_OPTVAL];
if (optval == IPOPT_EOL || optval == IPOPT_NOP) {
olen = 1;
} else {
if (cnt < IPOPT_OLEN + 1)
goto bad;
olen = cp[IPOPT_OLEN];
if (olen < IPOPT_OLEN + 1 || olen > cnt)
goto bad;
}
if (optval == IPOPT_LSRR || optval == IPOPT_SSRR) {
/*
* user process specifies route as:
* ->A->B->C->D
* D must be our final destination (but we can't
* check that since we may not have connected yet).
* A is first hop destination, which doesn't appear in
* actual IP option, but is stored before the options.
*/
if (olen < IPOPT_OFFSET + 1 + sizeof(struct in_addr))
goto bad;
offset = cp[IPOPT_OFFSET];
memcpy(mtod(m, u_char *), cp + IPOPT_OFFSET + 1,
sizeof(struct in_addr));
cp += sizeof(struct in_addr);
cnt -= sizeof(struct in_addr);
olen -= sizeof(struct in_addr);
if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
memcpy(dp, cp, olen);
dp[IPOPT_OPTVAL] = optval;
dp[IPOPT_OLEN] = olen;
dp[IPOPT_OFFSET] = offset;
break;
} else {
if (m->m_len + olen > MAX_IPOPTLEN + sizeof(struct in_addr))
goto bad;
memcpy(dp, cp, olen);
break;
}
dp += olen;
m->m_len += olen;
if (optval == IPOPT_EOL)
break;
cp += olen;
cnt -= olen;
}
inp->inp_options = m;
return 0;
bad:
(void)m_free(m);
return EINVAL;
}
/*
* following RFC1724 section 3.3, 0.0.0.0/8 is interpreted as interface index.
* Must be called in a pserialize critical section.
*/
static struct ifnet *
ip_multicast_if(struct in_addr *a, int *ifindexp)
{
int ifindex;
struct ifnet *ifp = NULL;
struct in_ifaddr *ia;
if (ifindexp) *ifindexp = 0;
if (ntohl(a->s_addr) >> 24 == 0) {
ifindex = ntohl(a->s_addr) & 0xffffff;
ifp = if_byindex(ifindex);
if (!ifp)
return NULL;
if (ifindexp)
*ifindexp = ifindex;
} else {
IN_ADDRHASH_READER_FOREACH(ia, a->s_addr) { if (in_hosteq(ia->ia_addr.sin_addr, *a) &&
(ia->ia_ifp->if_flags & IFF_MULTICAST) != 0) {
ifp = ia->ia_ifp;
if (if_is_deactivated(ifp))
ifp = NULL;
break;
}
}
}
return ifp;
}
static int
ip_getoptval(const struct sockopt *sopt, u_int8_t *val, u_int maxval)
{
u_int tval;
u_char cval;
int error;
if (sopt == NULL)
return EINVAL;
switch (sopt->sopt_size) {
case sizeof(u_char):
error = sockopt_get(sopt, &cval, sizeof(u_char));
tval = cval;
break;
case sizeof(u_int):
error = sockopt_get(sopt, &tval, sizeof(u_int));
break;
default:
error = EINVAL;
}
if (error)
return error;
if (tval > maxval)
return EINVAL;
*val = tval;
return 0;
}
static int
ip_get_membership(const struct sockopt *sopt, struct ifnet **ifp,
struct psref *psref, struct in_addr *ia, bool add)
{
int error;
struct ip_mreq mreq;
error = sockopt_get(sopt, &mreq, sizeof(mreq));
if (error)
return error;
if (!IN_MULTICAST(mreq.imr_multiaddr.s_addr))
return EINVAL;
memcpy(ia, &mreq.imr_multiaddr, sizeof(*ia));
if (in_nullhost(mreq.imr_interface)) {
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
struct route ro;
if (!add) {
*ifp = NULL;
return 0;
}
/*
* If no interface address was provided, use the interface of
* the route to the given multicast address.
*/
struct rtentry *rt;
memset(&ro, 0, sizeof(ro));
sockaddr_in_init(&u.dst4, ia, 0);
error = rtcache_setdst(&ro, &u.dst);
if (error != 0)
return error;
*ifp = (rt = rtcache_init(&ro)) != NULL ? rt->rt_ifp : NULL; if (*ifp != NULL) {
if (if_is_deactivated(*ifp))
*ifp = NULL;
else
if_acquire(*ifp, psref);
}
rtcache_unref(rt, &ro);
rtcache_free(&ro);
} else {
int s = pserialize_read_enter();
*ifp = ip_multicast_if(&mreq.imr_interface, NULL);
if (!add && *ifp == NULL) {
pserialize_read_exit(s);
return EADDRNOTAVAIL;
}
if (*ifp != NULL) {
if (if_is_deactivated(*ifp))
*ifp = NULL;
else
if_acquire(*ifp, psref);
}
pserialize_read_exit(s);
}
return 0;
}
/*
* Add a multicast group membership.
* Group must be a valid IP multicast address.
*/
static int
ip_add_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
struct ifnet *ifp = NULL; // XXX: gcc [ppc]
struct in_addr ia;
int i, error, bound;
struct psref psref;
/* imo is protected by solock or referenced only by the caller */
bound = curlwp_bind();
if (sopt->sopt_size == sizeof(struct ip_mreq))
error = ip_get_membership(sopt, &ifp, &psref, &ia, true);
else {
#ifdef INET6
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
error = EINVAL;
#endif
}
if (error)
goto out;
/*
* See if we found an interface, and confirm that it
* supports multicast.
*/
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
/*
* See if the membership already exists or if all the
* membership slots are full.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if (imo->imo_membership[i]->inm_ifp == ifp &&
in_hosteq(imo->imo_membership[i]->inm_addr, ia))
break;
}
if (i < imo->imo_num_memberships) {
error = EADDRINUSE;
goto out;
}
if (i == IP_MAX_MEMBERSHIPS) {
error = ETOOMANYREFS;
goto out;
}
/*
* Everything looks good; add a new record to the multicast
* address list for the given interface.
*/
imo->imo_membership[i] = in_addmulti(&ia, ifp);
if (imo->imo_membership[i] == NULL) {
error = ENOBUFS;
goto out;
}
++imo->imo_num_memberships;
error = 0;
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Drop a multicast group membership.
* Group must be a valid IP multicast address.
*/
static int
ip_drop_membership(struct ip_moptions *imo, const struct sockopt *sopt)
{
struct in_addr ia = { .s_addr = 0 }; // XXX: gcc [ppc]
struct ifnet *ifp = NULL; // XXX: gcc [ppc]
int i, error, bound;
struct psref psref;
/* imo is protected by solock or referenced only by the caller */
bound = curlwp_bind();
if (sopt->sopt_size == sizeof(struct ip_mreq))
error = ip_get_membership(sopt, &ifp, &psref, &ia, false);
else {
#ifdef INET6
error = ip6_get_membership(sopt, &ifp, &psref, &ia, sizeof(ia));
#else
error = EINVAL;
#endif
}
if (error)
goto out;
/*
* Find the membership in the membership array.
*/
for (i = 0; i < imo->imo_num_memberships; ++i) { if ((ifp == NULL || imo->imo_membership[i]->inm_ifp == ifp) &&
in_hosteq(imo->imo_membership[i]->inm_addr, ia))
break;
}
if (i == imo->imo_num_memberships) {
error = EADDRNOTAVAIL;
goto out;
}
/*
* Give up the multicast address record to which the
* membership points.
*/
in_delmulti(imo->imo_membership[i]);
/*
* Remove the gap in the membership array.
*/
for (++i; i < imo->imo_num_memberships; ++i)
imo->imo_membership[i-1] = imo->imo_membership[i];
--imo->imo_num_memberships;
error = 0;
out:
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
/*
* Set the IP multicast options in response to user setsockopt().
*/
int
ip_setmoptions(struct ip_moptions **pimo, const struct sockopt *sopt)
{
struct ip_moptions *imo = *pimo;
struct in_addr addr;
struct ifnet *ifp;
int ifindex, error = 0;
/* The passed imo isn't NULL, it should be protected by solock */
if (!imo) {
/*
* No multicast option buffer attached to the pcb;
* allocate one and initialize to default values.
*/
imo = kmem_intr_alloc(sizeof(*imo), KM_NOSLEEP); if (imo == NULL)
return ENOBUFS;
imo->imo_multicast_if_index = 0;
imo->imo_multicast_addr.s_addr = INADDR_ANY;
imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
imo->imo_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
imo->imo_num_memberships = 0;
*pimo = imo;
}
switch (sopt->sopt_name) {
case IP_MULTICAST_IF: {
int s;
/*
* Select the interface for outgoing multicast packets.
*/
error = sockopt_get(sopt, &addr, sizeof(addr));
if (error)
break;
/*
* INADDR_ANY is used to remove a previous selection.
* When no interface is selected, a default one is
* chosen every time a multicast packet is sent.
*/
if (in_nullhost(addr)) {
imo->imo_multicast_if_index = 0;
break;
}
/*
* The selected interface is identified by its local
* IP address. Find the interface and confirm that
* it supports multicasting.
*/
s = pserialize_read_enter();
ifp = ip_multicast_if(&addr, &ifindex);
if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0) {
pserialize_read_exit(s);
error = EADDRNOTAVAIL;
break;
}
imo->imo_multicast_if_index = ifp->if_index;
pserialize_read_exit(s);
if (ifindex)
imo->imo_multicast_addr = addr;
else
imo->imo_multicast_addr.s_addr = INADDR_ANY;
break;
}
case IP_MULTICAST_TTL:
/*
* Set the IP time-to-live for outgoing multicast packets.
*/
error = ip_getoptval(sopt, &imo->imo_multicast_ttl, MAXTTL);
break;
case IP_MULTICAST_LOOP:
/*
* Set the loopback flag for outgoing multicast packets.
* Must be zero or one.
*/
error = ip_getoptval(sopt, &imo->imo_multicast_loop, 1);
break;
case IP_ADD_MEMBERSHIP: /* IPV6_JOIN_GROUP */
error = ip_add_membership(imo, sopt);
break;
case IP_DROP_MEMBERSHIP: /* IPV6_LEAVE_GROUP */
error = ip_drop_membership(imo, sopt);
break;
default:
error = EOPNOTSUPP;
break;
}
/*
* If all options have default values, no need to keep the mbuf.
*/
if (imo->imo_multicast_if_index == 0 && imo->imo_multicast_ttl == IP_DEFAULT_MULTICAST_TTL && imo->imo_multicast_loop == IP_DEFAULT_MULTICAST_LOOP &&
imo->imo_num_memberships == 0) {
kmem_intr_free(imo, sizeof(*imo));
*pimo = NULL;
}
return error;
}
/*
* Return the IP multicast options in response to user getsockopt().
*/
int
ip_getmoptions(struct ip_moptions *imo, struct sockopt *sopt)
{
struct in_addr addr;
uint8_t optval;
int error = 0;
/* imo is protected by solock or referenced only by the caller */
switch (sopt->sopt_name) {
case IP_MULTICAST_IF:
if (imo == NULL || imo->imo_multicast_if_index == 0)
addr = zeroin_addr;
else if (imo->imo_multicast_addr.s_addr) {
/* return the value user has set */
addr = imo->imo_multicast_addr;
} else {
struct ifnet *ifp;
struct in_ifaddr *ia = NULL;
int s = pserialize_read_enter();
ifp = if_byindex(imo->imo_multicast_if_index);
if (ifp != NULL) { ia = in_get_ia_from_ifp(ifp);
}
addr = ia ? ia->ia_addr.sin_addr : zeroin_addr;
pserialize_read_exit(s);
}
error = sockopt_set(sopt, &addr, sizeof(addr));
break;
case IP_MULTICAST_TTL:
optval = imo ? imo->imo_multicast_ttl
: IP_DEFAULT_MULTICAST_TTL;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
case IP_MULTICAST_LOOP:
optval = imo ? imo->imo_multicast_loop
: IP_DEFAULT_MULTICAST_LOOP;
error = sockopt_set(sopt, &optval, sizeof(optval));
break;
default:
error = EOPNOTSUPP;
}
return error;
}
/*
* Discard the IP multicast options.
*/
void
ip_freemoptions(struct ip_moptions *imo)
{
int i;
/* The owner of imo (inp) should be protected by solock */
if (imo != NULL) { for (i = 0; i < imo->imo_num_memberships; ++i) {
struct in_multi *inm = imo->imo_membership[i];
in_delmulti(inm);
/* ifp should not leave thanks to solock */
}
kmem_intr_free(imo, sizeof(*imo));
}
}
/*
* Routine called from ip_output() to loop back a copy of an IP multicast
* packet to the input queue of a specified interface. Note that this
* calls the output routine of the loopback "driver", but with an interface
* pointer that might NOT be lo0ifp -- easier than replicating that code here.
*/
static void
ip_mloopback(struct ifnet *ifp, struct mbuf *m, const struct sockaddr_in *dst)
{
struct ip *ip;
struct mbuf *copym;
copym = m_copypacket(m, M_DONTWAIT);
if (copym != NULL && (copym->m_flags & M_EXT || copym->m_len < sizeof(struct ip)))
copym = m_pullup(copym, sizeof(struct ip));
if (copym == NULL)
return;
/*
* We don't bother to fragment if the IP length is greater
* than the interface's MTU. Can this possibly matter?
*/
ip = mtod(copym, struct ip *);
if (copym->m_pkthdr.csum_flags & (M_CSUM_TCPv4|M_CSUM_UDPv4)) { in_undefer_cksum_tcpudp(copym);
copym->m_pkthdr.csum_flags &=
~(M_CSUM_TCPv4|M_CSUM_UDPv4);
}
ip->ip_sum = 0;
ip->ip_sum = in_cksum(copym, ip->ip_hl << 2);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
(void)looutput(ifp, copym, sintocsa(dst), NULL);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
/*
* Ensure sending address is valid.
* Returns 0 on success, -1 if an error should be sent back or 1
* if the packet could be dropped without error (protocol dependent).
*/
static int
ip_ifaddrvalid(const struct in_ifaddr *ia)
{
if (ia->ia_addr.sin_addr.s_addr == INADDR_ANY)
return 0;
if (ia->ia4_flags & IN_IFF_DUPLICATED)
return -1;
else if (ia->ia4_flags & (IN_IFF_TENTATIVE | IN_IFF_DETACHED))
return 1;
return 0;
}
/* $NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $ */
/*-
* Copyright (c)2011 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page_array.c,v 1.9 2020/05/26 21:52:12 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_page.h>
#include <uvm/uvm_page_array.h>
/*
* uvm_page_array_init: initialize the array.
*/
void
uvm_page_array_init(struct uvm_page_array *ar, struct uvm_object *uobj,
unsigned int flags)
{
ar->ar_idx = 0;
ar->ar_npages = 0;
ar->ar_uobj = uobj;
ar->ar_flags = flags;
}
/*
* uvm_page_array_fini: clean up the array.
*/
void
uvm_page_array_fini(struct uvm_page_array *ar)
{
/*
* currently nothing to do.
*/
#if defined(DIAGNOSTIC)
/*
* poison to trigger assertion in uvm_page_array_peek to
* detect usage errors.
*/
ar->ar_npages = 1;
ar->ar_idx = 1000;
#endif /* defined(DIAGNOSTIC) */
}
/*
* uvm_page_array_clear: forget the cached pages and initialize the array.
*/
void
uvm_page_array_clear(struct uvm_page_array *ar)
{ KASSERT(ar->ar_idx <= ar->ar_npages);
ar->ar_idx = 0;
ar->ar_npages = 0;
}
/*
* uvm_page_array_peek: return the next cached page.
*/
struct vm_page *
uvm_page_array_peek(struct uvm_page_array *ar)
{ KASSERT(ar->ar_idx <= ar->ar_npages); if (ar->ar_idx == ar->ar_npages) {
return NULL;
}
return ar->ar_pages[ar->ar_idx];
}
/*
* uvm_page_array_advance: advance the array to the next cached page
*/
void
uvm_page_array_advance(struct uvm_page_array *ar)
{ KASSERT(ar->ar_idx <= ar->ar_npages);
ar->ar_idx++;
KASSERT(ar->ar_idx <= ar->ar_npages);
}
/*
* uvm_page_array_fill: lookup pages and keep them cached.
*
* return 0 on success. in that case, cache the result in the array
* so that they will be picked by later uvm_page_array_peek.
*
* nwant is a number of pages to fetch. a caller should consider it a hint.
* nwant == 0 means a caller have no specific idea.
*
* return ENOENT if no pages are found.
*
* called with object lock held.
*/
int
uvm_page_array_fill(struct uvm_page_array *ar, voff_t off, unsigned int nwant)
{
unsigned int npages;
#if defined(DEBUG)
unsigned int i;
#endif /* defined(DEBUG) */
unsigned int maxpages = __arraycount(ar->ar_pages);
struct uvm_object *uobj = ar->ar_uobj;
const int flags = ar->ar_flags;
const bool dense = (flags & UVM_PAGE_ARRAY_FILL_DENSE) != 0;
const bool backward = (flags & UVM_PAGE_ARRAY_FILL_BACKWARD) != 0;
int error = 0;
if (nwant != 0 && nwant < maxpages) {
maxpages = nwant;
}
#if 0 /* called from DDB for "show obj/f" without lock */
KASSERT(rw_lock_held(uobj->vmobjlock));
#endif
KASSERT(uvm_page_array_peek(ar) == NULL);
if ((flags & UVM_PAGE_ARRAY_FILL_DIRTY) != 0) {
unsigned int tagmask = UVM_PAGE_DIRTY_TAG;
if ((flags & UVM_PAGE_ARRAY_FILL_WRITEBACK) != 0) {
tagmask |= UVM_PAGE_WRITEBACK_TAG;
}
npages =
(backward ? radix_tree_gang_lookup_tagged_node_reverse :
radix_tree_gang_lookup_tagged_node)(
&uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
maxpages, dense, tagmask);
} else {
npages =
(backward ? radix_tree_gang_lookup_node_reverse :
radix_tree_gang_lookup_node)(
&uobj->uo_pages, off >> PAGE_SHIFT, (void **)ar->ar_pages,
maxpages, dense);
}
if (npages == 0) {
if (flags != 0) {
/*
* if dense or looking for tagged entries (or
* working backwards), fail right away.
*/
npages = 0;
} else {
/*
* there's nothing else to be found with the current
* set of arguments, in the current version of the
* tree.
*
* minimize repeated tree lookups by "finding" a
* null pointer, in case the caller keeps looping (a
* common use case).
*/
npages = 1;
ar->ar_pages[0] = NULL;
}
error = ENOENT;
}
KASSERT(npages <= maxpages); ar->ar_npages = npages;
ar->ar_idx = 0;
#if defined(DEBUG)
for (i = 0; error == 0 && i < ar->ar_npages; i++) {
struct vm_page * const pg = ar->ar_pages[i];
KASSERT(pg != NULL); KDASSERT(pg->uobject == uobj);
if (backward) {
KDASSERT(pg->offset <= off); KDASSERT(i == 0 ||
pg->offset < ar->ar_pages[i - 1]->offset);
} else {
KDASSERT(pg->offset >= off); KDASSERT(i == 0 ||
pg->offset > ar->ar_pages[i - 1]->offset);
}
}
#endif /* defined(DEBUG) */
return error;
}
/*
* uvm_page_array_fill_and_peek:
* same as uvm_page_array_peek except that, if the array is empty, try to fill
* it first.
*/
struct vm_page *
uvm_page_array_fill_and_peek(struct uvm_page_array *ar, voff_t off,
unsigned int nwant)
{
int error;
if (ar->ar_idx != ar->ar_npages) {
return ar->ar_pages[ar->ar_idx];
}
error = uvm_page_array_fill(ar, off, nwant);
if (error != 0) {
return NULL;
}
return uvm_page_array_peek(ar);
}
/* $NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*-
* Copyright (c) 1998, 2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Coyote Point Systems, Inc.
* This code is derived from software contributed to The NetBSD Foundation
* by Public Access Networks Corporation ("Panix"). It was developed under
* contract to Panix by Eric Haszlakiewicz and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_pcb.c,v 1.202 2022/11/04 09:05:41 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/once.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
#include <sys/domain.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#include <netinet/portalgo.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet/tcp_vtw.h>
struct in_addr zeroin_addr;
#define INPCBHASH_PORT(table, lport) \
&(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define INPCBHASH_BIND(table, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
((ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_bindhash]
#define INPCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
&(table)->inpt_connecthashtbl[ \
((ntohl((faddr).s_addr) + ntohs(fport)) + \
(ntohl((laddr).s_addr) + ntohs(lport))) & (table)->inpt_connecthash]
int anonportmin = IPPORT_ANONMIN;
int anonportmax = IPPORT_ANONMAX;
int lowportmin = IPPORT_RESERVEDMIN;
int lowportmax = IPPORT_RESERVEDMAX;
static pool_cache_t in4pcb_pool_cache;
#ifdef INET6
static pool_cache_t in6pcb_pool_cache;
#endif
static int
inpcb_poolinit(void)
{
in4pcb_pool_cache = pool_cache_init(sizeof(struct in4pcb), coherency_unit,
0, 0, "in4pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#ifdef INET6
in6pcb_pool_cache = pool_cache_init(sizeof(struct in6pcb), coherency_unit,
0, 0, "in6pcbpl", NULL, IPL_NET, NULL, NULL, NULL);
#endif
return 0;
}
void
inpcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{
static ONCE_DECL(control);
TAILQ_INIT(&table->inpt_queue);
table->inpt_porthashtbl = hashinit(bindhashsize, HASH_LIST, true,
&table->inpt_porthash);
table->inpt_bindhashtbl = hashinit(bindhashsize, HASH_LIST, true,
&table->inpt_bindhash);
table->inpt_connecthashtbl = hashinit(connecthashsize, HASH_LIST, true,
&table->inpt_connecthash);
table->inpt_lastlow = IPPORT_RESERVEDMAX;
table->inpt_lastport = (in_port_t)anonportmax;
RUN_ONCE(&control, inpcb_poolinit);
}
/*
* inpcb_create: construct a new PCB and associated with a given socket.
* Sets the PCB state to INP_ATTACHED and makes PCB globally visible.
*/
int
inpcb_create(struct socket *so, void *v)
{
struct inpcbtable *table = v;
struct inpcb *inp;
int s;
#ifdef INET6
KASSERT(soaf(so) == AF_INET || soaf(so) == AF_INET6);
if (soaf(so) == AF_INET)
inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
else
inp = pool_cache_get(in6pcb_pool_cache, PR_NOWAIT);
#else
KASSERT(soaf(so) == AF_INET);
inp = pool_cache_get(in4pcb_pool_cache, PR_NOWAIT);
#endif
if (inp == NULL)
return ENOBUFS;
if (soaf(so) == AF_INET)
memset(inp, 0, sizeof(struct in4pcb));
#ifdef INET6
else
memset(inp, 0, sizeof(struct in6pcb));
#endif
inp->inp_af = soaf(so);
inp->inp_table = table;
inp->inp_socket = so;
inp->inp_portalgo = PORTALGO_DEFAULT;
inp->inp_bindportonsend = false;
if (inp->inp_af == AF_INET) {
in4p_errormtu(inp) = -1;
in4p_prefsrcip(inp).s_addr = INADDR_ANY;
}
#ifdef INET6
else {
in6p_hops6(inp) = -1; /* use kernel default */
if (ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY;
}
#endif
#if defined(IPSEC)
if (ipsec_enabled) {
int error = ipsec_init_pcbpolicy(so, &inp->inp_sp);
if (error != 0) {
#ifdef INET6
if (inp->inp_af == AF_INET)
pool_cache_put(in4pcb_pool_cache, inp);
else
pool_cache_put(in6pcb_pool_cache, inp);
#else
KASSERT(inp->inp_af == AF_INET);
pool_cache_put(in4pcb_pool_cache, inp);
#endif
return error;
}
inp->inp_sp->sp_inp = inp;
}
#endif
so->so_pcb = inp;
s = splsoftnet();
TAILQ_INSERT_HEAD(&table->inpt_queue, inp, inp_queue); LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
inp_lhash);
inpcb_set_state(inp, INP_ATTACHED);
splx(s);
return 0;
}
static int
inpcb_set_port(struct sockaddr_in *sin, struct inpcb *inp, kauth_cred_t cred)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
in_port_t *lastport;
in_port_t lport = 0;
enum kauth_network_req req;
int error;
if (inp->inp_flags & INP_LOWPORT) {
#ifndef IPNOPRIVPORTS
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
#else
req = KAUTH_REQ_NETWORK_BIND_PORT;
#endif
lastport = &table->inpt_lastlow;
} else {
req = KAUTH_REQ_NETWORK_BIND_PORT;
lastport = &table->inpt_lastport;
}
/* XXX-kauth: KAUTH_REQ_NETWORK_BIND_AUTOASSIGN_{,PRIV}PORT */
error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req, so, sin,
NULL);
if (error)
return EACCES;
/*
* Use RFC6056 randomized port selection
*/
error = portalgo_randport(&lport, inp, cred);
if (error)
return error;
inp->inp_flags |= INP_ANONPORT;
*lastport = lport;
lport = htons(lport);
inp->inp_lport = lport;
inpcb_set_state(inp, INP_BOUND);
return 0;
}
int
inpcb_bindableaddr(const struct inpcb *inp, struct sockaddr_in *sin,
kauth_cred_t cred)
{
int error = EADDRNOTAVAIL;
struct ifaddr *ifa = NULL;
int s;
if (sin->sin_family != AF_INET)
return EAFNOSUPPORT;
s = pserialize_read_enter();
if (IN_MULTICAST(sin->sin_addr.s_addr)) {
/* Always succeed; port reuse handled in inpcb_bind_port(). */
} else if (!in_nullhost(sin->sin_addr)) {
struct in_ifaddr *ia;
ia = in_get_ia(sin->sin_addr);
/* check for broadcast addresses */
if (ia == NULL) {
ifa = ifa_ifwithaddr(sintosa(sin));
if (ifa != NULL)
ia = ifatoia(ifa); else if ((inp->inp_flags & INP_BINDANY) != 0) {
error = 0;
goto error;
}
}
if (ia == NULL)
goto error;
if (ia->ia4_flags & IN_IFF_DUPLICATED)
goto error;
}
error = 0;
error:
pserialize_read_exit(s);
return error;
}
static int
inpcb_bind_addr(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
int error;
error = inpcb_bindableaddr(inp, sin, cred);
if (error == 0)
in4p_laddr(inp) = sin->sin_addr;
return error;
}
static int
inpcb_bind_port(struct inpcb *inp, struct sockaddr_in *sin, kauth_cred_t cred)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
int reuseport = (so->so_options & SO_REUSEPORT);
int wild = 0, error;
if (IN_MULTICAST(sin->sin_addr.s_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow complete duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
reuseport = SO_REUSEADDR|SO_REUSEPORT;
}
if (sin->sin_port == 0) {
error = inpcb_set_port(sin, inp, cred);
if (error)
return error;
} else {
struct inpcb *t;
vestigial_inpcb_t vestige;
#ifdef INET6
struct inpcb *t6;
struct in6_addr mapped;
#endif
enum kauth_network_req req;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
wild = 1;
#ifndef IPNOPRIVPORTS
if (ntohs(sin->sin_port) < IPPORT_RESERVED)
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
else
#endif /* !IPNOPRIVPORTS */
req = KAUTH_REQ_NETWORK_BIND_PORT;
error = kauth_authorize_network(cred, KAUTH_NETWORK_BIND, req,
so, sin, NULL);
if (error)
return EACCES;
#ifdef INET6
in6_in_2_v4mapin6(&sin->sin_addr, &mapped);
t6 = in6pcb_lookup_local(table, &mapped, sin->sin_port, wild, &vestige);
if (t6 && (reuseport & t6->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t6 && vestige.valid) { if (!!reuseport != !!vestige.reuse_port) {
return EADDRINUSE;
}
}
#endif
/* XXX-kauth */
if (so->so_uidinfo->ui_uid && !IN_MULTICAST(sin->sin_addr.s_addr)) {
t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, 1, &vestige);
/*
* XXX: investigate ramifications of loosening this
* restriction so that as long as both ports have
* SO_REUSEPORT allow the bind
*/
if (t && (!in_nullhost(sin->sin_addr) || !in_nullhost(in4p_laddr(t)) ||
(t->inp_socket->so_options & SO_REUSEPORT) == 0)
&& (so->so_uidinfo->ui_uid != t->inp_socket->so_uidinfo->ui_uid)) {
return EADDRINUSE;
}
if (!t && vestige.valid) { if ((!in_nullhost(sin->sin_addr) || !in_nullhost(vestige.laddr.v4) || !vestige.reuse_port)
&& so->so_uidinfo->ui_uid != vestige.uid) {
return EADDRINUSE;
}
}
}
t = inpcb_lookup_local(table, sin->sin_addr, sin->sin_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
inp->inp_lport = sin->sin_port;
inpcb_set_state(inp, INP_BOUND);
}
LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(INPCBHASH_PORT(table, inp->inp_lport), inp,
inp_lhash);
return 0;
}
/*
* inpcb_bind: assign a local IP address and port number to the PCB.
*
* If the address is not a wildcard, verify that it corresponds to a
* local interface. If a port is specified and it is privileged, then
* check the permission. Check whether the address or port is in use,
* and if so, whether we can re-use them.
*/
int
inpcb_bind(void *v, struct sockaddr_in *sin, struct lwp *l)
{
struct inpcb *inp = v;
struct sockaddr_in lsin;
int error;
if (inp->inp_af != AF_INET)
return EINVAL;
if (inp->inp_lport || !in_nullhost(in4p_laddr(inp)))
return EINVAL;
if (NULL != sin) {
if (sin->sin_len != sizeof(*sin))
return EINVAL;
} else {
lsin = *((const struct sockaddr_in *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
sin = &lsin;
}
/* Bind address. */
error = inpcb_bind_addr(inp, sin, l->l_cred);
if (error)
return error;
/* Bind port. */
error = inpcb_bind_port(inp, sin, l->l_cred);
if (error) { in4p_laddr(inp).s_addr = INADDR_ANY;
return error;
}
return 0;
}
/*
* inpcb_connect: connect from a socket to a specified address, i.e.,
* assign a foreign IP address and port number to the PCB.
*
* Both address and port must be specified in the name argument.
* If there is no local address for this socket yet, then pick one.
*/
int
inpcb_connect(void *v, struct sockaddr_in *sin, struct lwp *l)
{
struct inpcb *inp = v;
vestigial_inpcb_t vestige;
int error;
struct in_addr laddr;
if (inp->inp_af != AF_INET)
return EINVAL;
if (sin->sin_len != sizeof (*sin))
return EINVAL;
if (sin->sin_family != AF_INET)
return EAFNOSUPPORT;
if (sin->sin_port == 0)
return EADDRNOTAVAIL;
if (IN_MULTICAST(sin->sin_addr.s_addr) &&
inp->inp_socket->so_type == SOCK_STREAM)
return EADDRNOTAVAIL;
if (!IN_ADDRLIST_READER_EMPTY()) {
/*
* If the destination address is INADDR_ANY,
* use any local address (likely loopback).
* If the supplied address is INADDR_BROADCAST,
* use the broadcast address of an interface
* which supports broadcast. (loopback does not)
*/
if (in_nullhost(sin->sin_addr)) {
/* XXX racy */
sin->sin_addr =
IN_ADDRLIST_READER_FIRST()->ia_addr.sin_addr;
} else if (sin->sin_addr.s_addr == INADDR_BROADCAST) {
struct in_ifaddr *ia;
int s = pserialize_read_enter();
IN_ADDRLIST_READER_FOREACH(ia) {
if (ia->ia_ifp->if_flags & IFF_BROADCAST) {
sin->sin_addr =
ia->ia_broadaddr.sin_addr;
break;
}
}
pserialize_read_exit(s);
}
}
/*
* If we haven't bound which network number to use as ours,
* we will use the number of the outgoing interface.
* This depends on having done a routing lookup, which
* we will probably have to do anyway, so we might
* as well do it now. On the other hand if we are
* sending to multiple destinations we may have already
* done the lookup, so see if we can use the route
* from before. In any case, we only
* chose a port number once, even if sending to multiple
* destinations.
*/
if (in_nullhost(in4p_laddr(inp))) {
int xerror;
struct in_ifaddr *ia, *_ia;
int s;
struct psref psref;
int bound;
bound = curlwp_bind();
ia = in_selectsrc(sin, &inp->inp_route,
inp->inp_socket->so_options, inp->inp_moptions, &xerror,
&psref);
if (ia == NULL) {
curlwp_bindx(bound);
if (xerror == 0)
xerror = EADDRNOTAVAIL;
return xerror;
}
s = pserialize_read_enter();
_ia = in_get_ia(IA_SIN(ia)->sin_addr); if (_ia == NULL && (inp->inp_flags & INP_BINDANY) == 0) {
pserialize_read_exit(s);
ia4_release(ia, &psref);
curlwp_bindx(bound);
return EADDRNOTAVAIL;
}
pserialize_read_exit(s);
laddr = IA_SIN(ia)->sin_addr;
ia4_release(ia, &psref);
curlwp_bindx(bound);
} else
laddr = in4p_laddr(inp);
if (inpcb_lookup(inp->inp_table, sin->sin_addr, sin->sin_port, laddr, inp->inp_lport, &vestige) != NULL ||
vestige.valid) {
return EADDRINUSE;
}
if (in_nullhost(in4p_laddr(inp))) { if (inp->inp_lport == 0) { error = inpcb_bind(inp, NULL, l);
/*
* This used to ignore the return value
* completely, but we need to check for
* ephemeral port shortage.
* And attempts to request low ports if not root.
*/
if (error != 0)
return error;
}
in4p_laddr(inp) = laddr;
}
in4p_faddr(inp) = sin->sin_addr;
inp->inp_fport = sin->sin_port;
/* Late bind, if needed */
if (inp->inp_bindportonsend) {
struct sockaddr_in lsin = *((const struct sockaddr_in *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin.sin_addr = in4p_laddr(inp);
lsin.sin_port = 0;
if ((error = inpcb_bind_port(inp, &lsin, l->l_cred)) != 0)
return error;
}
inpcb_set_state(inp, INP_CONNECTED);
#if defined(IPSEC)
if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp);
#endif
return 0;
}
/*
* inpcb_disconnect: remove any foreign IP/port association.
*
* Note: destroys the PCB if socket was closed.
*/
void
inpcb_disconnect(void *v)
{
struct inpcb *inp = v;
if (inp->inp_af != AF_INET)
return;
in4p_faddr(inp) = zeroin_addr;
inp->inp_fport = 0;
inpcb_set_state(inp, INP_BOUND);
#if defined(IPSEC)
if (ipsec_enabled)
ipsec_pcbdisconn(inp->inp_sp);
#endif
if (inp->inp_socket->so_state & SS_NOFDREF)
inpcb_destroy(inp);
}
/*
* inpcb_destroy: destroy PCB as well as the associated socket.
*/
void
inpcb_destroy(void *v)
{
struct inpcb *inp = v;
struct socket *so = inp->inp_socket;
int s;
KASSERT(inp->inp_af == AF_INET || inp->inp_af == AF_INET6);
#if defined(IPSEC)
if (ipsec_enabled) ipsec_delete_pcbpolicy(inp);
#endif
so->so_pcb = NULL;
s = splsoftnet();
inpcb_set_state(inp, INP_ATTACHED);
LIST_REMOVE(inp, inp_lhash); TAILQ_REMOVE(&inp->inp_table->inpt_queue, inp, inp_queue);
splx(s);
if (inp->inp_options) { m_free(inp->inp_options);
}
rtcache_free(&inp->inp_route);
ip_freemoptions(inp->inp_moptions);
#ifdef INET6
if (inp->inp_af == AF_INET6) { if (in6p_outputopts(inp) != NULL) { ip6_clearpktopts(in6p_outputopts(inp), -1);
free(in6p_outputopts(inp), M_IP6OPT);
}
ip6_freemoptions(in6p_moptions(inp));
}
#endif
sofree(so); /* drops the socket's lock */
#ifdef INET6
if (inp->inp_af == AF_INET)
pool_cache_put(in4pcb_pool_cache, inp);
else
pool_cache_put(in6pcb_pool_cache, inp);
#else
KASSERT(inp->inp_af == AF_INET);
pool_cache_put(in4pcb_pool_cache, inp);
#endif
mutex_enter(softnet_lock); /* reacquire the softnet_lock */
}
/*
* inpcb_fetch_sockaddr: fetch the local IP address and port number.
*/
void
inpcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in *sin)
{ if (inp->inp_af != AF_INET)
return;
sockaddr_in_init(sin, &in4p_laddr(inp), inp->inp_lport);
}
/*
* inpcb_fetch_peeraddr: fetch the foreign IP address and port number.
*/
void
inpcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in *sin)
{ if (inp->inp_af != AF_INET)
return;
sockaddr_in_init(sin, &in4p_faddr(inp), inp->inp_fport);
}
/*
* inpcb_notify: pass some notification to all connections of a protocol
* associated with destination address. The local address and/or port
* numbers may be specified to limit the search. The "usual action" will
* be taken, depending on the command.
*
* The caller must filter any commands that are not interesting (e.g.,
* no error in the map). Call the protocol specific routine (if any) to
* report any errors for each matching socket.
*
* Must be called at splsoftnet.
*/
int
inpcb_notify(struct inpcbtable *table, struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg, int errno,
void (*notify)(struct inpcb *, int))
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
int nmatch;
if (in_nullhost(faddr) || notify == NULL)
return 0;
nmatch = 0;
head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr) &&
inp->inp_fport == fport &&
inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr)) {
(*notify)(inp, errno);
nmatch++;
}
}
return nmatch;
}
void
inpcb_notifyall(struct inpcbtable *table, struct in_addr faddr, int errno,
void (*notify)(struct inpcb *, int))
{
struct inpcb *inp;
if (in_nullhost(faddr) || notify == NULL)
return;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr))
(*notify)(inp, errno);
}
}
void
in_purgeifmcast(struct ip_moptions *imo, struct ifnet *ifp)
{
int i, gap;
/* The owner of imo should be protected by solock */
KASSERT(ifp != NULL);
if (imo == NULL)
return;
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (imo->imo_multicast_if_index == ifp->if_index)
imo->imo_multicast_if_index = 0;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
*/
for (i = 0, gap = 0; i < imo->imo_num_memberships; i++) {
if (imo->imo_membership[i]->inm_ifp == ifp) {
in_delmulti(imo->imo_membership[i]);
gap++;
} else if (gap != 0)
imo->imo_membership[i - gap] = imo->imo_membership[i];
}
imo->imo_num_memberships -= gap;
}
void
inpcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
bool need_unlock = false;
if (inp->inp_af != AF_INET)
continue;
/* The caller holds either one of inps' lock */
if (!inp_locked(inp)) {
inp_lock(inp);
need_unlock = true;
}
in_purgeifmcast(inp->inp_moptions, ifp);
if (need_unlock)
inp_unlock(inp);
}
}
void
inpcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
struct rtentry *rt;
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET)
continue;
if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
rt->rt_ifp == ifp) {
rtcache_unref(rt, &inp->inp_route);
inpcb_rtchange(inp, 0);
} else
rtcache_unref(rt, &inp->inp_route);
}
}
/*
* inpcb_losing: check for alternatives when higher level complains about
* service problems. For now, invalidate cached routing information.
* If the route was created dynamically (by a redirect), time to try a
* default gateway again.
*/
void
inpcb_losing(struct inpcb *inp)
{
struct rtentry *rt;
struct rt_addrinfo info;
if (inp->inp_af != AF_INET)
return;
if ((rt = rtcache_validate(&inp->inp_route)) == NULL)
return;
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_DST] = rtcache_getdst(&inp->inp_route);
info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
info.rti_info[RTAX_NETMASK] = rt_mask(rt);
rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
if (rt->rt_flags & RTF_DYNAMIC) {
int error;
struct rtentry *nrt;
error = rtrequest(RTM_DELETE, rt_getkey(rt),
rt->rt_gateway, rt_mask(rt), rt->rt_flags, &nrt);
rtcache_unref(rt, &inp->inp_route);
if (error == 0) {
rt_newmsg_dynamic(RTM_DELETE, nrt);
rt_free(nrt);
}
} else
rtcache_unref(rt, &inp->inp_route);
/*
* A new route can be allocated
* the next time output is attempted.
*/
rtcache_free(&inp->inp_route);
}
/*
* inpcb_rtchange: after a routing change, flush old routing.
* A new route can be allocated the next time output is attempted.
*/
void
inpcb_rtchange(struct inpcb *inp, int errno)
{
if (inp->inp_af != AF_INET)
return;
rtcache_free(&inp->inp_route);
/* XXX SHOULD NOTIFY HIGHER-LEVEL PROTOCOLS */
}
/*
* inpcb_lookup_local: find a PCB by looking at the local port and matching
* the local address or resolving the wildcards. Primarily used to detect
* when the local address is already in use.
*/
struct inpcb *
inpcb_lookup_local(struct inpcbtable *table, struct in_addr laddr,
u_int lport_arg, int lookup_wildcard, vestigial_inpcb_t *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
struct inpcb *match = NULL;
int matchwild = 3;
int wildcard;
in_port_t lport = lport_arg;
if (vp) vp->valid = 0;
head = INPCBHASH_PORT(table, lport);
LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport != lport)
continue;
/*
* check if inp's faddr and laddr match with ours.
* our faddr is considered null.
* count the number of wildcard matches. (0 - 2)
*
* null null match
* A null wildcard match
* null B wildcard match
* A B non match
* A A match
*/
wildcard = 0;
if (!in_nullhost(in4p_faddr(inp)))
wildcard++;
if (in_nullhost(in4p_laddr(inp))) {
if (!in_nullhost(laddr))
wildcard++;
} else {
if (in_nullhost(laddr))
wildcard++;
else {
if (!in_hosteq(in4p_laddr(inp), laddr))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
/*
* prefer an address with less wildcards.
*/
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (match && matchwild == 0)
return match;
if (vp && table->vestige) {
void *state = (*table->vestige->init_ports4)(laddr, lport_arg, lookup_wildcard);
vestigial_inpcb_t better;
bool has_better = false;
while (table->vestige && (*table->vestige->next_port4)(state, vp)) { if (vp->lport != lport)
continue;
wildcard = 0;
if (!in_nullhost(vp->faddr.v4))
wildcard++;
if (in_nullhost(vp->laddr.v4)) {
if (!in_nullhost(laddr))
wildcard++;
} else {
if (in_nullhost(laddr))
wildcard++;
else {
if (!in_hosteq(vp->laddr.v4, laddr))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
better = *vp;
has_better = true;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (has_better) {
*vp = better;
return 0;
}
}
return match;
}
#ifdef DIAGNOSTIC
int inpcb_notifymiss = 0;
#endif
/*
* inpcb_lookup: perform a full 4-tuple PCB lookup.
*/
struct inpcb *
inpcb_lookup(struct inpcbtable *table,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
vestigial_inpcb_t *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
if (vp) vp->valid = 0;
head = INPCBHASH_CONNECT(table, faddr, fport, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET)
continue;
if (in_hosteq(in4p_faddr(inp), faddr) && inp->inp_fport == fport && inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr))
goto out;
}
if (vp && table->vestige) { if ((*table->vestige->lookup4)(faddr, fport_arg,
laddr, lport_arg, vp))
return 0;
}
#ifdef DIAGNOSTIC
if (inpcb_notifymiss) {
printf("inpcb_lookup: faddr=%08x fport=%d laddr=%08x lport=%d\n",
ntohl(faddr.s_addr), ntohs(fport),
ntohl(laddr.s_addr), ntohs(lport));
}
#endif
return 0;
out:
/* Move this PCB to the head of hash chain. */
if (inp != LIST_FIRST(head)) { LIST_REMOVE(inp, inp_hash); LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
/*
* inpcb_lookup_bound: find a PCB by looking at the local address and port.
* Primarily used to find the listening (i.e., already bound) socket.
*/
struct inpcb *
inpcb_lookup_bound(struct inpcbtable *table,
struct in_addr laddr, u_int lport_arg)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t lport = lport_arg;
head = INPCBHASH_BIND(table, laddr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), laddr))
goto out;
}
head = INPCBHASH_BIND(table, zeroin_addr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET)
continue;
if (inp->inp_lport == lport &&
in_hosteq(in4p_laddr(inp), zeroin_addr))
goto out;
}
#ifdef DIAGNOSTIC
if (inpcb_notifymiss) {
printf("inpcb_lookup_bound: laddr=%08x lport=%d\n",
ntohl(laddr.s_addr), ntohs(lport));
}
#endif
return 0;
out:
/* Move this PCB to the head of hash chain. */
if (inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
void
inpcb_set_state(struct inpcb *inp, int state)
{
#ifdef INET6
if (inp->inp_af == AF_INET6) {
in6pcb_set_state(inp, state);
return;
}
#else
if (inp->inp_af != AF_INET)
return;
#endif
if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) {
case INP_BOUND:
LIST_INSERT_HEAD(INPCBHASH_BIND(inp->inp_table,
in4p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
case INP_CONNECTED:
LIST_INSERT_HEAD(INPCBHASH_CONNECT(inp->inp_table,
in4p_faddr(inp), inp->inp_fport,
in4p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
}
inp->inp_state = state;
}
struct rtentry *
inpcb_rtentry(struct inpcb *inp)
{
struct route *ro;
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
#ifdef INET6
if (inp->inp_af == AF_INET6) return in6pcb_rtentry(inp);
#endif
if (inp->inp_af != AF_INET)
return NULL;
ro = &inp->inp_route;
sockaddr_in_init(&u.dst4, &in4p_faddr(inp), 0);
return rtcache_lookup(ro, &u.dst);
}
void
inpcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{
rtcache_unref(rt, &inp->inp_route);
}
/* $NetBSD: tmpfs_specops.c,v 1.16 2021/07/19 01:30:25 dholland Exp $ */
/*
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* tmpfs vnode interface for special devices.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_specops.c,v 1.16 2021/07/19 01:30:25 dholland Exp $");
#include <sys/param.h>
#include <sys/vnode.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_specops.h>
/*
* vnode operations vector used for special devices stored in a tmpfs
* file system.
*/
int (**tmpfs_specop_p)(void *);
const struct vnodeopv_entry_desc tmpfs_specop_entries[] = {
{ &vop_default_desc, vn_default_error },
GENFS_SPECOP_ENTRIES,
{ &vop_close_desc, tmpfs_spec_close },
{ &vop_access_desc, tmpfs_access },
{ &vop_accessx_desc, genfs_accessx },
{ &vop_getattr_desc, tmpfs_getattr },
{ &vop_setattr_desc, tmpfs_setattr },
{ &vop_read_desc, tmpfs_spec_read },
{ &vop_write_desc, tmpfs_spec_write },
{ &vop_fcntl_desc, genfs_fcntl },
{ &vop_fsync_desc, spec_fsync },
{ &vop_inactive_desc, tmpfs_inactive },
{ &vop_reclaim_desc, tmpfs_reclaim },
{ &vop_lock_desc, genfs_lock },
{ &vop_unlock_desc, genfs_unlock },
{ &vop_print_desc, tmpfs_print },
{ &vop_islocked_desc, genfs_islocked },
{ &vop_bwrite_desc, vn_bwrite },
{ NULL, NULL }
};
const struct vnodeopv_desc tmpfs_specop_opv_desc = {
&tmpfs_specop_p, tmpfs_specop_entries
};
int
tmpfs_spec_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap __unused = v;
return VOCALL(spec_vnodeop_p, VOFFSET(vop_close), v);
}
int
tmpfs_spec_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_ATIME);
return VOCALL(spec_vnodeop_p, VOFFSET(vop_read), v);
}
int
tmpfs_spec_write(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_MTIME);
return VOCALL(spec_vnodeop_p, VOFFSET(vop_write), v);
}
/* $NetBSD: clock_subr.c,v 1.27 2016/08/15 15:51:39 jakllsch Exp $ */
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1982, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: clock.c 1.18 91/01/21$
*
* @(#)clock.c 8.2 (Berkeley) 1/12/94
*/
/*
* Generic routines to convert between a POSIX date
* (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
* Derived from arch/hp300/hp300/clock.c
*/
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif /* HAVE_NBTOOL_CONFIG_H */
#ifdef _KERNEL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clock_subr.c,v 1.27 2016/08/15 15:51:39 jakllsch Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/errno.h>
#else /* ! _KERNEL */
#include <string.h>
#include <time.h>
#include <errno.h>
#endif /* ! _KERNEL */
#include "../sys/clock.h"
#include <dev/clock_subr.h>
#define FEBRUARY 2
/* for easier alignment:
* time from the epoch to 2001 (there were 8 leap years): */
#define DAYSTO2001 (365*31+8)
/* 4 year intervals include 1 leap year */
#define DAYS4YEARS (365*4+1)
/* 100 year intervals include 24 leap years */
#define DAYS100YEARS (365*100+24)
/* 400 year intervals include 97 leap years */
#define DAYS400YEARS (365*400+97)
time_t
clock_ymdhms_to_secs(struct clock_ymdhms *dt)
{
uint64_t secs, i, year, days;
year = dt->dt_year;
/*
* Compute days since start of time
* First from years, then from months.
*/
if (year < POSIX_BASE_YEAR)
return -1;
days = 0;
if (is_leap_year(year) && dt->dt_mon > FEBRUARY)
days++;
if (year < 2001) {
/* simple way for early years */
for (i = POSIX_BASE_YEAR; i < year; i++)
days += days_per_year(i);
} else {
/* years are properly aligned */
days += DAYSTO2001;
year -= 2001;
i = year / 400;
days += i * DAYS400YEARS;
year -= i * 400;
i = year / 100;
days += i * DAYS100YEARS;
year -= i * 100;
i = year / 4;
days += i * DAYS4YEARS;
year -= i * 4;
for (i = dt->dt_year-year; i < dt->dt_year; i++)
days += days_per_year(i);
}
/* Months */
for (i = 1; i < dt->dt_mon; i++)
days += days_in_month(i);
days += (dt->dt_day - 1);
/* Add hours, minutes, seconds. */
secs = (((uint64_t)days
* 24 + dt->dt_hour)
* 60 + dt->dt_min)
* 60 + dt->dt_sec;
if ((time_t)secs < 0 || secs > __type_max(time_t))
return -1;
return secs;
}
int
clock_secs_to_ymdhms(time_t secs, struct clock_ymdhms *dt)
{
int leap;
uint64_t i;
time_t days;
time_t rsec; /* remainder seconds */
if (secs < 0)
return EINVAL;
days = secs / SECS_PER_DAY;
rsec = secs % SECS_PER_DAY;
/* Day of week (Note: 1/1/1970 was a Thursday) */
dt->dt_wday = (days + 4) % 7;
if (days >= DAYSTO2001) {
days -= DAYSTO2001;
dt->dt_year = 2001;
i = days / DAYS400YEARS;
days -= i*DAYS400YEARS;
dt->dt_year += i*400;
i = days / DAYS100YEARS;
days -= i*DAYS100YEARS;
dt->dt_year += i*100;
i = days / DAYS4YEARS;
days -= i*DAYS4YEARS;
dt->dt_year += i*4;
for (i = dt->dt_year; days >= days_per_year(i); i++) days -= days_per_year(i);
dt->dt_year = i;
} else {
/* Subtract out whole years, counting them in i. */
for (i = POSIX_BASE_YEAR; days >= days_per_year(i); i++) days -= days_per_year(i);
dt->dt_year = i;
}
/* Subtract out whole months, counting them in i. */
for (leap = 0, i = 1; days >= days_in_month(i)+leap; i++) { days -= days_in_month(i)+leap;
if (i == 1 && is_leap_year(dt->dt_year))
leap = 1;
else
leap = 0;
}
dt->dt_mon = i;
/* Days are what is left over (+1) from all that. */
dt->dt_day = days + 1;
/* Hours, minutes, seconds are easy */
dt->dt_hour = rsec / SECS_PER_HOUR;
rsec = rsec % SECS_PER_HOUR;
dt->dt_min = rsec / SECS_PER_MINUTE;
rsec = rsec % SECS_PER_MINUTE;
dt->dt_sec = rsec;
return 0;
}
/* $NetBSD: ipsec.h,v 1.93 2022/10/28 05:23:09 ozaki-r Exp $ */
/* $FreeBSD: ipsec.h,v 1.2.4.2 2004/02/14 22:23:23 bms Exp $ */
/* $KAME: ipsec.h,v 1.53 2001/11/20 08:32:38 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _NETIPSEC_IPSEC_H_
#define _NETIPSEC_IPSEC_H_
#if defined(_KERNEL_OPT)
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <net/pfkeyv2.h>
#ifdef _KERNEL
#include <sys/socketvar.h>
#include <sys/localcount.h>
#include <netinet/in_pcb.h>
#include <netipsec/keydb.h>
/*
* Security Policy Index
* Ensure that both address families in the "src" and "dst" are same.
* When the value of the ul_proto is ICMPv6, the port field in "src"
* specifies ICMPv6 type, and the port field in "dst" specifies ICMPv6 code.
*/
struct secpolicyindex {
u_int8_t dir; /* direction of packet flow, see blow */
union sockaddr_union src; /* IP src address for SP */
union sockaddr_union dst; /* IP dst address for SP */
u_int8_t prefs; /* prefix length in bits for src */
u_int8_t prefd; /* prefix length in bits for dst */
u_int16_t ul_proto; /* upper layer Protocol */
};
/* Security Policy Data Base */
struct secpolicy {
struct pslist_entry pslist_entry;
struct localcount localcount; /* reference count */
struct secpolicyindex spidx; /* selector */
u_int32_t id; /* It's unique number on the system. */
u_int state; /* 0: dead, others: alive */
#define IPSEC_SPSTATE_DEAD 0
#define IPSEC_SPSTATE_ALIVE 1
u_int origin; /* who generate this SP. */
#define IPSEC_SPORIGIN_USER 0
#define IPSEC_SPORIGIN_KERNEL 1
u_int policy; /* DISCARD, NONE or IPSEC, see keyv2.h */
struct ipsecrequest *req;
/* pointer to the ipsec request tree, */
/* if policy == IPSEC else this value == NULL.*/
/*
* lifetime handler.
* the policy can be used without limitiation if both lifetime and
* validtime are zero.
* "lifetime" is passed by sadb_lifetime.sadb_lifetime_addtime.
* "validtime" is passed by sadb_lifetime.sadb_lifetime_usetime.
*/
time_t created; /* time created the policy */
time_t lastused; /* updated every when kernel sends a packet */
time_t lifetime; /* duration of the lifetime of this policy */
time_t validtime; /* duration this policy is valid without use */
};
/* Request for IPsec */
struct ipsecrequest {
struct ipsecrequest *next;
/* pointer to next structure */
/* If NULL, it means the end of chain. */
struct secasindex saidx;/* hint for search proper SA */
/* if __ss_len == 0 then no address specified.*/
u_int level; /* IPsec level defined below. */
struct secpolicy *sp; /* back pointer to SP */
};
/* security policy in PCB */
struct inpcbpolicy {
struct secpolicy *sp_in;
struct secpolicy *sp_out;
int priv; /* privileged socket ? */
/* cached policy */
struct {
struct secpolicy *cachesp;
struct secpolicyindex cacheidx;
int cachehint; /* processing requirement hint: */
#define IPSEC_PCBHINT_UNKNOWN 0 /* Unknown */
#define IPSEC_PCBHINT_YES 1 /* IPsec processing is required */
#define IPSEC_PCBHINT_NO 2 /* IPsec processing not required */
u_int cachegen; /* spdgen when cache filled */
} sp_cache[3]; /* XXX 3 == IPSEC_DIR_MAX */
int sp_cacheflags;
#define IPSEC_PCBSP_CONNECTED 1
struct inpcb *sp_inp; /* back pointer */
};
extern u_int ipsec_spdgen;
static __inline bool
ipsec_pcb_skip_ipsec(struct inpcbpolicy *pcbsp, int dir)
{
KASSERT(inp_locked(pcbsp->sp_inp)); return pcbsp->sp_cache[(dir)].cachehint == IPSEC_PCBHINT_NO &&
pcbsp->sp_cache[(dir)].cachegen == ipsec_spdgen;
}
/* SP acquiring list table. */
struct secspacq {
LIST_ENTRY(secspacq) chain;
struct secpolicyindex spidx;
time_t created; /* for lifetime */
int count; /* for lifetime */
/* XXX: here is mbuf place holder to be sent ? */
};
#endif /* _KERNEL */
/* buffer size for formatted output of ipsec address (addr + '%' + scope_id?) */
#define IPSEC_ADDRSTRLEN (INET6_ADDRSTRLEN + 11)
/* buffer size for ipsec_logsastr() */
#define IPSEC_LOGSASTRLEN 192
/* according to IANA assignment, port 0x0000 and proto 0xff are reserved. */
#define IPSEC_PORT_ANY 0
#define IPSEC_ULPROTO_ANY 255
#define IPSEC_PROTO_ANY 255
/* mode of security protocol */
/* NOTE: DON'T use IPSEC_MODE_ANY at SPD. It's only use in SAD */
#define IPSEC_MODE_ANY 0 /* i.e. wildcard. */
#define IPSEC_MODE_TRANSPORT 1
#define IPSEC_MODE_TUNNEL 2
#define IPSEC_MODE_TCPMD5 3 /* TCP MD5 mode */
/*
* Direction of security policy.
* NOTE: Since INVALID is used just as flag.
* The other are used for loop counter too.
*/
#define IPSEC_DIR_ANY 0
#define IPSEC_DIR_INBOUND 1
#define IPSEC_DIR_OUTBOUND 2
#define IPSEC_DIR_MAX 3
#define IPSEC_DIR_INVALID 4
#define IPSEC_DIR_IS_VALID(dir) ((dir) >= 0 && (dir) <= IPSEC_DIR_MAX)
#define IPSEC_DIR_IS_INOROUT(dir) ((dir) == IPSEC_DIR_INBOUND || \
(dir) == IPSEC_DIR_OUTBOUND)
/* Policy level */
/*
* IPSEC, ENTRUST and BYPASS are allowed for setsockopt() in PCB,
* DISCARD, IPSEC and NONE are allowed for setkey() in SPD.
* DISCARD and NONE are allowed for system default.
*/
#define IPSEC_POLICY_DISCARD 0 /* discarding packet */
#define IPSEC_POLICY_NONE 1 /* through IPsec engine */
#define IPSEC_POLICY_IPSEC 2 /* do IPsec */
#define IPSEC_POLICY_ENTRUST 3 /* consulting SPD if present. */
#define IPSEC_POLICY_BYPASS 4 /* only for privileged socket. */
/* Security protocol level */
#define IPSEC_LEVEL_DEFAULT 0 /* reference to system default */
#define IPSEC_LEVEL_USE 1 /* use SA if present. */
#define IPSEC_LEVEL_REQUIRE 2 /* require SA. */
#define IPSEC_LEVEL_UNIQUE 3 /* unique SA. */
#define IPSEC_MANUAL_REQID_MAX 0x3fff
/*
* if security policy level == unique, this id
* indicate to a relative SA for use, else is
* zero.
* 1 - 0x3fff are reserved for manual keying.
* 0 are reserved for above reason. Others is
* for kernel use.
* Note that this id doesn't identify SA
* by only itself.
*/
#define IPSEC_REPLAYWSIZE 32
#ifdef _KERNEL
extern int ipsec_debug;
#ifdef IPSEC_DEBUG
extern int ipsec_replay;
extern int ipsec_integrity;
#endif
extern struct secpolicy ip4_def_policy;
extern int ip4_esp_trans_deflev;
extern int ip4_esp_net_deflev;
extern int ip4_ah_trans_deflev;
extern int ip4_ah_net_deflev;
extern int ip4_ah_cleartos;
extern int ip4_ah_offsetmask;
extern int ip4_ipsec_dfbit;
extern int ip4_ipsec_ecn;
extern int crypto_support;
#include <sys/syslog.h>
#define DPRINTF(fmt, args...) \
do { \
if (ipsec_debug) \
log(LOG_DEBUG, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#define IPSECLOG(level, fmt, args...) \
do { \
if (ipsec_debug) \
log(level, "%s: " fmt, __func__, ##args); \
} while (/*CONSTCOND*/0)
#define ipsec_indone(m) \
((m->m_flags & M_AUTHIPHDR) || (m->m_flags & M_DECRYPTED))
#define ipsec_outdone(m) \
(m_tag_find((m), PACKET_TAG_IPSEC_OUT_DONE) != NULL)
static __inline bool
ipsec_skip_pfil(struct mbuf *m)
{
bool rv;
if (ipsec_indone(m) &&
((m->m_pkthdr.pkthdr_flags & PKTHDR_FLAG_IPSEC_SKIP_PFIL) != 0)) {
m->m_pkthdr.pkthdr_flags &= ~PKTHDR_FLAG_IPSEC_SKIP_PFIL;
rv = true;
} else {
rv = false;
}
return rv;
}
void ipsec_pcbconn(struct inpcbpolicy *);
void ipsec_pcbdisconn(struct inpcbpolicy *);
void ipsec_invalpcbcacheall(void);
struct inpcb;
int ipsec4_output(struct mbuf *, struct inpcb *, int, u_long *, bool *, bool *, bool *);
int ipsec_ip_input_checkpolicy(struct mbuf *, bool);
void ipsec_mtu(struct mbuf *, int *);
#ifdef INET6
void ipsec6_udp_cksum(struct mbuf *);
#endif
struct inpcb;
int ipsec_init_pcbpolicy(struct socket *so, struct inpcbpolicy **);
int ipsec_copy_policy(const struct inpcbpolicy *, struct inpcbpolicy *);
u_int ipsec_get_reqlevel(const struct ipsecrequest *);
int ipsec_set_policy(struct inpcb *, const void *, size_t, kauth_cred_t);
int ipsec_get_policy(struct inpcb *, const void *, size_t, struct mbuf **);
int ipsec_delete_pcbpolicy(struct inpcb *);
int ipsec_in_reject(struct mbuf *, struct inpcb *);
struct secasvar *ipsec_lookup_sa(const struct ipsecrequest *,
const struct mbuf *);
struct secas;
struct tcpcb;
int ipsec_chkreplay(u_int32_t, const struct secasvar *);
int ipsec_updatereplay(u_int32_t, const struct secasvar *);
size_t ipsec_hdrsiz(struct mbuf *, u_int, struct inpcb *);
size_t ipsec4_hdrsiz_tcp(struct tcpcb *);
union sockaddr_union;
const char *ipsec_address(const union sockaddr_union* sa, char *, size_t);
const char *ipsec_logsastr(const struct secasvar *, char *, size_t);
/* NetBSD protosw ctlin entrypoint */
void *esp4_ctlinput(int, const struct sockaddr *, void *);
void *ah4_ctlinput(int, const struct sockaddr *, void *);
void ipsec_output_init(void);
struct m_tag;
void ipsec4_common_input(struct mbuf *m, int, int);
int ipsec4_common_input_cb(struct mbuf *, struct secasvar *, int, int);
int ipsec4_process_packet(struct mbuf *, const struct ipsecrequest *, u_long *);
int ipsec_process_done(struct mbuf *, const struct ipsecrequest *,
struct secasvar *, int);
struct mbuf *m_clone(struct mbuf *);
struct mbuf *m_makespace(struct mbuf *, int, int, int *);
void *m_pad(struct mbuf *, int);
int m_striphdr(struct mbuf *, int, int);
extern int ipsec_used __read_mostly;
extern int ipsec_enabled __read_mostly;
#endif /* _KERNEL */
#ifndef _KERNEL
char *ipsec_set_policy(const char *, int);
int ipsec_get_policylen(char *);
char *ipsec_dump_policy(char *, const char *);
const char *ipsec_strerror(void);
#endif /* !_KERNEL */
#ifdef _KERNEL
/* External declarations of per-file init functions */
void ah_attach(void);
void esp_attach(void);
void ipcomp_attach(void);
void ipe4_attach(void);
void tcpsignature_attach(void);
void ipsec_attach(void);
void sysctl_net_inet_ipsec_setup(struct sysctllog **);
#ifdef INET6
void sysctl_net_inet6_ipsec6_setup(struct sysctllog **);
#endif
#endif /* _KERNEL */
#endif /* !_NETIPSEC_IPSEC_H_ */
/* $NetBSD: nfs_export.c,v 1.63 2021/06/04 10:44:58 hannken Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2008, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
/*
* VFS exports list management.
*
* Lock order: vfs_busy -> mnt_updating -> netexport_lock.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: nfs_export.c,v 1.63 2021/06/04 10:44:58 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/queue.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
#include <sys/dirent.h>
#include <sys/socket.h> /* XXX for AF_MAX */
#include <sys/kauth.h>
#include <net/radix.h>
#include <netinet/in.h>
#include <nfs/rpcv2.h>
#include <nfs/nfsproto.h>
#include <nfs/nfs.h>
#include <nfs/nfs_var.h>
/*
* Network address lookup element.
*/
struct netcred {
struct radix_node netc_rnodes[2];
int netc_refcnt;
int netc_exflags;
kauth_cred_t netc_anon;
};
/*
* Network export information.
*/
struct netexport {
TAILQ_ENTRY(netexport) ne_list;
struct mount *ne_mount;
struct netcred ne_defexported; /* Default export */
struct radix_node_head *ne_rtable[AF_MAX+1]; /* Individual exports */
};
TAILQ_HEAD(, netexport) netexport_list =
TAILQ_HEAD_INITIALIZER(netexport_list);
/* Publicly exported file system. */
struct nfs_public nfs_pub;
/*
* Local prototypes.
*/
static int init_exports(struct mount *, struct netexport **);
static int hang_addrlist(struct mount *, struct netexport *,
const struct export_args *);
static int sacheck(struct sockaddr *);
static int free_netcred(struct radix_node *, void *);
static int export(struct netexport *, const struct export_args *);
static int setpublicfs(struct mount *, struct netexport *,
const struct export_args *);
static struct netcred *netcred_lookup(struct netexport *, struct mbuf *);
static struct netexport *netexport_lookup(const struct mount *);
static struct netexport *netexport_lookup_byfsid(const fsid_t *);
static void netexport_clear(struct netexport *);
static void netexport_insert(struct netexport *);
static void netexport_remove(struct netexport *);
static void netexport_wrlock(void);
static void netexport_wrunlock(void);
static int nfs_export_update_30(struct mount *mp, const char *path, void *);
static krwlock_t netexport_lock;
/*
* PUBLIC INTERFACE
*/
/*
* Declare and initialize the file system export hooks.
*/
static void netexport_unmount(struct mount *);
struct vfs_hooks nfs_export_hooks = {
{ NULL, NULL },
.vh_unmount = netexport_unmount,
.vh_reexport = nfs_export_update_30,
};
/*
* VFS unmount hook for NFS exports.
*
* Releases NFS exports list resources if the given mount point has some.
* As allocation happens lazily, it may be that it doesn't have this
* information, although it theoretically should.
*/
static void
netexport_unmount(struct mount *mp)
{
struct netexport *ne;
KASSERT(mp != NULL);
netexport_wrlock();
ne = netexport_lookup(mp);
if (ne == NULL) {
netexport_wrunlock();
return;
}
netexport_clear(ne);
netexport_remove(ne);
netexport_wrunlock();
kmem_free(ne, sizeof(*ne));
}
void
netexport_init(void)
{
rw_init(&netexport_lock);
}
void
netexport_fini(void)
{
struct netexport *ne;
struct mount *mp;
int error;
while (!TAILQ_EMPTY(&netexport_list)) {
netexport_wrlock();
ne = TAILQ_FIRST(&netexport_list);
mp = ne->ne_mount;
error = vfs_busy(mp);
netexport_wrunlock();
if (error != 0) {
kpause("nfsfini", false, hz, NULL);
continue;
}
mutex_enter(mp->mnt_updating); /* mnt_flag */
netexport_unmount(mp);
mutex_exit(mp->mnt_updating); /* mnt_flag */
vfs_unbusy(mp);
}
rw_destroy(&netexport_lock);
}
/*
* Atomically set the NFS exports list of the given file system, replacing
* it with a new list of entries.
*
* Returns zero on success or an appropriate error code otherwise.
*
* Helper function for the nfssvc(2) system call (NFSSVC_SETEXPORTSLIST
* and NFSSVC_REPLACEEXPORTSLIST command).
*/
int
mountd_set_exports_list(const struct mountd_exports_list *mel, struct lwp *l,
struct mount *nmp, int cmd)
{
int error;
size_t i;
struct mount *mp;
struct netexport *ne;
struct pathbuf *pb;
struct nameidata nd;
struct vnode *vp;
size_t fid_size;
if (kauth_authorize_network(l->l_cred, KAUTH_NETWORK_NFS,
KAUTH_REQ_NETWORK_NFS_EXPORT, NULL, NULL, NULL) != 0)
return EPERM;
/* Look up the file system path. */
error = pathbuf_copyin(mel->mel_path, &pb);
if (error) {
return error;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb);
error = namei(&nd);
if (error != 0) {
pathbuf_destroy(pb);
return error;
}
vp = nd.ni_vp;
mp = vp->v_mount;
KASSERT(nmp == NULL || nmp == mp);
pathbuf_destroy(pb);
/*
* Make sure the file system can do vptofh. If the file system
* knows the handle's size, just trust it's able to do the
* actual translation also (otherwise we should check fhtovp
* also, and that's getting a wee bit ridiculous).
*/
fid_size = 0;
if ((error = VFS_VPTOFH(vp, NULL, &fid_size)) != E2BIG) {
vput(vp);
return EOPNOTSUPP;
}
/* Mark the file system busy. */
error = vfs_busy(mp);
vput(vp);
if (error != 0)
return error;
if (nmp == NULL) mutex_enter(mp->mnt_updating); /* mnt_flag */
netexport_wrlock();
ne = netexport_lookup(mp);
if (ne == NULL) {
error = init_exports(mp, &ne);
if (error != 0) {
goto out;
}
}
KASSERT(ne != NULL);
KASSERT(ne->ne_mount == mp);
if (cmd == NFSSVC_SETEXPORTSLIST) {
if (mel->mel_nexports == 0)
netexport_clear(ne);
else if (mel->mel_nexports == 1)
error = export(ne, &mel->mel_exports[0]);
else {
printf("%s: Cannot set more than one "
"entry at once (unimplemented)\n", __func__);
error = EOPNOTSUPP;
}
} else if (cmd == NFSSVC_REPLACEEXPORTSLIST) {
netexport_clear(ne);
for (i = 0; error == 0 && i < mel->mel_nexports; i++)
error = export(ne, &mel->mel_exports[i]);
} else {
printf("%s: Command %#x not implemented\n", __func__, cmd);
error = EOPNOTSUPP;
}
out:
netexport_wrunlock();
if (nmp == NULL) mutex_exit(mp->mnt_updating); /* mnt_flag */
vfs_unbusy(mp);
return error;
}
static void
netexport_insert(struct netexport *ne)
{
TAILQ_INSERT_HEAD(&netexport_list, ne, ne_list);
}
static void
netexport_remove(struct netexport *ne)
{
TAILQ_REMOVE(&netexport_list, ne, ne_list);
}
static struct netexport *
netexport_lookup(const struct mount *mp)
{
struct netexport *ne;
TAILQ_FOREACH(ne, &netexport_list, ne_list) { if (ne->ne_mount == mp) {
goto done;
}
}
ne = NULL;
done:
return ne;
}
static struct netexport *
netexport_lookup_byfsid(const fsid_t *fsid)
{
struct netexport *ne;
TAILQ_FOREACH(ne, &netexport_list, ne_list) {
const struct mount *mp = ne->ne_mount;
if (mp->mnt_stat.f_fsidx.__fsid_val[0] == fsid->__fsid_val[0] &&
mp->mnt_stat.f_fsidx.__fsid_val[1] == fsid->__fsid_val[1]) {
goto done;
}
}
ne = NULL;
done:
return ne;
}
/*
* Check if the file system specified by the 'mp' mount structure is
* exported to a client with 'anon' anonymous credentials. The 'mb'
* argument is an mbuf containing the network address of the client.
* The return parameters for the export flags for the client are returned
* in the address specified by 'wh'.
*
* This function is used exclusively by the NFS server. It is generally
* invoked before VFS_FHTOVP to validate that a client has access to the
* file system.
*/
int
netexport_check(const fsid_t *fsid, struct mbuf *mb, struct mount **mpp,
int *wh, kauth_cred_t *anon)
{
struct netexport *ne;
struct netcred *np;
ne = netexport_lookup_byfsid(fsid);
if (ne == NULL) {
return EACCES;
}
np = netcred_lookup(ne, mb);
if (np == NULL) {
return EACCES;
}
*mpp = ne->ne_mount;
*wh = np->netc_exflags;
*anon = np->netc_anon;
return 0;
}
/*
* Handles legacy export requests. In this case, the export information
* is hardcoded in a specific place of the mount arguments structure (given
* in data); the request for an update is given through the fspec field
* (also in a known location), which must be a null pointer.
*
* Returns EJUSTRETURN if the given command was not a export request.
* Otherwise, returns 0 on success or an appropriate error code otherwise.
*/
static int
nfs_export_update_30(struct mount *mp, const char *path, void *data)
{
struct mountd_exports_list mel;
struct mnt_export_args30 *args;
args = data;
mel.mel_path = path;
if (args->fspec != NULL)
return EJUSTRETURN;
if (args->eargs.ex_flags & 0x00020000) {
/* Request to delete exports. The mask above holds the
* value that used to be in MNT_DELEXPORT. */
mel.mel_nexports = 0;
} else {
/*
* The following code assumes export_args has not
* changed since export_args30, so check that.
*/
__CTASSERT(sizeof(args->eargs) == sizeof(*mel.mel_exports));
mel.mel_nexports = 1;
mel.mel_exports = (void *)&args->eargs;
}
return mountd_set_exports_list(&mel, curlwp, mp, NFSSVC_SETEXPORTSLIST);
}
/*
* INTERNAL FUNCTIONS
*/
/*
* Initializes NFS exports for the mountpoint given in 'mp'.
* If successful, returns 0 and sets *nep to the address of the new
* netexport item; otherwise returns an appropriate error code
* and *nep remains unmodified.
*/
static int
init_exports(struct mount *mp, struct netexport **nep)
{
int error;
struct export_args ea;
struct netexport *ne;
KASSERT(mp != NULL);
/* Ensure that we do not already have this mount point. */
KASSERT(netexport_lookup(mp) == NULL);
ne = kmem_zalloc(sizeof(*ne), KM_SLEEP);
ne->ne_mount = mp;
/* Set the default export entry. Handled internally by export upon
* first call. */
memset(&ea, 0, sizeof(ea));
ea.ex_root = -2;
if (mp->mnt_flag & MNT_RDONLY) ea.ex_flags |= MNT_EXRDONLY;
error = export(ne, &ea);
if (error != 0) {
kmem_free(ne, sizeof(*ne));
} else {
netexport_insert(ne);
*nep = ne;
}
return error;
}
/*
* Build hash lists of net addresses and hang them off the mount point.
* Called by export() to set up a new entry in the lists of export
* addresses.
*/
static int
hang_addrlist(struct mount *mp, struct netexport *nep,
const struct export_args *argp)
{
int error, i;
struct netcred *np, *enp;
struct radix_node_head *rnh;
struct sockaddr *saddr, *smask;
struct domain *dom;
smask = NULL;
if (argp->ex_addrlen == 0) {
if (mp->mnt_flag & MNT_DEFEXPORTED)
return EPERM;
np = &nep->ne_defexported;
KASSERT(np->netc_anon == NULL);
np->netc_anon = kauth_cred_alloc();
np->netc_exflags = argp->ex_flags;
kauth_uucred_to_cred(np->netc_anon, &argp->ex_anon);
mp->mnt_flag |= MNT_DEFEXPORTED;
return 0;
}
if (argp->ex_addrlen > MLEN || argp->ex_masklen > MLEN)
return EINVAL;
i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
np = malloc(i, M_NETADDR, M_WAITOK | M_ZERO);
np->netc_anon = kauth_cred_alloc();
saddr = (struct sockaddr *)(np + 1);
error = copyin(argp->ex_addr, saddr, argp->ex_addrlen);
if (error)
goto out;
if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (sacheck(saddr) == -1) {
error = EINVAL;
goto out;
}
if (argp->ex_masklen) {
smask = (struct sockaddr *)((char *)saddr + argp->ex_addrlen);
error = copyin(argp->ex_mask, smask, argp->ex_masklen);
if (error)
goto out;
if (smask->sa_len > argp->ex_masklen) smask->sa_len = argp->ex_masklen; if (smask->sa_family != saddr->sa_family) {
error = EINVAL;
goto out;
}
if (sacheck(smask) == -1) {
error = EINVAL;
goto out;
}
}
i = saddr->sa_family;
if ((rnh = nep->ne_rtable[i]) == 0) {
/*
* Seems silly to initialize every AF when most are not
* used, do so on demand here.
*/
DOMAIN_FOREACH(dom) { if (dom->dom_family == i && dom->dom_rtattach) {
rn_inithead((void **)&nep->ne_rtable[i],
dom->dom_rtoffset);
break;
}
}
if ((rnh = nep->ne_rtable[i]) == 0) {
error = ENOBUFS;
goto out;
}
}
enp = (struct netcred *)(*rnh->rnh_addaddr)(saddr, smask, rnh,
np->netc_rnodes);
if (enp != np) {
if (enp == NULL) {
enp = (struct netcred *)(*rnh->rnh_lookup)(saddr,
smask, rnh);
if (enp == NULL) {
error = EPERM;
goto out;
}
} else
enp->netc_refcnt++;
goto check;
} else
enp->netc_refcnt = 1;
np->netc_exflags = argp->ex_flags;
kauth_uucred_to_cred(np->netc_anon, &argp->ex_anon);
return 0;
check:
if (enp->netc_exflags != argp->ex_flags || kauth_cred_uucmp(enp->netc_anon, &argp->ex_anon) != 0)
error = EPERM;
else
error = 0;
out:
KASSERT(np->netc_anon != NULL);
kauth_cred_free(np->netc_anon);
free(np, M_NETADDR);
return error;
}
/*
* Ensure that the address stored in 'sa' is valid.
* Returns zero on success, otherwise -1.
*/
static int
sacheck(struct sockaddr *sa)
{
switch (sa->sa_family) {
case AF_INET: {
struct sockaddr_in *sin = (struct sockaddr_in *)sa;
char *p = (char *)sin->sin_zero;
size_t i;
if (sin->sin_len != sizeof(*sin))
return -1;
if (sin->sin_port != 0)
return -1;
for (i = 0; i < sizeof(sin->sin_zero); i++)
if (*p++ != '\0')
return -1;
return 0;
}
case AF_INET6: {
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa;
if (sin6->sin6_len != sizeof(*sin6))
return -1;
if (sin6->sin6_port != 0)
return -1;
return 0;
}
default:
return -1;
}
}
/*
* Free the netcred object pointed to by the 'rn' radix node.
* 'w' holds a pointer to the radix tree head.
*/
static int
free_netcred(struct radix_node *rn, void *w)
{
struct radix_node_head *rnh = (struct radix_node_head *)w;
struct netcred *np = (struct netcred *)(void *)rn;
(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
if (--(np->netc_refcnt) <= 0) {
KASSERT(np->netc_anon != NULL);
kauth_cred_free(np->netc_anon);
free(np, M_NETADDR);
}
return 0;
}
/*
* Clears the exports list for a given file system.
*/
static void
netexport_clear(struct netexport *ne)
{
struct radix_node_head *rnh;
struct mount *mp = ne->ne_mount;
int i;
if (mp->mnt_flag & MNT_EXPUBLIC) { setpublicfs(NULL, NULL, NULL);
mp->mnt_flag &= ~MNT_EXPUBLIC;
}
for (i = 0; i <= AF_MAX; i++) { if ((rnh = ne->ne_rtable[i]) != NULL) { rn_walktree(rnh, free_netcred, rnh);
free(rnh, M_RTABLE);
ne->ne_rtable[i] = NULL;
}
}
if ((mp->mnt_flag & MNT_DEFEXPORTED) != 0) {
struct netcred *np = &ne->ne_defexported;
KASSERT(np->netc_anon != NULL);
kauth_cred_free(np->netc_anon);
np->netc_anon = NULL;
} else {
KASSERT(ne->ne_defexported.netc_anon == NULL);
}
mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
}
/*
* Add a new export entry (described by an export_args structure) to the
* given file system.
*/
static int
export(struct netexport *nep, const struct export_args *argp)
{
struct mount *mp = nep->ne_mount;
int error;
if (argp->ex_flags & MNT_EXPORTED) { if (argp->ex_flags & MNT_EXPUBLIC) { if ((error = setpublicfs(mp, nep, argp)) != 0)
return error;
mp->mnt_flag |= MNT_EXPUBLIC;
}
if ((error = hang_addrlist(mp, nep, argp)) != 0)
return error;
mp->mnt_flag |= MNT_EXPORTED;
}
return 0;
}
/*
* Set the publicly exported filesystem (WebNFS). Currently, only
* one public filesystem is possible in the spec (RFC 2054 and 2055)
*/
static int
setpublicfs(struct mount *mp, struct netexport *nep,
const struct export_args *argp)
{
char *cp;
int error;
struct vnode *rvp;
size_t fhsize;
/*
* mp == NULL --> invalidate the current info; the FS is
* no longer exported. May be called from either export
* or unmount, so check if it hasn't already been done.
*/
if (mp == NULL) {
if (nfs_pub.np_valid) {
nfs_pub.np_valid = 0;
if (nfs_pub.np_handle != NULL) { free(nfs_pub.np_handle, M_TEMP);
nfs_pub.np_handle = NULL;
}
if (nfs_pub.np_index != NULL) { free(nfs_pub.np_index, M_TEMP);
nfs_pub.np_index = NULL;
}
}
return 0;
}
/*
* Only one allowed at a time.
*/
if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount)
return EBUSY;
/*
* Get real filehandle for root of exported FS.
*/
if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp)))
return error;
fhsize = 0;
error = vfs_composefh(rvp, NULL, &fhsize);
if (error != E2BIG)
return error;
nfs_pub.np_handle = malloc(fhsize, M_TEMP, M_NOWAIT);
if (nfs_pub.np_handle == NULL)
error = ENOMEM;
else
error = vfs_composefh(rvp, nfs_pub.np_handle, &fhsize);
if (error)
return error;
vput(rvp);
/*
* If an indexfile was specified, pull it in.
*/
if (argp->ex_indexfile != NULL) {
nfs_pub.np_index = malloc(NFS_MAXNAMLEN + 1, M_TEMP, M_WAITOK);
error = copyinstr(argp->ex_indexfile, nfs_pub.np_index,
NFS_MAXNAMLEN, (size_t *)0);
if (!error) {
/*
* Check for illegal filenames.
*/
for (cp = nfs_pub.np_index; *cp; cp++) {
if (*cp == '/') {
error = EINVAL;
break;
}
}
}
if (error) {
free(nfs_pub.np_index, M_TEMP);
return error;
}
}
nfs_pub.np_mount = mp;
nfs_pub.np_valid = 1;
return 0;
}
/*
* Look up an export entry in the exports list that matches the address
* stored in 'nam'. If no entry is found, the default one is used instead
* (if available).
*/
static struct netcred *
netcred_lookup(struct netexport *ne, struct mbuf *nam)
{
struct netcred *np;
struct radix_node_head *rnh;
struct sockaddr *saddr;
if ((ne->ne_mount->mnt_flag & MNT_EXPORTED) == 0) {
return NULL;
}
/*
* Look in the export list first.
*/
np = NULL;
if (nam != NULL) {
saddr = mtod(nam, struct sockaddr *);
rnh = ne->ne_rtable[saddr->sa_family];
if (rnh != NULL) {
np = (struct netcred *)
(*rnh->rnh_matchaddr)((void *)saddr,
rnh);
if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
np = NULL;
}
}
/*
* If no address match, use the default if it exists.
*/
if (np == NULL && ne->ne_mount->mnt_flag & MNT_DEFEXPORTED)
np = &ne->ne_defexported;
return np;
}
void
netexport_rdlock(void)
{
rw_enter(&netexport_lock, RW_READER);
}
void
netexport_rdunlock(void)
{
rw_exit(&netexport_lock);
}
static void
netexport_wrlock(void)
{
rw_enter(&netexport_lock, RW_WRITER);
}
static void
netexport_wrunlock(void)
{
rw_exit(&netexport_lock);
}
bool
netexport_hasexports(void)
{
return nfs_pub.np_valid || !TAILQ_EMPTY(&netexport_list);
}
/* $NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $ */
/*-
* Copyright (c) 2009, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Lockless producer/consumer queue.
*
* Summary of the producer algorithm in pcq_put (may run many in
* parallel with each other and with a consumer):
*
* P1. initialize an item
*
* P2. atomic_cas(&pcq->pcq_pc) loop to advance the producer
* pointer, reserving a space at c (fails if not enough space)
*
* P3. atomic_store_release(&pcq->pcq_items[c], item) to publish
* the item in the space it reserved
*
* Summary of the consumer algorithm in pcq_get (must be serialized by
* caller with other consumers, may run in parallel with any number of
* producers):
*
* C1. atomic_load_relaxed(&pcq->pcq_pc) to get the consumer
* pointer and a snapshot of the producer pointer, which may
* point to null items or point to initialized items (fails if
* no space reserved for published items yet)
*
* C2. atomic_load_consume(&pcq->pcq_items[c]) to get the next
* unconsumed but potentially published item (fails if item
* not published yet)
*
* C3. pcq->pcq_items[c] = NULL to consume the next unconsumed but
* published item
*
* C4. membar_producer
*
* C5. atomic_cas(&pcq->pcq_pc) loop to advance the consumer
* pointer
*
* C6. use the item
*
* Note that there is a weird bare membar_producer which is not matched
* by membar_consumer. This is one of the rare cases of a memory
* barrier on one side that is not matched by a memory barrier on
* another side, but the ordering works out, with a somewhat more
* involved proof.
*
* Some properties that need to be proved:
*
* Theorem 1. For pcq_put call that leads into pcq_get:
* Initializing item at P1 is dependency-ordered before usage of
* item at C6, so items placed by pcq_put can be safely used by
* the caller of pcq_get.
*
* Proof sketch.
*
* Assume load/store P2 synchronizes with load/store C1
* (if not, pcq_get fails in `if (p == c) return NULL').
*
* Assume store-release P3 synchronizes with load-consume
* C2 (if not, pcq_get fails in `if (item == NULL) return
* NULL').
*
* Then:
*
* - P1 is sequenced before store-release P3
* - store-release P3 synchronizes with load-consume C2
* - load-consume C2 is dependency-ordered before C6
*
* Hence transitively, P1 is dependency-ordered before C6,
* QED.
*
* Theorem 2. For pcq_get call followed by pcq_put: Nulling out
* location at store C3 happens before placing a new item in the
* same location at store P3, so items are not lost.
*
* Proof sketch.
*
* Assume load/store C5 synchronizes with load/store P2
* (otherwise pcq_peek starts over the CAS loop or fails).
*
* Then:
*
* - store C3 is sequenced before membar_producer C4
* - membar_producer C4 is sequenced before load/store C5
* - load/store C5 synchronizes with load/store P2 at &pcq->pcq_pc
* - P2 is sequenced before store-release P3
*
* Hence transitively, store C3 happens before
* store-release P3, QED.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pcq.c,v 1.20 2023/02/24 11:02:27 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/pcq.h>
/*
* Internal producer-consumer queue structure. Note: providing a separate
* cache-line both for pcq_t::pcq_pc and pcq_t::pcq_items.
*/
struct pcq {
u_int pcq_nitems;
uint8_t pcq_pad1[COHERENCY_UNIT - sizeof(u_int)];
volatile uint32_t pcq_pc;
uint8_t pcq_pad2[COHERENCY_UNIT - sizeof(uint32_t)];
void * volatile pcq_items[];
};
/*
* Producer (p) - stored in the lower 16 bits of pcq_t::pcq_pc.
* Consumer (c) - in the higher 16 bits.
*
* We have a limitation of 16 bits i.e. 0xffff items in the queue.
* The PCQ_MAXLEN constant is set accordingly.
*/
static inline void
pcq_split(uint32_t v, u_int *p, u_int *c)
{
*p = v & 0xffff;
*c = v >> 16;
}
static inline uint32_t
pcq_combine(u_int p, u_int c)
{
return p | (c << 16);
}
static inline u_int
pcq_advance(pcq_t *pcq, u_int pc)
{
if (__predict_false(++pc == pcq->pcq_nitems)) {
return 0;
}
return pc;
}
/*
* pcq_put: place an item at the end of the queue.
*/
bool
pcq_put(pcq_t *pcq, void *item)
{
uint32_t v, nv;
u_int op, p, c;
KASSERT(item != NULL);
do {
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &op, &c);
p = pcq_advance(pcq, op);
if (p == c) {
/* Queue is full. */
return false;
}
nv = pcq_combine(p, c);
} while (atomic_cas_32(&pcq->pcq_pc, v, nv) != v);
/*
* Ensure that the update to pcq_pc is globally visible before the
* data item. See pcq_get(). This also ensures that any changes
* that the caller made to the data item are globally visible
* before we put it onto the list.
*/
atomic_store_release(&pcq->pcq_items[op], item);
/*
* Synchronization activity to wake up the consumer will ensure
* that the update to pcq_items[] is visible before the wakeup
* arrives. So, we do not need an additional memory barrier here.
*/
return true;
}
/*
* pcq_peek: return the next item from the queue without removal.
*/
void *
pcq_peek(pcq_t *pcq)
{
const uint32_t v = atomic_load_relaxed(&pcq->pcq_pc);
u_int p, c;
pcq_split(v, &p, &c);
/* See comment on race below in pcq_get(). */
return (p == c) ? NULL : atomic_load_consume(&pcq->pcq_items[c]);
}
/*
* pcq_get: remove and return the next item for consumption or NULL if empty.
*
* => The caller must prevent concurrent gets from occurring.
*/
void *
pcq_get(pcq_t *pcq)
{
uint32_t v, nv;
u_int p, c;
void *item;
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &p, &c);
if (p == c) {
/* Queue is empty: nothing to return. */
return NULL;
}
item = atomic_load_consume(&pcq->pcq_items[c]);
if (item == NULL) {
/*
* Raced with sender: we rely on a notification (e.g. softint
* or wakeup) being generated after the producer's pcq_put(),
* causing us to retry pcq_get() later.
*/
return NULL;
}
/*
* We have exclusive access to this slot, so no need for
* atomic_store_*.
*/
pcq->pcq_items[c] = NULL;
c = pcq_advance(pcq, c);
nv = pcq_combine(p, c);
/*
* Ensure that update to pcq_items[c] becomes globally visible
* before the update to pcq_pc. If it were reordered to occur
* after it, we could in theory wipe out a modification made
* to pcq_items[c] by pcq_put().
*
* No need for load-before-store ordering of membar_release
* because the only load we need to ensure happens first is the
* load of pcq->pcq_items[c], but that necessarily happens
* before the store to pcq->pcq_items[c] to null it out because
* it is at the same memory location. Yes, this is a bare
* membar_producer with no matching membar_consumer.
*/
membar_producer();
while (__predict_false(atomic_cas_32(&pcq->pcq_pc, v, nv) != v)) {
v = atomic_load_relaxed(&pcq->pcq_pc);
pcq_split(v, &p, &c);
c = pcq_advance(pcq, c);
nv = pcq_combine(p, c);
}
return item;
}
pcq_t *
pcq_create(size_t nitems, km_flag_t kmflags)
{
pcq_t *pcq;
KASSERT(nitems > 0);
KASSERT(nitems <= PCQ_MAXLEN);
pcq = kmem_zalloc(offsetof(pcq_t, pcq_items[nitems]), kmflags);
if (pcq != NULL) {
pcq->pcq_nitems = nitems;
}
return pcq;
}
void
pcq_destroy(pcq_t *pcq)
{
kmem_free(pcq, offsetof(pcq_t, pcq_items[pcq->pcq_nitems]));
}
size_t
pcq_maxitems(pcq_t *pcq)
{
return pcq->pcq_nitems;
}
/* $NetBSD: pmap.h,v 1.134 2022/08/20 23:49:31 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* pmap.h: see pmap.c for the history of this pmap module.
*/
#ifndef _X86_PMAP_H_
#define _X86_PMAP_H_
#if defined(_KERNEL)
#include <x86/pmap_pv.h>
#include <uvm/pmap/pmap_pvt.h>
/*
* MD flags that we use for pmap_enter and pmap_kenter_pa:
*/
/*
* macros
*/
#define pmap_clear_modify(pg) pmap_clear_attrs(pg, PP_ATTRS_D)
#define pmap_clear_reference(pg) pmap_clear_attrs(pg, PP_ATTRS_A)
#define pmap_copy(DP,SP,D,L,S) __USE(L)
#define pmap_is_modified(pg) pmap_test_attrs(pg, PP_ATTRS_D)
#define pmap_is_referenced(pg) pmap_test_attrs(pg, PP_ATTRS_A)
#define pmap_move(DP,SP,D,L,S)
#define pmap_phys_address(ppn) (x86_ptob(ppn) & ~X86_MMAP_FLAG_MASK)
#define pmap_mmap_flags(ppn) x86_mmap_flags(ppn)
#if defined(__x86_64__) || defined(PAE)
#define X86_MMAP_FLAG_SHIFT (64 - PGSHIFT)
#else
#define X86_MMAP_FLAG_SHIFT (32 - PGSHIFT)
#endif
#define X86_MMAP_FLAG_MASK 0xf
#define X86_MMAP_FLAG_PREFETCH 0x1
/*
* prototypes
*/
void pmap_activate(struct lwp *);
void pmap_bootstrap(vaddr_t);
bool pmap_clear_attrs(struct vm_page *, unsigned);
bool pmap_pv_clear_attrs(paddr_t, unsigned);
void pmap_deactivate(struct lwp *);
void pmap_page_remove(struct vm_page *);
void pmap_pv_remove(paddr_t);
void pmap_remove(struct pmap *, vaddr_t, vaddr_t);
bool pmap_test_attrs(struct vm_page *, unsigned);
void pmap_write_protect(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
void pmap_load(void);
paddr_t pmap_init_tmp_pgtbl(paddr_t);
bool pmap_remove_all(struct pmap *);
void pmap_ldt_cleanup(struct lwp *);
void pmap_ldt_sync(struct pmap *);
void pmap_kremove_local(vaddr_t, vsize_t);
#define __HAVE_PMAP_PV_TRACK 1
void pmap_pv_init(void);
void pmap_pv_track(paddr_t, psize_t);
void pmap_pv_untrack(paddr_t, psize_t);
u_int x86_mmap_flags(paddr_t);
#define PMAP_GROWKERNEL /* turn on pmap_growkernel interface */
#define PMAP_FORK /* turn on pmap_fork interface */
/*
* inline functions
*/
/*
* pmap_page_protect: change the protection of all recorded mappings
* of a managed page
*
* => this function is a frontend for pmap_page_remove/pmap_clear_attrs
* => we only have to worry about making the page more protected.
* unprotecting a page is done on-demand at fault time.
*/
__inline static void __unused
pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
(void)pmap_clear_attrs(pg, PP_ATTRS_W);
} else {
pmap_page_remove(pg);
}
}
}
/*
* pmap_pv_protect: change the protection of all recorded mappings
* of an unmanaged page
*/
__inline static void __unused
pmap_pv_protect(paddr_t pa, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
(void)pmap_pv_clear_attrs(pa, PP_ATTRS_W);
} else {
pmap_pv_remove(pa);
}
}
}
/*
* pmap_protect: change the protection of pages in a pmap
*
* => this function is a frontend for pmap_remove/pmap_write_protect
* => we only have to worry about making the page more protected.
* unprotecting a page is done on-demand at fault time.
*/
__inline static void __unused
pmap_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
if ((prot & VM_PROT_WRITE) == 0) {
if (prot & (VM_PROT_READ|VM_PROT_EXECUTE)) {
pmap_write_protect(pmap, sva, eva, prot);
} else {
pmap_remove(pmap, sva, eva);
}
}
}
paddr_t vtophys(vaddr_t);
vaddr_t pmap_map(vaddr_t, paddr_t, paddr_t, vm_prot_t);
void pmap_cpu_init_late(struct cpu_info *);
/* pmap functions with machine addresses */
void pmap_kenter_ma(vaddr_t, paddr_t, vm_prot_t, u_int);
int pmap_enter_ma(struct pmap *, vaddr_t, paddr_t, paddr_t,
vm_prot_t, u_int, int);
bool pmap_extract_ma(pmap_t, vaddr_t, paddr_t *);
paddr_t pmap_get_physpage(void);
/*
* Hooks for the pool allocator.
*/
#define POOL_VTOPHYS(va) vtophys((vaddr_t) (va))
#ifdef __HAVE_DIRECT_MAP
extern vaddr_t pmap_direct_base;
extern vaddr_t pmap_direct_end;
#define PMAP_DIRECT_BASE pmap_direct_base
#define PMAP_DIRECT_END pmap_direct_end
#define PMAP_DIRECT_MAP(pa) ((vaddr_t)PMAP_DIRECT_BASE + (pa))
#define PMAP_DIRECT_UNMAP(va) ((paddr_t)(va) - PMAP_DIRECT_BASE)
/*
* Alternate mapping hooks for pool pages.
*/
#define PMAP_MAP_POOLPAGE(pa) PMAP_DIRECT_MAP((pa))
#define PMAP_UNMAP_POOLPAGE(va) PMAP_DIRECT_UNMAP((va))
#endif /* __HAVE_DIRECT_MAP */
#define __HAVE_VM_PAGE_MD
#define VM_MDPAGE_INIT(pg) \
memset(&(pg)->mdpage, 0, sizeof((pg)->mdpage)); \
PMAP_PAGE_INIT(&(pg)->mdpage.mp_pp)
struct vm_page_md {
struct pmap_page mp_pp;
};
#endif /* _KERNEL */
#endif /* _X86_PMAP_H_ */
/* $NetBSD: radio.c,v 1.31 2021/08/07 16:19:08 thorpej Exp $ */
/* $OpenBSD: radio.c,v 1.2 2001/12/05 10:27:06 mickey Exp $ */
/* $RuOBSD: radio.c,v 1.7 2001/12/04 06:03:05 tm Exp $ */
/*
* Copyright (c) 2001 Maxim Tsyplakov <tm@oganer.net>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/* This is the /dev/radio driver from OpenBSD */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: radio.c,v 1.31 2021/08/07 16:19:08 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/vnode.h>
#include <sys/radioio.h>
#include <sys/conf.h>
#include <dev/radio_if.h>
#include "ioconf.h"
struct radio_softc {
void *hw_hdl; /* hardware driver handle */
device_t sc_dev; /* hardware device struct */
const struct radio_hw_if *hw_if; /* hardware interface */
};
static int radioprobe(device_t, cfdata_t, void *);
static void radioattach(device_t, device_t, void *);
static int radioprint(void *, const char *);
static int radiodetach(device_t, int);
CFATTACH_DECL_NEW(radio, sizeof(struct radio_softc),
radioprobe, radioattach, radiodetach, NULL);
static dev_type_open(radioopen);
static dev_type_close(radioclose);
static dev_type_ioctl(radioioctl);
const struct cdevsw radio_cdevsw = {
.d_open = radioopen,
.d_close = radioclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = radioioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER,
};
static int
radioprobe(device_t parent, cfdata_t match, void *aux)
{
return (1);
}
static void
radioattach(device_t parent, device_t self, void *aux)
{
struct radio_softc *sc = device_private(self);
struct radio_attach_args *sa = aux;
const struct radio_hw_if *hwp = sa->hwif;
void *hdlp = sa->hdl;
aprint_naive("\n");
aprint_normal("\n");
sc->hw_if = hwp;
sc->hw_hdl = hdlp;
sc->sc_dev = self;
}
static int
radioopen(dev_t dev, int flags, int fmt, struct lwp *l)
{
int unit;
struct radio_softc *sc;
unit = RADIOUNIT(dev);
sc = device_lookup_private(&radio_cd, unit);
if (sc == NULL || sc->hw_if == NULL)
return (ENXIO);
if (sc->hw_if->open != NULL) return (sc->hw_if->open(sc->hw_hdl, flags, fmt, l->l_proc));
else
return (0);
}
static int
radioclose(dev_t dev, int flags, int fmt, struct lwp *l)
{
struct radio_softc *sc;
sc = device_lookup_private(&radio_cd, RADIOUNIT(dev));
if (sc->hw_if->close != NULL)
return (sc->hw_if->close(sc->hw_hdl, flags, fmt, l->l_proc));
else
return (0);
}
static int
radioioctl(dev_t dev, u_long cmd, void *data, int flags,
struct lwp *l)
{
struct radio_softc *sc;
int unit, error;
unit = RADIOUNIT(dev);
sc = device_lookup_private(&radio_cd, unit);
if (sc == NULL || sc->hw_if == NULL)
return (ENXIO);
error = EOPNOTSUPP;
switch (cmd) {
case RIOCGINFO:
if (sc->hw_if->get_info)
error = (sc->hw_if->get_info)(sc->hw_hdl,
(struct radio_info *)data);
break;
case RIOCSINFO:
if (sc->hw_if->set_info)
error = (sc->hw_if->set_info)(sc->hw_hdl,
(struct radio_info *)data);
break;
case RIOCSSRCH:
if (sc->hw_if->search)
error = (sc->hw_if->search)(sc->hw_hdl,
*(int *)data);
break;
default:
error = EINVAL;
}
return (error);
}
/*
* Called from hardware driver. This is where the MI radio driver gets
* probed/attached to the hardware driver
*/
device_t
radio_attach_mi(const struct radio_hw_if *rhwp, void *hdlp, device_t dev)
{
struct radio_attach_args arg;
arg.hwif = rhwp;
arg.hdl = hdlp;
return (config_found(dev, &arg, radioprint, CFARGS_NONE));
}
static int
radioprint(void *aux, const char *pnp)
{
if (pnp != NULL)
aprint_normal("radio at %s", pnp);
return (UNCONF);
}
static int
radiodetach(device_t self, int flags)
{
int maj, mn;
/* locate the major number */
maj = cdevsw_lookup_major(&radio_cdevsw);
/* Nuke the vnodes for any open instances (calls close). */
mn = device_unit(self);
vdevgone(maj, mn, mn, VCHR);
return (0);
}
/* $NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $ */
/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_fault.c,v 1.2 2020/06/30 16:28:17 maxv Exp $");
#include <sys/module.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/types.h>
#include <sys/specificdata.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/ioccom.h>
#include <sys/lwp.h>
#include <sys/fault.h>
typedef struct {
volatile bool enabled;
volatile bool oneshot;
volatile unsigned long nth;
volatile unsigned long cnt;
volatile unsigned long nfaults;
} fault_t;
static fault_t fault_global __cacheline_aligned = {
.enabled = false,
.oneshot = false,
.nth = FAULT_NTH_MIN,
.cnt = 0,
.nfaults = 0
};
static kmutex_t fault_global_lock __cacheline_aligned;
static specificdata_key_t fault_lwp_key;
/* -------------------------------------------------------------------------- */
bool
fault_inject(void)
{
volatile unsigned long cnt;
fault_t *f;
if (__predict_false(cold))
return false;
if (__predict_false(atomic_load_acquire(&fault_global.enabled))) {
f = &fault_global;
} else {
f = lwp_getspecific(fault_lwp_key);
if (__predict_true(f == NULL))
return false;
if (__predict_false(!f->enabled))
return false;
}
if (atomic_load_relaxed(&f->oneshot)) { if (__predict_true(atomic_load_relaxed(&f->nfaults) > 0))
return false;
}
cnt = atomic_inc_ulong_nv(&f->cnt);
if (__predict_false(cnt % atomic_load_relaxed(&f->nth) == 0)) { atomic_inc_ulong(&f->nfaults);
return true;
}
return false;
}
/* -------------------------------------------------------------------------- */
static int
fault_open(dev_t dev, int flag, int mode, struct lwp *l)
{
return 0;
}
static int
fault_close(dev_t dev, int flag, int mode, struct lwp *l)
{
return 0;
}
static int
fault_ioc_enable(struct fault_ioc_enable *args)
{
fault_t *f;
if (args->mode != FAULT_MODE_NTH_ONESHOT)
return EINVAL;
if (args->nth < FAULT_NTH_MIN)
return EINVAL;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
mutex_enter(&fault_global_lock);
if (fault_global.enabled) {
mutex_exit(&fault_global_lock);
return EEXIST;
}
fault_global.oneshot = true;
atomic_store_relaxed(&fault_global.nth, args->nth);
fault_global.cnt = 0;
fault_global.nfaults = 0;
atomic_store_release(&fault_global.enabled, true);
mutex_exit(&fault_global_lock);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f != NULL) {
if (f->enabled)
return EEXIST;
} else {
f = kmem_zalloc(sizeof(*f), KM_SLEEP);
lwp_setspecific(fault_lwp_key, f);
}
f->oneshot = true;
atomic_store_relaxed(&f->nth, args->nth);
f->cnt = 0;
f->nfaults = 0;
atomic_store_release(&f->enabled, true);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioc_disable(struct fault_ioc_disable *args)
{
fault_t *f;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
mutex_enter(&fault_global_lock);
if (!fault_global.enabled) {
mutex_exit(&fault_global_lock);
return ENOENT;
}
atomic_store_release(&fault_global.enabled, false);
mutex_exit(&fault_global_lock);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f == NULL)
return ENOENT;
if (!f->enabled)
return ENOENT;
atomic_store_release(&f->enabled, false);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioc_getinfo(struct fault_ioc_getinfo *args)
{
fault_t *f;
switch (args->scope) {
case FAULT_SCOPE_GLOBAL:
args->nfaults = atomic_load_relaxed(&fault_global.nfaults);
break;
case FAULT_SCOPE_LWP:
f = lwp_getspecific(fault_lwp_key);
if (f == NULL)
return ENOENT;
args->nfaults = atomic_load_relaxed(&f->nfaults);
break;
default:
return EINVAL;
}
return 0;
}
static int
fault_ioctl(dev_t dev, u_long cmd, void *addr, int flag, struct lwp *l)
{
switch (cmd) {
case FAULT_IOC_ENABLE:
return fault_ioc_enable(addr);
case FAULT_IOC_DISABLE:
return fault_ioc_disable(addr);
case FAULT_IOC_GETINFO:
return fault_ioc_getinfo(addr);
default:
return EINVAL;
}
}
const struct cdevsw fault_cdevsw = {
.d_open = fault_open,
.d_close = fault_close,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = fault_ioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* -------------------------------------------------------------------------- */
MODULE(MODULE_CLASS_MISC, fault, NULL);
static void
fault_lwp_free(void *arg)
{
fault_t *f = (fault_t *)arg;
if (f == NULL) {
return;
}
kmem_free(f, sizeof(*f));
}
static void
fault_init(void)
{
mutex_init(&fault_global_lock, MUTEX_DEFAULT, IPL_NONE);
lwp_specific_key_create(&fault_lwp_key, fault_lwp_free);
}
static int
fault_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
fault_init();
return 0;
case MODULE_CMD_FINI:
return EINVAL;
default:
return ENOTTY;
}
}
/* $NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_io.c,v 1.104 2024/04/05 13:05:40 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/buf.h>
#include <sys/atomic.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pager.h>
#include <uvm/uvm_page_array.h>
static int genfs_do_directio(struct vmspace *, vaddr_t, size_t, struct vnode *,
off_t, enum uio_rw);
static void genfs_dio_iodone(struct buf *);
static int genfs_getpages_read(struct vnode *, struct vm_page **, int, off_t,
off_t, bool, bool, bool, bool);
static int genfs_do_io(struct vnode *, off_t, vaddr_t, size_t, int, enum uio_rw,
void (*)(struct buf *));
static void genfs_rel_pages(struct vm_page **, unsigned int);
int genfs_maxdio = MAXPHYS;
static void
genfs_rel_pages(struct vm_page **pgs, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL || pg == PGO_DONTCARE)
continue;
KASSERT(uvm_page_owner_locked_p(pg, true)); if (pg->flags & PG_FAKE) { pg->flags |= PG_RELEASED;
}
}
uvm_page_unbusy(pgs, npages);
}
/*
* generic VM getpages routine.
* Return PG_BUSY pages for the given range,
* reading from backing store if necessary.
*/
int
genfs_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ * const ap = v;
off_t diskeof, memeof;
int i, error, npages, iflag;
const int flags = ap->a_flags;
struct vnode * const vp = ap->a_vp;
struct uvm_object * const uobj = &vp->v_uobj;
const bool async = (flags & PGO_SYNCIO) == 0;
const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
const bool overwrite = (flags & PGO_OVERWRITE) != 0;
const bool blockalloc = memwrite && (flags & PGO_NOBLOCKALLOC) == 0;
const bool need_wapbl = (vp->v_mount->mnt_wapbl &&
(flags & PGO_JOURNALLOCKED) == 0);
const bool glocked = (flags & PGO_GLOCKHELD) != 0;
bool holds_wapbl = false;
struct mount *trans_mount = NULL;
UVMHIST_FUNC("genfs_getpages"); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx/%jx count %jd",
(uintptr_t)vp, ap->a_offset >> 32, ap->a_offset, *ap->a_count);
KASSERT(memwrite >= overwrite); KASSERT(vp->v_type == VREG || vp->v_type == VDIR ||
vp->v_type == VLNK || vp->v_type == VBLK);
/*
* the object must be locked. it can only be a read lock when
* processing a read fault with PGO_LOCKED.
*/
KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) ||
((flags & PGO_LOCKED) != 0 && !memwrite));
#ifdef DIAGNOSTIC
if ((flags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
/*
* check for reclaimed vnode. v_interlock is not held here, but
* VI_DEADCHECK is set with vmobjlock held.
*/
iflag = atomic_load_relaxed(&vp->v_iflag); if (__predict_false((iflag & VI_DEADCHECK) != 0)) {
mutex_enter(vp->v_interlock);
error = vdead_check(vp, VDEAD_NOWAIT);
mutex_exit(vp->v_interlock);
if (error) { if ((flags & PGO_LOCKED) == 0) rw_exit(uobj->vmobjlock);
return error;
}
}
startover:
error = 0;
const voff_t origvsize = vp->v_size;
const off_t origoffset = ap->a_offset;
const int orignpages = *ap->a_count;
GOP_SIZE(vp, origvsize, &diskeof, 0);
if (flags & PGO_PASTEOF) {
off_t newsize;
#if defined(DIAGNOSTIC)
off_t writeeof;
#endif /* defined(DIAGNOSTIC) */
newsize = MAX(origvsize,
origoffset + (orignpages << PAGE_SHIFT));
GOP_SIZE(vp, newsize, &memeof, GOP_SIZE_MEM);
#if defined(DIAGNOSTIC)
GOP_SIZE(vp, vp->v_writesize, &writeeof, GOP_SIZE_MEM);
if (newsize > round_page(writeeof)) {
panic("%s: past eof: %" PRId64 " vs. %" PRId64,
__func__, newsize, round_page(writeeof));
}
#endif /* defined(DIAGNOSTIC) */
} else { GOP_SIZE(vp, origvsize, &memeof, GOP_SIZE_MEM);
}
KASSERT(ap->a_centeridx >= 0 || ap->a_centeridx <= orignpages); KASSERT((origoffset & (PAGE_SIZE - 1)) == 0); KASSERT(origoffset >= 0); KASSERT(orignpages > 0);
/*
* Bounds-check the request.
*/
if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= memeof) {
if ((flags & PGO_LOCKED) == 0) { rw_exit(uobj->vmobjlock);
}
UVMHIST_LOG(ubchist, "off 0x%jx count %jd goes past EOF 0x%jx",
origoffset, *ap->a_count, memeof,0);
error = EINVAL;
goto out_err;
}
/* uobj is locked */
if ((flags & PGO_NOTIMESTAMP) == 0 && (vp->v_type != VBLK ||
(vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
int updflags = 0;
if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
updflags = GOP_UPDATE_ACCESSED;
}
if (memwrite) {
updflags |= GOP_UPDATE_MODIFIED;
}
if (updflags != 0) { GOP_MARKUPDATE(vp, updflags);
}
}
/*
* For PGO_LOCKED requests, just return whatever's in memory.
*/
if (flags & PGO_LOCKED) {
int nfound;
struct vm_page *pg;
KASSERT(!glocked);
npages = *ap->a_count;
#if defined(DEBUG)
for (i = 0; i < npages; i++) {
pg = ap->a_m[i];
KASSERT(pg == NULL || pg == PGO_DONTCARE);
}
#endif /* defined(DEBUG) */
nfound = uvn_findpages(uobj, origoffset, &npages,
ap->a_m, NULL,
UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
(memwrite ? UFP_NORDONLY : 0));
KASSERT(npages == *ap->a_count); if (nfound == 0) {
error = EBUSY;
goto out_err;
}
/*
* lock and unlock g_glock to ensure that no one is truncating
* the file behind us.
*/
if (!genfs_node_rdtrylock(vp)) {
/*
* restore the array.
*/
for (i = 0; i < npages; i++) {
pg = ap->a_m[i];
if (pg != NULL && pg != PGO_DONTCARE) { ap->a_m[i] = NULL;
}
KASSERT(ap->a_m[i] == NULL ||
ap->a_m[i] == PGO_DONTCARE);
}
} else {
genfs_node_unlock(vp);
}
error = (ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0);
if (error == 0 && memwrite) { for (i = 0; i < npages; i++) { pg = ap->a_m[i]; if (pg == NULL || pg == PGO_DONTCARE) {
continue;
}
if (uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN) {
uvm_pagemarkdirty(pg,
UVM_PAGE_STATUS_UNKNOWN);
}
}
}
goto out_err;
}
rw_exit(uobj->vmobjlock);
/*
* find the requested pages and make some simple checks.
* leave space in the page array for a whole block.
*/
const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
const int fs_bsize = 1 << fs_bshift;
#define blk_mask (fs_bsize - 1)
#define trunc_blk(x) ((x) & ~blk_mask)
#define round_blk(x) (((x) + blk_mask) & ~blk_mask)
const int orignmempages = MIN(orignpages,
round_page(memeof - origoffset) >> PAGE_SHIFT);
npages = orignmempages;
const off_t startoffset = trunc_blk(origoffset);
const off_t endoffset = MIN(
round_page(round_blk(origoffset + (npages << PAGE_SHIFT))),
round_page(memeof));
const int ridx = (origoffset - startoffset) >> PAGE_SHIFT;
const int pgs_size = sizeof(struct vm_page *) *
((endoffset - startoffset) >> PAGE_SHIFT);
struct vm_page **pgs, *pgs_onstack[UBC_MAX_PAGES];
if (pgs_size > sizeof(pgs_onstack)) {
pgs = kmem_zalloc(pgs_size, async ? KM_NOSLEEP : KM_SLEEP);
if (pgs == NULL) {
pgs = pgs_onstack;
error = ENOMEM;
goto out_err;
}
} else {
pgs = pgs_onstack;
(void)memset(pgs, 0, pgs_size);
}
UVMHIST_LOG(ubchist, "ridx %jd npages %jd startoff %#jx endoff %#jx",
ridx, npages, startoffset, endoffset);
if (trans_mount == NULL) {
trans_mount = vp->v_mount;
fstrans_start(trans_mount);
/*
* check if this vnode is still valid.
*/
mutex_enter(vp->v_interlock);
error = vdead_check(vp, 0);
mutex_exit(vp->v_interlock);
if (error)
goto out_err_free;
/*
* XXX: This assumes that we come here only via
* the mmio path
*/
if (blockalloc && need_wapbl) {
error = WAPBL_BEGIN(trans_mount);
if (error)
goto out_err_free;
holds_wapbl = true;
}
}
/*
* hold g_glock to prevent a race with truncate.
*
* check if our idea of v_size is still valid.
*/
KASSERT(!glocked || genfs_node_wrlocked(vp));
if (!glocked) {
if (blockalloc) {
genfs_node_wrlock(vp);
} else {
genfs_node_rdlock(vp);
}
}
rw_enter(uobj->vmobjlock, RW_WRITER);
if (vp->v_size < origvsize) {
if (!glocked) { genfs_node_unlock(vp);
}
if (pgs != pgs_onstack) kmem_free(pgs, pgs_size);
goto startover;
}
if (uvn_findpages(uobj, origoffset, &npages, &pgs[ridx], NULL,
async ? UFP_NOWAIT : UFP_ALL) != orignmempages) {
if (!glocked) { genfs_node_unlock(vp);
}
KASSERT(async != 0); genfs_rel_pages(&pgs[ridx], orignmempages);
rw_exit(uobj->vmobjlock);
error = EBUSY;
goto out_err_free;
}
/*
* if PGO_OVERWRITE is set, don't bother reading the pages.
*/
if (overwrite) {
if (!glocked) { genfs_node_unlock(vp);
}
UVMHIST_LOG(ubchist, "PGO_OVERWRITE",0,0,0,0);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[ridx + i];
/*
* it's caller's responsibility to allocate blocks
* beforehand for the overwrite case.
*/
KASSERT((pg->flags & PG_RDONLY) == 0 || !blockalloc);
pg->flags &= ~PG_RDONLY;
/*
* mark the page DIRTY.
* otherwise another thread can do putpages and pull
* our vnode from syncer's queue before our caller does
* ubc_release. note that putpages won't see CLEAN
* pages even if they are BUSY.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
}
npages += ridx;
goto out;
}
/*
* if the pages are already resident, just return them.
*/
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[ridx + i];
if ((pg->flags & PG_FAKE) || (blockalloc && (pg->flags & PG_RDONLY) != 0)) {
break;
}
}
if (i == npages) { if (!glocked) { genfs_node_unlock(vp);
}
UVMHIST_LOG(ubchist, "returning cached pages", 0,0,0,0);
npages += ridx;
goto out;
}
/*
* the page wasn't resident and we're not overwriting,
* so we're going to have to do some i/o.
* find any additional pages needed to cover the expanded range.
*/
npages = (endoffset - startoffset) >> PAGE_SHIFT;
if (startoffset != origoffset || npages != orignmempages) {
int npgs;
/*
* we need to avoid deadlocks caused by locking
* additional pages at lower offsets than pages we
* already have locked. unlock them all and start over.
*/
genfs_rel_pages(&pgs[ridx], orignmempages);
memset(pgs, 0, pgs_size);
UVMHIST_LOG(ubchist, "reset npages start 0x%jx end 0x%jx",
startoffset, endoffset, 0,0);
npgs = npages;
if (uvn_findpages(uobj, startoffset, &npgs, pgs, NULL,
async ? UFP_NOWAIT : UFP_ALL) != npages) {
if (!glocked) { genfs_node_unlock(vp);
}
KASSERT(async != 0); genfs_rel_pages(pgs, npages);
rw_exit(uobj->vmobjlock);
error = EBUSY;
goto out_err_free;
}
}
rw_exit(uobj->vmobjlock);
error = genfs_getpages_read(vp, pgs, npages, startoffset, diskeof,
async, memwrite, blockalloc, glocked);
if (!glocked) { genfs_node_unlock(vp);
}
if (error == 0 && async)
goto out_err_free;
rw_enter(uobj->vmobjlock, RW_WRITER);
/*
* we're almost done! release the pages...
* for errors, we free the pages.
* otherwise we activate them and mark them as valid and clean.
* also, unbusy pages that were not actually requested.
*/
if (error) { genfs_rel_pages(pgs, npages);
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(ubchist, "returning error %jd", error,0,0,0);
goto out_err_free;
}
out:
UVMHIST_LOG(ubchist, "succeeding, npages %jd", npages,0,0,0);
error = 0;
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL) {
continue;
}
UVMHIST_LOG(ubchist, "examining pg %#jx flags 0x%jx",
(uintptr_t)pg, pg->flags, 0,0);
if (pg->flags & PG_FAKE && !overwrite) {
/*
* we've read page's contents from the backing storage.
*
* for a read fault, we keep them CLEAN; if we
* encountered a hole while reading, the pages can
* already been dirtied with zeros.
*/
KASSERTMSG(blockalloc || uvm_pagegetdirty(pg) ==
UVM_PAGE_STATUS_CLEAN, "page %p not clean", pg);
pg->flags &= ~PG_FAKE;
}
KASSERT(!memwrite || !blockalloc || (pg->flags & PG_RDONLY) == 0);
if (i < ridx || i >= ridx + orignmempages || async) {
UVMHIST_LOG(ubchist, "unbusy pg %#jx offset 0x%jx",
(uintptr_t)pg, pg->offset,0,0);
if (pg->flags & PG_FAKE) { KASSERT(overwrite);
uvm_pagezero(pg);
}
if (pg->flags & PG_RELEASED) {
uvm_pagefree(pg);
continue;
}
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
} else if (memwrite && !overwrite &&
uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
/*
* for a write fault, start dirtiness tracking of
* requested pages.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
}
}
rw_exit(uobj->vmobjlock);
if (ap->a_m != NULL) { memcpy(ap->a_m, &pgs[ridx],
orignmempages * sizeof(struct vm_page *));
}
out_err_free:
if (pgs != NULL && pgs != pgs_onstack) kmem_free(pgs, pgs_size);
out_err:
if (trans_mount != NULL) { if (holds_wapbl) WAPBL_END(trans_mount);
fstrans_done(trans_mount);
}
return error;
}
/*
* genfs_getpages_read: Read the pages in with VOP_BMAP/VOP_STRATEGY.
*
* "glocked" (which is currently not actually used) tells us not whether
* the genfs_node is locked on entry (it always is) but whether it was
* locked on entry to genfs_getpages.
*/
static int
genfs_getpages_read(struct vnode *vp, struct vm_page **pgs, int npages,
off_t startoffset, off_t diskeof,
bool async, bool memwrite, bool blockalloc, bool glocked)
{
struct uvm_object * const uobj = &vp->v_uobj;
const int fs_bshift = (vp->v_type != VBLK) ? vp->v_mount->mnt_fs_bshift : DEV_BSHIFT;
const int dev_bshift = (vp->v_type != VBLK) ?
vp->v_mount->mnt_dev_bshift : DEV_BSHIFT;
kauth_cred_t const cred = curlwp->l_cred; /* XXXUBC curlwp */
size_t bytes, iobytes, tailstart, tailbytes, totalbytes, skipbytes;
vaddr_t kva;
struct buf *bp, *mbp;
bool sawhole = false;
int i;
int error = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
/*
* read the desired page(s).
*/
totalbytes = npages << PAGE_SHIFT;
bytes = MIN(totalbytes, MAX(diskeof - startoffset, 0));
tailbytes = totalbytes - bytes;
skipbytes = 0;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | (async ? 0 : UVMPAGER_MAPIN_WAITOK));
if (kva == 0)
return EBUSY;
mbp = getiobuf(vp, true);
mbp->b_bufsize = totalbytes;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY;
if (async) {
mbp->b_flags = B_READ | B_ASYNC;
mbp->b_iodone = uvm_aio_aiodone;
} else {
mbp->b_flags = B_READ;
mbp->b_iodone = NULL;
}
if (async)
BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
else
BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
/*
* if EOF is in the middle of the range, zero the part past EOF.
* skip over pages which are not PG_FAKE since in that case they have
* valid data that we need to preserve.
*/
tailstart = bytes;
while (tailbytes > 0) {
const int len = PAGE_SIZE - (tailstart & PAGE_MASK);
KASSERT(len <= tailbytes); if ((pgs[tailstart >> PAGE_SHIFT]->flags & PG_FAKE) != 0) { memset((void *)(kva + tailstart), 0, len);
UVMHIST_LOG(ubchist, "tailbytes %#jx 0x%jx 0x%jx",
(uintptr_t)kva, tailstart, len, 0);
}
tailstart += len;
tailbytes -= len;
}
/*
* now loop over the pages, reading as needed.
*/
bp = NULL;
off_t offset;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
int run;
daddr_t lbn, blkno;
int pidx;
struct vnode *devvp;
/*
* skip pages which don't need to be read.
*/
pidx = (offset - startoffset) >> PAGE_SHIFT;
while ((pgs[pidx]->flags & PG_FAKE) == 0) {
size_t b;
KASSERT((offset & (PAGE_SIZE - 1)) == 0);
if ((pgs[pidx]->flags & PG_RDONLY)) {
sawhole = true;
}
b = MIN(PAGE_SIZE, bytes);
offset += b;
bytes -= b;
skipbytes += b;
pidx++;
UVMHIST_LOG(ubchist, "skipping, new offset 0x%jx",
offset, 0,0,0);
if (bytes == 0) {
goto loopdone;
}
}
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
lbn = offset >> fs_bshift;
error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
if (error) {
UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
lbn,error,0,0);
skipbytes += bytes;
bytes = 0;
goto loopdone;
}
/*
* see how many pages can be read with this i/o.
* reduce the i/o size if necessary to avoid
* overwriting pages with valid data.
*/
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
if (offset + iobytes > round_page(offset)) {
int pcount;
pcount = 1;
while (pidx + pcount < npages &&
pgs[pidx + pcount]->flags & PG_FAKE) {
pcount++;
}
iobytes = MIN(iobytes, (pcount << PAGE_SHIFT) -
(offset - trunc_page(offset)));
}
/*
* if this block isn't allocated, zero it instead of
* reading it. unless we are going to allocate blocks,
* mark the pages we zeroed PG_RDONLY.
*/
if (blkno == (daddr_t)-1) {
int holepages = (round_page(offset + iobytes) -
trunc_page(offset)) >> PAGE_SHIFT;
UVMHIST_LOG(ubchist, "lbn 0x%jx -> HOLE", lbn,0,0,0);
sawhole = true;
memset((char *)kva + (offset - startoffset), 0,
iobytes);
skipbytes += iobytes;
if (!blockalloc) {
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < holepages; i++) {
pgs[pidx + i]->flags |= PG_RDONLY;
}
rw_exit(uobj->vmobjlock);
}
continue;
}
/*
* allocate a sub-buf for this piece of the i/o
* (or just use mbp if there's only 1 piece),
* and start it going.
*/
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
} else {
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
bp = getiobuf(vp, true);
nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
}
bp->b_lblkno = 0;
/* adjust physical blkno for partial blocks */
bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
dev_bshift);
UVMHIST_LOG(ubchist,
"bp %#jx offset 0x%x bcount 0x%x blkno 0x%x",
(uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
VOP_STRATEGY(devvp, bp);
}
loopdone:
nestiobuf_done(mbp, skipbytes, error);
if (async) {
UVMHIST_LOG(ubchist, "returning 0 (async)",0,0,0,0);
return 0;
}
if (bp != NULL) { error = biowait(mbp);
}
/* Remove the mapping (make KVA available as soon as possible) */
uvm_pagermapout(kva, npages);
/*
* if this we encountered a hole then we have to do a little more work.
* for read faults, we marked the page PG_RDONLY so that future
* write accesses to the page will fault again.
* for write faults, we must make sure that the backing store for
* the page is completely allocated while the pages are locked.
*/
if (!error && sawhole && blockalloc) {
error = GOP_ALLOC(vp, startoffset,
npages << PAGE_SHIFT, 0, cred);
UVMHIST_LOG(ubchist, "gop_alloc off 0x%jx/0x%jx -> %jd",
startoffset, npages << PAGE_SHIFT, error,0);
if (!error) {
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
struct vm_page *pg = pgs[i];
if (pg == NULL) {
continue;
}
pg->flags &= ~PG_RDONLY;
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
UVMHIST_LOG(ubchist, "mark dirty pg %#jx",
(uintptr_t)pg, 0, 0, 0);
}
rw_exit(uobj->vmobjlock);
}
}
putiobuf(mbp);
return error;
}
/*
* generic VM putpages routine.
* Write the given range of pages to backing store.
*
* => "offhi == 0" means flush all pages at or after "offlo".
* => object should be locked by caller. we return with the
* object unlocked.
* => if PGO_CLEANIT or PGO_SYNCIO is set, we may block (due to I/O).
* thus, a caller might want to unlock higher level resources
* (e.g. vm_map) before calling flush.
* => if neither PGO_CLEANIT nor PGO_SYNCIO is set, we will not block
* => if PGO_ALLPAGES is set, then all pages in the object will be processed.
*
* note on "cleaning" object and PG_BUSY pages:
* this routine is holding the lock on the object. the only time
* that it can run into a PG_BUSY page that it does not own is if
* some other process has started I/O on the page (e.g. either
* a pagein, or a pageout). if the PG_BUSY page is being paged
* in, then it can not be dirty (!UVM_PAGE_STATUS_CLEAN) because no
* one has had a chance to modify it yet. if the PG_BUSY page is
* being paged out then it means that someone else has already started
* cleaning the page for us (how nice!). in this case, if we
* have syncio specified, then after we make our pass through the
* object we need to wait for the other PG_BUSY pages to clear
* off (i.e. we need to do an iosync). also note that once a
* page is PG_BUSY it must stay in its object until it is un-busyed.
*/
int
genfs_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ * const ap = v;
return genfs_do_putpages(ap->a_vp, ap->a_offlo, ap->a_offhi,
ap->a_flags, NULL);
}
int
genfs_do_putpages(struct vnode *vp, off_t startoff, off_t endoff,
int origflags, struct vm_page **busypg)
{
struct uvm_object * const uobj = &vp->v_uobj;
krwlock_t * const slock = uobj->vmobjlock;
off_t nextoff;
int i, error, npages, nback;
int freeflag;
/*
* This array is larger than it should so that it's size is constant.
* The right size is MAXPAGES.
*/
struct vm_page *pgs[MAXPHYS / MIN_PAGE_SIZE];
#define MAXPAGES (MAXPHYS / PAGE_SIZE)
struct vm_page *pg, *tpg;
struct uvm_page_array a;
bool wasclean, needs_clean;
bool async = (origflags & PGO_SYNCIO) == 0;
bool pagedaemon = curlwp == uvm.pagedaemon_lwp;
struct mount *trans_mp;
int flags;
bool modified; /* if we write out any pages */
bool holds_wapbl;
bool cleanall; /* try to pull off from the syncer's list */
bool onworklst;
bool nodirty;
const bool dirtyonly = (origflags & (PGO_DEACTIVATE|PGO_FREE)) == 0;
UVMHIST_FUNC("genfs_putpages"); UVMHIST_CALLED(ubchist);
KASSERT(origflags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)); KASSERT((startoff & PAGE_MASK) == 0); KASSERT((endoff & PAGE_MASK) == 0); KASSERT(startoff < endoff || endoff == 0); KASSERT(rw_write_held(slock));
UVMHIST_LOG(ubchist, "vp %#jx pages %jd off 0x%jx len 0x%jx",
(uintptr_t)vp, uobj->uo_npages, startoff, endoff - startoff);
#ifdef DIAGNOSTIC
if ((origflags & PGO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
trans_mp = NULL;
holds_wapbl = false;
retry:
modified = false;
flags = origflags;
/*
* shortcut if we have no pages to process.
*/
nodirty = uvm_obj_clean_p(uobj);
#ifdef DIAGNOSTIC
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_ONWORKLST) != 0 || nodirty);
mutex_exit(vp->v_interlock);
#endif
if (uobj->uo_npages == 0 || (dirtyonly && nodirty)) {
mutex_enter(vp->v_interlock);
if (vp->v_iflag & VI_ONWORKLST && LIST_EMPTY(&vp->v_dirtyblkhd)) { vn_syncer_remove_from_worklist(vp);
}
mutex_exit(vp->v_interlock);
if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp);
fstrans_done(trans_mp);
}
rw_exit(slock);
return (0);
}
/*
* the vnode has pages, set up to process the request.
*/
if (trans_mp == NULL && (flags & PGO_CLEANIT) != 0) {
if (pagedaemon) {
/* Pagedaemon must not sleep here. */
trans_mp = vp->v_mount;
error = fstrans_start_nowait(trans_mp);
if (error) { rw_exit(slock);
return error;
}
} else {
/*
* Cannot use vdeadcheck() here as this operation
* usually gets used from VOP_RECLAIM(). Test for
* change of v_mount instead and retry on change.
*/
rw_exit(slock);
trans_mp = vp->v_mount;
fstrans_start(trans_mp);
if (vp->v_mount != trans_mp) {
fstrans_done(trans_mp);
trans_mp = NULL;
} else {
holds_wapbl = (trans_mp->mnt_wapbl &&
(origflags & PGO_JOURNALLOCKED) == 0);
if (holds_wapbl) {
error = WAPBL_BEGIN(trans_mp);
if (error) { fstrans_done(trans_mp);
return error;
}
}
}
rw_enter(slock, RW_WRITER);
goto retry;
}
}
error = 0;
wasclean = uvm_obj_nowriteback_p(uobj);
nextoff = startoff;
if (endoff == 0 || flags & PGO_ALLPAGES) {
endoff = trunc_page(LLONG_MAX);
}
/*
* if this vnode is known not to have dirty pages,
* don't bother to clean it out.
*/
if (nodirty) {
/* We handled the dirtyonly && nodirty case above. */
KASSERT(!dirtyonly);
flags &= ~PGO_CLEANIT;
}
/*
* start the loop to scan pages.
*/
cleanall = true;
freeflag = pagedaemon ? PG_PAGEOUT : PG_RELEASED;
uvm_page_array_init(&a, uobj, dirtyonly ? (UVM_PAGE_ARRAY_FILL_DIRTY |
(!async ? UVM_PAGE_ARRAY_FILL_WRITEBACK : 0)) : 0);
for (;;) {
bool pgprotected;
/*
* if !dirtyonly, iterate over all resident pages in the range.
*
* if dirtyonly, only possibly dirty pages are interesting.
* however, if we are asked to sync for integrity, we should
* wait on pages being written back by other threads as well.
*/
pg = uvm_page_array_fill_and_peek(&a, nextoff, 0);
if (pg == NULL) {
break;
}
KASSERT(pg->uobject == uobj); KASSERT((pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
(pg->flags & (PG_BUSY)) != 0);
KASSERT(pg->offset >= startoff); KASSERT(pg->offset >= nextoff); KASSERT(!dirtyonly ||
uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN ||
uvm_obj_page_writeback_p(pg));
if (pg->offset >= endoff) {
break;
}
/*
* a preempt point.
*/
if (preempt_needed()) {
nextoff = pg->offset; /* visit this page again */
rw_exit(slock);
preempt();
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
/*
* if the current page is busy, wait for it to become unbusy.
*/
if ((pg->flags & PG_BUSY) != 0) {
UVMHIST_LOG(ubchist, "busy %#jx", (uintptr_t)pg,
0, 0, 0);
if ((pg->flags & (PG_RELEASED|PG_PAGEOUT)) != 0
&& (flags & PGO_BUSYFAIL) != 0) {
UVMHIST_LOG(ubchist, "busyfail %#jx",
(uintptr_t)pg, 0, 0, 0);
error = EDEADLK;
if (busypg != NULL) *busypg = pg;
break;
}
if (pagedaemon) {
/*
* someone has taken the page while we
* dropped the lock for fstrans_start.
*/
break;
}
/*
* don't bother to wait on other's activities
* unless we are asked to sync for integrity.
*/
if (!async && (flags & PGO_RECLAIM) == 0) {
wasclean = false;
nextoff = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
continue;
}
nextoff = pg->offset; /* visit this page again */
uvm_pagewait(pg, slock, "genput");
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
continue;
}
nextoff = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
/*
* if we're freeing, remove all mappings of the page now.
* if we're cleaning, check if the page is needs to be cleaned.
*/
pgprotected = false;
if (flags & PGO_FREE) {
pmap_page_protect(pg, VM_PROT_NONE);
pgprotected = true;
} else if (flags & PGO_CLEANIT) {
/*
* if we still have some hope to pull this vnode off
* from the syncer queue, write-protect the page.
*/
if (cleanall && wasclean) {
/*
* uobj pages get wired only by uvm_fault
* where uobj is locked.
*/
if (pg->wire_count == 0) { pmap_page_protect(pg,
VM_PROT_READ|VM_PROT_EXECUTE);
pgprotected = true;
} else {
cleanall = false;
}
}
}
if (flags & PGO_CLEANIT) {
needs_clean = uvm_pagecheckdirty(pg, pgprotected);
} else {
needs_clean = false;
}
/*
* if we're cleaning, build a cluster.
* the cluster will consist of pages which are currently dirty.
* if not cleaning, just operate on the one page.
*/
if (needs_clean) {
wasclean = false;
memset(pgs, 0, sizeof(pgs));
pg->flags |= PG_BUSY;
UVM_PAGE_OWN(pg, "genfs_putpages");
/*
* let the fs constrain the offset range of the cluster.
* we additionally constrain the range here such that
* it fits in the "pgs" pages array.
*/
off_t fslo, fshi, genlo, lo, off = pg->offset;
GOP_PUTRANGE(vp, off, &fslo, &fshi);
KASSERT(fslo == trunc_page(fslo)); KASSERT(fslo <= off); KASSERT(fshi == trunc_page(fshi)); KASSERT(fshi == 0 || off < fshi);
if (off > MAXPHYS / 2)
genlo = trunc_page(off - (MAXPHYS / 2));
else
genlo = 0;
lo = MAX(fslo, genlo);
/*
* first look backward.
*/
npages = (off - lo) >> PAGE_SHIFT;
nback = npages;
uvn_findpages(uobj, off - PAGE_SIZE, &nback,
&pgs[0], NULL,
UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY|UFP_BACKWARD);
if (nback) { memmove(&pgs[0], &pgs[npages - nback],
nback * sizeof(pgs[0]));
if (npages - nback < nback)
memset(&pgs[nback], 0,
(npages - nback) * sizeof(pgs[0]));
else
memset(&pgs[npages - nback], 0,
nback * sizeof(pgs[0]));
}
/*
* then plug in our page of interest.
*/
pgs[nback] = pg;
/*
* then look forward to fill in the remaining space in
* the array of pages.
*
* pass our cached array of pages so that hopefully
* uvn_findpages can find some good pages in it.
* the array a was filled above with the one of
* following sets of flags:
* 0
* UVM_PAGE_ARRAY_FILL_DIRTY
* UVM_PAGE_ARRAY_FILL_DIRTY|WRITEBACK
*
* XXX this is fragile but it'll work: the array
* was earlier filled sparsely, but UFP_DIRTYONLY
* implies dense. see corresponding comment in
* uvn_findpages().
*/
npages = MAXPAGES - nback - 1;
if (fshi) npages = MIN(npages,
(fshi - off - 1) >> PAGE_SHIFT);
uvn_findpages(uobj, off + PAGE_SIZE, &npages,
&pgs[nback + 1], &a,
UFP_NOWAIT|UFP_NOALLOC|UFP_DIRTYONLY);
npages += nback + 1;
} else {
pgs[0] = pg;
npages = 1;
nback = 0;
}
/*
* apply FREE or DEACTIVATE options if requested.
*/
for (i = 0; i < npages; i++) {
tpg = pgs[i];
KASSERT(tpg->uobject == uobj); KASSERT(i == 0 ||
pgs[i-1]->offset + PAGE_SIZE == tpg->offset);
KASSERT(!needs_clean || uvm_pagegetdirty(pgs[i]) !=
UVM_PAGE_STATUS_DIRTY);
if (needs_clean) {
/*
* mark pages as WRITEBACK so that concurrent
* fsync can find and wait for our activities.
*/
uvm_obj_page_set_writeback(pgs[i]);
}
if (tpg->offset < startoff || tpg->offset >= endoff)
continue;
if (flags & PGO_DEACTIVATE && tpg->wire_count == 0) { uvm_pagelock(tpg);
uvm_pagedeactivate(tpg);
uvm_pageunlock(tpg);
} else if (flags & PGO_FREE) {
pmap_page_protect(tpg, VM_PROT_NONE);
if (tpg->flags & PG_BUSY) {
tpg->flags |= freeflag;
if (pagedaemon) { uvm_pageout_start(1);
uvm_pagelock(tpg);
uvm_pagedequeue(tpg);
uvm_pageunlock(tpg);
}
} else {
/*
* ``page is not busy''
* implies that npages is 1
* and needs_clean is false.
*/
KASSERT(npages == 1); KASSERT(!needs_clean); KASSERT(pg == tpg); KASSERT(nextoff ==
tpg->offset + PAGE_SIZE);
uvm_pagefree(tpg);
if (pagedaemon) uvmexp.pdfreed++;
}
}
}
if (needs_clean) {
modified = true;
KASSERT(nextoff == pg->offset + PAGE_SIZE); KASSERT(nback < npages);
nextoff = pg->offset + ((npages - nback) << PAGE_SHIFT);
KASSERT(pgs[nback] == pg); KASSERT(nextoff == pgs[npages - 1]->offset + PAGE_SIZE);
/*
* start the i/o.
*/
rw_exit(slock);
error = GOP_WRITE(vp, pgs, npages, flags);
/*
* as we dropped the object lock, our cached pages can
* be stale.
*/
uvm_page_array_clear(&a);
rw_enter(slock, RW_WRITER);
if (error) {
break;
}
}
}
uvm_page_array_fini(&a);
/*
* update ctime/mtime if the modification we started writing out might
* be from mmap'ed write.
*
* this is necessary when an application keeps a file mmaped and
* repeatedly modifies it via the window. note that, because we
* don't always write-protect pages when cleaning, such modifications
* might not involve any page faults.
*/
mutex_enter(vp->v_interlock);
if (modified && (vp->v_iflag & VI_WRMAP) != 0 && (vp->v_type != VBLK ||
(vp->v_mount->mnt_flag & MNT_NODEVMTIME) == 0)) {
GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
}
/*
* if we no longer have any possibly dirty pages, take us off the
* syncer list.
*/
if ((vp->v_iflag & VI_ONWORKLST) != 0 && uvm_obj_clean_p(uobj) &&
LIST_EMPTY(&vp->v_dirtyblkhd)) {
vn_syncer_remove_from_worklist(vp);
}
/* Wait for output to complete. */
rw_exit(slock);
if (!wasclean && !async && vp->v_numoutput != 0) { while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, vp->v_interlock);
}
onworklst = (vp->v_iflag & VI_ONWORKLST) != 0;
mutex_exit(vp->v_interlock);
if ((flags & PGO_RECLAIM) != 0 && onworklst) {
/*
* in the case of PGO_RECLAIM, ensure to make the vnode clean.
* retrying is not a big deal because, in many cases,
* uobj->uo_npages is already 0 here.
*/
rw_enter(slock, RW_WRITER);
goto retry;
}
if (trans_mp) { if (holds_wapbl) WAPBL_END(trans_mp);
fstrans_done(trans_mp);
}
return (error);
}
/*
* Default putrange method for file systems that do not care
* how many pages are given to one GOP_WRITE() call.
*/
void
genfs_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
{
*lop = 0;
*hip = 0;
}
int
genfs_gop_write(struct vnode *vp, struct vm_page **pgs, int npages, int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
off = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
len = npages << PAGE_SHIFT;
error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
uvm_aio_aiodone);
return error;
}
/*
* genfs_gop_write_rwmap:
*
* a variant of genfs_gop_write. it's used by UDF for its directory buffers.
* this maps pages with PROT_WRITE so that VOP_STRATEGY can modifies
* the contents before writing it out to the underlying storage.
*/
int
genfs_gop_write_rwmap(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
off_t off;
vaddr_t kva;
size_t len;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx pgs %#jx npages %jd flags 0x%jx",
(uintptr_t)vp, (uintptr_t)pgs, npages, flags);
off = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
len = npages << PAGE_SHIFT;
error = genfs_do_io(vp, off, kva, len, flags, UIO_WRITE,
uvm_aio_aiodone);
return error;
}
/*
* Backend routine for doing I/O to vnode pages. Pages are already locked
* and mapped into kernel memory. Here we just look up the underlying
* device block addresses and call the strategy routine.
*/
static int
genfs_do_io(struct vnode *vp, off_t off, vaddr_t kva, size_t len, int flags,
enum uio_rw rw, void (*iodone)(struct buf *))
{
int s, error;
int fs_bshift, dev_bshift;
off_t eof, offset, startoffset;
size_t bytes, iobytes, skipbytes;
struct buf *mbp, *bp;
const bool async = (flags & PGO_SYNCIO) == 0;
const bool lazy = (flags & PGO_LAZY) == 0;
const bool iowrite = rw == UIO_WRITE;
const int brw = iowrite ? B_WRITE : B_READ;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx kva %#jx len 0x%jx flags 0x%jx",
(uintptr_t)vp, (uintptr_t)kva, len, flags);
KASSERT(vp->v_size != VSIZENOTSET); KASSERT(vp->v_writesize != VSIZENOTSET); KASSERTMSG(vp->v_size <= vp->v_writesize, "vp=%p"
" v_size=0x%llx v_writesize=0x%llx", vp,
(unsigned long long)vp->v_size,
(unsigned long long)vp->v_writesize);
GOP_SIZE(vp, vp->v_writesize, &eof, 0);
if (vp->v_type != VBLK) { fs_bshift = vp->v_mount->mnt_fs_bshift;
dev_bshift = vp->v_mount->mnt_dev_bshift;
} else {
fs_bshift = DEV_BSHIFT;
dev_bshift = DEV_BSHIFT;
}
error = 0;
startoffset = off;
bytes = MIN(len, eof - startoffset);
skipbytes = 0;
KASSERT(bytes != 0); if (iowrite) {
/*
* why += 2?
* 1 for biodone, 1 for uvm_aio_aiodone.
*/
mutex_enter(vp->v_interlock);
vp->v_numoutput += 2;
mutex_exit(vp->v_interlock);
}
mbp = getiobuf(vp, true);
UVMHIST_LOG(ubchist, "vp %#jx mbp %#jx num now %jd bytes 0x%jx",
(uintptr_t)vp, (uintptr_t)mbp, vp->v_numoutput, bytes);
mbp->b_bufsize = len;
mbp->b_data = (void *)kva;
mbp->b_resid = mbp->b_bcount = bytes;
mbp->b_cflags |= BC_BUSY | BC_AGE;
if (async) {
mbp->b_flags = brw | B_ASYNC;
mbp->b_iodone = iodone;
} else {
mbp->b_flags = brw;
mbp->b_iodone = NULL;
}
if (curlwp == uvm.pagedaemon_lwp)
BIO_SETPRIO(mbp, BPRIO_TIMELIMITED);
else if (async || lazy)
BIO_SETPRIO(mbp, BPRIO_TIMENONCRITICAL);
else
BIO_SETPRIO(mbp, BPRIO_TIMECRITICAL);
bp = NULL;
for (offset = startoffset;
bytes > 0;
offset += iobytes, bytes -= iobytes) {
int run;
daddr_t lbn, blkno;
struct vnode *devvp;
/*
* bmap the file to find out the blkno to read from and
* how much we can read in one i/o. if bmap returns an error,
* skip the rest of the top-level i/o.
*/
lbn = offset >> fs_bshift;
error = VOP_BMAP(vp, lbn, &devvp, &blkno, &run);
if (error) {
UVMHIST_LOG(ubchist, "VOP_BMAP lbn 0x%jx -> %jd",
lbn, error, 0, 0);
skipbytes += bytes;
bytes = 0;
goto loopdone;
}
/*
* see how many pages can be read with this i/o.
* reduce the i/o size if necessary to avoid
* overwriting pages with valid data.
*/
iobytes = MIN((((off_t)lbn + 1 + run) << fs_bshift) - offset,
bytes);
/*
* if this block isn't allocated, zero it instead of
* reading it. unless we are going to allocate blocks,
* mark the pages we zeroed PG_RDONLY.
*/
if (blkno == (daddr_t)-1) {
if (!iowrite) { memset((char *)kva + (offset - startoffset), 0,
iobytes);
}
skipbytes += iobytes;
continue;
}
/*
* allocate a sub-buf for this piece of the i/o
* (or just use mbp if there's only 1 piece),
* and start it going.
*/
if (offset == startoffset && iobytes == bytes) {
bp = mbp;
} else {
UVMHIST_LOG(ubchist, "vp %#jx bp %#jx num now %jd",
(uintptr_t)vp, (uintptr_t)bp, vp->v_numoutput, 0);
bp = getiobuf(vp, true);
nestiobuf_setup(mbp, bp, offset - startoffset, iobytes);
}
bp->b_lblkno = 0;
/* adjust physical blkno for partial blocks */
bp->b_blkno = blkno + ((offset - ((off_t)lbn << fs_bshift)) >>
dev_bshift);
UVMHIST_LOG(ubchist,
"bp %#jx offset 0x%jx bcount 0x%jx blkno 0x%jx",
(uintptr_t)bp, offset, bp->b_bcount, bp->b_blkno);
VOP_STRATEGY(devvp, bp);
}
loopdone:
if (skipbytes) {
UVMHIST_LOG(ubchist, "skipbytes %jd", skipbytes, 0,0,0);
}
nestiobuf_done(mbp, skipbytes, error);
if (async) {
UVMHIST_LOG(ubchist, "returning 0 (async)", 0,0,0,0);
return (0);
}
UVMHIST_LOG(ubchist, "waiting for mbp %#jx", (uintptr_t)mbp, 0, 0, 0);
error = biowait(mbp);
s = splbio();
(*iodone)(mbp);
splx(s);
UVMHIST_LOG(ubchist, "returning, error %jd", error, 0, 0, 0);
return (error);
}
int
genfs_compat_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
off_t origoffset;
struct vnode *vp = ap->a_vp;
struct uvm_object *uobj = &vp->v_uobj;
struct vm_page *pg, **pgs;
vaddr_t kva;
int i, error, orignpages, npages;
struct iovec iov;
struct uio uio;
kauth_cred_t cred = curlwp->l_cred;
const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
error = 0;
origoffset = ap->a_offset;
orignpages = *ap->a_count;
pgs = ap->a_m;
if (ap->a_flags & PGO_LOCKED) {
uvn_findpages(uobj, origoffset, ap->a_count, ap->a_m, NULL,
UFP_NOWAIT|UFP_NOALLOC| (memwrite ? UFP_NORDONLY : 0));
error = ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
return error;
}
if (origoffset + (ap->a_centeridx << PAGE_SHIFT) >= vp->v_size) {
rw_exit(uobj->vmobjlock);
return EINVAL;
}
if ((ap->a_flags & PGO_SYNCIO) == 0) {
rw_exit(uobj->vmobjlock);
return 0;
}
npages = orignpages;
uvn_findpages(uobj, origoffset, &npages, pgs, NULL, UFP_ALL);
rw_exit(uobj->vmobjlock);
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_READ | UVMPAGER_MAPIN_WAITOK);
for (i = 0; i < npages; i++) {
pg = pgs[i];
if ((pg->flags & PG_FAKE) == 0) {
continue;
}
iov.iov_base = (char *)kva + (i << PAGE_SHIFT);
iov.iov_len = PAGE_SIZE;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = origoffset + (i << PAGE_SHIFT);
uio.uio_rw = UIO_READ;
uio.uio_resid = PAGE_SIZE;
UIO_SETUP_SYSSPACE(&uio);
/* XXX vn_lock */
error = VOP_READ(vp, &uio, 0, cred);
if (error) {
break;
}
if (uio.uio_resid) {
memset(iov.iov_base, 0, uio.uio_resid);
}
}
uvm_pagermapout(kva, npages);
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) {
pg = pgs[i];
if (error && (pg->flags & PG_FAKE) != 0) {
pg->flags |= PG_RELEASED;
} else {
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
if (error) {
uvm_page_unbusy(pgs, npages);
}
rw_exit(uobj->vmobjlock);
return error;
}
int
genfs_compat_gop_write(struct vnode *vp, struct vm_page **pgs, int npages,
int flags)
{
off_t offset;
struct iovec iov;
struct uio uio;
kauth_cred_t cred = curlwp->l_cred;
struct buf *bp;
vaddr_t kva;
int error;
offset = pgs[0]->offset;
kva = uvm_pagermapin(pgs, npages,
UVMPAGER_MAPIN_WRITE | UVMPAGER_MAPIN_WAITOK);
iov.iov_base = (void *)kva;
iov.iov_len = npages << PAGE_SHIFT;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = offset;
uio.uio_rw = UIO_WRITE;
uio.uio_resid = npages << PAGE_SHIFT;
UIO_SETUP_SYSSPACE(&uio);
/* XXX vn_lock */
error = VOP_WRITE(vp, &uio, 0, cred);
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
bp = getiobuf(vp, true);
bp->b_cflags |= BC_BUSY | BC_AGE;
bp->b_lblkno = offset >> vp->v_mount->mnt_fs_bshift;
bp->b_data = (char *)kva;
bp->b_bcount = npages << PAGE_SHIFT;
bp->b_bufsize = npages << PAGE_SHIFT;
bp->b_resid = 0;
bp->b_error = error;
uvm_aio_aiodone(bp);
return (error);
}
/*
* Process a uio using direct I/O. If we reach a part of the request
* which cannot be processed in this fashion for some reason, just return.
* The caller must handle some additional part of the request using
* buffered I/O before trying direct I/O again.
*/
void
genfs_directio(struct vnode *vp, struct uio *uio, int ioflag)
{
struct vmspace *vs;
struct iovec *iov;
vaddr_t va;
size_t len;
const int mask = DEV_BSIZE - 1;
int error;
bool need_wapbl = (vp->v_mount && vp->v_mount->mnt_wapbl && (ioflag & IO_JOURNALLOCKED) == 0);
#ifdef DIAGNOSTIC
if ((ioflag & IO_JOURNALLOCKED) && vp->v_mount->mnt_wapbl) WAPBL_JLOCK_ASSERT(vp->v_mount);
#endif
/*
* We only support direct I/O to user space for now.
*/
if (VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
return;
}
/*
* If the vnode is mapped, we would need to get the getpages lock
* to stabilize the bmap, but then we would get into trouble while
* locking the pages if the pages belong to this same vnode (or a
* multi-vnode cascade to the same effect). Just fall back to
* buffered I/O if the vnode is mapped to avoid this mess.
*/
if (vp->v_vflag & VV_MAPPED) {
return;
}
if (need_wapbl) {
error = WAPBL_BEGIN(vp->v_mount);
if (error)
return;
}
/*
* Do as much of the uio as possible with direct I/O.
*/
vs = uio->uio_vmspace; while (uio->uio_resid) {
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
va = (vaddr_t)iov->iov_base;
len = MIN(iov->iov_len, genfs_maxdio);
len &= ~mask;
/*
* If the next chunk is smaller than DEV_BSIZE or extends past
* the current EOF, then fall back to buffered I/O.
*/
if (len == 0 || uio->uio_offset + len > vp->v_size) {
break;
}
/*
* Check alignment. The file offset must be at least
* sector-aligned. The exact constraint on memory alignment
* is very hardware-dependent, but requiring sector-aligned
* addresses there too is safe.
*/
if (uio->uio_offset & mask || va & mask) {
break;
}
error = genfs_do_directio(vs, va, len, vp, uio->uio_offset,
uio->uio_rw);
if (error) {
break;
}
iov->iov_base = (char *)iov->iov_base + len;
iov->iov_len -= len;
uio->uio_offset += len;
uio->uio_resid -= len;
}
if (need_wapbl) WAPBL_END(vp->v_mount);
}
/*
* Iodone routine for direct I/O. We don't do much here since the request is
* always synchronous, so the caller will do most of the work after biowait().
*/
static void
genfs_dio_iodone(struct buf *bp)
{
KASSERT((bp->b_flags & B_ASYNC) == 0);
if ((bp->b_flags & B_READ) == 0 && (bp->b_cflags & BC_AGE) != 0) {
mutex_enter(bp->b_objlock);
vwakeup(bp);
mutex_exit(bp->b_objlock);
}
putiobuf(bp);
}
/*
* Process one chunk of a direct I/O request.
*/
static int
genfs_do_directio(struct vmspace *vs, vaddr_t uva, size_t len, struct vnode *vp,
off_t off, enum uio_rw rw)
{
struct vm_map *map;
struct pmap *upm, *kpm __unused;
size_t klen = round_page(uva + len) - trunc_page(uva);
off_t spoff, epoff;
vaddr_t kva, puva;
paddr_t pa;
vm_prot_t prot;
int error, rv __diagused, poff, koff;
const int pgoflags = PGO_CLEANIT | PGO_SYNCIO | PGO_JOURNALLOCKED |
(rw == UIO_WRITE ? PGO_FREE : 0);
/*
* For writes, verify that this range of the file already has fully
* allocated backing store. If there are any holes, just punt and
* make the caller take the buffered write path.
*/
if (rw == UIO_WRITE) {
daddr_t lbn, elbn, blkno;
int bsize, bshift, run;
bshift = vp->v_mount->mnt_fs_bshift;
bsize = 1 << bshift;
lbn = off >> bshift;
elbn = (off + len + bsize - 1) >> bshift;
while (lbn < elbn) {
error = VOP_BMAP(vp, lbn, NULL, &blkno, &run);
if (error) {
return error;
}
if (blkno == (daddr_t)-1) {
return ENOSPC;
}
lbn += 1 + run;
}
}
/*
* Flush any cached pages for parts of the file that we're about to
* access. If we're writing, invalidate pages as well.
*/
spoff = trunc_page(off);
epoff = round_page(off + len);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, spoff, epoff, pgoflags);
if (error) {
return error;
}
/*
* Wire the user pages and remap them into kernel memory.
*/
prot = rw == UIO_READ ? VM_PROT_READ | VM_PROT_WRITE : VM_PROT_READ;
error = uvm_vslock(vs, (void *)uva, len, prot);
if (error) {
return error;
}
map = &vs->vm_map;
upm = vm_map_pmap(map);
kpm = vm_map_pmap(kernel_map);
puva = trunc_page(uva);
kva = uvm_km_alloc(kernel_map, klen, atop(puva) & uvmexp.colormask,
UVM_KMF_VAONLY | UVM_KMF_WAITVA | UVM_KMF_COLORMATCH);
for (poff = 0; poff < klen; poff += PAGE_SIZE) {
rv = pmap_extract(upm, puva + poff, &pa);
KASSERT(rv);
pmap_kenter_pa(kva + poff, pa, prot, PMAP_WIRED);
}
pmap_update(kpm);
/*
* Do the I/O.
*/
koff = uva - trunc_page(uva);
error = genfs_do_io(vp, off, kva + koff, len, PGO_SYNCIO, rw,
genfs_dio_iodone);
/*
* Tear down the kernel mapping.
*/
pmap_kremove(kva, klen);
pmap_update(kpm);
uvm_km_free(kernel_map, kva, klen, UVM_KMF_VAONLY);
/*
* Unwire the user pages.
*/
uvm_vsunlock(vs, (void *)uva, len);
return error;
}
/* $NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $ */
/*-
* Copyright (c) 1997, 1998, 2004, 2005, 2007, 2008, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, by Andrew Doran,
* by Marshall Kirk McKusick and Greg Ganger at the University of Michigan.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_subr.c,v 1.500 2023/04/30 08:46:11 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_43.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm_ddb.h>
SDT_PROBE_DEFINE3(vfs, syncer, worklist, vnode__add,
"struct vnode *"/*vp*/,
"int"/*delayx*/,
"int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, vnode__update,
"struct vnode *"/*vp*/,
"int"/*delayx*/,
"int"/*oslot*/,
"int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, vnode__remove,
"struct vnode *"/*vp*/);
SDT_PROBE_DEFINE3(vfs, syncer, worklist, mount__add,
"struct mount *"/*mp*/,
"int"/*vdelay*/,
"int"/*slot*/);
SDT_PROBE_DEFINE4(vfs, syncer, worklist, mount__update,
"struct mount *"/*vp*/,
"int"/*vdelay*/,
"int"/*oslot*/,
"int"/*nslot*/);
SDT_PROBE_DEFINE1(vfs, syncer, worklist, mount__remove,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, start,
"int"/*starttime*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__start,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, mount__done,
"struct mount *"/*mp*/,
"int"/*error*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, mount__skip,
"struct mount *"/*mp*/);
SDT_PROBE_DEFINE1(vfs, syncer, sync, vnode__start,
"struct vnode *"/*vp*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__done,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__lock,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, vnode__fail__vget,
"struct vnode *"/*vp*/,
"int"/*error*/);
SDT_PROBE_DEFINE2(vfs, syncer, sync, done,
"int"/*starttime*/,
"int"/*endtime*/);
const enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
const int vttoif_tab[9] = {
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT,
};
/*
* Insq/Remq for the vnode usage lists.
*/
#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
#define bufremvn(bp) { \
LIST_REMOVE(bp, b_vnbufs); \
(bp)->b_vnbufs.le_next = NOLIST; \
}
int doforce = 1; /* 1 => permit forcible unmounting */
/*
* Local declarations.
*/
static void vn_initialize_syncerd(void);
/*
* Initialize the vnode management data structures.
*/
void
vntblinit(void)
{
vn_initialize_syncerd();
vfs_mount_sysinit();
vfs_vnode_sysinit();
}
/*
* Flush out and invalidate all buffers associated with a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vinvalbuf(struct vnode *vp, int flags, kauth_cred_t cred, struct lwp *l,
bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
int flushflags = PGO_ALLPAGES | PGO_FREE | PGO_SYNCIO |
(flags & V_SAVE ? PGO_CLEANIT | PGO_RECLAIM : 0);
/* XXXUBC this doesn't look at flags or slp* */
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, 0, 0, flushflags);
if (error) {
return error;
}
if (flags & V_SAVE) {
error = VOP_FSYNC(vp, cred, FSYNC_WAIT|FSYNC_RECLAIM, 0, 0);
if (error)
return (error);
KASSERT(LIST_EMPTY(&vp->v_dirtyblkhd));
}
mutex_enter(&bufcache_lock);
restart:
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
/*
* XXX Since there are no node locks for NFS, I believe
* there is a slight chance that a delayed write will
* occur while sleeping just above, so check for it.
*/
if ((bp->b_oflags & BO_DELWRI) && (flags & V_SAVE)) {
#ifdef DEBUG
printf("buffer still DELWRI\n");
#endif
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
VOP_BWRITE(bp->b_vp, bp);
mutex_enter(&bufcache_lock);
goto restart;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
#ifdef DIAGNOSTIC
if (!LIST_EMPTY(&vp->v_cleanblkhd) || !LIST_EMPTY(&vp->v_dirtyblkhd))
panic("vinvalbuf: flush failed, vp %p", vp);
#endif
mutex_exit(&bufcache_lock);
return (0);
}
/*
* Destroy any in core blocks past the truncation length.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vtruncbuf(struct vnode *vp, daddr_t lbn, bool catch_p, int slptimeo)
{
struct buf *bp, *nbp;
int error;
voff_t off;
off = round_page((voff_t)lbn << vp->v_mount->mnt_fs_bshift);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, off, 0, PGO_FREE | PGO_SYNCIO);
if (error) {
return error;
}
mutex_enter(&bufcache_lock);
restart:
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_lblkno < lbn)
continue;
error = bbusy(bp, catch_p, slptimeo, NULL);
if (error != 0) {
if (error == EPASSTHROUGH)
goto restart;
mutex_exit(&bufcache_lock);
return (error);
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
mutex_exit(&bufcache_lock);
return (0);
}
/*
* Flush all dirty buffers from a vnode.
* Called with the underlying vnode locked, which should prevent new dirty
* buffers from being queued.
*/
int
vflushbuf(struct vnode *vp, int flags)
{
struct buf *bp, *nbp;
int error, pflags;
bool dirty, sync;
sync = (flags & FSYNC_WAIT) != 0;
pflags = PGO_CLEANIT | PGO_ALLPAGES |
(sync ? PGO_SYNCIO : 0) |
((flags & FSYNC_LAZY) ? PGO_LAZY : 0);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
(void) VOP_PUTPAGES(vp, 0, 0, pflags);
loop:
mutex_enter(&bufcache_lock);
for (bp = LIST_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { KASSERT(bp->b_vp == vp);
nbp = LIST_NEXT(bp, b_vnbufs);
if ((bp->b_cflags & BC_BUSY))
continue;
if ((bp->b_oflags & BO_DELWRI) == 0)
panic("vflushbuf: not dirty, bp %p", bp);
bp->b_cflags |= BC_BUSY | BC_VFLUSH;
mutex_exit(&bufcache_lock);
/*
* Wait for I/O associated with indirect blocks to complete,
* since there is no way to quickly wait for them below.
*/
if (bp->b_vp == vp || !sync)
(void) bawrite(bp);
else {
error = bwrite(bp);
if (error)
return error;
}
goto loop;
}
mutex_exit(&bufcache_lock);
if (!sync)
return 0;
mutex_enter(vp->v_interlock);
while (vp->v_numoutput != 0)
cv_wait(&vp->v_cv, vp->v_interlock);
dirty = !LIST_EMPTY(&vp->v_dirtyblkhd);
mutex_exit(vp->v_interlock);
if (dirty) { vprint("vflushbuf: dirty", vp);
goto loop;
}
return 0;
}
/*
* Create a vnode for a block device.
* Used for root filesystem and swap areas.
* Also used for memory file system special devices.
*/
int
bdevvp(dev_t dev, vnode_t **vpp)
{
struct vattr va;
vattr_null(&va);
va.va_type = VBLK;
va.va_rdev = dev;
return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}
/*
* Create a vnode for a character device.
* Used for kernfs and some console handling.
*/
int
cdevvp(dev_t dev, vnode_t **vpp)
{
struct vattr va;
vattr_null(&va);
va.va_type = VCHR;
va.va_rdev = dev;
return vcache_new(dead_rootmount, NULL, &va, NOCRED, NULL, vpp);
}
/*
* Associate a buffer with a vnode. There must already be a hold on
* the vnode.
*/
void
bgetvp(struct vnode *vp, struct buf *bp)
{ KASSERT(bp->b_vp == NULL); KASSERT(bp->b_objlock == &buffer_lock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done));
vholdl(vp);
bp->b_vp = vp;
if (vp->v_type == VBLK || vp->v_type == VCHR) bp->b_dev = vp->v_rdev;
else
bp->b_dev = NODEV;
/*
* Insert onto list for new vnode.
*/
bufinsvn(bp, &vp->v_cleanblkhd);
bp->b_objlock = vp->v_interlock;
}
/*
* Disassociate a buffer from a vnode.
*/
void
brelvp(struct buf *bp)
{
struct vnode *vp = bp->b_vp;
KASSERT(vp != NULL); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(mutex_owned(&bufcache_lock)); KASSERT((bp->b_cflags & BC_BUSY) != 0); KASSERT(!cv_has_waiters(&bp->b_done));
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp); if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
vn_syncer_remove_from_worklist(vp);
bp->b_objlock = &buffer_lock;
bp->b_vp = NULL;
holdrelel(vp);
}
/*
* Reassign a buffer from one vnode list to another.
* The list reassignment must be within the same vnode.
* Used to assign file specific control information
* (indirect blocks) to the list to which they belong.
*/
void
reassignbuf(struct buf *bp, struct vnode *vp)
{
struct buflists *listheadp;
int delayx;
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(vp->v_interlock)); KASSERT((bp->b_cflags & BC_BUSY) != 0);
/*
* Delete from old vnode list, if on one.
*/
if (LIST_NEXT(bp, b_vnbufs) != NOLIST) bufremvn(bp);
/*
* If dirty, put on list of dirty buffers;
* otherwise insert onto list of clean buffers.
*/
if ((bp->b_oflags & BO_DELWRI) == 0) {
listheadp = &vp->v_cleanblkhd;
if ((vp->v_iflag & (VI_ONWORKLST | VI_PAGES)) == VI_ONWORKLST &&
LIST_FIRST(&vp->v_dirtyblkhd) == NULL)
vn_syncer_remove_from_worklist(vp);
} else {
listheadp = &vp->v_dirtyblkhd;
if ((vp->v_iflag & VI_ONWORKLST) == 0) { switch (vp->v_type) {
case VDIR:
delayx = dirdelay;
break;
case VBLK:
if (spec_node_getmountedfs(vp) != NULL) {
delayx = metadelay;
break;
}
/* fall through */
default:
delayx = filedelay;
break;
}
if (!vp->v_mount ||
(vp->v_mount->mnt_flag & MNT_ASYNC) == 0)
vn_syncer_add_to_worklist(vp, delayx);
}
}
bufinsvn(bp, listheadp);
}
/*
* Lookup a vnode by device number and return it referenced.
*/
int
vfinddev(dev_t dev, enum vtype type, vnode_t **vpp)
{
return (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, vpp) == 0);
}
/*
* Revoke all the vnodes corresponding to the specified minor number
* range (endpoints inclusive) of the specified major.
*/
void
vdevgone(int maj, int minl, int minh, enum vtype type)
{
vnode_t *vp;
dev_t dev;
int mn;
for (mn = minl; mn <= minh; mn++) {
dev = makedev(maj, mn);
/*
* Notify anyone trying to get at this device that it
* has been detached, and then revoke it.
*/
switch (type) {
case VBLK:
bdev_detached(dev);
break;
case VCHR:
cdev_detached(dev);
break;
default:
panic("invalid specnode type: %d", type);
}
/*
* Passing 0 as flags, instead of VDEAD_NOWAIT, means
* spec_node_lookup_by_dev will wait for vnodes it
* finds concurrently being revoked before returning.
*/
while (spec_node_lookup_by_dev(type, dev, 0, &vp) == 0) {
VOP_REVOKE(vp, REVOKEALL);
vrele(vp);
}
}
}
/*
* The filesystem synchronizer mechanism - syncer.
*
* It is useful to delay writes of file data and filesystem metadata for
* a certain amount of time so that quickly created and deleted files need
* not waste disk bandwidth being created and removed. To implement this,
* vnodes are appended to a "workitem" queue.
*
* Most pending metadata should not wait for more than ten seconds. Thus,
* mounted on block devices are delayed only about a half the time that file
* data is delayed. Similarly, directory updates are more critical, so are
* only delayed about a third the time that file data is delayed.
*
* There are SYNCER_MAXDELAY queues that are processed in a round-robin
* manner at a rate of one each second (driven off the filesystem syner
* thread). The syncer_delayno variable indicates the next queue that is
* to be processed. Items that need to be processed soon are placed in
* this queue:
*
* syncer_workitem_pending[syncer_delayno]
*
* A delay of e.g. fifteen seconds is done by placing the request fifteen
* entries later in the queue:
*
* syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
*
* Flag VI_ONWORKLST indicates that vnode is added into the queue.
*/
#define SYNCER_MAXDELAY 32
typedef TAILQ_HEAD(synclist, vnode_impl) synclist_t;
static void vn_syncer_add1(struct vnode *, int);
static void sysctl_vfs_syncfs_setup(struct sysctllog **);
/*
* Defines and variables for the syncer process.
*/
int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */
time_t syncdelay = 30; /* max time to delay syncing data */
time_t filedelay = 30; /* time to delay syncing files */
time_t dirdelay = 15; /* time to delay syncing directories */
time_t metadelay = 10; /* time to delay syncing metadata */
time_t lockdelay = 1; /* time to delay if locking fails */
static kmutex_t syncer_data_lock; /* short term lock on data structs */
static int syncer_delayno = 0;
static long syncer_last;
static synclist_t * syncer_workitem_pending;
static void
vn_initialize_syncerd(void)
{
int i;
syncer_last = SYNCER_MAXDELAY + 2;
sysctl_vfs_syncfs_setup(NULL);
syncer_workitem_pending =
kmem_alloc(syncer_last * sizeof (struct synclist), KM_SLEEP);
for (i = 0; i < syncer_last; i++)
TAILQ_INIT(&syncer_workitem_pending[i]);
mutex_init(&syncer_data_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Return delay factor appropriate for the given file system. For
* WAPBL we use the sync vnode to burst out metadata updates: sync
* those file systems more frequently.
*/
static inline int
sync_delay(struct mount *mp)
{
return mp->mnt_wapbl != NULL ? metadelay : syncdelay;
}
/*
* Compute the next slot index from delay.
*/
static inline int
sync_delay_slot(int delayx)
{
if (delayx > syncer_maxdelay - 2)
delayx = syncer_maxdelay - 2;
return (syncer_delayno + delayx) % syncer_last;
}
/*
* Add an item to the syncer work queue.
*/
static void
vn_syncer_add1(struct vnode *vp, int delayx)
{
synclist_t *slp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(&syncer_data_lock));
if (vp->v_iflag & VI_ONWORKLST) {
/*
* Remove in order to adjust the position of the vnode.
* Note: called from sched_sync(), which will not hold
* interlock, therefore we cannot modify v_iflag here.
*/
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist);
} else {
KASSERT(mutex_owned(vp->v_interlock));
vp->v_iflag |= VI_ONWORKLST;
}
vip->vi_synclist_slot = sync_delay_slot(delayx);
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_INSERT_TAIL(slp, vip, vi_synclist);
}
void
vn_syncer_add_to_worklist(struct vnode *vp, int delayx)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(vp->v_interlock));
mutex_enter(&syncer_data_lock);
vn_syncer_add1(vp, delayx);
SDT_PROBE3(vfs, syncer, worklist, vnode__add,
vp, delayx, vip->vi_synclist_slot);
mutex_exit(&syncer_data_lock);
}
/*
* Remove an item from the syncer work queue.
*/
void
vn_syncer_remove_from_worklist(struct vnode *vp)
{
synclist_t *slp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_iflag & VI_ONWORKLST) {
mutex_enter(&syncer_data_lock);
SDT_PROBE1(vfs, syncer, worklist, vnode__remove, vp);
vp->v_iflag &= ~VI_ONWORKLST;
slp = &syncer_workitem_pending[vip->vi_synclist_slot]; TAILQ_REMOVE(slp, vip, vi_synclist);
mutex_exit(&syncer_data_lock);
}
}
/*
* Add this mount point to the syncer.
*/
void
vfs_syncer_add_to_worklist(struct mount *mp)
{
static int start, incr, next;
int vdelay;
KASSERT(mutex_owned(mp->mnt_updating)); KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) == 0);
/*
* We attempt to scatter the mount points on the list
* so that they will go off at evenly distributed times
* even if all the filesystems are mounted at once.
*/
next += incr;
if (next == 0 || next > syncer_maxdelay) {
start /= 2;
incr /= 2;
if (start == 0) { start = syncer_maxdelay / 2;
incr = syncer_maxdelay;
}
next = start;
}
mp->mnt_iflag |= IMNT_ONWORKLIST;
vdelay = sync_delay(mp);
mp->mnt_synclist_slot = vdelay > 0 ? next % vdelay : 0; SDT_PROBE3(vfs, syncer, worklist, mount__add,
mp, vdelay, mp->mnt_synclist_slot);
}
/*
* Remove the mount point from the syncer.
*/
void
vfs_syncer_remove_from_worklist(struct mount *mp)
{ KASSERT(mutex_owned(mp->mnt_updating)); KASSERT((mp->mnt_iflag & IMNT_ONWORKLIST) != 0); SDT_PROBE1(vfs, syncer, worklist, mount__remove, mp);
mp->mnt_iflag &= ~IMNT_ONWORKLIST;
}
/*
* Try lazy sync, return true on success.
*/
static bool
lazy_sync_vnode(struct vnode *vp)
{
bool synced;
int error;
KASSERT(mutex_owned(&syncer_data_lock));
synced = false;
if ((error = vcache_tryvget(vp)) == 0) {
mutex_exit(&syncer_data_lock);
if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT)) == 0) {
synced = true;
SDT_PROBE1(vfs, syncer, sync, vnode__start, vp);
error = VOP_FSYNC(vp, curlwp->l_cred,
FSYNC_LAZY, 0, 0);
SDT_PROBE2(vfs, syncer, sync, vnode__done, vp, error);
vput(vp);
} else {
SDT_PROBE2(vfs, syncer, sync, vnode__fail__lock,
vp, error);
vrele(vp);
}
mutex_enter(&syncer_data_lock);
} else {
SDT_PROBE2(vfs, syncer, sync, vnode__fail__vget, vp, error);
}
return synced;
}
/*
* System filesystem synchronizer daemon.
*/
void
sched_sync(void *arg)
{
mount_iterator_t *iter;
synclist_t *slp;
struct vnode_impl *vi;
struct vnode *vp;
struct mount *mp;
time_t starttime, endtime;
int vdelay, oslot, nslot, delayx;
bool synced;
int error;
for (;;) {
starttime = time_second;
SDT_PROBE1(vfs, syncer, sync, start, starttime);
/*
* Sync mounts whose dirty time has expired.
*/
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_trynext(iter)) != NULL) {
if ((mp->mnt_iflag & IMNT_ONWORKLIST) == 0 ||
mp->mnt_synclist_slot != syncer_delayno) {
SDT_PROBE1(vfs, syncer, sync, mount__skip,
mp);
continue;
}
vdelay = sync_delay(mp);
oslot = mp->mnt_synclist_slot;
nslot = sync_delay_slot(vdelay);
mp->mnt_synclist_slot = nslot;
SDT_PROBE4(vfs, syncer, worklist, mount__update,
mp, vdelay, oslot, nslot);
SDT_PROBE1(vfs, syncer, sync, mount__start, mp);
error = VFS_SYNC(mp, MNT_LAZY, curlwp->l_cred);
SDT_PROBE2(vfs, syncer, sync, mount__done,
mp, error);
}
mountlist_iterator_destroy(iter);
mutex_enter(&syncer_data_lock);
/*
* Push files whose dirty time has expired.
*/
slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno >= syncer_last)
syncer_delayno = 0;
while ((vi = TAILQ_FIRST(slp)) != NULL) {
vp = VIMPL_TO_VNODE(vi);
synced = lazy_sync_vnode(vp);
/*
* XXX The vnode may have been recycled, in which
* case it may have a new identity.
*/
vi = TAILQ_FIRST(slp);
if (vi != NULL && VIMPL_TO_VNODE(vi) == vp) {
/*
* Put us back on the worklist. The worklist
* routine will remove us from our current
* position and then add us back in at a later
* position.
*
* Try again sooner rather than later if
* we were unable to lock the vnode. Lock
* failure should not prevent us from doing
* the sync "soon".
*
* If we locked it yet arrive here, it's
* likely that lazy sync is in progress and
* so the vnode still has dirty metadata.
* syncdelay is mainly to get this vnode out
* of the way so we do not consider it again
* "soon" in this loop, so the delay time is
* not critical as long as it is not "soon".
* While write-back strategy is the file
* system's domain, we expect write-back to
* occur no later than syncdelay seconds
* into the future.
*/
delayx = synced ? syncdelay : lockdelay;
oslot = vi->vi_synclist_slot;
vn_syncer_add1(vp, delayx);
nslot = vi->vi_synclist_slot;
SDT_PROBE4(vfs, syncer, worklist,
vnode__update,
vp, delayx, oslot, nslot);
}
}
endtime = time_second;
SDT_PROBE2(vfs, syncer, sync, done, starttime, endtime);
/*
* If it has taken us less than a second to process the
* current work, then wait. Otherwise start right over
* again. We can still lose time if any single round
* takes more than two seconds, but it does not really
* matter as we are just trying to generally pace the
* filesystem activity.
*/
if (endtime == starttime) {
kpause("syncer", false, hz, &syncer_data_lock);
}
mutex_exit(&syncer_data_lock);
}
}
static void
sysctl_vfs_syncfs_setup(struct sysctllog **clog)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sync",
SYSCTL_DESCR("syncer options"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "delay",
SYSCTL_DESCR("max time to delay syncing data"),
NULL, 0, &syncdelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "filedelay",
SYSCTL_DESCR("time to delay syncing files"),
NULL, 0, &filedelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "dirdelay",
SYSCTL_DESCR("time to delay syncing directories"),
NULL, 0, &dirdelay, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_QUAD, "metadelay",
SYSCTL_DESCR("time to delay syncing metadata"),
NULL, 0, &metadelay, 0,
CTL_CREATE, CTL_EOL);
}
/*
* sysctl helper routine to return list of supported fstypes
*/
int
sysctl_vfs_generic_fstypes(SYSCTLFN_ARGS)
{
char bf[sizeof(((struct statvfs *)NULL)->f_fstypename)];
char *where = oldp;
struct vfsops *v;
size_t needed, left, slen;
int error, first;
if (newp != NULL)
return (EPERM);
if (namelen != 0)
return (EINVAL);
first = 1;
error = 0;
needed = 0;
left = *oldlenp;
sysctl_unlock();
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (where == NULL)
needed += strlen(v->vfs_name) + 1;
else {
memset(bf, 0, sizeof(bf));
if (first) {
strncpy(bf, v->vfs_name, sizeof(bf));
first = 0;
} else {
bf[0] = ' ';
strncpy(bf + 1, v->vfs_name, sizeof(bf) - 1);
}
bf[sizeof(bf)-1] = '\0';
slen = strlen(bf);
if (left < slen + 1)
break;
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
/* +1 to copy out the trailing NUL byte */
error = copyout(bf, where, slen + 1);
mutex_enter(&vfs_list_lock);
v->vfs_refcount--;
if (error)
break;
where += slen;
needed += slen;
left -= slen;
}
}
mutex_exit(&vfs_list_lock);
sysctl_relock();
*oldlenp = needed;
return (error);
}
int kinfo_vdebug = 1;
int kinfo_vgetfailed;
#define KINFO_VNODESLOP 10
/*
* Dump vnode list (via sysctl).
* Copyout address of vnode followed by vnode.
*/
int
sysctl_kern_vnode(SYSCTLFN_ARGS)
{
char *where = oldp;
size_t *sizep = oldlenp;
struct mount *mp;
vnode_t *vp, vbuf;
mount_iterator_t *iter;
struct vnode_iterator *marker;
char *bp = where;
char *ewhere;
int error;
if (namelen != 0)
return (EOPNOTSUPP);
if (newp != NULL)
return (EPERM);
#define VPTRSZ sizeof(vnode_t *)
#define VNODESZ sizeof(vnode_t)
if (where == NULL) {
*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
return (0);
}
ewhere = where + *sizep;
sysctl_unlock();
mountlist_iterator_init(&iter);
while ((mp = mountlist_iterator_next(iter)) != NULL) {
vfs_vnode_iterator_init(mp, &marker);
while ((vp = vfs_vnode_iterator_next(marker, NULL, NULL))) {
if (bp + VPTRSZ + VNODESZ > ewhere) {
vrele(vp);
vfs_vnode_iterator_destroy(marker);
mountlist_iterator_destroy(iter);
sysctl_relock();
*sizep = bp - where;
return (ENOMEM);
}
memcpy(&vbuf, vp, VNODESZ);
if ((error = copyout(&vp, bp, VPTRSZ)) ||
(error = copyout(&vbuf, bp + VPTRSZ, VNODESZ))) {
vrele(vp);
vfs_vnode_iterator_destroy(marker);
mountlist_iterator_destroy(iter);
sysctl_relock();
return (error);
}
vrele(vp);
bp += VPTRSZ + VNODESZ;
}
vfs_vnode_iterator_destroy(marker);
}
mountlist_iterator_destroy(iter);
sysctl_relock();
*sizep = bp - where;
return (0);
}
/*
* Set vnode attributes to VNOVAL
*/
void
vattr_null(struct vattr *vap)
{
memset(vap, 0, sizeof(*vap));
vap->va_type = VNON;
/*
* Assign individually so that it is safe even if size and
* sign of each member are varied.
*/
vap->va_mode = VNOVAL;
vap->va_nlink = VNOVAL;
vap->va_uid = VNOVAL;
vap->va_gid = VNOVAL;
vap->va_fsid = VNOVAL;
vap->va_fileid = VNOVAL;
vap->va_size = VNOVAL;
vap->va_blocksize = VNOVAL;
vap->va_atime.tv_sec =
vap->va_mtime.tv_sec =
vap->va_ctime.tv_sec =
vap->va_birthtime.tv_sec = VNOVAL;
vap->va_atime.tv_nsec =
vap->va_mtime.tv_nsec =
vap->va_ctime.tv_nsec =
vap->va_birthtime.tv_nsec = VNOVAL;
vap->va_gen = VNOVAL;
vap->va_flags = VNOVAL;
vap->va_rdev = VNOVAL;
vap->va_bytes = VNOVAL;
}
/*
* Vnode state to string.
*/
const char *
vstate_name(enum vnode_state state)
{
switch (state) {
case VS_ACTIVE:
return "ACTIVE";
case VS_MARKER:
return "MARKER";
case VS_LOADING:
return "LOADING";
case VS_LOADED:
return "LOADED";
case VS_BLOCKED:
return "BLOCKED";
case VS_RECLAIMING:
return "RECLAIMING";
case VS_RECLAIMED:
return "RECLAIMED";
default:
return "ILLEGAL";
}
}
/*
* Print a description of a vnode (common part).
*/
static void
vprint_common(struct vnode *vp, const char *prefix,
void (*pr)(const char *, ...) __printflike(1, 2))
{
int n;
char bf[96];
const uint8_t *cp;
vnode_impl_t *vip;
const char * const vnode_tags[] = { VNODE_TAGS };
const char * const vnode_types[] = { VNODE_TYPES };
const char vnode_flagbits[] = VNODE_FLAGBITS;
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
#define ARRAY_PRINT(idx, arr) \
((unsigned int)(idx) < ARRAY_SIZE(arr) ? (arr)[(idx)] : "UNKNOWN")
vip = VNODE_TO_VIMPL(vp);
snprintb(bf, sizeof(bf),
vnode_flagbits, vp->v_iflag | vp->v_vflag | vp->v_uflag);
(*pr)("vnode %p flags %s\n", vp, bf);
(*pr)("%stag %s(%d) type %s(%d) mount %p typedata %p\n", prefix,
ARRAY_PRINT(vp->v_tag, vnode_tags), vp->v_tag,
ARRAY_PRINT(vp->v_type, vnode_types), vp->v_type,
vp->v_mount, vp->v_mountedhere);
(*pr)("%susecount %d writecount %d holdcount %d\n", prefix,
vrefcnt(vp), vp->v_writecount, vp->v_holdcnt);
(*pr)("%ssize %" PRIx64 " writesize %" PRIx64 " numoutput %d\n",
prefix, vp->v_size, vp->v_writesize, vp->v_numoutput);
(*pr)("%sdata %p lock %p\n", prefix, vp->v_data, &vip->vi_lock);
(*pr)("%sstate %s key(%p %zd)", prefix, vstate_name(vip->vi_state),
vip->vi_key.vk_mount, vip->vi_key.vk_key_len);
n = vip->vi_key.vk_key_len;
cp = vip->vi_key.vk_key;
while (n-- > 0)
(*pr)(" %02x", *cp++);
(*pr)("\n");
(*pr)("%slrulisthd %p\n", prefix, vip->vi_lrulisthd);
#undef ARRAY_PRINT
#undef ARRAY_SIZE
}
/*
* Print out a description of a vnode.
*/
void
vprint(const char *label, struct vnode *vp)
{
if (label != NULL)
printf("%s: ", label);
vprint_common(vp, "\t", printf);
if (vp->v_data != NULL) {
printf("\t");
VOP_PRINT(vp);
}
}
/*
* Given a file system name, look up the vfsops for that
* file system, or return NULL if file system isn't present
* in the kernel.
*/
struct vfsops *
vfs_getopsbyname(const char *name)
{
struct vfsops *v;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (strcmp(v->vfs_name, name) == 0)
break;
}
if (v != NULL)
v->vfs_refcount++;
mutex_exit(&vfs_list_lock);
return (v);
}
void
copy_statvfs_info(struct statvfs *sbp, const struct mount *mp)
{
const struct statvfs *mbp;
if (sbp == (mbp = &mp->mnt_stat))
return;
(void)memcpy(&sbp->f_fsidx, &mbp->f_fsidx, sizeof(sbp->f_fsidx));
sbp->f_fsid = mbp->f_fsid;
sbp->f_owner = mbp->f_owner;
sbp->f_flag = mbp->f_flag;
sbp->f_syncwrites = mbp->f_syncwrites;
sbp->f_asyncwrites = mbp->f_asyncwrites;
sbp->f_syncreads = mbp->f_syncreads;
sbp->f_asyncreads = mbp->f_asyncreads;
(void)memcpy(sbp->f_spare, mbp->f_spare, sizeof(mbp->f_spare));
(void)memcpy(sbp->f_fstypename, mbp->f_fstypename,
sizeof(sbp->f_fstypename));
(void)memcpy(sbp->f_mntonname, mbp->f_mntonname,
sizeof(sbp->f_mntonname));
(void)memcpy(sbp->f_mntfromname, mp->mnt_stat.f_mntfromname,
sizeof(sbp->f_mntfromname));
(void)memcpy(sbp->f_mntfromlabel, mp->mnt_stat.f_mntfromlabel,
sizeof(sbp->f_mntfromlabel));
sbp->f_namemax = mbp->f_namemax;
}
int
set_statvfs_info(const char *onp, int ukon, const char *fromp, int ukfrom,
const char *vfsname, struct mount *mp, struct lwp *l)
{
int error;
size_t size;
struct statvfs *sfs = &mp->mnt_stat;
int (*fun)(const void *, void *, size_t, size_t *);
(void)strlcpy(mp->mnt_stat.f_fstypename, vfsname,
sizeof(mp->mnt_stat.f_fstypename));
if (onp) {
struct cwdinfo *cwdi = l->l_proc->p_cwdi;
fun = (ukon == UIO_SYSSPACE) ? copystr : copyinstr;
if (cwdi->cwdi_rdir != NULL) {
size_t len;
char *bp;
char *path = PNBUF_GET();
bp = path + MAXPATHLEN;
*--bp = '\0';
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_rdir, rootvnode, &bp,
path, MAXPATHLEN / 2, 0, l);
rw_exit(&cwdi->cwdi_lock);
if (error) {
PNBUF_PUT(path);
return error;
}
len = strlen(bp);
if (len > sizeof(sfs->f_mntonname) - 1)
len = sizeof(sfs->f_mntonname) - 1;
(void)strncpy(sfs->f_mntonname, bp, len);
PNBUF_PUT(path);
if (len < sizeof(sfs->f_mntonname) - 1) {
error = (*fun)(onp, &sfs->f_mntonname[len],
sizeof(sfs->f_mntonname) - len - 1, &size);
if (error)
return error;
size += len;
} else {
size = len;
}
} else {
error = (*fun)(onp, &sfs->f_mntonname,
sizeof(sfs->f_mntonname) - 1, &size);
if (error)
return error;
}
(void)memset(sfs->f_mntonname + size, 0,
sizeof(sfs->f_mntonname) - size);
}
if (fromp) {
fun = (ukfrom == UIO_SYSSPACE) ? copystr : copyinstr;
error = (*fun)(fromp, sfs->f_mntfromname,
sizeof(sfs->f_mntfromname) - 1, &size);
if (error)
return error;
(void)memset(sfs->f_mntfromname + size, 0,
sizeof(sfs->f_mntfromname) - size);
}
return 0;
}
/*
* Knob to control the precision of file timestamps:
*
* 0 = seconds only; nanoseconds zeroed.
* 1 = seconds and nanoseconds, accurate within 1/HZ.
* 2 = seconds and nanoseconds, truncated to microseconds.
* >=3 = seconds and nanoseconds, maximum precision.
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
int vfs_timestamp_precision __read_mostly = TSP_NSEC;
void
vfs_timestamp(struct timespec *tsp)
{
struct timeval tv;
switch (vfs_timestamp_precision) {
case TSP_SEC:
tsp->tv_sec = time_second;
tsp->tv_nsec = 0;
break;
case TSP_HZ:
getnanotime(tsp);
break;
case TSP_USEC:
microtime(&tv);
TIMEVAL_TO_TIMESPEC(&tv, tsp);
break;
case TSP_NSEC:
default:
nanotime(tsp);
break;
}
}
/*
* The purpose of this routine is to remove granularity from accmode_t,
* reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
* VADMIN and VAPPEND.
*
* If it returns 0, the caller is supposed to continue with the usual
* access checks using 'accmode' as modified by this routine. If it
* returns nonzero value, the caller is supposed to return that value
* as errno.
*
* Note that after this routine runs, accmode may be zero.
*/
int
vfs_unixify_accmode(accmode_t *accmode)
{
/*
* There is no way to specify explicit "deny" rule using
* file mode or POSIX.1e ACLs.
*/
if (*accmode & VEXPLICIT_DENY) {
*accmode = 0;
return (0);
}
/*
* None of these can be translated into usual access bits.
* Also, the common case for NFSv4 ACLs is to not contain
* either of these bits. Caller should check for VWRITE
* on the containing directory instead.
*/
if (*accmode & (VDELETE_CHILD | VDELETE))
return (EPERM);
if (*accmode & VADMIN_PERMS) {
*accmode &= ~VADMIN_PERMS;
*accmode |= VADMIN;
}
/*
* There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
* or VSYNCHRONIZE using file mode or POSIX.1e ACL.
*/
*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
return (0);
}
time_t rootfstime; /* recorded root fs time, if known */
void
setrootfstime(time_t t)
{
rootfstime = t;
}
static const uint8_t vttodt_tab[ ] = {
[VNON] = DT_UNKNOWN,
[VREG] = DT_REG,
[VDIR] = DT_DIR,
[VBLK] = DT_BLK,
[VCHR] = DT_CHR,
[VLNK] = DT_LNK,
[VSOCK] = DT_SOCK,
[VFIFO] = DT_FIFO,
[VBAD] = DT_UNKNOWN
};
uint8_t
vtype2dt(enum vtype vt)
{
CTASSERT(VBAD == __arraycount(vttodt_tab) - 1);
return vttodt_tab[vt];
}
int
VFS_MOUNT(struct mount *mp, const char *a, void *b, size_t *c)
{
int mpsafe = mp->mnt_iflag & IMNT_MPSAFE;
int error;
/*
* Note: The first time through, the vfs_mount function may set
* IMNT_MPSAFE, so we have to cache it on entry in order to
* avoid leaking a kernel lock.
*
* XXX Maybe the MPSAFE bit should be set in struct vfsops and
* not in struct mount.
*/
if (mpsafe) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_mount))(mp, a, b, c);
if (mpsafe) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_START(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_start))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_UNMOUNT(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_unmount))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_ROOT(struct mount *mp, int lktype, struct vnode **a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_root))(mp, lktype, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_QUOTACTL(struct mount *mp, struct quotactl_args *args)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_quotactl))(mp, args);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_STATVFS(struct mount *mp, struct statvfs *a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_statvfs))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_SYNC(struct mount *mp, int a, struct kauth_cred *b)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_sync))(mp, a, b);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_FHTOVP(struct mount *mp, struct fid *a, int b, struct vnode **c)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_fhtovp))(mp, a, b, c);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_VPTOFH(struct vnode *vp, struct fid *a, size_t *b)
{
int error;
if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(vp->v_mount->mnt_op->vfs_vptofh))(vp, a, b);
if ((vp->v_vflag & VV_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_SNAPSHOT(struct mount *mp, struct vnode *a, struct timespec *b)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_snapshot))(mp, a, b);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) {
KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
int
VFS_EXTATTRCTL(struct mount *mp, int a, struct vnode *b, int c, const char *d)
{
int error;
KERNEL_LOCK(1, NULL); /* XXXSMP check ffs */
error = (*(mp->mnt_op->vfs_extattrctl))(mp, a, b, c, d);
KERNEL_UNLOCK_ONE(NULL); /* XXX */
return error;
}
int
VFS_SUSPENDCTL(struct mount *mp, int a)
{
int error;
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_LOCK(1, NULL);
}
error = (*(mp->mnt_op->vfs_suspendctl))(mp, a);
if ((mp->mnt_iflag & IMNT_MPSAFE) == 0) { KERNEL_UNLOCK_ONE(NULL);
}
return error;
}
#if defined(DDB) || defined(DEBUGPRINT)
static const char buf_flagbits[] = BUF_FLAGBITS;
void
vfs_buf_print(struct buf *bp, int full, void (*pr)(const char *, ...))
{
char bf[1024];
(*pr)(" vp %p lblkno 0x%"PRIx64" blkno 0x%"PRIx64" rawblkno 0x%"
PRIx64 " dev 0x%x\n",
bp->b_vp, bp->b_lblkno, bp->b_blkno, bp->b_rawblkno, bp->b_dev);
snprintb(bf, sizeof(bf),
buf_flagbits, bp->b_flags | bp->b_oflags | bp->b_cflags);
(*pr)(" error %d flags %s\n", bp->b_error, bf);
(*pr)(" bufsize 0x%lx bcount 0x%lx resid 0x%lx\n",
bp->b_bufsize, bp->b_bcount, bp->b_resid);
(*pr)(" data %p saveaddr %p\n",
bp->b_data, bp->b_saveaddr);
(*pr)(" iodone %p objlock %p\n", bp->b_iodone, bp->b_objlock);
}
void
vfs_vnode_print(struct vnode *vp, int full, void (*pr)(const char *, ...))
{
uvm_object_printit(&vp->v_uobj, full, pr);
(*pr)("\n");
vprint_common(vp, "", pr);
if (full) {
struct buf *bp;
(*pr)("clean bufs:\n");
LIST_FOREACH(bp, &vp->v_cleanblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
(*pr)("dirty bufs:\n");
LIST_FOREACH(bp, &vp->v_dirtyblkhd, b_vnbufs) {
(*pr)(" bp %p\n", bp);
vfs_buf_print(bp, full, pr);
}
}
}
void
vfs_vnode_lock_print(void *vlock, int full, void (*pr)(const char *, ...))
{
struct mount *mp;
vnode_impl_t *vip;
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
if (&vip->vi_lock == vlock ||
VIMPL_TO_VNODE(vip)->v_interlock == vlock)
vfs_vnode_print(VIMPL_TO_VNODE(vip), full, pr);
}
}
}
void
vfs_mount_print_all(int full, void (*pr)(const char *, ...))
{
struct mount *mp;
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp))
vfs_mount_print(mp, full, pr);
}
void
vfs_mount_print(struct mount *mp, int full, void (*pr)(const char *, ...))
{
char sbuf[256];
(*pr)("vnodecovered = %p data = %p\n",
mp->mnt_vnodecovered, mp->mnt_data);
(*pr)("fs_bshift %d dev_bshift = %d\n",
mp->mnt_fs_bshift, mp->mnt_dev_bshift);
snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_flag);
(*pr)("flag = %s\n", sbuf);
snprintb(sbuf, sizeof(sbuf), __IMNT_FLAG_BITS, mp->mnt_iflag);
(*pr)("iflag = %s\n", sbuf);
(*pr)("refcnt = %d updating @ %p\n", mp->mnt_refcnt, mp->mnt_updating);
(*pr)("statvfs cache:\n");
(*pr)("\tbsize = %lu\n", mp->mnt_stat.f_bsize);
(*pr)("\tfrsize = %lu\n", mp->mnt_stat.f_frsize);
(*pr)("\tiosize = %lu\n", mp->mnt_stat.f_iosize);
(*pr)("\tblocks = %"PRIu64"\n", mp->mnt_stat.f_blocks);
(*pr)("\tbfree = %"PRIu64"\n", mp->mnt_stat.f_bfree);
(*pr)("\tbavail = %"PRIu64"\n", mp->mnt_stat.f_bavail);
(*pr)("\tbresvd = %"PRIu64"\n", mp->mnt_stat.f_bresvd);
(*pr)("\tfiles = %"PRIu64"\n", mp->mnt_stat.f_files);
(*pr)("\tffree = %"PRIu64"\n", mp->mnt_stat.f_ffree);
(*pr)("\tfavail = %"PRIu64"\n", mp->mnt_stat.f_favail);
(*pr)("\tfresvd = %"PRIu64"\n", mp->mnt_stat.f_fresvd);
(*pr)("\tf_fsidx = { 0x%"PRIx32", 0x%"PRIx32" }\n",
mp->mnt_stat.f_fsidx.__fsid_val[0],
mp->mnt_stat.f_fsidx.__fsid_val[1]);
(*pr)("\towner = %"PRIu32"\n", mp->mnt_stat.f_owner);
(*pr)("\tnamemax = %lu\n", mp->mnt_stat.f_namemax);
snprintb(sbuf, sizeof(sbuf), __MNT_FLAG_BITS, mp->mnt_stat.f_flag);
(*pr)("\tflag = %s\n", sbuf);
(*pr)("\tsyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_syncwrites);
(*pr)("\tasyncwrites = %" PRIu64 "\n", mp->mnt_stat.f_asyncwrites);
(*pr)("\tsyncreads = %" PRIu64 "\n", mp->mnt_stat.f_syncreads);
(*pr)("\tasyncreads = %" PRIu64 "\n", mp->mnt_stat.f_asyncreads);
(*pr)("\tfstypename = %s\n", mp->mnt_stat.f_fstypename);
(*pr)("\tmntonname = %s\n", mp->mnt_stat.f_mntonname);
(*pr)("\tmntfromname = %s\n", mp->mnt_stat.f_mntfromname);
{
int cnt = 0;
vnode_t *vp;
vnode_impl_t *vip;
(*pr)("locked vnodes =");
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (VOP_ISLOCKED(vp)) {
if ((++cnt % 6) == 0) {
(*pr)(" %p,\n\t", vp);
} else {
(*pr)(" %p,", vp);
}
}
}
(*pr)("\n");
}
if (full) {
int cnt = 0;
vnode_t *vp;
vnode_impl_t *vip;
(*pr)("all vnodes =");
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (!TAILQ_NEXT(vip, vi_mntvnodes)) {
(*pr)(" %p", vp);
} else if ((++cnt % 6) == 0) {
(*pr)(" %p,\n\t", vp);
} else {
(*pr)(" %p,", vp);
}
}
(*pr)("\n");
}
}
/*
* List all of the locked vnodes in the system.
*/
void printlockedvnodes(void);
void
printlockedvnodes(void)
{
struct mount *mp;
vnode_t *vp;
vnode_impl_t *vip;
printf("Locked vnodes\n");
for (mp = _mountlist_next(NULL); mp; mp = _mountlist_next(mp)) {
TAILQ_FOREACH(vip, &mp->mnt_vnodelist, vi_mntvnodes) {
vp = VIMPL_TO_VNODE(vip);
if (VOP_ISLOCKED(vp))
vprint(NULL, vp);
}
}
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $ */
/*-
* Copyright (c) 1999-2002 Robert N. M. Watson
* Copyright (c) 2002-2003 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed by Robert Watson for the TrustedBSD Project.
*
* This software was developed for the FreeBSD Project in part by Network
* Associates Laboratories, the Security Research Division of Network
* Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
* as part of the DARPA CHATS research program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
/*
* Support for file system extended attributes on the UFS1 file system.
*
* Extended attributes are defined in the form name=value, where name is
* a nul-terminated string in the style of a file name, and value is a
* binary blob of zero or more bytes. The UFS1 extended attribute service
* layers support for extended attributes onto a backing file, in the style
* of the quota implementation, meaning that it requires no underlying format
* changes to the file system. This design choice exchanges simplicity,
* usability, and easy deployment for performance.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_extattr.c,v 1.55 2024/02/10 18:43:53 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/reboot.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lwp.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/lock.h>
#include <sys/dirent.h>
#include <sys/extattr.h>
#include <sys/sysctl.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/extattr.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
int ufs_extattr_sync = 1;
int ufs_extattr_autocreate = 1024;
static int ufs_extattr_valid_attrname(int attrnamespace,
const char *attrname);
static int ufs_extattr_enable_with_open(struct ufsmount *ump,
struct vnode *vp, int attrnamespace, const char *attrname,
struct lwp *l);
static int ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct vnode *backing_vnode,
struct lwp *l);
static int ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct lwp *l);
static int ufs_extattr_get(struct vnode *vp, int attrnamespace,
const char *name, struct uio *uio, size_t *size,
kauth_cred_t cred, struct lwp *l);
static int ufs_extattr_list(struct vnode *vp, int attrnamespace,
struct uio *uio, size_t *size, int flag,
kauth_cred_t cred, struct lwp *l);
static int ufs_extattr_set(struct vnode *vp, int attrnamespace,
const char *name, struct uio *uio, kauth_cred_t cred,
struct lwp *l);
static int ufs_extattr_rm(struct vnode *vp, int attrnamespace,
const char *name, kauth_cred_t cred, struct lwp *l);
static struct ufs_extattr_list_entry *ufs_extattr_find_attr(struct ufsmount *,
int, const char *);
static int ufs_extattr_get_header(struct vnode *,
struct ufs_extattr_list_entry *,
struct ufs_extattr_header *, off_t *);
/*
* Per-FS attribute lock protecting attribute operations.
* XXX Right now there is a lot of lock contention due to having a single
* lock per-FS; really, this should be far more fine-grained.
*/
static void
ufs_extattr_uepm_lock(struct ufsmount *ump)
{
/*
* XXX This needs to be recursive for the following reasons:
* - it is taken in ufs_extattr_vnode_inactive
* - which is called from VOP_INACTIVE
* - which can be triggered by any vrele, vput, or vn_close
* - several of these can happen while it's held
*/
if (mutex_owned(&ump->um_extattr.uepm_lock)) {
ump->um_extattr.uepm_lockcnt++;
return;
}
mutex_enter(&ump->um_extattr.uepm_lock);
}
static void
ufs_extattr_uepm_unlock(struct ufsmount *ump)
{
if (ump->um_extattr.uepm_lockcnt != 0) {
KASSERT(mutex_owned(&ump->um_extattr.uepm_lock));
ump->um_extattr.uepm_lockcnt--;
return;
}
mutex_exit(&ump->um_extattr.uepm_lock);
}
/*-
* Determine whether the name passed is a valid name for an actual
* attribute.
*
* Invalid currently consists of:
* NULL pointer for attrname
* zero-length attrname (used to retrieve application attribute list)
*/
static int
ufs_extattr_valid_attrname(int attrnamespace, const char *attrname)
{
if (attrname == NULL)
return 0;
if (strlen(attrname) == 0)
return 0;
return 1;
}
/*
* Autocreate an attribute storage
*/
static int
ufs_extattr_autocreate_attr(struct vnode *vp, int attrnamespace,
const char *attrname, struct lwp *l, struct ufs_extattr_list_entry **uelep)
{
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct vnode *backing_vp;
struct pathbuf *pb;
char *path;
struct ufs_extattr_fileheader uef;
struct ufs_extattr_list_entry *uele;
int error;
path = PNBUF_GET();
/*
* We only support system and user namespace autocreation
*/
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
UFS_EXTATTR_SUBDIR_SYSTEM, attrname);
break;
case EXTATTR_NAMESPACE_USER:
(void)snprintf(path, PATH_MAX, "%s/%s/%s/%s",
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR,
UFS_EXTATTR_SUBDIR_USER, attrname);
break;
default:
PNBUF_PUT(path);
*uelep = NULL;
return EINVAL;
break;
}
/*
* Release extended attribute mount lock, otherwise
* we can deadlock with another thread that would lock
* vp after we unlock it below, and call
* ufs_extattr_uepm_lock(ump), for instance
* in ufs_getextattr().
*/
ufs_extattr_uepm_unlock(ump);
/*
* XXX unlock/lock should only be done when setting extattr
* on backing store or one of its parent directory
* including root, but we always do it for now.
*/
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
VOP_UNLOCK(vp);
pb = pathbuf_create(path);
/*
* Since we do not hold ufs_extattr_uepm_lock anymore,
* another thread may race with us for backend creation,
* but only one can succeed here thanks to O_EXCL.
*
* backing_vp is the backing store.
*/
error = vn_open(NULL, pb, 0, O_CREAT|O_EXCL|O_RDWR, 0600,
&backing_vp, NULL, NULL);
/*
* Reacquire the lock on the vnode
*/
KASSERT(VOP_ISLOCKED(vp) == 0);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
ufs_extattr_uepm_lock(ump);
if (error != 0) {
pathbuf_destroy(pb);
PNBUF_PUT(path);
*uelep = NULL;
return error;
}
KASSERT(backing_vp != NULL);
KASSERT(VOP_ISLOCKED(backing_vp) == LK_EXCLUSIVE);
pathbuf_destroy(pb);
PNBUF_PUT(path);
uef.uef_magic = UFS_EXTATTR_MAGIC;
uef.uef_version = UFS_EXTATTR_VERSION;
uef.uef_size = ufs_extattr_autocreate;
error = vn_rdwr(UIO_WRITE, backing_vp, &uef, sizeof(uef), 0,
UIO_SYSSPACE, IO_NODELOCKED|IO_APPEND,
l->l_cred, NULL, l);
VOP_UNLOCK(backing_vp);
if (error != 0) {
printf("%s: write uef header failed for `%s' (%d)\n",
__func__, attrname, error);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return error;
}
/*
* Now enable attribute.
*/
error = ufs_extattr_enable(ump,attrnamespace, attrname, backing_vp, l);
KASSERT(VOP_ISLOCKED(backing_vp) == 0);
if (error != 0) {
printf("%s: enable `%s' failed (%d)\n",
__func__, attrname, error);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return error;
}
uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
if (uele == NULL) {
printf("%s: attribute `%s' created but not found!\n",
__func__, attrname);
vn_close(backing_vp, FREAD|FWRITE, l->l_cred);
*uelep = NULL;
return ESRCH; /* really internal error */
}
printf("%s: EA backing store autocreated for %s\n",
mp->mnt_stat.f_mntonname, attrname);
*uelep = uele;
return 0;
}
/*
* Locate an attribute given a name and mountpoint.
* Must be holding uepm lock for the mount point.
*/
static struct ufs_extattr_list_entry *
ufs_extattr_find_attr(struct ufsmount *ump, int attrnamespace,
const char *attrname)
{
struct ufs_extattr_list_entry *search_attribute;
for (search_attribute = LIST_FIRST(&ump->um_extattr.uepm_list);
search_attribute != NULL;
search_attribute = LIST_NEXT(search_attribute, uele_entries)) {
if (!(strncmp(attrname, search_attribute->uele_attrname,
UFS_EXTATTR_MAXEXTATTRNAME)) &&
(attrnamespace == search_attribute->uele_attrnamespace)) {
return search_attribute;
}
}
return 0;
}
/*
* Initialize per-FS structures supporting extended attributes. Do not
* start extended attributes yet.
*/
void
ufs_extattr_uepm_init(struct ufs_extattr_per_mount *uepm)
{
uepm->uepm_flags = 0;
uepm->uepm_lockcnt = 0;
LIST_INIT(&uepm->uepm_list);
mutex_init(&uepm->uepm_lock, MUTEX_DEFAULT, IPL_NONE);
uepm->uepm_flags |= UFS_EXTATTR_UEPM_INITIALIZED;
}
/*
* Destroy per-FS structures supporting extended attributes. Assumes
* that EAs have already been stopped, and will panic if not.
*/
void
ufs_extattr_uepm_destroy(struct ufs_extattr_per_mount *uepm)
{
if (!(uepm->uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
panic("ufs_extattr_uepm_destroy: not initialized");
if ((uepm->uepm_flags & UFS_EXTATTR_UEPM_STARTED))
panic("ufs_extattr_uepm_destroy: called while still started");
/*
* It's not clear that either order for the next three lines is
* ideal, and it should never be a problem if this is only called
* during unmount, and with vfs_busy().
*/
uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
uepm->uepm_flags &= ~UFS_EXTATTR_UEPM_INITIALIZED;
mutex_destroy(&uepm->uepm_lock);
}
/*
* Start extended attribute support on an FS.
*/
int
ufs_extattr_start(struct mount *mp, struct lwp *l)
{
struct ufsmount *ump;
int error = 0;
ump = VFSTOUFS(mp);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
ufs_extattr_uepm_init(&ump->um_extattr);
ufs_extattr_uepm_lock(ump);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED)) {
error = EOPNOTSUPP;
goto unlock;
}
if (ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED) {
error = EBUSY;
goto unlock;
}
ump->um_extattr.uepm_flags |= UFS_EXTATTR_UEPM_STARTED;
ump->um_extattr.uepm_ucred = l->l_cred;
kauth_cred_hold(ump->um_extattr.uepm_ucred);
unlock:
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Helper routine: given a locked parent directory and filename, return
* the locked vnode of the inode associated with the name. Will not
* follow symlinks, may return any type of vnode. Lock on parent will
* be released even in the event of a failure. In the event that the
* target is the parent (i.e., "."), there will be two references and
* one lock, requiring the caller to possibly special-case.
*/
static int
ufs_extattr_lookup(struct vnode *start_dvp, int lockparent,
const char *dirname,
struct vnode **vp, struct lwp *l)
{
struct vop_lookup_v2_args vargs;
struct componentname cnp;
struct vnode *target_vp;
char *pnbuf;
int error;
KASSERT(VOP_ISLOCKED(start_dvp) == LK_EXCLUSIVE);
pnbuf = PNBUF_GET();
memset(&cnp, 0, sizeof(cnp));
cnp.cn_nameiop = LOOKUP;
cnp.cn_flags = ISLASTCN | lockparent;
cnp.cn_cred = l->l_cred;
cnp.cn_nameptr = pnbuf;
error = copystr(dirname, pnbuf, MAXPATHLEN, &cnp.cn_namelen);
if (error) {
if (lockparent == 0) {
VOP_UNLOCK(start_dvp);
}
PNBUF_PUT(pnbuf);
printf("%s: copystr failed (%d)\n", __func__, error);
return error;
}
cnp.cn_namelen--; /* trim nul termination */
vargs.a_desc = NULL;
vargs.a_dvp = start_dvp;
vargs.a_vpp = &target_vp;
vargs.a_cnp = &cnp;
error = ufs_lookup(&vargs);
PNBUF_PUT(pnbuf);
if (error) {
if (lockparent == 0) {
VOP_UNLOCK(start_dvp);
}
return error;
}
#if 0
if (target_vp == start_dvp)
panic("%s: target_vp == start_dvp", __func__);
#endif
if (target_vp != start_dvp) {
error = vn_lock(target_vp, LK_EXCLUSIVE);
if (lockparent == 0)
VOP_UNLOCK(start_dvp);
if (error) {
vrele(target_vp);
return error;
}
}
KASSERT(VOP_ISLOCKED(target_vp) == LK_EXCLUSIVE);
*vp = target_vp;
return 0;
}
/*
* Enable an EA using the passed filesystem, backing vnode, attribute name,
* namespace, and proc. Will perform a VOP_OPEN() on the vp, so expects vp
* to be locked when passed in. The vnode will be returned unlocked,
* regardless of success/failure of the function. As a result, the caller
* will always need to vrele(), but not vput().
*/
static int
ufs_extattr_enable_with_open(struct ufsmount *ump, struct vnode *vp,
int attrnamespace, const char *attrname, struct lwp *l)
{
int error;
error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred);
if (error) {
printf("%s: VOP_OPEN(): failed (%d)\n", __func__, error);
VOP_UNLOCK(vp);
return error;
}
mutex_enter(vp->v_interlock);
vp->v_writecount++;
mutex_exit(vp->v_interlock);
vref(vp);
VOP_UNLOCK(vp);
error = ufs_extattr_enable(ump, attrnamespace, attrname, vp, l);
if (error != 0)
vn_close(vp, FREAD|FWRITE, l->l_cred);
return error;
}
/*
* Given a locked directory vnode, iterate over the names in the directory
* and use ufs_extattr_lookup() to retrieve locked vnodes of potential
* attribute files. Then invoke ufs_extattr_enable_with_open() on each
* to attempt to start the attribute. Leaves the directory locked on
* exit.
*/
static int
ufs_extattr_iterate_directory(struct ufsmount *ump, struct vnode *dvp,
int attrnamespace, struct lwp *l)
{
struct vop_readdir_args vargs;
struct statvfs *sbp = &ump->um_mountp->mnt_stat;
struct dirent *dp, *edp;
struct vnode *attr_vp;
struct uio auio;
struct iovec aiov;
char *dirbuf;
int error, eofflag = 0;
if (dvp->v_type != VDIR)
return ENOTDIR;
dirbuf = kmem_alloc(UFS_DIRBLKSIZ, KM_SLEEP);
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
auio.uio_offset = 0;
UIO_SETUP_SYSSPACE(&auio);
vargs.a_desc = NULL;
vargs.a_vp = dvp;
vargs.a_uio = &auio;
vargs.a_cred = l->l_cred;
vargs.a_eofflag = &eofflag;
vargs.a_ncookies = NULL;
vargs.a_cookies = NULL;
while (!eofflag) {
auio.uio_resid = UFS_DIRBLKSIZ;
aiov.iov_base = dirbuf;
aiov.iov_len = UFS_DIRBLKSIZ;
error = ufs_readdir(&vargs);
if (error) {
printf("%s: ufs_readdir (%d)\n", __func__, error);
return error;
}
/*
* XXXRW: While in UFS, we always get UFS_DIRBLKSIZ returns from
* the directory code on success, on other file systems this
* may not be the case. For portability, we should check the
* read length on return from ufs_readdir().
*/
edp = (struct dirent *)&dirbuf[UFS_DIRBLKSIZ];
for (dp = (struct dirent *)dirbuf; dp < edp; ) {
if (dp->d_reclen == 0)
break;
/* Skip "." and ".." */
if (dp->d_name[0] == '.' &&
(dp->d_name[1] == '\0' ||
(dp->d_name[1] == '.' && dp->d_name[2] == '\0')))
goto next;
error = ufs_extattr_lookup(dvp, LOCKPARENT,
dp->d_name, &attr_vp, l);
if (error == ENOENT) {
goto next; /* keep silent */
} else if (error) {
printf("%s: lookup `%s' (%d)\n", __func__,
dp->d_name, error);
} else if (attr_vp == dvp) {
vrele(attr_vp);
} else if (attr_vp->v_type != VREG) {
vput(attr_vp);
} else {
error = ufs_extattr_enable_with_open(ump,
attr_vp, attrnamespace, dp->d_name, l);
vrele(attr_vp);
if (error) {
printf("%s: enable `%s' (%d)\n",
__func__, dp->d_name, error);
} else if (bootverbose) {
printf("%s: EA %s loaded\n",
sbp->f_mntonname, dp->d_name);
}
}
next:
dp = (struct dirent *) ((char *)dp + dp->d_reclen);
if (dp >= edp)
break;
}
}
kmem_free(dirbuf, UFS_DIRBLKSIZ);
return 0;
}
static int
ufs_extattr_subdir(struct lwp *l, struct mount *mp, struct vnode *attr_dvp,
const char *subdir, int namespace)
{
int error;
struct vnode *attr_sub;
error = ufs_extattr_lookup(attr_dvp, LOCKPARENT, subdir, &attr_sub, l);
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
if (error) {
printf("%s: Can't find `%s/%s/%s' (%d)\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
return error;
}
KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
error = ufs_extattr_iterate_directory(VFSTOUFS(mp),
attr_sub, namespace, l);
if (error) {
printf("%s: ufs_extattr_iterate_directory `%s/%s/%s' (%d)\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR, subdir, error);
}
KASSERT(VOP_ISLOCKED(attr_sub) == LK_EXCLUSIVE);
vput(attr_sub);
return error;
}
/*
* Auto-start of extended attributes, to be executed (optionally) at
* mount-time.
*/
int
ufs_extattr_autostart(struct mount *mp, struct lwp *l)
{
struct vnode *rvp, *attr_dvp;
int error;
/*
* Does UFS_EXTATTR_FSROOTSUBDIR exist off the filesystem root?
* If so, automatically start EA's.
*/
error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp);
if (error) {
printf("%s: VFS_ROOT() (%d)\n", __func__, error);
return error;
}
KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
error = ufs_extattr_lookup(rvp, 0,
UFS_EXTATTR_FSROOTSUBDIR, &attr_dvp, l);
if (error) {
/* rvp ref'd but now unlocked */
KASSERT(VOP_ISLOCKED(rvp) == 0);
vrele(rvp);
printf("%s: lookup `%s/%s' (%d)\n", __func__,
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, error);
return error;
}
if (rvp == attr_dvp) {
/* Should never happen. */
KASSERT(VOP_ISLOCKED(rvp) == LK_EXCLUSIVE);
vrele(attr_dvp);
vput(rvp);
printf("%s: `/' == `%s/%s' (%d)\n", __func__,
mp->mnt_stat.f_mntonname, UFS_EXTATTR_FSROOTSUBDIR, EINVAL);
return EINVAL;
}
KASSERT(VOP_ISLOCKED(rvp) == 0);
vrele(rvp);
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
if (attr_dvp->v_type != VDIR) {
printf("%s: `%s/%s' is not a directory\n",
__func__, mp->mnt_stat.f_mntonname,
UFS_EXTATTR_FSROOTSUBDIR);
goto return_vput_attr_dvp;
}
error = ufs_extattr_start(mp, l);
if (error) {
printf("%s: ufs_extattr_start failed (%d)\n", __func__,
error);
goto return_vput_attr_dvp;
}
/*
* Look for two subdirectories: UFS_EXTATTR_SUBDIR_SYSTEM,
* UFS_EXTATTR_SUBDIR_USER. For each, iterate over the sub-directory,
* and start with appropriate type. Failures in either don't
* result in an over-all failure. attr_dvp is left locked to
* be cleaned up on exit.
*/
error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_SYSTEM,
EXTATTR_NAMESPACE_SYSTEM);
error = ufs_extattr_subdir(l, mp, attr_dvp, UFS_EXTATTR_SUBDIR_USER,
EXTATTR_NAMESPACE_USER);
/* Mask startup failures in sub-directories. */
error = 0;
return_vput_attr_dvp:
KASSERT(VOP_ISLOCKED(attr_dvp) == LK_EXCLUSIVE);
vput(attr_dvp);
return error;
}
/*
* Stop extended attribute support on an FS.
*/
void
ufs_extattr_stop(struct mount *mp, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct ufsmount *ump = VFSTOUFS(mp);
ufs_extattr_uepm_lock(ump);
/*
* If we haven't been started, no big deal. Just short-circuit
* the processing work.
*/
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
goto unlock;
}
while (LIST_FIRST(&ump->um_extattr.uepm_list) != NULL) {
uele = LIST_FIRST(&ump->um_extattr.uepm_list);
ufs_extattr_disable(ump, uele->uele_attrnamespace,
uele->uele_attrname, l);
}
ump->um_extattr.uepm_flags &= ~UFS_EXTATTR_UEPM_STARTED;
kauth_cred_free(ump->um_extattr.uepm_ucred);
ump->um_extattr.uepm_ucred = NULL;
unlock:
ufs_extattr_uepm_unlock(ump);
}
/*
* Enable a named attribute on the specified filesystem; provide an
* unlocked backing vnode to hold the attribute data.
*/
static int
ufs_extattr_enable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct vnode *backing_vnode, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct iovec aiov;
struct uio auio;
int error = 0;
if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
return EINVAL;
if (backing_vnode->v_type != VREG)
return EINVAL;
attribute = kmem_zalloc(sizeof(*attribute), KM_SLEEP);
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED)) {
error = EOPNOTSUPP;
goto free_exit;
}
if (ufs_extattr_find_attr(ump, attrnamespace, attrname)) {
error = EEXIST;
goto free_exit;
}
strncpy(attribute->uele_attrname, attrname,
UFS_EXTATTR_MAXEXTATTRNAME);
attribute->uele_attrnamespace = attrnamespace;
memset(&attribute->uele_fileheader, 0,
sizeof(struct ufs_extattr_fileheader));
attribute->uele_backing_vnode = backing_vnode;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
aiov.iov_base = (void *) &attribute->uele_fileheader;
aiov.iov_len = sizeof(struct ufs_extattr_fileheader);
auio.uio_resid = sizeof(struct ufs_extattr_fileheader);
auio.uio_offset = (off_t) 0;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
vn_lock(backing_vnode, LK_SHARED | LK_RETRY);
error = VOP_READ(backing_vnode, &auio, IO_NODELOCKED,
ump->um_extattr.uepm_ucred);
if (error)
goto unlock_free_exit;
if (auio.uio_resid != 0) {
printf("%s: malformed attribute header\n", __func__);
error = EINVAL;
goto unlock_free_exit;
}
/*
* Try to determine the byte order of the attribute file.
*/
if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
attribute->uele_flags |= UELE_F_NEEDSWAP;
attribute->uele_fileheader.uef_magic =
ufs_rw32(attribute->uele_fileheader.uef_magic,
UELE_NEEDSWAP(attribute));
if (attribute->uele_fileheader.uef_magic != UFS_EXTATTR_MAGIC) {
printf("%s: invalid attribute header magic\n",
__func__);
error = EINVAL;
goto unlock_free_exit;
}
}
attribute->uele_fileheader.uef_version =
ufs_rw32(attribute->uele_fileheader.uef_version,
UELE_NEEDSWAP(attribute));
attribute->uele_fileheader.uef_size =
ufs_rw32(attribute->uele_fileheader.uef_size,
UELE_NEEDSWAP(attribute));
if (attribute->uele_fileheader.uef_version != UFS_EXTATTR_VERSION) {
printf("%s: incorrect attribute header version %d != %d\n",
__func__, attribute->uele_fileheader.uef_version,
UFS_EXTATTR_VERSION);
error = EINVAL;
goto unlock_free_exit;
}
LIST_INSERT_HEAD(&ump->um_extattr.uepm_list, attribute, uele_entries);
VOP_UNLOCK(backing_vnode);
return 0;
unlock_free_exit:
VOP_UNLOCK(backing_vnode);
free_exit:
kmem_free(attribute, sizeof(*attribute));
return error;
}
/*
* Disable extended attribute support on an FS.
*/
static int
ufs_extattr_disable(struct ufsmount *ump, int attrnamespace,
const char *attrname, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
int error = 0;
if (!ufs_extattr_valid_attrname(attrnamespace, attrname))
return EINVAL;
uele = ufs_extattr_find_attr(ump, attrnamespace, attrname);
if (!uele)
return ENODATA;
LIST_REMOVE(uele, uele_entries);
error = vn_close(uele->uele_backing_vnode, FREAD|FWRITE, l->l_cred);
kmem_free(uele, sizeof(*uele));
return error;
}
/*
* VFS call to manage extended attributes in UFS. If filename_vp is
* non-NULL, it must be passed in locked, and regardless of errors in
* processing, will be unlocked.
*/
int
ufs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
int attrnamespace, const char *attrname)
{
struct lwp *l = curlwp;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
/*
* Only privileged processes can configure extended attributes.
*/
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_EXTATTR,
0, mp, NULL, NULL);
if (error) {
if (filename_vp != NULL)
VOP_UNLOCK(filename_vp);
return error;
}
switch(cmd) {
case UFS_EXTATTR_CMD_START:
case UFS_EXTATTR_CMD_STOP:
case UFS_EXTATTR_CMD_ENABLE:
case UFS_EXTATTR_CMD_DISABLE:
if (filename_vp != NULL) {
VOP_UNLOCK(filename_vp);
return EINVAL;
}
if (attrname != NULL)
return EINVAL;
break;
default:
return EINVAL;
}
switch(cmd) {
case UFS_EXTATTR_CMD_START:
error = ufs_extattr_autostart(mp, l);
return error;
case UFS_EXTATTR_CMD_STOP:
ufs_extattr_stop(mp, l);
return 0;
case UFS_EXTATTR_CMD_ENABLE:
/*
* ufs_extattr_enable_with_open() will always unlock the
* vnode, regardless of failure.
*/
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_enable_with_open(ump, filename_vp,
attrnamespace, attrname, l);
ufs_extattr_uepm_unlock(ump);
return error;
case UFS_EXTATTR_CMD_DISABLE:
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_disable(ump, attrnamespace, attrname, l);
ufs_extattr_uepm_unlock(ump);
return error;
default:
return EINVAL;
}
}
/*
* Read extended attribute header for a given vnode and attribute.
* Backing vnode should be locked and unlocked by caller.
*/
static int
ufs_extattr_get_header(struct vnode *vp, struct ufs_extattr_list_entry *uele,
struct ufs_extattr_header *ueh, off_t *bap)
{
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct inode *ip = VTOI(vp);
off_t base_offset;
struct iovec aiov;
struct uio aio;
int error;
/*
* Find base offset of header in file based on file header size, and
* data header size + maximum data size, indexed by inode number.
*/
base_offset = sizeof(struct ufs_extattr_fileheader) +
ip->i_number * (sizeof(struct ufs_extattr_header) +
uele->uele_fileheader.uef_size);
/*
* Read in the data header to see if the data is defined, and if so
* how much.
*/
memset(ueh, 0, sizeof(struct ufs_extattr_header));
aiov.iov_base = ueh;
aiov.iov_len = sizeof(struct ufs_extattr_header);
aio.uio_iov = &aiov;
aio.uio_iovcnt = 1;
aio.uio_rw = UIO_READ;
aio.uio_offset = base_offset;
aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&aio);
error = VOP_READ(uele->uele_backing_vnode, &aio,
IO_NODELOCKED, ump->um_extattr.uepm_ucred);
if (error)
return error;
/*
* Attribute headers are kept in file system byte order.
* XXX What about the blob of data?
*/
ueh->ueh_flags = ufs_rw32(ueh->ueh_flags, UELE_NEEDSWAP(uele));
ueh->ueh_len = ufs_rw32(ueh->ueh_len, UELE_NEEDSWAP(uele));
ueh->ueh_i_gen = ufs_rw32(ueh->ueh_i_gen, UELE_NEEDSWAP(uele));
/* Defined? */
if ((ueh->ueh_flags & UFS_EXTATTR_ATTR_FLAG_INUSE) == 0)
return ENODATA;
/* Valid for the current inode generation? */
if (ueh->ueh_i_gen != ip->i_gen) {
/*
* The inode itself has a different generation number
* than the uele data. For now, the best solution
* is to coerce this to undefined, and let it get cleaned
* up by the next write or extattrctl clean.
*/
printf("%s: %s: inode gen inconsistency (%u, %jd)\n",
__func__, mp->mnt_stat.f_mntonname, ueh->ueh_i_gen,
(intmax_t)ip->i_gen);
return ENODATA;
}
/* Local size consistency check. */
if (ueh->ueh_len > uele->uele_fileheader.uef_size)
return ENXIO;
/* Return base offset */
if (bap != NULL)
*bap = base_offset;
return 0;
}
/*
* Vnode operation to retrieve a named extended attribute.
*/
int
ufs_getextattr(struct vop_getextattr_args *ap)
/*
vop_getextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_get(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_uio, ap->a_size, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with retrieving a named attribute--assumes that
* the attribute lock has already been grabbed.
*/
static int
ufs_extattr_get(struct vnode *vp, int attrnamespace, const char *name,
struct uio *uio, size_t *size, kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
off_t base_offset;
size_t len, old_len;
int error = 0;
if (strlen(name) == 0)
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute)
return ENODATA;
/*
* Allow only offsets of zero to encourage the read/replace
* extended attribute semantic. Otherwise we can't guarantee
* atomicity, as we don't provide locks for extended attributes.
*/
if (uio != NULL && uio->uio_offset != 0)
return ENXIO;
/*
* Don't need to get a lock on the backing file if the getattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode, LK_SHARED | LK_RETRY);
error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
if (error)
goto vopunlock_exit;
/* Return full data size if caller requested it. */
if (size != NULL)
*size = ueh.ueh_len;
/* Return data if the caller requested it. */
if (uio != NULL) {
/* Allow for offset into the attribute data. */
uio->uio_offset = base_offset + sizeof(struct
ufs_extattr_header);
/*
* Figure out maximum to transfer -- use buffer size and
* local data limit.
*/
len = MIN(uio->uio_resid, ueh.ueh_len);
old_len = uio->uio_resid;
uio->uio_resid = len;
error = VOP_READ(attribute->uele_backing_vnode, uio,
IO_NODELOCKED, ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
uio->uio_resid = old_len - (len - uio->uio_resid);
}
vopunlock_exit:
if (uio != NULL)
uio->uio_offset = 0;
if (attribute->uele_backing_vnode != vp)
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Vnode operation to list extended attribute for a vnode
*/
int
ufs_listextattr(struct vop_listextattr_args *ap)
/*
vop_listextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
INOUT struct uio *a_uio;
OUT size_t *a_size;
IN int flag;
IN kauth_cred_t a_cred;
struct proc *a_p;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_list(ap->a_vp, ap->a_attrnamespace,
ap->a_uio, ap->a_size, ap->a_flag, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with retrieving list of attributes--assumes that
* the attribute lock has already been grabbed.
*/
static int
ufs_extattr_list(struct vnode *vp, int attrnamespace,
struct uio *uio, size_t *size, int flag,
kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
size_t listsize = 0;
int error = 0;
/*
* XXX: We can move this inside the loop and iterate on individual
* attributes.
*/
error = extattr_check_cred(vp, attrnamespace, cred, VREAD);
if (error)
return error;
LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries) {
unsigned char attrnamelen;
if (uele->uele_attrnamespace != attrnamespace)
continue;
error = ufs_extattr_get_header(vp, uele, &ueh, NULL);
if (error == ENODATA)
continue;
if (error != 0)
return error;
/*
* Don't need to get a lock on the backing file if
* the listattr is being applied to the backing file,
* as the lock is already held.
*/
if (uele->uele_backing_vnode != vp)
vn_lock(uele->uele_backing_vnode, LK_SHARED | LK_RETRY);
/*
* +1 for trailing NUL (listxattr flavor)
* or leading name length (extattr_list_file flavor)
*/
attrnamelen = strlen(uele->uele_attrname);
listsize += attrnamelen + 1;
/* Return data if the caller requested it. */
if (uio != NULL) {
/*
* We support two flavors. Either NUL-terminated
* strings (a la listxattr), or non NUL-terminated,
* one byte length prefixed strings (for
* extattr_list_file). EXTATTR_LIST_LENPREFIX switches
* that second behavior.
*/
if (flag & EXTATTR_LIST_LENPREFIX) {
uint8_t len = (uint8_t)attrnamelen;
/* Copy leading name length */
error = uiomove(&len, sizeof(len), uio);
if (error != 0)
break;
} else {
/* Include trailing NULL */
attrnamelen++;
}
error = uiomove(uele->uele_attrname,
(size_t)attrnamelen, uio);
if (error != 0)
break;
}
if (uele->uele_backing_vnode != vp)
VOP_UNLOCK(uele->uele_backing_vnode);
if (error != 0)
return error;
}
if (uio != NULL)
uio->uio_offset = 0;
/* Return full data size if caller requested it. */
if (size != NULL)
*size = listsize;
return 0;
}
/*
* Vnode operation to remove a named attribute.
*/
int
ufs_deleteextattr(struct vop_deleteextattr_args *ap)
/*
vop_deleteextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
error = ufs_extattr_rm(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Vnode operation to set a named attribute.
*/
int
ufs_setextattr(struct vop_setextattr_args *ap)
/*
vop_setextattr {
IN struct vnode *a_vp;
IN int a_attrnamespace;
IN const char *a_name;
INOUT struct uio *a_uio;
IN kauth_cred_t a_cred;
};
*/
{
struct mount *mp = ap->a_vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
int error;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return EOPNOTSUPP;
ufs_extattr_uepm_lock(ump);
/*
* XXX: No longer a supported way to delete extended attributes.
*/
if (ap->a_uio == NULL) {
ufs_extattr_uepm_unlock(ump);
return EINVAL;
}
error = ufs_extattr_set(ap->a_vp, ap->a_attrnamespace, ap->a_name,
ap->a_uio, ap->a_cred, curlwp);
ufs_extattr_uepm_unlock(ump);
return error;
}
/*
* Real work associated with setting a vnode's extended attributes;
* assumes that the attribute lock has already been grabbed.
*/
static int
ufs_extattr_set(struct vnode *vp, int attrnamespace, const char *name,
struct uio *uio, kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct iovec local_aiov;
struct uio local_aio;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct inode *ip = VTOI(vp);
off_t base_offset;
int error = 0, ioflag;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
if (!ufs_extattr_valid_attrname(attrnamespace, name))
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute) {
error = ufs_extattr_autocreate_attr(vp, attrnamespace,
name, l, &attribute);
if (error == EEXIST) {
/* Another thread raced us for backend creation */
error = 0;
attribute =
ufs_extattr_find_attr(ump, attrnamespace, name);
}
if (error || !attribute)
return ENODATA;
}
/*
* Early rejection of invalid offsets/length.
* Reject: any offset but 0 (replace)
* Any size greater than attribute size limit
*/
if (uio->uio_offset != 0 ||
uio->uio_resid > attribute->uele_fileheader.uef_size)
return ENXIO;
/*
* Find base offset of header in file based on file header size, and
* data header size + maximum data size, indexed by inode number.
*/
base_offset = sizeof(struct ufs_extattr_fileheader) +
ip->i_number * (sizeof(struct ufs_extattr_header) +
attribute->uele_fileheader.uef_size);
/*
* Write out a data header for the data.
*/
ueh.ueh_len = ufs_rw32((uint32_t) uio->uio_resid,
UELE_NEEDSWAP(attribute));
ueh.ueh_flags = ufs_rw32(UFS_EXTATTR_ATTR_FLAG_INUSE,
UELE_NEEDSWAP(attribute));
ueh.ueh_i_gen = ufs_rw32(ip->i_gen, UELE_NEEDSWAP(attribute));
local_aiov.iov_base = &ueh;
local_aiov.iov_len = sizeof(struct ufs_extattr_header);
local_aio.uio_iov = &local_aiov;
local_aio.uio_iovcnt = 1;
local_aio.uio_rw = UIO_WRITE;
local_aio.uio_offset = base_offset;
local_aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&local_aio);
/*
* Don't need to get a lock on the backing file if the setattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode,
LK_EXCLUSIVE | LK_RETRY);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
if (local_aio.uio_resid != 0) {
error = ENXIO;
goto vopunlock_exit;
}
/*
* Write out user data.
* XXX NOT ATOMIC WITH RESPECT TO THE HEADER.
*/
uio->uio_offset = base_offset + sizeof(struct ufs_extattr_header);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, uio, ioflag,
ump->um_extattr.uepm_ucred);
vopunlock_exit:
uio->uio_offset = 0;
if (attribute->uele_backing_vnode != vp)
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Real work associated with removing an extended attribute from a vnode.
* Assumes the attribute lock has already been grabbed.
*/
static int
ufs_extattr_rm(struct vnode *vp, int attrnamespace, const char *name,
kauth_cred_t cred, struct lwp *l)
{
struct ufs_extattr_list_entry *attribute;
struct ufs_extattr_header ueh;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
struct iovec local_aiov;
struct uio local_aio;
off_t base_offset;
int error = 0, ioflag;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
if (!ufs_extattr_valid_attrname(attrnamespace, name))
return EINVAL;
error = extattr_check_cred(vp, attrnamespace, cred, VWRITE);
if (error)
return error;
attribute = ufs_extattr_find_attr(ump, attrnamespace, name);
if (!attribute)
return ENODATA;
/*
* Don't need to get a lock on the backing file if the getattr is
* being applied to the backing file, as the lock is already held.
*/
if (attribute->uele_backing_vnode != vp)
vn_lock(attribute->uele_backing_vnode, LK_EXCLUSIVE | LK_RETRY);
error = ufs_extattr_get_header(vp, attribute, &ueh, &base_offset);
if (error)
goto vopunlock_exit;
/* Flag it as not in use. */
ueh.ueh_flags = 0; /* No need to byte swap 0 */
ueh.ueh_len = 0; /* ...ditto... */
local_aiov.iov_base = &ueh;
local_aiov.iov_len = sizeof(struct ufs_extattr_header);
local_aio.uio_iov = &local_aiov;
local_aio.uio_iovcnt = 1;
local_aio.uio_rw = UIO_WRITE;
local_aio.uio_offset = base_offset;
local_aio.uio_resid = sizeof(struct ufs_extattr_header);
UIO_SETUP_SYSSPACE(&local_aio);
ioflag = IO_NODELOCKED;
if (ufs_extattr_sync)
ioflag |= IO_SYNC;
error = VOP_WRITE(attribute->uele_backing_vnode, &local_aio, ioflag,
ump->um_extattr.uepm_ucred);
if (error)
goto vopunlock_exit;
if (local_aio.uio_resid != 0)
error = ENXIO;
vopunlock_exit:
VOP_UNLOCK(attribute->uele_backing_vnode);
return error;
}
/*
* Called by UFS when an inode is no longer active and should have its
* attributes stripped.
*/
void
ufs_extattr_vnode_inactive(struct vnode *vp, struct lwp *l)
{
struct ufs_extattr_list_entry *uele;
struct mount *mp = vp->v_mount;
struct ufsmount *ump = VFSTOUFS(mp);
/*
* In that case, we cannot lock. We should not have any active vnodes
* on the fs if this is not yet initialized but is going to be, so
* this can go unlocked.
*/
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_INITIALIZED))
return;
if (!(ump->um_extattr.uepm_flags & UFS_EXTATTR_UEPM_STARTED))
return;
ufs_extattr_uepm_lock(ump); LIST_FOREACH(uele, &ump->um_extattr.uepm_list, uele_entries)
ufs_extattr_rm(vp, uele->uele_attrnamespace,
uele->uele_attrname, lwp0.l_cred, l);
ufs_extattr_uepm_unlock(ump);
}
void
ufs_extattr_init(void)
{
}
void
ufs_extattr_done(void)
{
}
/* $NetBSD: usbdi.c,v 1.253 2024/04/05 18:57:10 riastradh Exp $ */
/*
* Copyright (c) 1998, 2012, 2015 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Lennart Augustsson (lennart@augustsson.net) at
* Carlstedt Research & Technology, Matthew R. Green (mrg@eterna23.net),
* and Nick Hudson.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: usbdi.c,v 1.253 2024/04/05 18:57:10 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_usb.h"
#include "opt_compat_netbsd.h"
#include "usb_dma.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/bus.h>
#include <sys/cpu.h>
#include <dev/usb/usb.h>
#include <dev/usb/usbdi.h>
#include <dev/usb/usbdi_util.h>
#include <dev/usb/usbdivar.h>
#include <dev/usb/usb_mem.h>
#include <dev/usb/usb_quirks.h>
#include <dev/usb/usb_sdt.h>
#include <dev/usb/usbhist.h>
/* UTF-8 encoding stuff */
#include <fs/unicode.h>
SDT_PROBE_DEFINE5(usb, device, pipe, open,
"struct usbd_interface *"/*iface*/,
"uint8_t"/*address*/,
"uint8_t"/*flags*/,
"int"/*ival*/,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE7(usb, device, pipe, open__intr,
"struct usbd_interface *"/*iface*/,
"uint8_t"/*address*/,
"uint8_t"/*flags*/,
"int"/*ival*/,
"usbd_callback"/*cb*/,
"void *"/*cookie*/,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE2(usb, device, pipe, transfer__start,
"struct usbd_pipe *"/*pipe*/,
"struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE3(usb, device, pipe, transfer__done,
"struct usbd_pipe *"/*pipe*/,
"struct usbd_xfer *"/*xfer*/,
"usbd_status"/*err*/);
SDT_PROBE_DEFINE2(usb, device, pipe, start,
"struct usbd_pipe *"/*pipe*/,
"struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE1(usb, device, pipe, close, "struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE1(usb, device, pipe, abort__start,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE1(usb, device, pipe, abort__done,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE1(usb, device, pipe, clear__endpoint__stall,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE1(usb, device, pipe, clear__endpoint__toggle,
"struct usbd_pipe *"/*pipe*/);
SDT_PROBE_DEFINE5(usb, device, xfer, create,
"struct usbd_xfer *"/*xfer*/,
"struct usbd_pipe *"/*pipe*/,
"size_t"/*len*/,
"unsigned int"/*flags*/,
"unsigned int"/*nframes*/);
SDT_PROBE_DEFINE1(usb, device, xfer, start, "struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE1(usb, device, xfer, preabort, "struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE1(usb, device, xfer, abort, "struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE1(usb, device, xfer, timeout, "struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE2(usb, device, xfer, done,
"struct usbd_xfer *"/*xfer*/,
"usbd_status"/*status*/);
SDT_PROBE_DEFINE1(usb, device, xfer, destroy, "struct usbd_xfer *"/*xfer*/);
SDT_PROBE_DEFINE5(usb, device, request, start,
"struct usbd_device *"/*dev*/,
"usb_device_request_t *"/*req*/,
"size_t"/*len*/,
"int"/*flags*/,
"uint32_t"/*timeout*/);
SDT_PROBE_DEFINE7(usb, device, request, done,
"struct usbd_device *"/*dev*/,
"usb_device_request_t *"/*req*/,
"size_t"/*actlen*/,
"int"/*flags*/,
"uint32_t"/*timeout*/,
"void *"/*data*/,
"usbd_status"/*status*/);
Static void usbd_ar_pipe(struct usbd_pipe *);
Static void usbd_start_next(struct usbd_pipe *);
Static usbd_status usbd_open_pipe_ival
(struct usbd_interface *, uint8_t, uint8_t, struct usbd_pipe **, int);
static void *usbd_alloc_buffer(struct usbd_xfer *, uint32_t);
static void usbd_free_buffer(struct usbd_xfer *);
static struct usbd_xfer *usbd_alloc_xfer(struct usbd_device *, unsigned int);
static void usbd_free_xfer(struct usbd_xfer *);
static void usbd_xfer_timeout(void *);
static void usbd_xfer_timeout_task(void *);
static bool usbd_xfer_probe_timeout(struct usbd_xfer *);
static void usbd_xfer_cancel_timeout_async(struct usbd_xfer *);
#if defined(USB_DEBUG)
void
usbd_dump_iface(struct usbd_interface *iface)
{
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "iface %#jx", (uintptr_t)iface, 0, 0, 0);
if (iface == NULL)
return;
USBHIST_LOG(usbdebug, " device = %#jx idesc = %#jx index = %jd",
(uintptr_t)iface->ui_dev, (uintptr_t)iface->ui_idesc,
iface->ui_index, 0);
USBHIST_LOG(usbdebug, " altindex=%jd",
iface->ui_altindex, 0, 0, 0);
}
void
usbd_dump_device(struct usbd_device *dev)
{
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "dev = %#jx", (uintptr_t)dev, 0, 0, 0);
if (dev == NULL)
return;
USBHIST_LOG(usbdebug, " bus = %#jx default_pipe = %#jx",
(uintptr_t)dev->ud_bus, (uintptr_t)dev->ud_pipe0, 0, 0);
USBHIST_LOG(usbdebug, " address = %jd config = %jd depth = %jd ",
dev->ud_addr, dev->ud_config, dev->ud_depth, 0);
USBHIST_LOG(usbdebug, " speed = %jd self_powered = %jd "
"power = %jd langid = %jd",
dev->ud_speed, dev->ud_selfpowered, dev->ud_power, dev->ud_langid);
}
void
usbd_dump_endpoint(struct usbd_endpoint *endp)
{
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "endp = %#jx", (uintptr_t)endp, 0, 0, 0);
if (endp == NULL)
return;
USBHIST_LOG(usbdebug, " edesc = %#jx refcnt = %jd",
(uintptr_t)endp->ue_edesc, endp->ue_refcnt, 0, 0);
if (endp->ue_edesc)
USBHIST_LOG(usbdebug, " bEndpointAddress=0x%02jx",
endp->ue_edesc->bEndpointAddress, 0, 0, 0);
}
void
usbd_dump_queue(struct usbd_pipe *pipe)
{
struct usbd_xfer *xfer;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0);
SIMPLEQ_FOREACH(xfer, &pipe->up_queue, ux_next) {
USBHIST_LOG(usbdebug, " xfer = %#jx", (uintptr_t)xfer,
0, 0, 0);
}
}
void
usbd_dump_pipe(struct usbd_pipe *pipe)
{
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0);
if (pipe == NULL)
return;
usbd_dump_iface(pipe->up_iface);
usbd_dump_device(pipe->up_dev);
usbd_dump_endpoint(pipe->up_endpoint);
USBHIST_LOG(usbdebug, "(usbd_dump_pipe)", 0, 0, 0, 0);
USBHIST_LOG(usbdebug, " running = %jd aborting = %jd",
pipe->up_running, pipe->up_aborting, 0, 0);
USBHIST_LOG(usbdebug, " intrxfer = %#jx, repeat = %jd, "
"interval = %jd", (uintptr_t)pipe->up_intrxfer, pipe->up_repeat,
pipe->up_interval, 0);
}
#endif
usbd_status
usbd_open_pipe(struct usbd_interface *iface, uint8_t address,
uint8_t flags, struct usbd_pipe **pipe)
{
return (usbd_open_pipe_ival(iface, address, flags, pipe,
USBD_DEFAULT_INTERVAL));
}
usbd_status
usbd_open_pipe_ival(struct usbd_interface *iface, uint8_t address,
uint8_t flags, struct usbd_pipe **pipe, int ival)
{
struct usbd_pipe *p = NULL;
struct usbd_endpoint *ep = NULL /* XXXGCC */;
bool piperef = false;
usbd_status err;
int i;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "iface = %#jx address = %#jx flags = %#jx",
(uintptr_t)iface, address, flags, 0);
/*
* Block usbd_set_interface so we have a snapshot of the
* interface endpoints. They will remain stable until we drop
* the reference in usbd_close_pipe (or on failure here).
*/
err = usbd_iface_piperef(iface);
if (err)
goto out;
piperef = true;
/* Find the endpoint at this address. */
for (i = 0; i < iface->ui_idesc->bNumEndpoints; i++) {
ep = &iface->ui_endpoints[i];
if (ep->ue_edesc == NULL) {
err = USBD_IOERROR;
goto out;
}
if (ep->ue_edesc->bEndpointAddress == address)
break;
}
if (i == iface->ui_idesc->bNumEndpoints) {
err = USBD_BAD_ADDRESS;
goto out;
}
/* Set up the pipe with this endpoint. */
err = usbd_setup_pipe_flags(iface->ui_dev, iface, ep, ival, &p, flags);
if (err)
goto out;
/* Success! */
*pipe = p;
p = NULL; /* handed off to caller */
piperef = false; /* handed off to pipe */
SDT_PROBE5(usb, device, pipe, open,
iface, address, flags, ival, p);
err = USBD_NORMAL_COMPLETION;
out: if (p)
usbd_close_pipe(p);
if (piperef)
usbd_iface_pipeunref(iface);
return err;
}
usbd_status
usbd_open_pipe_intr(struct usbd_interface *iface, uint8_t address,
uint8_t flags, struct usbd_pipe **pipe,
void *priv, void *buffer, uint32_t len,
usbd_callback cb, int ival)
{
usbd_status err;
struct usbd_xfer *xfer;
struct usbd_pipe *ipipe;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "address = %#jx flags = %#jx len = %jd",
address, flags, len, 0);
err = usbd_open_pipe_ival(iface, address,
USBD_EXCLUSIVE_USE | (flags & USBD_MPSAFE),
&ipipe, ival);
if (err)
return err;
err = usbd_create_xfer(ipipe, len, flags, 0, &xfer);
if (err)
goto bad1;
usbd_setup_xfer(xfer, priv, buffer, len, flags, USBD_NO_TIMEOUT, cb);
ipipe->up_intrxfer = xfer;
ipipe->up_repeat = 1;
err = usbd_transfer(xfer);
*pipe = ipipe;
if (err != USBD_IN_PROGRESS)
goto bad3;
SDT_PROBE7(usb, device, pipe, open__intr,
iface, address, flags, ival, cb, priv, ipipe);
return USBD_NORMAL_COMPLETION;
bad3:
ipipe->up_intrxfer = NULL;
ipipe->up_repeat = 0;
usbd_destroy_xfer(xfer);
bad1:
usbd_close_pipe(ipipe);
return err;
}
void
usbd_close_pipe(struct usbd_pipe *pipe)
{
USBHIST_FUNC(); USBHIST_CALLED(usbdebug);
KASSERT(pipe != NULL);
usbd_lock_pipe(pipe);
SDT_PROBE1(usb, device, pipe, close, pipe); if (!SIMPLEQ_EMPTY(&pipe->up_queue)) {
printf("WARNING: pipe closed with active xfers on addr %d\n",
pipe->up_dev->ud_addr);
usbd_ar_pipe(pipe);
}
KASSERT(SIMPLEQ_EMPTY(&pipe->up_queue));
pipe->up_methods->upm_close(pipe);
usbd_unlock_pipe(pipe);
cv_destroy(&pipe->up_callingcv);
if (pipe->up_intrxfer) usbd_destroy_xfer(pipe->up_intrxfer);
usb_rem_task_wait(pipe->up_dev, &pipe->up_async_task, USB_TASKQ_DRIVER,
NULL);
usbd_endpoint_release(pipe->up_dev, pipe->up_endpoint);
if (pipe->up_iface) usbd_iface_pipeunref(pipe->up_iface);
kmem_free(pipe, pipe->up_dev->ud_bus->ub_pipesize);
}
usbd_status
usbd_transfer(struct usbd_xfer *xfer)
{
struct usbd_pipe *pipe = xfer->ux_pipe;
usbd_status err;
unsigned int size, flags;
USBHIST_FUNC(); USBHIST_CALLARGS(usbdebug,
"xfer = %#jx, flags = %#jx, pipe = %#jx, running = %jd",
(uintptr_t)xfer, xfer->ux_flags, (uintptr_t)pipe, pipe->up_running);
KASSERT(xfer->ux_status == USBD_NOT_STARTED); SDT_PROBE1(usb, device, xfer, start, xfer);
#ifdef USB_DEBUG
if (usbdebug > 5)
usbd_dump_queue(pipe);
#endif
xfer->ux_done = 0;
KASSERT(xfer->ux_length == 0 || xfer->ux_buf != NULL);
size = xfer->ux_length;
flags = xfer->ux_flags;
if (size != 0) {
/*
* Use the xfer buffer if none specified in transfer setup.
* isoc transfers always use the xfer buffer, i.e.
* ux_buffer is always NULL for isoc.
*/
if (xfer->ux_buffer == NULL) {
xfer->ux_buffer = xfer->ux_buf;
}
/*
* If not using the xfer buffer copy data to the
* xfer buffer for OUT transfers of >0 length
*/
if (xfer->ux_buffer != xfer->ux_buf) { KASSERT(xfer->ux_buf); if (!usbd_xfer_isread(xfer)) { memcpy(xfer->ux_buf, xfer->ux_buffer, size);
}
}
}
if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe);
if (pipe->up_aborting) {
/*
* XXX For synchronous transfers this is fine. What to
* do for asynchronous transfers? The callback is
* never run, not even with status USBD_CANCELLED.
*/
KASSERT(pipe->up_dev->ud_bus->ub_usepolling == 0);
usbd_unlock_pipe(pipe);
USBHIST_LOG(usbdebug, "<- done xfer %#jx, aborting",
(uintptr_t)xfer, 0, 0, 0);
SDT_PROBE2(usb, device, xfer, done, xfer, USBD_CANCELLED);
return USBD_CANCELLED;
}
/* xfer is not valid after the transfer method unless synchronous */
SDT_PROBE2(usb, device, pipe, transfer__start, pipe, xfer);
do {
#ifdef DIAGNOSTIC
xfer->ux_state = XFER_ONQU;
#endif
SIMPLEQ_INSERT_TAIL(&pipe->up_queue, xfer, ux_next);
if (pipe->up_running && pipe->up_serialise) {
err = USBD_IN_PROGRESS;
} else {
pipe->up_running = 1;
err = USBD_NORMAL_COMPLETION;
}
if (err)
break;
err = pipe->up_methods->upm_transfer(xfer);
} while (0);
SDT_PROBE3(usb, device, pipe, transfer__done, pipe, xfer, err); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe); if (err != USBD_IN_PROGRESS && err) {
/*
* The transfer made it onto the pipe queue, but didn't get
* accepted by the HCD for some reason. It needs removing
* from the pipe queue.
*/
USBHIST_LOG(usbdebug, "xfer failed: %jd, reinserting",
err, 0, 0, 0);
if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe); SDT_PROBE1(usb, device, xfer, preabort, xfer);
#ifdef DIAGNOSTIC
xfer->ux_state = XFER_BUSY;
#endif
SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next); if (pipe->up_serialise) usbd_start_next(pipe); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe);
}
if (!(flags & USBD_SYNCHRONOUS)) {
USBHIST_LOG(usbdebug, "<- done xfer %#jx, not sync (err %jd)",
(uintptr_t)xfer, err, 0, 0);
KASSERTMSG(err != USBD_NORMAL_COMPLETION,
"asynchronous xfer %p completed synchronously", xfer);
return err;
}
if (err != USBD_IN_PROGRESS) {
USBHIST_LOG(usbdebug, "<- done xfer %#jx, sync (err %jd)",
(uintptr_t)xfer, err, 0, 0);
SDT_PROBE2(usb, device, xfer, done, xfer, err);
return err;
}
/* Sync transfer, wait for completion. */
if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_lock_pipe(pipe); while (!xfer->ux_done) {
if (pipe->up_dev->ud_bus->ub_usepolling)
panic("usbd_transfer: not done");
USBHIST_LOG(usbdebug, "<- sleeping on xfer %#jx",
(uintptr_t)xfer, 0, 0, 0);
err = 0;
if ((flags & USBD_SYNCHRONOUS_SIG) != 0) {
err = cv_wait_sig(&xfer->ux_cv, pipe->up_dev->ud_bus->ub_lock);
} else {
cv_wait(&xfer->ux_cv, pipe->up_dev->ud_bus->ub_lock);
}
if (err) { if (!xfer->ux_done) { SDT_PROBE1(usb, device, xfer, abort, xfer);
pipe->up_methods->upm_abort(xfer);
}
break;
}
}
err = xfer->ux_status;
SDT_PROBE2(usb, device, xfer, done, xfer, err); if (pipe->up_dev->ud_bus->ub_usepolling == 0) usbd_unlock_pipe(pipe);
return err;
}
/* Like usbd_transfer(), but waits for completion. */
usbd_status
usbd_sync_transfer(struct usbd_xfer *xfer)
{
xfer->ux_flags |= USBD_SYNCHRONOUS;
return usbd_transfer(xfer);
}
/* Like usbd_transfer(), but waits for completion and listens for signals. */
usbd_status
usbd_sync_transfer_sig(struct usbd_xfer *xfer)
{
xfer->ux_flags |= USBD_SYNCHRONOUS | USBD_SYNCHRONOUS_SIG;
return usbd_transfer(xfer);
}
static void *
usbd_alloc_buffer(struct usbd_xfer *xfer, uint32_t size)
{
KASSERT(xfer->ux_buf == NULL); KASSERT(size != 0);
xfer->ux_bufsize = 0;
#if NUSB_DMA > 0
struct usbd_bus *bus = xfer->ux_bus;
if (bus->ub_usedma) {
usb_dma_t *dmap = &xfer->ux_dmabuf;
KASSERT((bus->ub_dmaflags & USBMALLOC_COHERENT) == 0);
int err = usb_allocmem(bus->ub_dmatag, size, 0, bus->ub_dmaflags, dmap);
if (err) {
return NULL;
}
xfer->ux_buf = KERNADDR(&xfer->ux_dmabuf, 0);
xfer->ux_bufsize = size;
return xfer->ux_buf;
}
#endif
KASSERT(xfer->ux_bus->ub_usedma == false);
xfer->ux_buf = kmem_alloc(size, KM_SLEEP);
xfer->ux_bufsize = size;
return xfer->ux_buf;
}
static void
usbd_free_buffer(struct usbd_xfer *xfer)
{
KASSERT(xfer->ux_buf != NULL);
KASSERT(xfer->ux_bufsize != 0);
void *buf = xfer->ux_buf;
uint32_t size = xfer->ux_bufsize;
xfer->ux_buf = NULL;
xfer->ux_bufsize = 0;
#if NUSB_DMA > 0
struct usbd_bus *bus = xfer->ux_bus;
if (bus->ub_usedma) {
usb_dma_t *dmap = &xfer->ux_dmabuf;
usb_freemem(dmap);
return;
}
#endif
KASSERT(xfer->ux_bus->ub_usedma == false);
kmem_free(buf, size);
}
void *
usbd_get_buffer(struct usbd_xfer *xfer)
{
return xfer->ux_buf;
}
struct usbd_pipe *
usbd_get_pipe0(struct usbd_device *dev)
{
return dev->ud_pipe0;
}
static struct usbd_xfer *
usbd_alloc_xfer(struct usbd_device *dev, unsigned int nframes)
{
struct usbd_xfer *xfer;
USBHIST_FUNC();
ASSERT_SLEEPABLE();
xfer = dev->ud_bus->ub_methods->ubm_allocx(dev->ud_bus, nframes);
if (xfer == NULL)
goto out;
xfer->ux_bus = dev->ud_bus;
callout_init(&xfer->ux_callout, CALLOUT_MPSAFE);
callout_setfunc(&xfer->ux_callout, usbd_xfer_timeout, xfer);
cv_init(&xfer->ux_cv, "usbxfer");
usb_init_task(&xfer->ux_aborttask, usbd_xfer_timeout_task, xfer,
USB_TASKQ_MPSAFE);
out:
USBHIST_CALLARGS(usbdebug, "returns %#jx", (uintptr_t)xfer, 0, 0, 0);
return xfer;
}
static void
usbd_free_xfer(struct usbd_xfer *xfer)
{
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "%#jx", (uintptr_t)xfer, 0, 0, 0);
if (xfer->ux_buf) { usbd_free_buffer(xfer);
}
/* Wait for any straggling timeout to complete. */
mutex_enter(xfer->ux_bus->ub_lock);
xfer->ux_timeout_reset = false; /* do not resuscitate */
callout_halt(&xfer->ux_callout, xfer->ux_bus->ub_lock);
usb_rem_task_wait(xfer->ux_pipe->up_dev, &xfer->ux_aborttask,
USB_TASKQ_HC, xfer->ux_bus->ub_lock);
mutex_exit(xfer->ux_bus->ub_lock);
cv_destroy(&xfer->ux_cv);
xfer->ux_bus->ub_methods->ubm_freex(xfer->ux_bus, xfer);
}
int
usbd_create_xfer(struct usbd_pipe *pipe, size_t len, unsigned int flags,
unsigned int nframes, struct usbd_xfer **xp)
{ KASSERT(xp != NULL);
void *buf = NULL;
struct usbd_xfer *xfer = usbd_alloc_xfer(pipe->up_dev, nframes);
if (xfer == NULL)
return ENOMEM;
xfer->ux_pipe = pipe;
xfer->ux_flags = flags;
xfer->ux_nframes = nframes;
xfer->ux_methods = pipe->up_methods;
if (len) { buf = usbd_alloc_buffer(xfer, len); if (!buf) {
usbd_free_xfer(xfer);
return ENOMEM;
}
}
if (xfer->ux_methods->upm_init) {
int err = xfer->ux_methods->upm_init(xfer);
if (err) { usbd_free_xfer(xfer);
return err;
}
}
*xp = xfer;
SDT_PROBE5(usb, device, xfer, create,
xfer, pipe, len, flags, nframes);
return 0;
}
void
usbd_destroy_xfer(struct usbd_xfer *xfer)
{
SDT_PROBE1(usb, device, xfer, destroy, xfer); if (xfer->ux_methods->upm_fini) xfer->ux_methods->upm_fini(xfer);
usbd_free_xfer(xfer);
}
void
usbd_setup_xfer(struct usbd_xfer *xfer, void *priv, void *buffer,
uint32_t length, uint16_t flags, uint32_t timeout, usbd_callback callback)
{
KASSERT(xfer->ux_pipe);
xfer->ux_priv = priv;
xfer->ux_buffer = buffer;
xfer->ux_length = length;
xfer->ux_actlen = 0;
xfer->ux_flags = flags;
xfer->ux_timeout = timeout;
xfer->ux_status = USBD_NOT_STARTED;
xfer->ux_callback = callback;
xfer->ux_rqflags &= ~URQ_REQUEST;
xfer->ux_nframes = 0;
}
void
usbd_setup_default_xfer(struct usbd_xfer *xfer, struct usbd_device *dev,
void *priv, uint32_t timeout, usb_device_request_t *req, void *buffer,
uint32_t length, uint16_t flags, usbd_callback callback)
{
KASSERT(xfer->ux_pipe == dev->ud_pipe0);
xfer->ux_priv = priv;
xfer->ux_buffer = buffer;
xfer->ux_length = length;
xfer->ux_actlen = 0;
xfer->ux_flags = flags;
xfer->ux_timeout = timeout;
xfer->ux_status = USBD_NOT_STARTED;
xfer->ux_callback = callback;
xfer->ux_request = *req;
xfer->ux_rqflags |= URQ_REQUEST;
xfer->ux_nframes = 0;
}
void
usbd_setup_isoc_xfer(struct usbd_xfer *xfer, void *priv, uint16_t *frlengths,
uint32_t nframes, uint16_t flags, usbd_callback callback)
{
xfer->ux_priv = priv;
xfer->ux_buffer = NULL;
xfer->ux_length = 0;
xfer->ux_actlen = 0;
xfer->ux_flags = flags;
xfer->ux_timeout = USBD_NO_TIMEOUT;
xfer->ux_status = USBD_NOT_STARTED;
xfer->ux_callback = callback;
xfer->ux_rqflags &= ~URQ_REQUEST;
xfer->ux_frlengths = frlengths;
xfer->ux_nframes = nframes;
for (size_t i = 0; i < xfer->ux_nframes; i++)
xfer->ux_length += xfer->ux_frlengths[i];
}
void
usbd_get_xfer_status(struct usbd_xfer *xfer, void **priv,
void **buffer, uint32_t *count, usbd_status *status)
{
if (priv != NULL)
*priv = xfer->ux_priv;
if (buffer != NULL)
*buffer = xfer->ux_buffer;
if (count != NULL)
*count = xfer->ux_actlen;
if (status != NULL)
*status = xfer->ux_status;
}
usb_config_descriptor_t *
usbd_get_config_descriptor(struct usbd_device *dev)
{
KASSERT(dev != NULL);
return dev->ud_cdesc;
}
usb_interface_descriptor_t *
usbd_get_interface_descriptor(struct usbd_interface *iface)
{
KASSERT(iface != NULL);
return iface->ui_idesc;
}
usb_device_descriptor_t *
usbd_get_device_descriptor(struct usbd_device *dev)
{
KASSERT(dev != NULL);
return &dev->ud_ddesc;
}
usb_endpoint_descriptor_t *
usbd_interface2endpoint_descriptor(struct usbd_interface *iface, uint8_t index)
{
if (index >= iface->ui_idesc->bNumEndpoints)
return NULL;
return iface->ui_endpoints[index].ue_edesc;
}
/* Some drivers may wish to abort requests on the default pipe, *
* but there is no mechanism for getting a handle on it. */
void
usbd_abort_default_pipe(struct usbd_device *device)
{
usbd_abort_pipe(device->ud_pipe0);
}
void
usbd_abort_pipe(struct usbd_pipe *pipe)
{
usbd_suspend_pipe(pipe);
usbd_resume_pipe(pipe);
}
void
usbd_suspend_pipe(struct usbd_pipe *pipe)
{
usbd_lock_pipe(pipe);
usbd_ar_pipe(pipe);
usbd_unlock_pipe(pipe);
}
void
usbd_resume_pipe(struct usbd_pipe *pipe)
{
usbd_lock_pipe(pipe);
KASSERT(SIMPLEQ_EMPTY(&pipe->up_queue));
pipe->up_aborting = 0;
usbd_unlock_pipe(pipe);
}
usbd_status
usbd_clear_endpoint_stall(struct usbd_pipe *pipe)
{
struct usbd_device *dev = pipe->up_dev;
usbd_status err;
USBHIST_FUNC(); USBHIST_CALLED(usbdebug);
SDT_PROBE1(usb, device, pipe, clear__endpoint__stall, pipe);
/*
* Clearing en endpoint stall resets the endpoint toggle, so
* do the same to the HC toggle.
*/
SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe);
pipe->up_methods->upm_cleartoggle(pipe);
err = usbd_clear_endpoint_feature(dev,
pipe->up_endpoint->ue_edesc->bEndpointAddress, UF_ENDPOINT_HALT);
#if 0
XXX should we do this?
if (!err) {
pipe->state = USBD_PIPE_ACTIVE;
/* XXX activate pipe */
}
#endif
return err;
}
void
usbd_clear_endpoint_stall_task(void *arg)
{
struct usbd_pipe *pipe = arg;
struct usbd_device *dev = pipe->up_dev;
SDT_PROBE1(usb, device, pipe, clear__endpoint__stall, pipe);
SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe);
pipe->up_methods->upm_cleartoggle(pipe);
(void)usbd_clear_endpoint_feature(dev,
pipe->up_endpoint->ue_edesc->bEndpointAddress, UF_ENDPOINT_HALT);
}
void
usbd_clear_endpoint_stall_async(struct usbd_pipe *pipe)
{
usb_add_task(pipe->up_dev, &pipe->up_async_task, USB_TASKQ_DRIVER);
}
void
usbd_clear_endpoint_toggle(struct usbd_pipe *pipe)
{
SDT_PROBE1(usb, device, pipe, clear__endpoint__toggle, pipe);
pipe->up_methods->upm_cleartoggle(pipe);
}
usbd_status
usbd_endpoint_count(struct usbd_interface *iface, uint8_t *count)
{
KASSERT(iface != NULL);
KASSERT(iface->ui_idesc != NULL);
*count = iface->ui_idesc->bNumEndpoints;
return USBD_NORMAL_COMPLETION;
}
usbd_status
usbd_interface_count(struct usbd_device *dev, uint8_t *count)
{
if (dev->ud_cdesc == NULL)
return USBD_NOT_CONFIGURED;
*count = dev->ud_cdesc->bNumInterface;
return USBD_NORMAL_COMPLETION;
}
void
usbd_interface2device_handle(struct usbd_interface *iface,
struct usbd_device **dev)
{
*dev = iface->ui_dev;
}
usbd_status
usbd_device2interface_handle(struct usbd_device *dev,
uint8_t ifaceno, struct usbd_interface **iface)
{
if (dev->ud_cdesc == NULL)
return USBD_NOT_CONFIGURED;
if (ifaceno >= dev->ud_cdesc->bNumInterface)
return USBD_INVAL;
*iface = &dev->ud_ifaces[ifaceno];
return USBD_NORMAL_COMPLETION;
}
struct usbd_device *
usbd_pipe2device_handle(struct usbd_pipe *pipe)
{
KASSERT(pipe != NULL);
return pipe->up_dev;
}
/* XXXX use altno */
usbd_status
usbd_set_interface(struct usbd_interface *iface, int altidx)
{
bool locked = false;
usb_device_request_t req;
usbd_status err;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "iface %#jx", (uintptr_t)iface, 0, 0, 0);
err = usbd_iface_lock(iface);
if (err)
goto out;
locked = true;
err = usbd_fill_iface_data(iface->ui_dev, iface->ui_index, altidx);
if (err)
goto out;
req.bmRequestType = UT_WRITE_INTERFACE;
req.bRequest = UR_SET_INTERFACE;
USETW(req.wValue, iface->ui_idesc->bAlternateSetting);
USETW(req.wIndex, iface->ui_idesc->bInterfaceNumber);
USETW(req.wLength, 0);
err = usbd_do_request(iface->ui_dev, &req, 0);
out: /* XXX back out iface data? */
if (locked)
usbd_iface_unlock(iface);
return err;
}
int
usbd_get_no_alts(usb_config_descriptor_t *cdesc, int ifaceno)
{
char *p = (char *)cdesc;
char *end = p + UGETW(cdesc->wTotalLength);
usb_descriptor_t *desc;
usb_interface_descriptor_t *idesc;
int n;
for (n = 0; end - p >= sizeof(*desc); p += desc->bLength) {
desc = (usb_descriptor_t *)p;
if (desc->bLength < sizeof(*desc) || desc->bLength > end - p)
break;
if (desc->bDescriptorType != UDESC_INTERFACE)
continue;
if (desc->bLength < sizeof(*idesc))
break;
idesc = (usb_interface_descriptor_t *)desc;
if (idesc->bInterfaceNumber == ifaceno) {
n++;
if (n == INT_MAX)
break;
}
}
return n;
}
int
usbd_get_interface_altindex(struct usbd_interface *iface)
{
return iface->ui_altindex;
}
usbd_status
usbd_get_interface(struct usbd_interface *iface, uint8_t *aiface)
{
usb_device_request_t req;
req.bmRequestType = UT_READ_INTERFACE;
req.bRequest = UR_GET_INTERFACE;
USETW(req.wValue, 0);
USETW(req.wIndex, iface->ui_idesc->bInterfaceNumber);
USETW(req.wLength, 1);
return usbd_do_request(iface->ui_dev, &req, aiface);
}
/*** Internal routines ***/
/* Dequeue all pipe operations, called with bus lock held. */
Static void
usbd_ar_pipe(struct usbd_pipe *pipe)
{
struct usbd_xfer *xfer;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "pipe = %#jx", (uintptr_t)pipe, 0, 0, 0);
SDT_PROBE1(usb, device, pipe, abort__start, pipe);
ASSERT_SLEEPABLE();
KASSERT(mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERT(pipe->up_dev->ud_bus->ub_usepolling == 0);
/*
* Allow only one thread at a time to abort the pipe, so we
* don't get confused if upm_abort drops the lock in the middle
* of the abort to wait for hardware completion softints to
* stop using the xfer before returning.
*/
KASSERTMSG(pipe->up_abortlwp == NULL, "pipe->up_abortlwp=%p",
pipe->up_abortlwp);
pipe->up_abortlwp = curlwp;
#ifdef USB_DEBUG
if (usbdebug > 5)
usbd_dump_queue(pipe);
#endif
pipe->up_repeat = 0;
pipe->up_running = 0;
pipe->up_aborting = 1;
while ((xfer = SIMPLEQ_FIRST(&pipe->up_queue)) != NULL) {
USBHIST_LOG(usbdebug, "pipe = %#jx xfer = %#jx "
"(methods = %#jx)", (uintptr_t)pipe, (uintptr_t)xfer,
(uintptr_t)pipe->up_methods, 0);
if (xfer->ux_status == USBD_NOT_STARTED) {
SDT_PROBE1(usb, device, xfer, preabort, xfer);
#ifdef DIAGNOSTIC
xfer->ux_state = XFER_BUSY;
#endif
SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next);
} else {
/* Make the HC abort it (and invoke the callback). */
SDT_PROBE1(usb, device, xfer, abort, xfer);
pipe->up_methods->upm_abort(xfer);
while (pipe->up_callingxfer == xfer) {
USBHIST_LOG(usbdebug, "wait for callback"
"pipe = %#jx xfer = %#jx",
(uintptr_t)pipe, (uintptr_t)xfer, 0, 0);
cv_wait(&pipe->up_callingcv,
pipe->up_dev->ud_bus->ub_lock);
}
/* XXX only for non-0 usbd_clear_endpoint_stall(pipe); */
}
}
/*
* There may be an xfer callback already in progress which was
* taken off the queue before we got to it. We must wait for
* the callback to finish before returning control to the
* caller.
*/
while (pipe->up_callingxfer) {
USBHIST_LOG(usbdebug, "wait for callback"
"pipe = %#jx xfer = %#jx",
(uintptr_t)pipe, (uintptr_t)pipe->up_callingxfer, 0, 0);
cv_wait(&pipe->up_callingcv, pipe->up_dev->ud_bus->ub_lock);
}
KASSERT(mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERTMSG(pipe->up_abortlwp == curlwp, "pipe->up_abortlwp=%p",
pipe->up_abortlwp);
pipe->up_abortlwp = NULL;
SDT_PROBE1(usb, device, pipe, abort__done, pipe);
}
/* Called with USB lock held. */
void
usb_transfer_complete(struct usbd_xfer *xfer)
{
struct usbd_pipe *pipe = xfer->ux_pipe;
struct usbd_bus *bus = pipe->up_dev->ud_bus;
int sync = xfer->ux_flags & USBD_SYNCHRONOUS;
int erred;
int polling = bus->ub_usepolling;
int repeat = pipe->up_repeat;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "pipe = %#jx xfer = %#jx status = %jd "
"actlen = %jd", (uintptr_t)pipe, (uintptr_t)xfer, xfer->ux_status,
xfer->ux_actlen);
KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock)); KASSERTMSG(xfer->ux_state == XFER_ONQU, "xfer %p state is %x", xfer,
xfer->ux_state);
KASSERT(pipe != NULL);
/*
* If device is known to miss out ack, then pretend that
* output timeout is a success. Userland should handle
* the logic to verify that the operation succeeded.
*/
if (pipe->up_dev->ud_quirks && pipe->up_dev->ud_quirks->uq_flags & UQ_MISS_OUT_ACK && xfer->ux_status == USBD_TIMEOUT && !usbd_xfer_isread(xfer)) {
USBHIST_LOG(usbdebug, "Possible output ack miss for xfer %#jx: "
"hiding write timeout to %jd.%jd for %ju bytes written",
(uintptr_t)xfer, curlwp->l_proc->p_pid, curlwp->l_lid,
xfer->ux_length);
xfer->ux_status = USBD_NORMAL_COMPLETION;
xfer->ux_actlen = xfer->ux_length;
}
erred = xfer->ux_status == USBD_CANCELLED ||
xfer->ux_status == USBD_TIMEOUT;
if (!repeat) {
/* Remove request from queue. */
KASSERTMSG(!SIMPLEQ_EMPTY(&pipe->up_queue),
"pipe %p is empty, but xfer %p wants to complete", pipe,
xfer);
KASSERTMSG(xfer == SIMPLEQ_FIRST(&pipe->up_queue),
"xfer %p is not start of queue (%p is at start)", xfer,
SIMPLEQ_FIRST(&pipe->up_queue));
#ifdef DIAGNOSTIC
xfer->ux_state = XFER_BUSY;
#endif
SIMPLEQ_REMOVE_HEAD(&pipe->up_queue, ux_next);
}
USBHIST_LOG(usbdebug, "xfer %#jx: repeat %jd new head = %#jx",
(uintptr_t)xfer, repeat, (uintptr_t)SIMPLEQ_FIRST(&pipe->up_queue),
0);
/* Count completed transfers. */
++pipe->up_dev->ud_bus->ub_stats.uds_requests
[pipe->up_endpoint->ue_edesc->bmAttributes & UE_XFERTYPE];
xfer->ux_done = 1;
if (!xfer->ux_status && xfer->ux_actlen < xfer->ux_length &&
!(xfer->ux_flags & USBD_SHORT_XFER_OK)) {
USBHIST_LOG(usbdebug, "short transfer %jd < %jd",
xfer->ux_actlen, xfer->ux_length, 0, 0);
xfer->ux_status = USBD_SHORT_XFER;
}
USBHIST_LOG(usbdebug, "xfer %#jx doing done %#jx", (uintptr_t)xfer,
(uintptr_t)pipe->up_methods->upm_done, 0, 0);
SDT_PROBE2(usb, device, xfer, done, xfer, xfer->ux_status);
pipe->up_methods->upm_done(xfer);
if (xfer->ux_length != 0 && xfer->ux_buffer != xfer->ux_buf) { KDASSERTMSG(xfer->ux_actlen <= xfer->ux_length,
"actlen %d length %d",xfer->ux_actlen, xfer->ux_length);
/* Only if IN transfer */
if (usbd_xfer_isread(xfer)) { memcpy(xfer->ux_buffer, xfer->ux_buf, xfer->ux_actlen);
}
}
USBHIST_LOG(usbdebug, "xfer %#jx doing callback %#jx status %jd",
(uintptr_t)xfer, (uintptr_t)xfer->ux_callback, xfer->ux_status, 0);
if (xfer->ux_callback) {
if (!polling) {
KASSERT(pipe->up_callingxfer == NULL);
pipe->up_callingxfer = xfer;
mutex_exit(pipe->up_dev->ud_bus->ub_lock);
if (!(pipe->up_flags & USBD_MPSAFE)) KERNEL_LOCK(1, curlwp);
}
xfer->ux_callback(xfer, xfer->ux_priv, xfer->ux_status);
if (!polling) {
if (!(pipe->up_flags & USBD_MPSAFE)) KERNEL_UNLOCK_ONE(curlwp);
mutex_enter(pipe->up_dev->ud_bus->ub_lock);
KASSERT(pipe->up_callingxfer == xfer);
pipe->up_callingxfer = NULL;
cv_broadcast(&pipe->up_callingcv);
}
}
if (sync && !polling) {
USBHIST_LOG(usbdebug, "<- done xfer %#jx, wakeup",
(uintptr_t)xfer, 0, 0, 0);
cv_broadcast(&xfer->ux_cv);
}
if (repeat) {
xfer->ux_actlen = 0;
xfer->ux_status = USBD_NOT_STARTED;
} else {
/* XXX should we stop the queue on all errors? */
if (erred && pipe->up_iface != NULL) /* not control pipe */ pipe->up_running = 0;
}
if (pipe->up_running && pipe->up_serialise) usbd_start_next(pipe);
}
/* Called with USB lock held. */
void
usbd_start_next(struct usbd_pipe *pipe)
{
struct usbd_xfer *xfer;
usbd_status err;
USBHIST_FUNC();
KASSERT(pipe != NULL); KASSERT(pipe->up_methods != NULL); KASSERT(pipe->up_methods->upm_start != NULL); KASSERT(pipe->up_serialise == true);
int polling = pipe->up_dev->ud_bus->ub_usepolling;
KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock));
/* Get next request in queue. */
xfer = SIMPLEQ_FIRST(&pipe->up_queue);
USBHIST_CALLARGS(usbdebug, "pipe = %#jx, xfer = %#jx", (uintptr_t)pipe,
(uintptr_t)xfer, 0, 0);
if (xfer == NULL) {
pipe->up_running = 0;
} else {
SDT_PROBE2(usb, device, pipe, start, pipe, xfer);
err = pipe->up_methods->upm_start(xfer);
if (err != USBD_IN_PROGRESS) {
USBHIST_LOG(usbdebug, "error = %jd", err, 0, 0, 0);
pipe->up_running = 0;
/* XXX do what? */
}
}
KASSERT(polling || mutex_owned(pipe->up_dev->ud_bus->ub_lock));
}
usbd_status
usbd_do_request(struct usbd_device *dev, usb_device_request_t *req, void *data)
{
return usbd_do_request_flags(dev, req, data, 0, 0,
USBD_DEFAULT_TIMEOUT);
}
usbd_status
usbd_do_request_flags(struct usbd_device *dev, usb_device_request_t *req,
void *data, uint16_t flags, int *actlen, uint32_t timeout)
{
size_t len = UGETW(req->wLength);
return usbd_do_request_len(dev, req, len, data, flags, actlen, timeout);
}
usbd_status
usbd_do_request_len(struct usbd_device *dev, usb_device_request_t *req,
size_t len, void *data, uint16_t flags, int *actlen, uint32_t timeout)
{
struct usbd_xfer *xfer;
usbd_status err;
KASSERT(len >= UGETW(req->wLength));
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "dev=%#jx req=%jx flags=%jx len=%jx",
(uintptr_t)dev, (uintptr_t)req, flags, len);
ASSERT_SLEEPABLE();
SDT_PROBE5(usb, device, request, start,
dev, req, len, flags, timeout);
int error = usbd_create_xfer(dev->ud_pipe0, len, 0, 0, &xfer);
if (error) {
SDT_PROBE7(usb, device, request, done,
dev, req, /*actlen*/0, flags, timeout, data, USBD_NOMEM);
return USBD_NOMEM;
}
usbd_setup_default_xfer(xfer, dev, 0, timeout, req, data,
UGETW(req->wLength), flags, NULL);
KASSERT(xfer->ux_pipe == dev->ud_pipe0);
err = usbd_sync_transfer(xfer);
#if defined(USB_DEBUG) || defined(DIAGNOSTIC)
if (xfer->ux_actlen > xfer->ux_length) {
USBHIST_LOG(usbdebug, "overrun addr = %jd type = 0x%02jx",
dev->ud_addr, xfer->ux_request.bmRequestType, 0, 0);
USBHIST_LOG(usbdebug, " req = 0x%02jx val = %jd "
"index = %jd",
xfer->ux_request.bRequest, UGETW(xfer->ux_request.wValue),
UGETW(xfer->ux_request.wIndex), 0);
USBHIST_LOG(usbdebug, " rlen = %jd length = %jd "
"actlen = %jd",
UGETW(xfer->ux_request.wLength),
xfer->ux_length, xfer->ux_actlen, 0);
}
#endif
if (actlen != NULL)
*actlen = xfer->ux_actlen;
usbd_destroy_xfer(xfer); SDT_PROBE7(usb, device, request, done,
dev, req, xfer->ux_actlen, flags, timeout, data, err);
if (err) {
USBHIST_LOG(usbdebug, "returning err = %jd", err, 0, 0, 0);
}
return err;
}
const struct usbd_quirks *
usbd_get_quirks(struct usbd_device *dev)
{
#ifdef DIAGNOSTIC
if (dev == NULL) {
printf("usbd_get_quirks: dev == NULL\n");
return 0;
}
#endif
return dev->ud_quirks;
}
/* XXX do periodic free() of free list */
/*
* Called from keyboard driver when in polling mode.
*/
void
usbd_dopoll(struct usbd_interface *iface)
{
iface->ui_dev->ud_bus->ub_methods->ubm_dopoll(iface->ui_dev->ud_bus);
}
/*
* This is for keyboard driver as well, which only operates in polling
* mode from the ask root, etc., prompt and from DDB.
*/
void
usbd_set_polling(struct usbd_device *dev, int on)
{
mutex_enter(dev->ud_bus->ub_lock);
if (on) {
/*
* Enabling polling. If we're enabling for the first
* time, call the softint routine on transition while
* we hold the lock and polling is still disabled, and
* then enable polling -- once polling is enabled, we
* must not hold the lock when we call the softint
* routine.
*/
KASSERT(dev->ud_bus->ub_usepolling < __type_max(char));
if (dev->ud_bus->ub_usepolling == 0)
dev->ud_bus->ub_methods->ubm_softint(dev->ud_bus);
dev->ud_bus->ub_usepolling++;
} else {
/*
* Disabling polling. If we're disabling polling for
* the last time, disable polling first and then call
* the softint routine while we hold the lock -- until
* polling is disabled, we must not hold the lock when
* we call the softint routine.
*/
KASSERT(dev->ud_bus->ub_usepolling > 0);
dev->ud_bus->ub_usepolling--;
if (dev->ud_bus->ub_usepolling == 0)
dev->ud_bus->ub_methods->ubm_softint(dev->ud_bus);
}
mutex_exit(dev->ud_bus->ub_lock);
}
usb_endpoint_descriptor_t *
usbd_get_endpoint_descriptor(struct usbd_interface *iface, uint8_t address)
{
struct usbd_endpoint *ep;
int i;
for (i = 0; i < iface->ui_idesc->bNumEndpoints; i++) {
ep = &iface->ui_endpoints[i];
if (ep->ue_edesc->bEndpointAddress == address)
return iface->ui_endpoints[i].ue_edesc;
}
return NULL;
}
/*
* usbd_ratecheck() can limit the number of error messages that occurs.
* When a device is unplugged it may take up to 0.25s for the hub driver
* to notice it. If the driver continuously tries to do I/O operations
* this can generate a large number of messages.
*/
int
usbd_ratecheck(struct timeval *last)
{
static struct timeval errinterval = { 0, 250000 }; /* 0.25 s*/
return ratecheck(last, &errinterval);
}
/*
* Search for a vendor/product pair in an array. The item size is
* given as an argument.
*/
const struct usb_devno *
usb_match_device(const struct usb_devno *tbl, u_int nentries, u_int sz,
uint16_t vendor, uint16_t product)
{
while (nentries-- > 0) {
uint16_t tproduct = tbl->ud_product;
if (tbl->ud_vendor == vendor &&
(tproduct == product || tproduct == USB_PRODUCT_ANY))
return tbl;
tbl = (const struct usb_devno *)((const char *)tbl + sz);
}
return NULL;
}
usbd_status
usbd_get_string(struct usbd_device *dev, int si, char *buf)
{
return usbd_get_string0(dev, si, buf, 1);
}
usbd_status
usbd_get_string0(struct usbd_device *dev, int si, char *buf, int unicode)
{
int swap = dev->ud_quirks->uq_flags & UQ_SWAP_UNICODE;
usb_string_descriptor_t us;
char *s;
int i, n;
uint16_t c;
usbd_status err;
int size;
USBHIST_FUNC(); USBHIST_CALLED(usbdebug);
buf[0] = '\0';
if (si == 0)
return USBD_INVAL;
if (dev->ud_quirks->uq_flags & UQ_NO_STRINGS)
return USBD_STALLED;
if (dev->ud_langid == USBD_NOLANG) {
/* Set up default language */
err = usbd_get_string_desc(dev, USB_LANGUAGE_TABLE, 0, &us,
&size);
if (err || size < 4) {
USBHIST_LOG(usbdebug, "getting lang failed, using 0",
0, 0, 0, 0);
dev->ud_langid = 0; /* Well, just pick something then */
} else {
/* Pick the first language as the default. */
dev->ud_langid = UGETW(us.bString[0]);
}
}
err = usbd_get_string_desc(dev, si, dev->ud_langid, &us, &size);
if (err)
return err;
s = buf;
n = size / 2 - 1;
if (unicode) {
for (i = 0; i < n; i++) {
c = UGETW(us.bString[i]);
if (swap)
c = (c >> 8) | (c << 8);
s += wput_utf8(s, 3, c);
}
*s++ = 0;
}
#ifdef COMPAT_30
else {
for (i = 0; i < n; i++) {
c = UGETW(us.bString[i]);
if (swap)
c = (c >> 8) | (c << 8);
*s++ = (c < 0x80) ? c : '?';
}
*s++ = 0;
}
#endif
return USBD_NORMAL_COMPLETION;
}
/*
* usbd_xfer_trycomplete(xfer)
*
* Try to claim xfer for completion. Return true if successful,
* false if the xfer has been synchronously aborted or has timed
* out.
*
* If this returns true, caller is responsible for setting
* xfer->ux_status and calling usb_transfer_complete. To be used
* in a host controller interrupt handler.
*
* Caller must either hold the bus lock or have the bus in polling
* mode. If this succeeds, caller must proceed to call
* usb_complete_transfer under the bus lock or with polling
* enabled -- must not release and reacquire the bus lock in the
* meantime. Failing to heed this rule may lead to catastrophe
* with abort or timeout.
*/
bool
usbd_xfer_trycomplete(struct usbd_xfer *xfer)
{
struct usbd_bus *bus __diagused = xfer->ux_bus;
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd",
(uintptr_t)xfer, xfer->ux_status, 0, 0);
/*
* If software has completed it, either by synchronous abort or
* by timeout, too late.
*/
if (xfer->ux_status != USBD_IN_PROGRESS)
return false;
/*
* We are completing the xfer. Cancel the timeout if we can,
* but only asynchronously. See usbd_xfer_cancel_timeout_async
* for why we need not wait for the callout or task here.
*/
usbd_xfer_cancel_timeout_async(xfer);
/* Success! Note: Caller must set xfer->ux_status afterwar. */
return true;
}
/*
* usbd_xfer_abort(xfer)
*
* Try to claim xfer to abort. If successful, mark it completed
* with USBD_CANCELLED and call the bus-specific method to abort
* at the hardware level.
*
* To be called in thread context from struct
* usbd_pipe_methods::upm_abort.
*
* Caller must hold the bus lock.
*/
void
usbd_xfer_abort(struct usbd_xfer *xfer)
{
struct usbd_bus *bus = xfer->ux_bus;
KASSERT(mutex_owned(bus->ub_lock));
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd",
(uintptr_t)xfer, xfer->ux_status, 0, 0);
/*
* If host controller interrupt or timer interrupt has
* completed it, too late. But the xfer cannot be
* cancelled already -- only one caller can synchronously
* abort.
*/
KASSERT(xfer->ux_status != USBD_CANCELLED);
if (xfer->ux_status != USBD_IN_PROGRESS)
return;
/*
* Cancel the timeout if we can, but only asynchronously; see
* usbd_xfer_cancel_timeout_async for why we need not wait for
* the callout or task here.
*/
usbd_xfer_cancel_timeout_async(xfer);
/*
* We beat everyone else. Claim the status as cancelled, do
* the bus-specific dance to abort the hardware, and complete
* the xfer.
*/
xfer->ux_status = USBD_CANCELLED;
bus->ub_methods->ubm_abortx(xfer);
usb_transfer_complete(xfer);
}
/*
* usbd_xfer_timeout(xfer)
*
* Called at IPL_SOFTCLOCK when too much time has elapsed waiting
* for xfer to complete. Since we can't abort the xfer at
* IPL_SOFTCLOCK, defer to a usb_task to run it in thread context,
* unless the xfer has completed or aborted concurrently -- and if
* the xfer has also been resubmitted, take care of rescheduling
* the callout.
*/
static void
usbd_xfer_timeout(void *cookie)
{
struct usbd_xfer *xfer = cookie;
struct usbd_bus *bus = xfer->ux_bus;
struct usbd_device *dev = xfer->ux_pipe->up_dev;
/* Acquire the lock so we can transition the timeout state. */
mutex_enter(bus->ub_lock);
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd",
(uintptr_t)xfer, xfer->ux_status, 0, 0);
/*
* Use usbd_xfer_probe_timeout to check whether the timeout is
* still valid, or to reschedule the callout if necessary. If
* it is still valid, schedule the task.
*/
if (usbd_xfer_probe_timeout(xfer)) {
USBHIST_LOG(usbdebug, "xfer %#jx schedule timeout task",
(uintptr_t)xfer, 0, 0, 0);
usb_add_task(dev, &xfer->ux_aborttask, USB_TASKQ_HC);
} else {
USBHIST_LOG(usbdebug, "xfer %#jx timeout cancelled",
(uintptr_t)xfer, 0, 0, 0);
}
/*
* Notify usbd_xfer_cancel_timeout_async that we may have
* scheduled the task. This causes callout_invoking to return
* false in usbd_xfer_cancel_timeout_async so that it can tell
* which stage in the callout->task->abort process we're at.
*/
callout_ack(&xfer->ux_callout);
/* All done -- release the lock. */
mutex_exit(bus->ub_lock);
}
/*
* usbd_xfer_timeout_task(xfer)
*
* Called in thread context when too much time has elapsed waiting
* for xfer to complete. Abort the xfer with USBD_TIMEOUT, unless
* it has completed or aborted concurrently -- and if the xfer has
* also been resubmitted, take care of rescheduling the callout.
*/
static void
usbd_xfer_timeout_task(void *cookie)
{
struct usbd_xfer *xfer = cookie;
struct usbd_bus *bus = xfer->ux_bus;
/* Acquire the lock so we can transition the timeout state. */
mutex_enter(bus->ub_lock);
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx status %jd",
(uintptr_t)xfer, xfer->ux_status, 0, 0);
/*
* Use usbd_xfer_probe_timeout to check whether the timeout is
* still valid, or to reschedule the callout if necessary. If
* it is not valid -- the timeout has been asynchronously
* cancelled, or the xfer has already been resubmitted -- then
* we're done here.
*/
if (!usbd_xfer_probe_timeout(xfer)) {
USBHIST_LOG(usbdebug, "xfer %#jx timeout cancelled",
(uintptr_t)xfer, 0, 0, 0);
goto out;
}
/*
* After this point, no further timeout probing will happen for
* the current incarnation of the timeout, so make the next
* usbd_xfer_schedule_timeout schedule a new callout.
* usbd_xfer_probe_timeout has already processed any reset.
*/
KASSERT(!xfer->ux_timeout_reset);
xfer->ux_timeout_set = false;
/*
* May have completed or been aborted, but we're the only one
* who can time it out. If it has completed or been aborted,
* no need to timeout.
*/
KASSERT(xfer->ux_status != USBD_TIMEOUT);
if (xfer->ux_status != USBD_IN_PROGRESS) {
USBHIST_LOG(usbdebug, "xfer %#jx timeout raced",
(uintptr_t)xfer, 0, 0, 0);
goto out;
}
/*
* We beat everyone else. Claim the status as timed out, do
* the bus-specific dance to abort the hardware, and complete
* the xfer.
*/
USBHIST_LOG(usbdebug, "xfer %#jx timed out",
(uintptr_t)xfer, 0, 0, 0);
xfer->ux_status = USBD_TIMEOUT;
bus->ub_methods->ubm_abortx(xfer);
usb_transfer_complete(xfer);
out: /* All done -- release the lock. */
mutex_exit(bus->ub_lock);
}
/*
* usbd_xfer_probe_timeout(xfer)
*
* Probe the status of xfer's timeout. Acknowledge and process a
* request to reschedule. Return true if the timeout is still
* valid and the caller should take further action (queueing a
* task or aborting the xfer), false if it must stop here.
*/
static bool
usbd_xfer_probe_timeout(struct usbd_xfer *xfer)
{
struct usbd_bus *bus = xfer->ux_bus;
bool valid;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms"
" set %jd reset %jd",
(uintptr_t)xfer, xfer->ux_timeout,
xfer->ux_timeout_set, xfer->ux_timeout_reset);
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
/* The timeout must be set. */
KASSERT(xfer->ux_timeout_set);
/*
* Neither callout nor task may be pending; they execute
* alternately in lock step.
*/
KASSERT(!callout_pending(&xfer->ux_callout));
KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask));
/* There are a few cases... */
if (bus->ub_methods->ubm_dying(bus)) {
/* Host controller dying. Drop it all on the floor. */
USBHIST_LOG(usbdebug, "xfer %#jx bus dying, not rescheduling",
(uintptr_t)xfer, 0, 0, 0);
xfer->ux_timeout_set = false;
xfer->ux_timeout_reset = false;
valid = false;
} else if (xfer->ux_timeout_reset) {
/*
* The xfer completed _and_ got resubmitted while we
* waited for the lock. Acknowledge the request to
* reschedule, and reschedule it if there is a timeout
* and the bus is not polling.
*/
xfer->ux_timeout_reset = false;
if (xfer->ux_timeout && !bus->ub_usepolling) {
USBHIST_LOG(usbdebug, "xfer %#jx resubmitted,"
" rescheduling timer for %jdms",
(uintptr_t)xfer, xfer->ux_timeout, 0, 0);
KASSERT(xfer->ux_timeout_set);
callout_schedule(&xfer->ux_callout,
mstohz(xfer->ux_timeout));
} else {
/* No more callout or task scheduled. */
USBHIST_LOG(usbdebug, "xfer %#jx resubmitted"
" and completed, not rescheduling",
(uintptr_t)xfer, 0, 0, 0);
xfer->ux_timeout_set = false;
}
valid = false;
} else if (xfer->ux_status != USBD_IN_PROGRESS) {
/*
* The xfer has completed by hardware completion or by
* software abort, and has not been resubmitted, so the
* timeout must be unset, and is no longer valid for
* the caller.
*/
USBHIST_LOG(usbdebug, "xfer %#jx timeout lost race,"
" status=%jd, not rescheduling",
(uintptr_t)xfer, xfer->ux_status, 0, 0);
xfer->ux_timeout_set = false;
valid = false;
} else {
/*
* The xfer has not yet completed, so the timeout is
* valid.
*/
USBHIST_LOG(usbdebug, "xfer %#jx timing out",
(uintptr_t)xfer, 0, 0, 0);
valid = true;
}
/* Any reset must have been processed. */
KASSERT(!xfer->ux_timeout_reset);
/*
* Either we claim the timeout is set, or the callout is idle.
* If the timeout is still set, we may be handing off to the
* task instead, so this is an if but not an iff.
*/
KASSERT(xfer->ux_timeout_set || !callout_pending(&xfer->ux_callout));
/*
* The task must be idle now.
*
* - If the caller is the callout, _and_ the timeout is still
* valid, the caller will schedule it, but it hasn't been
* scheduled yet. (If the timeout is not valid, the task
* should not be scheduled.)
*
* - If the caller is the task, it cannot be scheduled again
* until the callout runs again, which won't happen until we
* next release the lock.
*/
KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask));
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
return valid;
}
/*
* usbd_xfer_schedule_timeout(xfer)
*
* Ensure that xfer has a timeout. If the callout is already
* queued or the task is already running, request that they
* reschedule the callout. If not, and if we're not polling,
* schedule the callout anew.
*
* To be called in thread context from struct
* usbd_pipe_methods::upm_start.
*/
void
usbd_xfer_schedule_timeout(struct usbd_xfer *xfer)
{
struct usbd_bus *bus = xfer->ux_bus;
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms"
" set %jd reset %jd",
(uintptr_t)xfer, xfer->ux_timeout,
xfer->ux_timeout_set, xfer->ux_timeout_reset);
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
KASSERTMSG(xfer->ux_status == USBD_IN_PROGRESS, "xfer=%p status=%d",
xfer, xfer->ux_status);
if (xfer->ux_timeout_set) {
/*
* Callout or task has fired from a prior completed
* xfer but has not yet noticed that the xfer is done.
* Ask it to reschedule itself to ux_timeout.
*/
xfer->ux_timeout_reset = true;
} else if (xfer->ux_timeout && !bus->ub_usepolling) {
/* Callout is not scheduled. Schedule it. */
KASSERT(!callout_pending(&xfer->ux_callout));
callout_schedule(&xfer->ux_callout, mstohz(xfer->ux_timeout));
xfer->ux_timeout_set = true;
}
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
}
/*
* usbd_xfer_cancel_timeout_async(xfer)
*
* Cancel the callout and the task of xfer, which have not yet run
* to completion, but don't wait for the callout or task to finish
* running.
*
* If they have already fired, at worst they are waiting for the
* bus lock. They will see that the xfer is no longer in progress
* and give up, or they will see that the xfer has been
* resubmitted with a new timeout and reschedule the callout.
*
* If a resubmitted request completed so fast that the callout
* didn't have time to process a timer reset, just cancel the
* timer reset.
*/
static void
usbd_xfer_cancel_timeout_async(struct usbd_xfer *xfer)
{
struct usbd_bus *bus __diagused = xfer->ux_bus;
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
USBHIST_FUNC();
USBHIST_CALLARGS(usbdebug, "xfer %#jx timeout %jdms"
" set %jd reset %jd",
(uintptr_t)xfer, xfer->ux_timeout,
xfer->ux_timeout_set, xfer->ux_timeout_reset);
/*
* If the timer wasn't running anyway, forget about it. This
* can happen if we are completing an isochronous transfer
* which doesn't use the same timeout logic.
*/
if (!xfer->ux_timeout_set) {
USBHIST_LOG(usbdebug, "xfer %#jx timer not running",
(uintptr_t)xfer, 0, 0, 0);
return;
}
xfer->ux_timeout_reset = false;
if (!callout_stop(&xfer->ux_callout)) {
/*
* We stopped the callout before it ran. The timeout
* is no longer set.
*/
USBHIST_LOG(usbdebug, "xfer %#jx timer stopped",
(uintptr_t)xfer, 0, 0, 0);
xfer->ux_timeout_set = false;
} else if (callout_invoking(&xfer->ux_callout)) {
/*
* The callout has begun to run but it has not yet
* acquired the lock and called callout_ack. The task
* cannot be queued yet, and the callout cannot have
* been rescheduled yet.
*
* By the time the callout acquires the lock, we will
* have transitioned from USBD_IN_PROGRESS to a
* completed status, and possibly also resubmitted the
* xfer and set xfer->ux_timeout_reset = true. In both
* cases, the callout will DTRT, so no further action
* is needed here.
*/
USBHIST_LOG(usbdebug, "xfer %#jx timer fired",
(uintptr_t)xfer, 0, 0, 0);
} else if (usb_rem_task(xfer->ux_pipe->up_dev, &xfer->ux_aborttask)) {
/*
* The callout had fired and scheduled the task, but we
* stopped the task before it could run. The timeout
* is therefore no longer set -- the next resubmission
* of the xfer must schedule a new timeout.
*
* The callout should not be pending at this point:
* it is scheduled only under the lock, and only when
* xfer->ux_timeout_set is false, or by the callout or
* task itself when xfer->ux_timeout_reset is true.
*/
USBHIST_LOG(usbdebug, "xfer %#jx task fired",
(uintptr_t)xfer, 0, 0, 0);
xfer->ux_timeout_set = false;
} else {
USBHIST_LOG(usbdebug, "xfer %#jx task stopped",
(uintptr_t)xfer, 0, 0, 0);
}
/*
* The callout cannot be scheduled and the task cannot be
* queued at this point. Either we cancelled them, or they are
* already running and waiting for the bus lock.
*/
KASSERT(!callout_pending(&xfer->ux_callout));
KASSERT(!usb_task_pending(xfer->ux_pipe->up_dev, &xfer->ux_aborttask));
KASSERT(bus->ub_usepolling || mutex_owned(bus->ub_lock));
}
/* $NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_ptrace.c,v 1.12 2022/07/10 14:07:55 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ptrace.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/exec.h>
#include <sys/pax.h>
#include <sys/ptrace.h>
#include <sys/uio.h>
#include <sys/ras.h>
#include <sys/kmem.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/syscallvar.h>
#include <sys/syscall.h>
#include <sys/module.h>
#include <uvm/uvm_extern.h>
#include <machine/reg.h>
/*
* PTRACE methods
*/
static int
ptrace_copyin_piod(struct ptrace_io_desc *piod, const void *addr, size_t len)
{ if (len != 0 && sizeof(*piod) != len)
return EINVAL;
return copyin(addr, piod, sizeof(*piod));
}
static int
ptrace_copyout_piod(const struct ptrace_io_desc *piod, void *addr, size_t len)
{
if (len != 0 && sizeof(*piod) != len)
return EINVAL;
return copyout(piod, addr, sizeof(*piod));
}
static int
ptrace_copyin_siginfo(struct ptrace_siginfo *psi, const void *addr, size_t len)
{ if (sizeof(*psi) != len)
return EINVAL;
return copyin(addr, psi, sizeof(*psi));
}
static int
ptrace_copyout_siginfo(const struct ptrace_siginfo *psi, void *addr, size_t len)
{ if (sizeof(*psi) != len)
return EINVAL;
return copyout(psi, addr, sizeof(*psi));
}
static int
ptrace_copyout_lwpstatus(const struct ptrace_lwpstatus *pls, void *addr,
size_t len)
{
return copyout(pls, addr, len);
}
static struct ptrace_methods native_ptm = {
.ptm_copyin_piod = ptrace_copyin_piod,
.ptm_copyout_piod = ptrace_copyout_piod,
.ptm_copyin_siginfo = ptrace_copyin_siginfo,
.ptm_copyout_siginfo = ptrace_copyout_siginfo,
.ptm_copyout_lwpstatus = ptrace_copyout_lwpstatus,
.ptm_doregs = process_doregs,
.ptm_dofpregs = process_dofpregs,
.ptm_dodbregs = process_dodbregs,
};
static const struct syscall_package ptrace_syscalls[] = {
{ SYS_ptrace, 0, (sy_call_t *)sys_ptrace },
{ 0, 0, NULL },
};
/*
* Process debugging system call.
*/
int
sys_ptrace(struct lwp *l, const struct sys_ptrace_args *uap, register_t *retval)
{
/* {
syscallarg(int) req;
syscallarg(pid_t) pid;
syscallarg(void *) addr;
syscallarg(int) data;
} */
return do_ptrace(&native_ptm, l, SCARG(uap, req), SCARG(uap, pid),
SCARG(uap, addr), SCARG(uap, data), retval);
}
#define DEPS "ptrace_common"
MODULE(MODULE_CLASS_EXEC, ptrace, DEPS);
static int
ptrace_init(void)
{
int error;
error = syscall_establish(&emul_netbsd, ptrace_syscalls);
return error;
}
static int
ptrace_fini(void)
{
int error;
error = syscall_disestablish(&emul_netbsd, ptrace_syscalls);
return error;
}
static int
ptrace_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = ptrace_init();
break;
case MODULE_CMD_FINI:
error = ptrace_fini();
break;
default:
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: uvm_50.c,v 1.3 2020/09/05 16:30:10 riastradh Exp $ */
/*-
* Copyright (c) 2018 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_50.c,v 1.3 2020/09/05 16:30:10 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#if defined(_KERNEL) || defined(_MODULE)
#if defined(_KERNEL_OPT)
#include "opt_vmswap.h"
#else
#define VMSWAP /* XXX */
#endif
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/systm.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <uvm/uvm_swap.h>
#include <compat/sys/uvm.h>
static void
swapent50_cvt(void *p, const struct swapent *se)
{
struct swapent50 *sep50 = p;
sep50->se50_dev = se->se_dev;
sep50->se50_flags = se->se_flags;
sep50->se50_nblks = se->se_nblks;
sep50->se50_inuse = se->se_inuse;
sep50->se50_priority = se->se_priority;
KASSERT(sizeof(se->se_path) <= sizeof(sep50->se50_path));
strcpy(sep50->se50_path, se->se_path);
}
static int
compat_uvm_swap_stats50(const struct sys_swapctl_args *uap, register_t *retval)
{
return uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
swapent50_cvt, sizeof(struct swapent50), retval);
}
void
uvm_50_init(void)
{
uvm_swap_stats50 = compat_uvm_swap_stats50;
}
void
uvm_50_fini(void)
{
uvm_swap_stats50 = (void *)enosys;
}
/* $NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $ */
/* $KAME: mld6.c,v 1.25 2001/01/16 14:14:18 itojun Exp $ */
/*
* Copyright (C) 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
/*
* Copyright (c) 1988 Stephen Deering.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)igmp.c 8.1 (Berkeley) 7/19/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: mld6.c,v 1.101 2019/09/25 09:53:38 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/syslog.h>
#include <sys/sysctl.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/cprng.h>
#include <sys/rwlock.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <netinet6/mld6_var.h>
static krwlock_t in6_multilock __cacheline_aligned;
/*
* Protocol constants
*/
/*
* time between repetitions of a node's initial report of interest in a
* multicast address(in seconds)
*/
#define MLD_UNSOLICITED_REPORT_INTERVAL 10
static struct ip6_pktopts ip6_opts;
static void mld_start_listening(struct in6_multi *);
static void mld_stop_listening(struct in6_multi *);
static struct mld_hdr *mld_allocbuf(struct mbuf **, struct in6_multi *, int);
static void mld_sendpkt(struct in6_multi *, int, const struct in6_addr *);
static void mld_starttimer(struct in6_multi *);
static void mld_stoptimer(struct in6_multi *);
static u_long mld_timerresid(struct in6_multi *);
static void in6m_ref(struct in6_multi *);
static void in6m_unref(struct in6_multi *);
static void in6m_destroy(struct in6_multi *);
void
mld_init(void)
{
static u_int8_t hbh_buf[8];
struct ip6_hbh *hbh = (struct ip6_hbh *)hbh_buf;
u_int16_t rtalert_code = htons((u_int16_t)IP6OPT_RTALERT_MLD);
/* ip6h_nxt will be fill in later */
hbh->ip6h_len = 0; /* (8 >> 3) - 1 */
/* XXX: grotty hard coding... */
hbh_buf[2] = IP6OPT_PADN; /* 2 byte padding */
hbh_buf[3] = 0;
hbh_buf[4] = IP6OPT_RTALERT;
hbh_buf[5] = IP6OPT_RTALERT_LEN - 2;
memcpy(&hbh_buf[6], (void *)&rtalert_code, sizeof(u_int16_t));
ip6_opts.ip6po_hbh = hbh;
/* We will specify the hoplimit by a multicast option. */
ip6_opts.ip6po_hlim = -1;
ip6_opts.ip6po_prefer_tempaddr = IP6PO_TEMPADDR_NOTPREFER;
rw_init(&in6_multilock);
}
static void
mld_starttimer(struct in6_multi *in6m)
{
struct timeval now;
KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_timer != IN6M_TIMER_UNDEF,
"in6m_timer=%d", in6m->in6m_timer);
microtime(&now);
in6m->in6m_timer_expire.tv_sec = now.tv_sec + in6m->in6m_timer / hz;
in6m->in6m_timer_expire.tv_usec = now.tv_usec +
(in6m->in6m_timer % hz) * (1000000 / hz);
if (in6m->in6m_timer_expire.tv_usec > 1000000) { in6m->in6m_timer_expire.tv_sec++;
in6m->in6m_timer_expire.tv_usec -= 1000000;
}
/* start or restart the timer */
callout_schedule(&in6m->in6m_timer_ch, in6m->in6m_timer);
}
/*
* mld_stoptimer releases in6_multilock when calling callout_halt.
* The caller must ensure in6m won't be freed while releasing the lock.
*/
static void
mld_stoptimer(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock)); if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
return;
rw_exit(&in6_multilock);
callout_halt(&in6m->in6m_timer_ch, NULL);
rw_enter(&in6_multilock, RW_WRITER);
in6m->in6m_timer = IN6M_TIMER_UNDEF;
}
static void
mld_timeo(void *arg)
{
struct in6_multi *in6m = arg;
KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
in6m->in6m_refcount);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
rw_enter(&in6_multilock, RW_WRITER);
if (in6m->in6m_timer == IN6M_TIMER_UNDEF)
goto out;
in6m->in6m_timer = IN6M_TIMER_UNDEF;
switch (in6m->in6m_state) {
case MLD_REPORTPENDING:
mld_start_listening(in6m);
break;
default:
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
break;
}
out:
rw_exit(&in6_multilock);
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
static u_long
mld_timerresid(struct in6_multi *in6m)
{
struct timeval now, diff;
microtime(&now);
if (now.tv_sec > in6m->in6m_timer_expire.tv_sec ||
(now.tv_sec == in6m->in6m_timer_expire.tv_sec &&
now.tv_usec > in6m->in6m_timer_expire.tv_usec)) {
return (0);
}
diff = in6m->in6m_timer_expire;
diff.tv_sec -= now.tv_sec;
diff.tv_usec -= now.tv_usec;
if (diff.tv_usec < 0) {
diff.tv_sec--;
diff.tv_usec += 1000000;
}
/* return the remaining time in milliseconds */
return diff.tv_sec * 1000 + diff.tv_usec / 1000;
}
static void
mld_start_listening(struct in6_multi *in6m)
{
struct in6_addr all_in6;
KASSERT(rw_write_held(&in6_multilock));
/*
* RFC2710 page 10:
* The node never sends a Report or Done for the link-scope all-nodes
* address.
* MLD messages are never sent for multicast addresses whose scope is 0
* (reserved) or 1 (node-local).
*/
all_in6 = in6addr_linklocal_allnodes;
if (in6_setscope(&all_in6, in6m->in6m_ifp, NULL)) {
/* XXX: this should not happen! */
in6m->in6m_timer = 0;
in6m->in6m_state = MLD_OTHERLISTENER;
}
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) {
in6m->in6m_timer = IN6M_TIMER_UNDEF;
in6m->in6m_state = MLD_OTHERLISTENER;
} else {
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_timer = cprng_fast32() %
(MLD_UNSOLICITED_REPORT_INTERVAL * hz);
in6m->in6m_state = MLD_IREPORTEDLAST;
mld_starttimer(in6m);
}
}
static void
mld_stop_listening(struct in6_multi *in6m)
{
struct in6_addr allnode, allrouter;
KASSERT(rw_lock_held(&in6_multilock));
allnode = in6addr_linklocal_allnodes;
if (in6_setscope(&allnode, in6m->in6m_ifp, NULL)) {
/* XXX: this should not happen! */
return;
}
allrouter = in6addr_linklocal_allrouters;
if (in6_setscope(&allrouter, in6m->in6m_ifp, NULL)) {
/* XXX impossible */
return;
}
if (in6m->in6m_state == MLD_IREPORTEDLAST && (!IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &allnode)) &&
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) >
IPV6_ADDR_SCOPE_INTFACELOCAL) {
mld_sendpkt(in6m, MLD_LISTENER_DONE, &allrouter);
}
}
void
mld_input(struct mbuf *m, int off)
{
struct ip6_hdr *ip6;
struct mld_hdr *mldh;
struct ifnet *ifp;
struct in6_multi *in6m = NULL;
struct in6_addr mld_addr, all_in6;
u_long timer = 0; /* timer value in the MLD query header */
struct psref psref;
ifp = m_get_rcvif_psref(m, &psref);
if (__predict_false(ifp == NULL))
goto out;
IP6_EXTHDR_GET(mldh, struct mld_hdr *, m, off, sizeof(*mldh));
if (mldh == NULL) {
ICMP6_STATINC(ICMP6_STAT_TOOSHORT);
goto out_nodrop;
}
ip6 = mtod(m, struct ip6_hdr *);
/* source address validation */
if (!IN6_IS_ADDR_LINKLOCAL(&ip6->ip6_src)) {
/*
* RFC3590 allows the IPv6 unspecified address as the source
* address of MLD report and done messages. However, as this
* same document says, this special rule is for snooping
* switches and the RFC requires routers to discard MLD packets
* with the unspecified source address. The RFC only talks
* about hosts receiving an MLD query or report in Security
* Considerations, but this is probably the correct intention.
* RFC3590 does not talk about other cases than link-local and
* the unspecified source addresses, but we believe the same
* rule should be applied.
* As a result, we only allow link-local addresses as the
* source address; otherwise, simply discard the packet.
*/
#if 0
/*
* XXX: do not log in an input path to avoid log flooding,
* though RFC3590 says "SHOULD log" if the source of a query
* is the unspecified address.
*/
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufm[INET6_ADDRSTRLEN];
log(LOG_INFO,
"mld_input: src %s is not link-local (grp=%s)\n",
IN6_PRINT(ip6bufs,&ip6->ip6_src),
IN6_PRINT(ip6bufm, &mldh->mld_addr));
#endif
goto out;
}
/*
* make a copy for local work (in6_setscope() may modify the 1st arg)
*/
mld_addr = mldh->mld_addr;
if (in6_setscope(&mld_addr, ifp, NULL)) {
/* XXX: this should not happen! */
goto out;
}
/*
* In the MLD specification, there are 3 states and a flag.
*
* In Non-Listener state, we simply don't have a membership record.
* In Delaying Listener state, our timer is running (in6m->in6m_timer)
* In Idle Listener state, our timer is not running
* (in6m->in6m_timer==IN6M_TIMER_UNDEF)
*
* The flag is in6m->in6m_state, it is set to MLD_OTHERLISTENER if
* we have heard a report from another member, or MLD_IREPORTEDLAST
* if we sent the last report.
*/
switch (mldh->mld_type) {
case MLD_LISTENER_QUERY: {
struct in6_multi *next;
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
!IN6_IS_ADDR_MULTICAST(&mld_addr))
break; /* print error or log stat? */
all_in6 = in6addr_linklocal_allnodes;
if (in6_setscope(&all_in6, ifp, NULL)) {
/* XXX: this should not happen! */
break;
}
/*
* - Start the timers in all of our membership records
* that the query applies to for the interface on
* which the query arrived excl. those that belong
* to the "all-nodes" group (ff02::1).
* - Restart any timer that is already running but has
* a value longer than the requested timeout.
* - Use the value specified in the query message as
* the maximum timeout.
*/
timer = ntohs(mldh->mld_maxdelay);
rw_enter(&in6_multilock, RW_WRITER);
/*
* mld_stoptimer and mld_sendpkt release in6_multilock
* temporarily, so we have to prevent in6m from being freed
* while releasing the lock by having an extra reference to it.
*
* Also in6_purge_multi might remove items from the list of the
* ifp while releasing the lock. Fortunately in6_purge_multi is
* never executed as long as we have a psref of the ifp.
*/
LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, &all_in6) ||
IPV6_ADDR_MC_SCOPE(&in6m->in6m_addr) <
IPV6_ADDR_SCOPE_LINKLOCAL)
continue;
if (in6m->in6m_state == MLD_REPORTPENDING)
continue; /* we are not yet ready */
if (!IN6_IS_ADDR_UNSPECIFIED(&mld_addr) &&
!IN6_ARE_ADDR_EQUAL(&mld_addr, &in6m->in6m_addr))
continue;
if (timer == 0) {
in6m_ref(in6m);
/* send a report immediately */
mld_stoptimer(in6m);
mld_sendpkt(in6m, MLD_LISTENER_REPORT, NULL);
in6m->in6m_state = MLD_IREPORTEDLAST;
in6m_unref(in6m); /* May free in6m */
} else if (in6m->in6m_timer == IN6M_TIMER_UNDEF ||
mld_timerresid(in6m) > timer) {
in6m->in6m_timer =
1 + (cprng_fast32() % timer) * hz / 1000;
mld_starttimer(in6m);
}
}
rw_exit(&in6_multilock);
break;
}
case MLD_LISTENER_REPORT:
/*
* For fast leave to work, we have to know that we are the
* last person to send a report for this group. Reports
* can potentially get looped back if we are a multicast
* router, so discard reports sourced by me.
* Note that it is impossible to check IFF_LOOPBACK flag of
* ifp for this purpose, since ip6_mloopback pass the physical
* interface to looutput.
*/
if (m->m_flags & M_LOOP) /* XXX: grotty flag, but efficient */
break;
if (!IN6_IS_ADDR_MULTICAST(&mldh->mld_addr))
break;
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
rw_enter(&in6_multilock, RW_WRITER);
in6m = in6_lookup_multi(&mld_addr, ifp);
if (in6m) {
in6m_ref(in6m);
mld_stoptimer(in6m); /* transit to idle state */
in6m->in6m_state = MLD_OTHERLISTENER; /* clear flag */
in6m_unref(in6m);
in6m = NULL; /* in6m might be freed */
}
rw_exit(&in6_multilock);
break;
default: /* this is impossible */
#if 0
/*
* this case should be impossible because of filtering in
* icmp6_input(). But we explicitly disabled this part
* just in case.
*/
log(LOG_ERR, "mld_input: illegal type(%d)", mldh->mld_type);
#endif
break;
}
out:
m_freem(m);
out_nodrop:
m_put_rcvif_psref(ifp, &psref);
}
/*
* XXX mld_sendpkt must be called with in6_multilock held and
* will release in6_multilock before calling ip6_output and
* returning to avoid locking against myself in ip6_output.
*/
static void
mld_sendpkt(struct in6_multi *in6m, int type, const struct in6_addr *dst)
{
struct mbuf *mh;
struct mld_hdr *mldh;
struct ip6_hdr *ip6 = NULL;
struct ip6_moptions im6o;
struct in6_ifaddr *ia = NULL;
struct ifnet *ifp = in6m->in6m_ifp;
int ignflags;
struct psref psref;
int bound;
KASSERT(rw_write_held(&in6_multilock));
/*
* At first, find a link local address on the outgoing interface
* to use as the source address of the MLD packet.
* We do not reject tentative addresses for MLD report to deal with
* the case where we first join a link-local address.
*/
ignflags = (IN6_IFF_NOTREADY|IN6_IFF_ANYCAST) & ~IN6_IFF_TENTATIVE;
bound = curlwp_bind();
ia = in6ifa_ifpforlinklocal_psref(ifp, ignflags, &psref);
if (ia == NULL) {
curlwp_bindx(bound);
return;
}
if ((ia->ia6_flags & IN6_IFF_TENTATIVE)) { ia6_release(ia, &psref);
ia = NULL;
}
/* Allocate two mbufs to store IPv6 header and MLD header */
mldh = mld_allocbuf(&mh, in6m, type);
if (mldh == NULL) {
ia6_release(ia, &psref); curlwp_bindx(bound);
return;
}
/* fill src/dst here */
ip6 = mtod(mh, struct ip6_hdr *);
ip6->ip6_src = ia ? ia->ia_addr.sin6_addr : in6addr_any; ip6->ip6_dst = dst ? *dst : in6m->in6m_addr; ia6_release(ia, &psref); curlwp_bindx(bound);
mldh->mld_addr = in6m->in6m_addr;
in6_clearscope(&mldh->mld_addr); /* XXX */
mldh->mld_cksum = in6_cksum(mh, IPPROTO_ICMPV6, sizeof(struct ip6_hdr),
sizeof(struct mld_hdr));
/* construct multicast option */
memset(&im6o, 0, sizeof(im6o));
im6o.im6o_multicast_if_index = if_get_index(ifp);
im6o.im6o_multicast_hlim = 1;
/*
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing daemon can hear it.
*/
im6o.im6o_multicast_loop = (ip6_mrouter != NULL);
/* increment output statistics */
ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
icmp6_ifstat_inc(ifp, ifs6_out_msg); switch (type) {
case MLD_LISTENER_QUERY:
icmp6_ifstat_inc(ifp, ifs6_out_mldquery);
break;
case MLD_LISTENER_REPORT:
icmp6_ifstat_inc(ifp, ifs6_out_mldreport);
break;
case MLD_LISTENER_DONE:
icmp6_ifstat_inc(ifp, ifs6_out_mlddone);
break;
}
/* XXX we cannot call ip6_output with holding in6_multilock */
rw_exit(&in6_multilock);
ip6_output(mh, &ip6_opts, NULL, ia ? 0 : IPV6_UNSPECSRC,
&im6o, NULL, NULL);
rw_enter(&in6_multilock, RW_WRITER);
}
static struct mld_hdr *
mld_allocbuf(struct mbuf **mh, struct in6_multi *in6m, int type)
{
struct mbuf *md;
struct mld_hdr *mldh;
struct ip6_hdr *ip6;
/*
* Allocate mbufs to store ip6 header and MLD header.
* We allocate 2 mbufs and make chain in advance because
* it is more convenient when inserting the hop-by-hop option later.
*/
MGETHDR(*mh, M_DONTWAIT, MT_HEADER);
if (*mh == NULL)
return NULL;
MGET(md, M_DONTWAIT, MT_DATA);
if (md == NULL) {
m_free(*mh);
*mh = NULL;
return NULL;
}
(*mh)->m_next = md;
md->m_next = NULL;
m_reset_rcvif((*mh));
(*mh)->m_pkthdr.len = sizeof(struct ip6_hdr) + sizeof(struct mld_hdr);
(*mh)->m_len = sizeof(struct ip6_hdr);
m_align(*mh, sizeof(struct ip6_hdr));
/* fill in the ip6 header */
ip6 = mtod(*mh, struct ip6_hdr *);
memset(ip6, 0, sizeof(*ip6));
ip6->ip6_flow = 0;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be set later */
ip6->ip6_nxt = IPPROTO_ICMPV6;
/* ip6_hlim will be set by im6o.im6o_multicast_hlim */
/* ip6_src/dst will be set by mld_sendpkt() or mld_sendbuf() */
/* fill in the MLD header as much as possible */
md->m_len = sizeof(struct mld_hdr);
mldh = mtod(md, struct mld_hdr *);
memset(mldh, 0, sizeof(struct mld_hdr));
mldh->mld_type = type;
return mldh;
}
static void
in6m_ref(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
in6m->in6m_refcount++;
}
static void
in6m_unref(struct in6_multi *in6m)
{
KASSERT(rw_write_held(&in6_multilock));
if (--in6m->in6m_refcount == 0)
in6m_destroy(in6m);
}
/*
* Add an address to the list of IP6 multicast addresses for a given interface.
*/
struct in6_multi *
in6_addmulti(struct in6_addr *maddr6, struct ifnet *ifp, int *errorp,
int timer)
{
struct sockaddr_in6 sin6;
struct in6_multi *in6m;
*errorp = 0;
rw_enter(&in6_multilock, RW_WRITER);
/*
* See if address already in list.
*/
in6m = in6_lookup_multi(maddr6, ifp);
if (in6m != NULL) {
/*
* Found it; just increment the reference count.
*/
in6m->in6m_refcount++;
} else {
/*
* New address; allocate a new multicast record
* and link it into the interface's multicast list.
*/
in6m = malloc(sizeof(*in6m), M_IPMADDR, M_NOWAIT|M_ZERO);
if (in6m == NULL) {
*errorp = ENOBUFS;
goto out;
}
in6m->in6m_addr = *maddr6;
in6m->in6m_ifp = ifp;
in6m->in6m_refcount = 1;
in6m->in6m_timer = IN6M_TIMER_UNDEF;
callout_init(&in6m->in6m_timer_ch, CALLOUT_MPSAFE);
callout_setfunc(&in6m->in6m_timer_ch, mld_timeo, in6m);
LIST_INSERT_HEAD(&ifp->if_multiaddrs, in6m, in6m_entry);
/*
* Ask the network driver to update its multicast reception
* filter appropriately for the new address.
*/
sockaddr_in6_init(&sin6, maddr6, 0, 0, 0);
*errorp = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
if (*errorp) {
callout_destroy(&in6m->in6m_timer_ch);
LIST_REMOVE(in6m, in6m_entry);
free(in6m, M_IPMADDR);
in6m = NULL;
goto out;
}
in6m->in6m_timer = timer;
if (in6m->in6m_timer > 0) {
in6m->in6m_state = MLD_REPORTPENDING;
mld_starttimer(in6m);
goto out;
}
/*
* Let MLD6 know that we have joined a new IP6 multicast
* group.
*/
mld_start_listening(in6m);
}
out:
rw_exit(&in6_multilock);
return in6m;
}
static void
in6m_destroy(struct in6_multi *in6m)
{
struct sockaddr_in6 sin6;
KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_refcount == 0, "in6m_refcount=%d",
in6m->in6m_refcount);
/*
* Unlink from list if it's listed. This must be done before
* mld_stop_listening because it releases in6_multilock and that allows
* someone to look up the removing in6m from the list and add a
* reference to the entry unexpectedly.
*/
if (in6_lookup_multi(&in6m->in6m_addr, in6m->in6m_ifp) != NULL) LIST_REMOVE(in6m, in6m_entry);
/*
* No remaining claims to this record; let MLD6 know
* that we are leaving the multicast group.
*/
mld_stop_listening(in6m);
/*
* Delete all references of this multicasting group from
* the membership arrays
*/
in6_purge_mcast_references(in6m);
/*
* Notify the network driver to update its multicast
* reception filter.
*/
sockaddr_in6_init(&sin6, &in6m->in6m_addr, 0, 0, 0);
if_mcast_op(in6m->in6m_ifp, SIOCDELMULTI, sin6tosa(&sin6));
/* Tell mld_timeo we're halting the timer */
in6m->in6m_timer = IN6M_TIMER_UNDEF;
rw_exit(&in6_multilock);
callout_halt(&in6m->in6m_timer_ch, NULL);
callout_destroy(&in6m->in6m_timer_ch);
free(in6m, M_IPMADDR);
rw_enter(&in6_multilock, RW_WRITER);
}
/*
* Delete a multicast address record.
*/
void
in6_delmulti_locked(struct in6_multi *in6m)
{ KASSERT(rw_write_held(&in6_multilock)); KASSERTMSG(in6m->in6m_refcount > 0, "in6m_refcount=%d",
in6m->in6m_refcount);
/*
* The caller should have a reference to in6m. So we don't need to care
* of releasing the lock in mld_stoptimer.
*/
mld_stoptimer(in6m); if (--in6m->in6m_refcount == 0) in6m_destroy(in6m);
}
void
in6_delmulti(struct in6_multi *in6m)
{
rw_enter(&in6_multilock, RW_WRITER);
in6_delmulti_locked(in6m);
rw_exit(&in6_multilock);
}
/*
* Look up the in6_multi record for a given IP6 multicast address
* on a given interface. If no matching record is found, "in6m"
* returns NULL.
*/
struct in6_multi *
in6_lookup_multi(const struct in6_addr *addr, const struct ifnet *ifp)
{
struct in6_multi *in6m;
KASSERT(rw_lock_held(&in6_multilock)); LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) { if (IN6_ARE_ADDR_EQUAL(&in6m->in6m_addr, addr))
break;
}
return in6m;
}
void
in6_lookup_and_delete_multi(const struct in6_addr *addr,
const struct ifnet *ifp)
{
struct in6_multi *in6m;
rw_enter(&in6_multilock, RW_WRITER);
in6m = in6_lookup_multi(addr, ifp);
if (in6m != NULL)
in6_delmulti_locked(in6m);
rw_exit(&in6_multilock);
}
bool
in6_multi_group(const struct in6_addr *addr, const struct ifnet *ifp)
{
bool ingroup;
rw_enter(&in6_multilock, RW_READER);
ingroup = in6_lookup_multi(addr, ifp) != NULL;
rw_exit(&in6_multilock);
return ingroup;
}
/*
* Purge in6_multi records associated to the interface.
*/
void
in6_purge_multi(struct ifnet *ifp)
{
struct in6_multi *in6m, *next;
rw_enter(&in6_multilock, RW_WRITER);
LIST_FOREACH_SAFE(in6m, &ifp->if_multiaddrs, in6m_entry, next) {
LIST_REMOVE(in6m, in6m_entry);
/*
* Normally multicast addresses are already purged at this
* point. Remaining references aren't accessible via ifp,
* so what we can do here is to prevent ifp from being
* accessed via in6m by removing it from the list of ifp.
*/
mld_stoptimer(in6m);
}
rw_exit(&in6_multilock);
}
void
in6_multi_lock(int op)
{
rw_enter(&in6_multilock, op);
}
void
in6_multi_unlock(void)
{
rw_exit(&in6_multilock);
}
bool
in6_multi_locked(int op)
{
switch (op) {
case RW_READER:
return rw_read_held(&in6_multilock);
case RW_WRITER:
return rw_write_held(&in6_multilock);
default:
return rw_lock_held(&in6_multilock);
}
}
struct in6_multi_mship *
in6_joingroup(struct ifnet *ifp, struct in6_addr *addr, int *errorp, int timer)
{
struct in6_multi_mship *imm;
imm = malloc(sizeof(*imm), M_IPMADDR, M_NOWAIT|M_ZERO);
if (imm == NULL) {
*errorp = ENOBUFS;
return NULL;
}
imm->i6mm_maddr = in6_addmulti(addr, ifp, errorp, timer);
if (!imm->i6mm_maddr) {
/* *errorp is already set */
free(imm, M_IPMADDR);
return NULL;
}
return imm;
}
int
in6_leavegroup(struct in6_multi_mship *imm)
{
struct in6_multi *in6m;
rw_enter(&in6_multilock, RW_WRITER);
in6m = imm->i6mm_maddr;
imm->i6mm_maddr = NULL;
if (in6m != NULL) { in6_delmulti_locked(in6m);
}
rw_exit(&in6_multilock);
free(imm, M_IPMADDR);
return 0;
}
/*
* DEPRECATED: keep it just to avoid breaking old sysctl users.
*/
static int
in6_mkludge_sysctl(SYSCTLFN_ARGS)
{
if (namelen != 1)
return EINVAL;
*oldlenp = 0;
return 0;
}
static int
in6_multicast_sysctl(SYSCTLFN_ARGS)
{
struct ifnet *ifp;
struct ifaddr *ifa;
struct in6_ifaddr *ia6;
struct in6_multi *in6m;
uint32_t tmp;
int error;
size_t written;
struct psref psref, psref_ia;
int bound, s;
if (namelen != 1)
return EINVAL;
rw_enter(&in6_multilock, RW_READER);
bound = curlwp_bind();
ifp = if_get_byindex(name[0], &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
rw_exit(&in6_multilock);
return ENODEV;
}
if (oldp == NULL) {
*oldlenp = 0;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
*oldlenp += 2 * sizeof(struct in6_addr) +
sizeof(uint32_t);
}
}
pserialize_read_exit(s);
if_put(ifp, &psref);
curlwp_bindx(bound);
rw_exit(&in6_multilock);
return 0;
}
error = 0;
written = 0;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref_ia);
pserialize_read_exit(s);
ia6 = ifatoia6(ifa);
LIST_FOREACH(in6m, &ifp->if_multiaddrs, in6m_entry) {
if (written + 2 * sizeof(struct in6_addr) +
sizeof(uint32_t) > *oldlenp)
goto done;
/*
* XXX return the first IPv6 address to keep backward
* compatibility, however now multicast addresses
* don't belong to any IPv6 addresses so it should be
* unnecessary.
*/
error = sysctl_copyout(l, &ia6->ia_addr.sin6_addr,
oldp, sizeof(struct in6_addr));
if (error)
goto done;
oldp = (char *)oldp + sizeof(struct in6_addr);
written += sizeof(struct in6_addr);
error = sysctl_copyout(l, &in6m->in6m_addr,
oldp, sizeof(struct in6_addr));
if (error)
goto done;
oldp = (char *)oldp + sizeof(struct in6_addr);
written += sizeof(struct in6_addr);
tmp = in6m->in6m_refcount;
error = sysctl_copyout(l, &tmp, oldp, sizeof(tmp));
if (error)
goto done;
oldp = (char *)oldp + sizeof(tmp);
written += sizeof(tmp);
}
s = pserialize_read_enter();
break;
}
pserialize_read_exit(s);
done:
ifa_release(ifa, &psref_ia);
if_put(ifp, &psref);
curlwp_bindx(bound);
rw_exit(&in6_multilock);
*oldlenp = written;
return error;
}
void
in6_sysctl_multicast_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "multicast",
SYSCTL_DESCR("Multicast information"),
in6_multicast_sysctl, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "multicast_kludge",
SYSCTL_DESCR("multicast kludge information"),
in6_mkludge_sysctl, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: strlcat.c,v 1.4 2013/01/23 07:57:27 matt Exp $ */
/* $OpenBSD: strlcat.c,v 1.10 2003/04/12 21:56:39 millert Exp $ */
/*
* Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND TODD C. MILLER DISCLAIMS ALL
* WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL TODD C. MILLER BE LIABLE
* FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
* OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
* CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#if !defined(_KERNEL) && !defined(_STANDALONE)
#if HAVE_NBTOOL_CONFIG_H
#include "nbtool_config.h"
#endif
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
__RCSID("$NetBSD: strlcat.c,v 1.4 2013/01/23 07:57:27 matt Exp $");
#endif /* LIBC_SCCS and not lint */
#ifdef _LIBC
#include "namespace.h"
#endif
#include <sys/types.h>
#include <assert.h>
#include <string.h>
#ifdef _LIBC
# ifdef __weak_alias
__weak_alias(strlcat, _strlcat)
# endif
#endif
#else
#include <lib/libkern/libkern.h>
#endif /* !_KERNEL && !_STANDALONE */
#if !HAVE_STRLCAT
/*
* Appends src to string dst of size siz (unlike strncat, siz is the
* full size of dst, not space left). At most siz-1 characters
* will be copied. Always NUL terminates (unless siz <= strlen(dst)).
* Returns strlen(src) + MIN(siz, strlen(initial dst)).
* If retval >= siz, truncation occurred.
*/
size_t
strlcat(char *dst, const char *src, size_t siz)
{
#if 1
char *d = dst;
const char *s = src;
size_t n = siz;
size_t dlen;
_DIAGASSERT(dst != NULL); _DIAGASSERT(src != NULL);
/* Find the end of dst and adjust bytes left but don't go past end */
while (n-- != 0 && *d != '\0')
d++;
dlen = d - dst;
n = siz - dlen;
if (n == 0)
return(dlen + strlen(s)); while (*s != '\0') { if (n != 1) { *d++ = *s;
n--;
}
s++;
}
*d = '\0';
return(dlen + (s - src)); /* count does not include NUL */
#else
_DIAGASSERT(dst != NULL);
_DIAGASSERT(src != NULL);
/*
* Find length of string in dst (maxing out at siz).
*/
size_t dlen = strnlen(dst, siz);
/*
* Copy src into any remaining space in dst (truncating if needed).
* Note strlcpy(dst, src, 0) returns strlen(src).
*/
return dlen + strlcpy(dst + dlen, src, siz - dlen);
#endif
}
#endif
/* $NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Emmanuel Dreyfus.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>
int
compat50_clockctlioctl(dev_t dev, u_long cmd, void *data, int flags,
struct lwp *l)
{
int error = 0;
const struct cdevsw *cd = cdevsw_lookup(dev);
if (cd == NULL || cd->d_ioctl == NULL)
return ENXIO;
switch (cmd) {
case CLOCKCTL_OSETTIMEOFDAY: {
struct timeval50 tv50;
struct timeval tv;
struct clockctl50_settimeofday *args = data;
error = copyin(args->tv, &tv50, sizeof(tv50));
if (error)
return (error);
timeval50_to_timeval(&tv50, &tv);
error = settimeofday1(&tv, false, args->tzp, l, false);
break;
}
case CLOCKCTL_OADJTIME: {
struct timeval atv, oldatv;
struct timeval50 atv50;
struct clockctl50_adjtime *args = data;
if (args->delta) {
error = copyin(args->delta, &atv50, sizeof(atv50));
if (error)
return (error);
timeval50_to_timeval(&atv50, &atv);
}
adjtime1(args->delta ? &atv : NULL,
args->olddelta ? &oldatv : NULL, l->l_proc);
if (args->olddelta) { timeval_to_timeval50(&oldatv, &atv50);
error = copyout(&atv50, args->olddelta, sizeof(atv50));
}
break;
}
case CLOCKCTL_OCLOCK_SETTIME: {
struct timespec50 tp50;
struct timespec tp;
struct clockctl50_clock_settime *args = data;
error = copyin(args->tp, &tp50, sizeof(tp50));
if (error)
return (error);
timespec50_to_timespec(&tp50, &tp);
error = clock_settime1(l->l_proc, args->clock_id, &tp, true);
break;
}
case CLOCKCTL_ONTP_ADJTIME: {
if (vec_ntp_timestatus == NULL) {
error = ENOTTY;
break;
}
/* The ioctl number changed but the data did not change. */
error = (cd->d_ioctl)(dev, CLOCKCTL_NTP_ADJTIME,
data, flags, l);
break;
}
default:
error = ENOTTY;
}
return (error);
}
void
clockctl_50_init(void)
{
MODULE_HOOK_SET(clockctl_ioctl_50_hook, compat50_clockctlioctl);
}
void
clockctl_50_fini(void)
{
MODULE_HOOK_UNSET(clockctl_ioctl_50_hook);
}
/* $NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_inode.c 8.13 (Berkeley) 4/21/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_inode.c,v 1.131 2020/07/31 04:07:30 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>
#include <sys/vnode.h>
#include <sys/wapbl.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
static int ffs_indirtrunc(struct inode *, daddr_t, daddr_t, daddr_t, int,
int64_t *);
/*
* Update the access, modified, and inode change times as specified
* by the IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.
* The IN_MODIFIED flag is used to specify that the inode needs to be
* updated but that the times have already been set. The access
* and modified times are taken from the second and third parameters;
* the inode change time is always taken from the current time. If
* UPDATE_WAIT flag is set, or UPDATE_DIROP is set then wait for the
* disk write of the inode to complete.
*/
int
ffs_update(struct vnode *vp, const struct timespec *acc,
const struct timespec *mod, int updflags)
{
struct fs *fs;
struct buf *bp;
struct inode *ip;
int error;
void *cp;
int waitfor, flags;
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return (0);
ip = VTOI(vp);
FFS_ITIMES(ip, acc, mod, NULL);
if (updflags & UPDATE_CLOSE)
flags = ip->i_flag & (IN_MODIFIED | IN_ACCESSED);
else
flags = ip->i_flag & IN_MODIFIED;
if (flags == 0)
return (0);
fs = ip->i_fs;
if ((flags & IN_MODIFIED) != 0 &&
(vp->v_mount->mnt_flag & MNT_ASYNC) == 0) {
waitfor = updflags & UPDATE_WAIT;
if ((updflags & UPDATE_DIROP) != 0)
waitfor |= UPDATE_WAIT;
} else
waitfor = 0;
/*
* Ensure that uid and gid are correct. This is a temporary
* fix until fsck has been changed to do the update.
*/
if (fs->fs_magic == FS_UFS1_MAGIC && /* XXX */
fs->fs_old_inodefmt < FS_44INODEFMT) { /* XXX */
ip->i_ffs1_ouid = ip->i_uid; /* XXX */
ip->i_ffs1_ogid = ip->i_gid; /* XXX */
} /* XXX */
error = bread(ip->i_devvp, FFS_FSBTODB(fs, ino_to_fsba(fs, ip->i_number)),
(int)fs->fs_bsize, B_MODIFY, &bp);
if (error) {
return (error);
}
ip->i_flag &= ~(IN_MODIFIED | IN_ACCESSED);
/* Keep unlinked inode list up to date */
KDASSERTMSG(DIP(ip, nlink) == ip->i_nlink,
"DIP(ip, nlink) [%d] == ip->i_nlink [%d]",
DIP(ip, nlink), ip->i_nlink);
if (ip->i_mode) {
if (ip->i_nlink > 0) {
UFS_WAPBL_UNREGISTER_INODE(ip->i_ump->um_mountp,
ip->i_number, ip->i_mode);
} else {
UFS_WAPBL_REGISTER_INODE(ip->i_ump->um_mountp,
ip->i_number, ip->i_mode);
}
}
if (fs->fs_magic == FS_UFS1_MAGIC) {
cp = (char *)bp->b_data +
(ino_to_fsbo(fs, ip->i_number) * DINODE1_SIZE);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode1_swap(ip->i_din.ffs1_din,
(struct ufs1_dinode *)cp);
else
#endif
memcpy(cp, ip->i_din.ffs1_din, DINODE1_SIZE);
} else {
cp = (char *)bp->b_data +
(ino_to_fsbo(fs, ip->i_number) * DINODE2_SIZE);
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs))
ffs_dinode2_swap(ip->i_din.ffs2_din,
(struct ufs2_dinode *)cp);
else
#endif
memcpy(cp, ip->i_din.ffs2_din, DINODE2_SIZE);
}
if (waitfor) {
return (bwrite(bp));
} else {
bdwrite(bp);
return (0);
}
}
#define SINGLE 0 /* index of single indirect block */
#define DOUBLE 1 /* index of double indirect block */
#define TRIPLE 2 /* index of triple indirect block */
/*
* Truncate the inode oip to at most length size, freeing the
* disk blocks.
*/
int
ffs_truncate(struct vnode *ovp, off_t length, int ioflag, kauth_cred_t cred)
{
daddr_t lastblock;
struct inode *oip = VTOI(ovp);
struct mount *omp = ovp->v_mount;
daddr_t bn, lastiblock[UFS_NIADDR], indir_lbn[UFS_NIADDR];
daddr_t blks[UFS_NDADDR + UFS_NIADDR], oldblks[UFS_NDADDR + UFS_NIADDR];
struct fs *fs;
int extblocks;
int offset, pgoffset, level;
int64_t blocksreleased = 0, datablocks;
int i, aflag, nblocks;
int error, allerror = 0;
off_t osize;
int sync;
struct ufsmount *ump = oip->i_ump;
void *dcookie;
long bsize;
bool wapbl = omp->mnt_wapbl != NULL;
UFS_WAPBL_JLOCK_ASSERT(ump->um_mountp); if (ovp->v_type == VCHR || ovp->v_type == VBLK ||
ovp->v_type == VFIFO || ovp->v_type == VSOCK) {
KASSERT(oip->i_size == 0);
return 0;
}
if (length < 0)
return (EINVAL);
/*
* Historically clients did not have to specify which data
* they were truncating. So, if not specified, we assume
* traditional behavior, e.g., just the normal data.
*/
if ((ioflag & (IO_EXT | IO_NORMAL)) == 0)
ioflag |= IO_NORMAL;
fs = oip->i_fs;
#define i_din2 i_din.ffs2_din
extblocks = 0;
datablocks = DIP(oip, blocks); if (fs->fs_magic == FS_UFS2_MAGIC && oip->i_din2->di_extsize > 0) {
extblocks = btodb(ffs_fragroundup(fs, oip->i_din2->di_extsize));
datablocks -= extblocks;
}
if ((ioflag & IO_EXT) && extblocks > 0) {
if (length != 0)
panic("ffs_truncate: partial trunc of extdata");
{
#ifdef QUOTA
(void) chkdq(oip, -extblocks, NOCRED, FORCE);
#endif
osize = oip->i_din2->di_extsize;
oip->i_din2->di_blocks -= extblocks;
oip->i_din2->di_extsize = 0;
for (i = 0; i < UFS_NXADDR; i++) {
binvalbuf(ovp, -1 - i);
oldblks[i] = oip->i_din2->di_extb[i];
oip->i_din2->di_extb[i] = 0;
}
oip->i_flag |= IN_CHANGE;
if ((error = ffs_update(ovp, NULL, NULL, 0)))
return (error);
for (i = 0; i < UFS_NXADDR; i++) {
if (oldblks[i] == 0)
continue;
bsize = ffs_sblksize(fs, osize, i);
if (wapbl) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(omp,
FFS_FSBTODB(fs, oldblks[i]), bsize, NULL);
if (error)
return error;
} else
ffs_blkfree(fs, oip->i_devvp, oldblks[i],
bsize, oip->i_number);
}
extblocks = 0;
}
}
if ((ioflag & IO_NORMAL) == 0)
return (0);
if (ovp->v_type == VLNK && (oip->i_size < ump->um_maxsymlinklen || (ump->um_maxsymlinklen == 0 && datablocks == 0))) { KDASSERT(length == 0); memset(SHORTLINK(oip), 0, (size_t)oip->i_size);
oip->i_size = 0;
DIP_ASSIGN(oip, size, 0);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (ffs_update(ovp, NULL, NULL, 0));
}
if (oip->i_size == length) {
/* still do a uvm_vnp_setsize() as writesize may be larger */
uvm_vnp_setsize(ovp, length);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
return (ffs_update(ovp, NULL, NULL, 0));
}
if (length > ump->um_maxfilesize)
return (EFBIG);
if ((oip->i_flags & SF_SNAPSHOT) != 0) ffs_snapremove(ovp);
osize = oip->i_size;
aflag = ioflag & IO_SYNC ? B_SYNC : 0;
/*
* Lengthen the size of the file. We must ensure that the
* last byte of the file is allocated. Since the smallest
* value of osize is 0, length will be at least 1.
*/
if (osize < length) {
if (ffs_lblkno(fs, osize) < UFS_NDADDR &&
ffs_lblkno(fs, osize) != ffs_lblkno(fs, length) &&
ffs_blkroundup(fs, osize) != osize) {
off_t eob;
eob = ffs_blkroundup(fs, osize);
uvm_vnp_setwritesize(ovp, eob);
error = ufs_balloc_range(ovp, osize, eob - osize,
cred, aflag);
if (error) {
(void) ffs_truncate(ovp, osize,
ioflag & IO_SYNC, cred);
return error;
}
if (ioflag & IO_SYNC) { rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
VOP_PUTPAGES(ovp,
trunc_page(osize & fs->fs_bmask),
round_page(eob), PGO_CLEANIT | PGO_SYNCIO |
PGO_JOURNALLOCKED);
}
}
uvm_vnp_setwritesize(ovp, length);
error = ufs_balloc_range(ovp, length - 1, 1, cred, aflag);
if (error) {
(void) ffs_truncate(ovp, osize, ioflag & IO_SYNC, cred);
return (error);
}
uvm_vnp_setsize(ovp, length);
oip->i_flag |= IN_CHANGE | IN_UPDATE;
KASSERT(ovp->v_size == oip->i_size);
return (ffs_update(ovp, NULL, NULL, 0));
}
/*
* When truncating a regular file down to a non-block-aligned size,
* we must zero the part of last block which is past the new EOF.
* We must synchronously flush the zeroed pages to disk
* since the new pages will be invalidated as soon as we
* inform the VM system of the new, smaller size.
* We must do this before acquiring the GLOCK, since fetching
* the pages will acquire the GLOCK internally.
* So there is a window where another thread could see a whole
* zeroed page past EOF, but that's life.
*/
offset = ffs_blkoff(fs, length);
pgoffset = length & PAGE_MASK;
if (ovp->v_type == VREG && (pgoffset != 0 || offset != 0) &&
osize > length) {
daddr_t lbn;
voff_t eoz;
int size;
if (offset != 0) {
error = ufs_balloc_range(ovp, length - 1, 1, cred,
aflag);
if (error)
return error;
}
lbn = ffs_lblkno(fs, length);
size = ffs_blksize(fs, oip, lbn);
eoz = MIN(MAX(ffs_lblktosize(fs, lbn) + size, round_page(pgoffset)),
osize);
ubc_zerorange(&ovp->v_uobj, length, eoz - length,
UBC_VNODE_FLAGS(ovp));
if (round_page(eoz) > round_page(length)) {
rw_enter(ovp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(ovp, round_page(length),
round_page(eoz),
PGO_CLEANIT | PGO_DEACTIVATE | PGO_JOURNALLOCKED |
((ioflag & IO_SYNC) ? PGO_SYNCIO : 0));
if (error)
return error;
}
}
genfs_node_wrlock(ovp);
oip->i_size = length;
DIP_ASSIGN(oip, size, length);
uvm_vnp_setsize(ovp, length);
/*
* Calculate index into inode's block list of
* last direct and indirect blocks (if any)
* which we want to keep. Lastblock is -1 when
* the file is truncated to 0.
*/
lastblock = ffs_lblkno(fs, length + fs->fs_bsize - 1) - 1;
lastiblock[SINGLE] = lastblock - UFS_NDADDR;
lastiblock[DOUBLE] = lastiblock[SINGLE] - FFS_NINDIR(fs);
lastiblock[TRIPLE] = lastiblock[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs);
nblocks = btodb(fs->fs_bsize);
/*
* Update file and block pointers on disk before we start freeing
* blocks. If we crash before free'ing blocks below, the blocks
* will be returned to the free list. lastiblock values are also
* normalized to -1 for calls to ffs_indirtrunc below.
*/
sync = 0;
for (level = TRIPLE; level >= SINGLE; level--) {
blks[UFS_NDADDR + level] = DIP(oip, ib[level]); if (lastiblock[level] < 0 && blks[UFS_NDADDR + level] != 0) {
sync = 1;
DIP_ASSIGN(oip, ib[level], 0);
lastiblock[level] = -1;
}
}
for (i = 0; i < UFS_NDADDR; i++) {
blks[i] = DIP(oip, db[i]);
if (i > lastblock && blks[i] != 0) {
sync = 1;
DIP_ASSIGN(oip, db[i], 0);
}
}
oip->i_flag |= IN_CHANGE | IN_UPDATE;
if (sync) { error = ffs_update(ovp, NULL, NULL, UPDATE_WAIT);
if (error && !allerror)
allerror = error;
}
/*
* Having written the new inode to disk, save its new configuration
* and put back the old block pointers long enough to process them.
* Note that we save the new block configuration so we can check it
* when we are done.
*/
for (i = 0; i < UFS_NDADDR; i++) { bn = DIP(oip, db[i]);
DIP_ASSIGN(oip, db[i], blks[i]);
blks[i] = bn;
}
for (i = 0; i < UFS_NIADDR; i++) {
bn = DIP(oip, ib[i]);
DIP_ASSIGN(oip, ib[i], blks[UFS_NDADDR + i]);
blks[UFS_NDADDR + i] = bn;
}
oip->i_size = osize;
DIP_ASSIGN(oip, size, osize);
error = vtruncbuf(ovp, lastblock + 1, 0, 0);
if (error && !allerror)
allerror = error;
/*
* Indirect blocks first.
*/
indir_lbn[SINGLE] = -UFS_NDADDR;
indir_lbn[DOUBLE] = indir_lbn[SINGLE] - FFS_NINDIR(fs) - 1;
indir_lbn[TRIPLE] = indir_lbn[DOUBLE] - FFS_NINDIR(fs) * FFS_NINDIR(fs) - 1;
for (level = TRIPLE; level >= SINGLE; level--) { bn = ffs_getib(fs, oip, level);
if (bn != 0) {
if (lastiblock[level] < 0 &&
oip->i_ump->um_mountp->mnt_wapbl) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp,
FFS_FSBTODB(fs, bn), fs->fs_bsize,
&dcookie);
if (error)
goto out;
} else {
dcookie = NULL;
}
error = ffs_indirtrunc(oip, indir_lbn[level],
FFS_FSBTODB(fs, bn), lastiblock[level], level,
&blocksreleased);
if (error) {
if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION(
oip->i_ump->um_mountp, dcookie);
}
goto out;
}
if (lastiblock[level] < 0) { if (!dcookie) ffs_blkfree(fs, oip->i_devvp, bn,
fs->fs_bsize, oip->i_number);
DIP_ASSIGN(oip, ib[level], 0);
blocksreleased += nblocks;
}
}
if (lastiblock[level] >= 0)
goto done;
}
/*
* All whole direct blocks or frags.
*/
for (i = UFS_NDADDR - 1; i > lastblock; i--) { bn = ffs_getdb(fs, oip, i); if (bn == 0)
continue;
bsize = ffs_blksize(fs, oip, i); if ((oip->i_ump->um_mountp->mnt_wapbl) &&
(ovp->v_type != VREG)) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp,
FFS_FSBTODB(fs, bn), bsize, NULL);
if (error)
goto out;
} else
ffs_blkfree(fs, oip->i_devvp, bn, bsize, oip->i_number);
DIP_ASSIGN(oip, db[i], 0);
blocksreleased += btodb(bsize);
}
if (lastblock < 0)
goto done;
/*
* Finally, look for a change in size of the
* last direct block; release any frags.
*/
bn = ffs_getdb(fs, oip, lastblock); if (bn != 0) {
long oldspace, newspace;
/*
* Calculate amount of space we're giving
* back as old block size minus new block size.
*/
oldspace = ffs_blksize(fs, oip, lastblock);
oip->i_size = length;
DIP_ASSIGN(oip, size, length); newspace = ffs_blksize(fs, oip, lastblock);
if (newspace == 0)
panic("itrunc: newspace"); if (oldspace - newspace > 0) {
/*
* Block number of space to be free'd is
* the old block # plus the number of frags
* required for the storage we're keeping.
*/
bn += ffs_numfrags(fs, newspace);
if ((oip->i_ump->um_mountp->mnt_wapbl) &&
(ovp->v_type != VREG)) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
oip->i_ump->um_mountp, FFS_FSBTODB(fs, bn),
oldspace - newspace, NULL);
if (error)
goto out;
} else
ffs_blkfree(fs, oip->i_devvp, bn,
oldspace - newspace, oip->i_number);
blocksreleased += btodb(oldspace - newspace);
}
}
done:
for (level = SINGLE; level <= TRIPLE; level++)
KASSERTMSG((blks[UFS_NDADDR + level] == DIP(oip, ib[level])),
"itrunc1 blk mismatch: %jx != %jx",
(uintmax_t)blks[UFS_NDADDR + level],
(uintmax_t)DIP(oip, ib[level]));
for (i = 0; i < UFS_NDADDR; i++) KASSERTMSG((blks[i] == DIP(oip, db[i])),
"itrunc2 blk mismatch: %jx != %jx",
(uintmax_t)blks[i], (uintmax_t)DIP(oip, db[i]));
KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_cleanblkhd)),
"itrunc3: zero length and nonempty cleanblkhd");
KASSERTMSG((length != 0 || extblocks || LIST_EMPTY(&ovp->v_dirtyblkhd)),
"itrunc3: zero length and nonempty dirtyblkhd");
out:
/*
* Set length back to old size if deallocation failed. Some indirect
* blocks were deallocated creating a hole, but that is okay.
*/
if (error == EAGAIN) { if (!allerror)
allerror = error;
length = osize;
uvm_vnp_setsize(ovp, length);
}
/*
* Put back the real size.
*/
oip->i_size = length;
DIP_ASSIGN(oip, size, length); DIP_ADD(oip, blocks, -blocksreleased);
genfs_node_unlock(ovp);
oip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ovp, NULL, NULL, 0);
#if defined(QUOTA) || defined(QUOTA2)
(void) chkdq(oip, -blocksreleased, NOCRED, 0);
#endif
KASSERT(ovp->v_type != VREG || ovp->v_size == oip->i_size);
return (allerror);
}
/*
* Release blocks associated with the inode ip and stored in the indirect
* block bn. Blocks are free'd in LIFO order up to (but not including)
* lastbn. If level is greater than SINGLE, the block is an indirect block
* and recursive calls to indirtrunc must be used to cleanse other indirect
* blocks.
*
* NB: triple indirect blocks are untested.
*/
static int
ffs_indirtrunc(struct inode *ip, daddr_t lbn, daddr_t dbn, daddr_t lastbn,
int level, int64_t *countp)
{
int i;
struct buf *bp;
struct fs *fs = ip->i_fs;
int32_t *bap1 = NULL;
int64_t *bap2 = NULL;
struct vnode *vp;
daddr_t nb, nlbn, last;
char *copy = NULL;
int64_t factor;
int64_t nblocks;
int error = 0, allerror = 0;
const int needswap = UFS_FSNEEDSWAP(fs);
const int wapbl = (ip->i_ump->um_mountp->mnt_wapbl != NULL);
void *dcookie;
#define RBAP(ip, i) (((ip)->i_ump->um_fstype == UFS1) ? \
ufs_rw32(bap1[i], needswap) : ufs_rw64(bap2[i], needswap))
#define BAP_ASSIGN(ip, i, value) \
do { \
if ((ip)->i_ump->um_fstype == UFS1) \
bap1[i] = (value); \
else \
bap2[i] = (value); \
} while(0)
/*
* Calculate index in current block of last
* block to be kept. -1 indicates the entire
* block so we need not calculate the index.
*/
factor = 1;
for (i = SINGLE; i < level; i++)
factor *= FFS_NINDIR(fs);
last = lastbn;
if (lastbn > 0) last /= factor;
nblocks = btodb(fs->fs_bsize);
/*
* Get buffer of block pointers, zero those entries corresponding
* to blocks to be free'd, and update on disk copy first. Since
* double(triple) indirect before single(double) indirect, calls
* to bmap on these blocks will fail. However, we already have
* the on disk address, so we have to set the b_blkno field
* explicitly instead of letting bread do everything for us.
*/
vp = ITOV(ip);
error = ffs_getblk(vp, lbn, FFS_NOBLK, fs->fs_bsize, false, &bp);
if (error)
return error;
if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
/* Braces must be here in case trace evaluates to nothing. */
trace(TR_BREADHIT, pack(vp, fs->fs_bsize), lbn);
} else {
trace(TR_BREADMISS, pack(vp, fs->fs_bsize), lbn);
curlwp->l_ru.ru_inblock++; /* pay for read */
bp->b_flags |= B_READ;
bp->b_flags &= ~B_COWDONE; /* we change blkno below */
if (bp->b_bcount > bp->b_bufsize)
panic("ffs_indirtrunc: bad buffer size");
bp->b_blkno = dbn;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
error = biowait(bp);
if (error == 0)
error = fscow_run(bp, true);
}
if (error) {
brelse(bp, 0);
return error;
}
/*
* Clear reference to blocks to be removed on disk, before actually
* reclaiming them, so that fsck is more likely to be able to recover
* the filesystem if system goes down during the truncate process.
* This assumes the truncate process would not fail, contrary
* to the wapbl case.
*/
if (ip->i_ump->um_fstype == UFS1)
bap1 = (int32_t *)bp->b_data;
else
bap2 = (int64_t *)bp->b_data; if (lastbn >= 0 && !wapbl) {
copy = kmem_alloc(fs->fs_bsize, KM_SLEEP);
memcpy((void *)copy, bp->b_data, (u_int)fs->fs_bsize);
for (i = last + 1; i < FFS_NINDIR(fs); i++) BAP_ASSIGN(ip, i, 0);
error = bwrite(bp);
if (error)
allerror = error;
if (ip->i_ump->um_fstype == UFS1)
bap1 = (int32_t *)copy;
else
bap2 = (int64_t *)copy;
}
/*
* Recursively free totally unused blocks.
*/
for (i = FFS_NINDIR(fs) - 1, nlbn = lbn + 1 - i * factor; i > last;
i--, nlbn += factor) {
nb = RBAP(ip, i); if (nb == 0)
continue;
if ((ip->i_ump->um_mountp->mnt_wapbl) && ((level > SINGLE) || (ITOV(ip)->v_type != VREG))) {
error = UFS_WAPBL_REGISTER_DEALLOCATION(
ip->i_ump->um_mountp,
FFS_FSBTODB(fs, nb), fs->fs_bsize,
&dcookie);
if (error)
goto out;
} else {
dcookie = NULL;
}
if (level > SINGLE) {
error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
(daddr_t)-1, level - 1, countp);
if (error) { if (dcookie) { UFS_WAPBL_UNREGISTER_DEALLOCATION(
ip->i_ump->um_mountp, dcookie);
}
goto out;
}
}
if (!dcookie) ffs_blkfree(fs, ip->i_devvp, nb, fs->fs_bsize,
ip->i_number);
BAP_ASSIGN(ip, i, 0);
*countp += nblocks;
}
/*
* Recursively free blocks on the now last partial indirect block.
*/
if (level > SINGLE && lastbn >= 0) {
last = lastbn % factor;
nb = RBAP(ip, i); if (nb != 0) { error = ffs_indirtrunc(ip, nlbn, FFS_FSBTODB(fs, nb),
last, level - 1, countp);
if (error)
goto out;
}
}
out:
if (error && !allerror)
allerror = error;
if (copy != NULL) {
kmem_free(copy, fs->fs_bsize);
} else if (lastbn < 0 && error == 0) {
/* all freed, release without writing back */
brelse(bp, BC_INVAL); } else if (wapbl) {
/* only partially freed, write the updated block */
error = bwrite(bp);
if (!allerror)
allerror = error;
}
return (allerror);
}
void
ffs_itimes(struct inode *ip, const struct timespec *acc,
const struct timespec *mod, const struct timespec *cre)
{
struct timespec now;
if (!(ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY))) {
return;
}
vfs_timestamp(&now);
if (ip->i_flag & IN_ACCESS) {
if (acc == NULL)
acc = &now;
DIP_ASSIGN(ip, atime, acc->tv_sec); DIP_ASSIGN(ip, atimensec, acc->tv_nsec);
}
if (ip->i_flag & (IN_UPDATE | IN_MODIFY)) { if ((ip->i_flags & SF_SNAPSHOT) == 0) {
if (mod == NULL)
mod = &now;
DIP_ASSIGN(ip, mtime, mod->tv_sec); DIP_ASSIGN(ip, mtimensec, mod->tv_nsec);
}
ip->i_modrev++;
}
if (ip->i_flag & (IN_CHANGE | IN_MODIFY)) {
if (cre == NULL)
cre = &now;
DIP_ASSIGN(ip, ctime, cre->tv_sec); DIP_ASSIGN(ip, ctimensec, cre->tv_nsec);
}
if (ip->i_flag & (IN_ACCESS | IN_MODIFY))
ip->i_flag |= IN_ACCESSED;
if (ip->i_flag & (IN_UPDATE | IN_CHANGE))
ip->i_flag |= IN_MODIFIED;
ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY);
}
/* $NetBSD: dtrace_bsd.h,v 1.9 2018/04/19 21:19:07 christos Exp $ */
/*-
* Copyright (c) 2007-2008 John Birrell (jb@freebsd.org)
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $FreeBSD: src/sys/sys/dtrace_bsd.h,v 1.3.2.1 2009/08/03 08:13:06 kensmith Exp $
*
* This file contains BSD shims for Sun's DTrace code.
*/
#ifndef _SYS_DTRACE_BSD_H
#define _SYS_DTRACE_BSD_H
#if defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>
/* Forward definitions: */
struct mbuf;
struct trapframe;
struct lwp;
struct vattr;
struct vnode;
struct ucred;
/*
* Cyclic clock function type definition used to hook the cyclic
* subsystem into the appropriate timer interrupt.
*/
typedef void (*cyclic_clock_func_t)(struct clockframe *);
extern cyclic_clock_func_t cyclic_clock_func[];
/*
* The dtrace module handles traps that occur during a DTrace probe.
* This type definition is used in the trap handler to provide a
* hook for the dtrace module to register its handler with.
*/
typedef int (*dtrace_trap_func_t)(struct trapframe *, u_int);
int dtrace_trap(struct trapframe *, u_int);
extern dtrace_trap_func_t dtrace_trap_func;
/* Used by the machine dependent trap() code. */
typedef int (*dtrace_invop_func_t)(uintptr_t, uintptr_t *, uintptr_t);
typedef void (*dtrace_doubletrap_func_t)(void);
/* Global variables in trap.c */
extern dtrace_invop_func_t dtrace_invop_func;
extern dtrace_doubletrap_func_t dtrace_doubletrap_func;
/* Virtual time hook function type. */
typedef void (*dtrace_vtime_switch_func_t)(struct lwp *);
extern int dtrace_vtime_active;
extern dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
/* The fasttrap module hooks into the fork, exit and exit. */
typedef void (*dtrace_fork_func_t)(struct proc *, struct proc *);
typedef void (*dtrace_execexit_func_t)(struct proc *);
/* Global variable in kern_fork.c */
extern dtrace_fork_func_t dtrace_fasttrap_fork;
/* Global variable in kern_exec.c */
extern dtrace_execexit_func_t dtrace_fasttrap_exec;
/* Global variable in kern_exit.c */
extern dtrace_execexit_func_t dtrace_fasttrap_exit;
/* The dtmalloc provider hooks into malloc. */
typedef void (*dtrace_malloc_probe_func_t)(u_int32_t, uintptr_t arg0,
uintptr_t arg1, uintptr_t arg2, uintptr_t arg3, uintptr_t arg4);
extern dtrace_malloc_probe_func_t dtrace_malloc_probe;
/* dtnfsclient NFSv3 access cache provider hooks. */
typedef void (*dtrace_nfsclient_accesscache_flush_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_accesscache_flush_probe_func_t
dtrace_nfsclient_accesscache_flush_done_probe;
typedef void (*dtrace_nfsclient_accesscache_get_probe_func_t)(uint32_t,
struct vnode *, uid_t, uint32_t);
extern dtrace_nfsclient_accesscache_get_probe_func_t
dtrace_nfsclient_accesscache_get_hit_probe,
dtrace_nfsclient_accesscache_get_miss_probe;
typedef void (*dtrace_nfsclient_accesscache_load_probe_func_t)(uint32_t,
struct vnode *, uid_t, uint32_t, int);
extern dtrace_nfsclient_accesscache_load_probe_func_t
dtrace_nfsclient_accesscache_load_done_probe;
/* dtnfsclient NFSv[23] attribute cache provider hooks. */
typedef void (*dtrace_nfsclient_attrcache_flush_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_attrcache_flush_probe_func_t
dtrace_nfsclient_attrcache_flush_done_probe;
typedef void (*dtrace_nfsclient_attrcache_get_hit_probe_func_t)(uint32_t,
struct vnode *, struct vattr *);
extern dtrace_nfsclient_attrcache_get_hit_probe_func_t
dtrace_nfsclient_attrcache_get_hit_probe;
typedef void (*dtrace_nfsclient_attrcache_get_miss_probe_func_t)(uint32_t,
struct vnode *);
extern dtrace_nfsclient_attrcache_get_miss_probe_func_t
dtrace_nfsclient_attrcache_get_miss_probe;
typedef void (*dtrace_nfsclient_attrcache_load_probe_func_t)(uint32_t,
struct vnode *, struct vattr *, int);
extern dtrace_nfsclient_attrcache_load_probe_func_t
dtrace_nfsclient_attrcache_load_done_probe;
/* dtnfsclient NFSv[23] RPC provider hooks. */
typedef void (*dtrace_nfsclient_nfs23_start_probe_func_t)(uint32_t,
struct vnode *, struct mbuf *, struct ucred *, int);
extern dtrace_nfsclient_nfs23_start_probe_func_t
dtrace_nfsclient_nfs23_start_probe;
typedef void (*dtrace_nfsclient_nfs23_done_probe_func_t)(uint32_t,
struct vnode *, struct mbuf *, struct ucred *, int, int);
extern dtrace_nfsclient_nfs23_done_probe_func_t
dtrace_nfsclient_nfs23_done_probe;
/*
* OpenSolaris compatible time functions returning nanoseconds.
* On OpenSolaris these return hrtime_t which we define as uint64_t.
*/
uint64_t dtrace_gethrtime(void);
uint64_t dtrace_gethrestime(void);
/* sizes based on DTrace structure requirements */
#define KDTRACE_PROC_SIZE 64
#define KDTRACE_PROC_ZERO 8
#define KDTRACE_THREAD_SIZE 256
#define KDTRACE_THREAD_ZERO 64
/*
* Functions for managing the opaque DTrace memory areas for
* processes and lwps.
*/
static __inline size_t kdtrace_proc_size(void);
static __inline void kdtrace_proc_ctor(void *, struct proc *);
static __inline void kdtrace_proc_dtor(void *, struct proc *);
static __inline size_t kdtrace_thread_size(void);
static __inline void kdtrace_thread_ctor(void *, struct lwp *);
static __inline void kdtrace_thread_dtor(void *, struct lwp *);
/* Return the DTrace process data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_proc_size(void)
{
return KDTRACE_PROC_SIZE;
}
/* Return the DTrace thread data size compiled in the kernel hooks. */
static __inline size_t
kdtrace_thread_size(void)
{
return KDTRACE_THREAD_SIZE;
}
static __inline void
kdtrace_proc_ctor(void *arg, struct proc *p)
{
#ifdef KDTRACE_HOOKS
p->p_dtrace = kmem_zalloc(KDTRACE_PROC_SIZE, KM_SLEEP);
#endif
}
static __inline void
kdtrace_proc_dtor(void *arg, struct proc *p)
{
#ifdef KDTRACE_HOOKS
if (p->p_dtrace != NULL) { kmem_free(p->p_dtrace, KDTRACE_PROC_SIZE);
p->p_dtrace = NULL;
}
#endif
}
static __inline void
kdtrace_thread_ctor(void *arg, struct lwp *l)
{
#ifdef KDTRACE_HOOKS
l->l_dtrace = kmem_zalloc(KDTRACE_THREAD_SIZE, KM_SLEEP);
#endif
}
static __inline void
kdtrace_thread_dtor(void *arg, struct lwp *l)
{
#ifdef KDTRACE_HOOKS
if (l->l_dtrace != NULL) { kmem_free(l->l_dtrace, KDTRACE_THREAD_SIZE);
l->l_dtrace = NULL;
}
#endif
}
#endif /* _SYS_DTRACE_BSD_H */
/* $NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $ */
/*
* Copyright (c) 1999, 2000 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Overlay Layer
* (See overlay_vnops.c for a description of what this does.)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: overlay_vfsops.c,v 1.73 2022/11/04 11:20:39 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <miscfs/overlay/overlay.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, overlay, "layerfs");
VFS_PROTOS(ov);
#define NOVERLAYNODECACHE 16
/*
* Mount overlay layer
*/
int
ov_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0;
struct overlay_args *args = data;
struct vnode *lowerrootvp, *vp;
struct overlay_mount *nmp;
struct layer_mount *lmp;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_mount(mp = %p)\n", mp);
#endif
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
lmp = MOUNTTOLAYERMOUNT(mp);
if (lmp == NULL)
return EIO;
args->la.target = NULL;
*data_len = sizeof *args;
return 0;
}
/*
* Update is not supported
*/
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/*
* Find lower node
*/
lowerrootvp = mp->mnt_vnodecovered;
vref(lowerrootvp);
if ((error = vn_lock(lowerrootvp, LK_EXCLUSIVE))) {
vrele(lowerrootvp);
return (error);
}
/*
* First cut at fixing up upper mount point
*/
nmp = kmem_zalloc(sizeof(struct overlay_mount), KM_SLEEP);
mp->mnt_data = nmp;
/*
* Make sure that the mount point is sufficiently initialized
* that the node create call will work.
*/
vfs_getnewfsid(mp);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(nmp, sizeof(struct overlay_mount));
return error;
}
nmp->ovm_size = sizeof (struct overlay_node);
nmp->ovm_tag = VT_OVERLAY;
nmp->ovm_bypass = layer_bypass;
nmp->ovm_vnodeop_p = overlay_vnodeop_p;
/*
* Fix up overlay node for root vnode
*/
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
/*
* Make sure the fixup worked
*/
if (error) {
vrele(lowerrootvp);
kmem_free(nmp, sizeof(struct overlay_mount));
return error;
}
/*
* Keep a held reference to the root vnode.
* It is vrele'd in ov_unmount.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
nmp->ovm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_mount: lower %s, alias at %s\n",
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
return 0;
}
/*
* Free reference to overlay layer
*/
int
ov_unmount(struct mount *mp, int mntflags)
{
struct vnode *overlay_rootvp = MOUNTTOOVERLAYMOUNT(mp)->ovm_rootvp;
struct overlay_mount *omp;
int error;
int flags = 0;
#ifdef OVERLAYFS_DIAGNOSTIC
printf("ov_unmount(mp = %p)\n", mp);
#endif
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(overlay_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, overlay_rootvp, flags)) != 0)
return (error);
#ifdef OVERLAYFS_DIAGNOSTIC
vprint("alias root of lower", overlay_rootvp);
#endif
/*
* Blow it away for future re-use
*/
vgone(overlay_rootvp);
/*
* Finally, throw away the overlay_mount structure
*/
omp = mp->mnt_data;
kmem_free(omp, sizeof(struct overlay_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc overlay_vnodeop_opv_desc;
const struct vnodeopv_desc * const ov_vnodeopv_descs[] = {
&overlay_vnodeop_opv_desc,
NULL,
};
struct vfsops overlay_vfsops = {
.vfs_name = MOUNT_OVERLAY,
.vfs_min_mount_data = sizeof (struct overlay_args),
.vfs_mount = ov_mount,
.vfs_start = layerfs_start,
.vfs_unmount = ov_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = ov_vnodeopv_descs
};
SYSCTL_SETUP(overlay_sysctl_setup, "overlay fs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "overlay",
SYSCTL_DESCR("Overlay file system"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
}
static int
overlay_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&overlay_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&overlay_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $ */
/*-
* Copyright (c) 1998, 1999, 2006, 2007, 2008, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_exit.c 8.10 (Berkeley) 2/23/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_exit.c,v 1.298 2023/10/08 12:38:58 ad Exp $");
#include "opt_ktrace.h"
#include "opt_dtrace.h"
#include "opt_sysv.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>
#include <sys/ptrace.h>
#include <sys/acct.h>
#include <sys/filedesc.h>
#include <sys/ras.h>
#include <sys/signalvar.h>
#include <sys/sched.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/ktrace.h>
#include <sys/cpu.h>
#include <sys/lwpctl.h>
#include <sys/atomic.h>
#include <sys/sdt.h>
#include <sys/psref.h>
#include <uvm/uvm_extern.h>
#ifdef DEBUG_EXIT
int debug_exit = 0;
#define DPRINTF(x) if (debug_exit) printf x
#else
#define DPRINTF(x)
#endif
static int find_stopped_child(struct proc *, idtype_t, id_t, int,
struct proc **, struct wrusage *, siginfo_t *);
static void proc_free(struct proc *, struct wrusage *);
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE1(proc, kernel, , exit, "int");
/*
* Fill in the appropriate signal information, and signal the parent.
*/
/* XXX noclone works around a gcc 4.5 bug on arm */
static void __noclone
exit_psignal(struct proc *p, struct proc *pp, ksiginfo_t *ksi)
{
KSI_INIT(ksi);
if ((ksi->ksi_signo = P_EXITSIG(p)) == SIGCHLD) {
if (p->p_xsig) {
if (p->p_sflag & PS_COREDUMP)
ksi->ksi_code = CLD_DUMPED;
else
ksi->ksi_code = CLD_KILLED;
ksi->ksi_status = p->p_xsig;
} else {
ksi->ksi_code = CLD_EXITED;
ksi->ksi_status = p->p_xexit;
}
} else {
ksi->ksi_code = SI_USER;
ksi->ksi_status = p->p_xsig;
}
/*
* We fill those in, even for non-SIGCHLD.
* It's safe to access p->p_cred unlocked here.
*/
ksi->ksi_pid = p->p_pid;
ksi->ksi_uid = kauth_cred_geteuid(p->p_cred);
/* XXX: is this still valid? */
ksi->ksi_utime = p->p_stats->p_ru.ru_utime.tv_sec;
ksi->ksi_stime = p->p_stats->p_ru.ru_stime.tv_sec;
}
/*
* exit --
* Death of process.
*/
int
sys_exit(struct lwp *l, const struct sys_exit_args *uap, register_t *retval)
{
/* {
syscallarg(int) rval;
} */
struct proc *p = l->l_proc;
/* Don't call exit1() multiple times in the same process. */
mutex_enter(p->p_lock);
if (p->p_sflag & PS_WEXIT) {
mutex_exit(p->p_lock);
lwp_exit(l);
}
/* exit1() will release the mutex. */
exit1(l, SCARG(uap, rval), 0);
/* NOTREACHED */
return (0);
}
/*
* Exit: deallocate address space and other resources, change proc state
* to zombie, and unlink proc from allproc and parent's lists. Save exit
* status and rusage for wait(). Check for child processes and orphan them.
*
* Must be called with p->p_lock held. Does not return.
*/
void
exit1(struct lwp *l, int exitcode, int signo)
{
struct proc *p, *child, *next_child, *old_parent, *new_parent;
struct pgrp *pgrp;
ksiginfo_t ksi;
ksiginfoq_t kq;
int wakeinit;
p = l->l_proc;
/* Verify that we hold no locks other than p->p_lock. */
LOCKDEBUG_BARRIER(p->p_lock, 0);
/* XXX Temporary: something is leaking kernel_lock. */
KERNEL_UNLOCK_ALL(l, NULL);
KASSERT(mutex_owned(p->p_lock));
KASSERT(p->p_vmspace != NULL);
if (__predict_false(p == initproc)) {
panic("init died (signal %d, exit %d)", signo, exitcode);
}
p->p_sflag |= PS_WEXIT;
/*
* Force all other LWPs to exit before we do. Only then can we
* begin to tear down the rest of the process state.
*/
if (p->p_nlwps > 1) {
exit_lwps(l);
}
ksiginfo_queue_init(&kq);
/*
* If we have been asked to stop on exit, do so now.
*/
if (__predict_false(p->p_sflag & PS_STOPEXIT)) {
KASSERT(l->l_blcnt == 0);
sigclearall(p, &contsigmask, &kq);
if (!mutex_tryenter(&proc_lock)) {
mutex_exit(p->p_lock);
mutex_enter(&proc_lock);
mutex_enter(p->p_lock);
}
p->p_waited = 0;
p->p_pptr->p_nstopchild++;
p->p_stat = SSTOP;
mutex_exit(&proc_lock);
lwp_lock(l);
p->p_nrlwps--;
l->l_stat = LSSTOP;
lwp_unlock(l);
mutex_exit(p->p_lock);
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
mutex_enter(p->p_lock);
}
/*
* Bin any remaining signals and mark the process as dying so it will
* not be found for, e.g. signals.
*/
sigfillset(&p->p_sigctx.ps_sigignore);
sigclearall(p, NULL, &kq);
p->p_stat = SDYING;
/*
* Perform any required thread cleanup. Do this early so
* anyone wanting to look us up by our global thread ID
* will fail to find us.
*
* N.B. this will unlock p->p_lock on our behalf.
*/
lwp_thread_cleanup(l);
ksiginfo_queue_drain(&kq);
/* Destroy any lwpctl info. */
if (p->p_lwpctl != NULL)
lwp_ctl_exit();
/*
* Drain all remaining references that procfs, ptrace and others may
* have on the process.
*/
rw_enter(&p->p_reflock, RW_WRITER);
DPRINTF(("%s: %d.%d exiting.\n", __func__, p->p_pid, l->l_lid));
ptimers_free(p, TIMERS_ALL);
#if defined(__HAVE_RAS)
ras_purgeall();
#endif
/*
* Close open files, release open-file table and free signal
* actions. This may block!
*/
fd_free();
cwdfree(p->p_cwdi);
p->p_cwdi = NULL;
doexithooks(p);
sigactsfree(p->p_sigacts);
/*
* Write out accounting data.
*/
(void)acct_process(l);
#ifdef KTRACE
/*
* Release trace file.
*/
if (p->p_tracep != NULL) {
mutex_enter(&ktrace_lock);
ktrderef(p);
mutex_exit(&ktrace_lock);
}
#endif
p->p_xexit = exitcode;
p->p_xsig = signo;
/*
* If emulation has process exit hook, call it now.
* Set the exit status now so that the exit hook has
* an opportunity to tweak it (COMPAT_LINUX requires
* this for thread group emulation)
*/
if (p->p_emul->e_proc_exit)
(*p->p_emul->e_proc_exit)(p);
/*
* Free the VM resources we're still holding on to.
* We must do this from a valid thread because doing
* so may block. This frees vmspace, which we don't
* need anymore. The only remaining lwp is the one
* we run at this moment, nothing runs in userland
* anymore.
*/
ruspace(p); /* Update our vm resource use */
uvm_proc_exit(p);
/*
* Stop profiling.
*/
if (__predict_false((p->p_stflag & PST_PROFIL) != 0)) {
mutex_spin_enter(&p->p_stmutex);
stopprofclock(p);
mutex_spin_exit(&p->p_stmutex);
}
/*
* If parent is waiting for us to exit or exec, PL_PPWAIT is set; we
* wake up the parent early to avoid deadlock. We can do this once
* the VM resources are released.
*/
mutex_enter(&proc_lock);
if (p->p_lflag & PL_PPWAIT) {
lwp_t *lp;
l->l_lwpctl = NULL; /* was on loan from blocked parent */
p->p_lflag &= ~PL_PPWAIT;
lp = p->p_vforklwp;
p->p_vforklwp = NULL;
lp->l_vforkwaiting = false;
cv_broadcast(&lp->l_waitcv);
}
if (SESS_LEADER(p)) {
struct vnode *vprele = NULL, *vprevoke = NULL;
struct session *sp = p->p_session;
struct tty *tp;
if (sp->s_ttyvp) {
/*
* Controlling process.
* Signal foreground pgrp,
* drain controlling terminal
* and revoke access to controlling terminal.
*/
tp = sp->s_ttyp;
mutex_spin_enter(&tty_lock);
if (tp->t_session == sp) {
/* we can't guarantee the revoke will do this */
pgrp = tp->t_pgrp;
tp->t_pgrp = NULL;
tp->t_session = NULL;
mutex_spin_exit(&tty_lock);
if (pgrp != NULL) {
pgsignal(pgrp, SIGHUP, 1);
}
mutex_exit(&proc_lock);
(void) ttywait(tp);
mutex_enter(&proc_lock);
/* The tty could have been revoked. */
vprevoke = sp->s_ttyvp;
} else
mutex_spin_exit(&tty_lock);
vprele = sp->s_ttyvp;
sp->s_ttyvp = NULL;
/*
* s_ttyp is not zero'd; we use this to indicate
* that the session once had a controlling terminal.
* (for logging and informational purposes)
*/
}
sp->s_leader = NULL;
if (vprevoke != NULL || vprele != NULL) {
if (vprevoke != NULL) {
/* Releases proc_lock. */
proc_sessrele(sp);
VOP_REVOKE(vprevoke, REVOKEALL);
} else
mutex_exit(&proc_lock);
if (vprele != NULL)
vrele(vprele);
mutex_enter(&proc_lock);
}
}
fixjobc(p, p->p_pgrp, 0);
/* Release fstrans private data. */
fstrans_lwp_dtor(l);
/*
* Finalize the last LWP's specificdata, as well as the
* specificdata for the proc itself.
*/
lwp_finispecific(l);
proc_finispecific(p);
/*
* Reset p_opptr pointer of all former children which got
* traced by another process and were reparented. We reset
* it to NULL here; the trace detach code then reparents
* the child to initproc. We only check allproc list, since
* eventual former children on zombproc list won't reference
* p_opptr anymore.
*/
if (__predict_false(p->p_slflag & PSL_CHTRACED)) {
struct proc *q;
PROCLIST_FOREACH(q, &allproc) {
if (q->p_opptr == p)
q->p_opptr = NULL;
}
PROCLIST_FOREACH(q, &zombproc) {
if (q->p_opptr == p)
q->p_opptr = NULL;
}
}
/*
* Give orphaned children to init(8).
*/
child = LIST_FIRST(&p->p_children);
wakeinit = (child != NULL);
for (; child != NULL; child = next_child) {
next_child = LIST_NEXT(child, p_sibling);
/*
* Traced processes are killed since their existence
* means someone is screwing up. Since we reset the
* trace flags, the logic in sys_wait4() would not be
* triggered to reparent the process to its
* original parent, so we must do this here.
*/
if (__predict_false(child->p_slflag & PSL_TRACED)) {
mutex_enter(p->p_lock);
child->p_slflag &=
~(PSL_TRACED|PSL_SYSCALL);
mutex_exit(p->p_lock);
if (child->p_opptr != child->p_pptr) {
struct proc *t = child->p_opptr;
proc_reparent(child, t ? t : initproc);
child->p_opptr = NULL;
} else
proc_reparent(child, initproc);
killproc(child, "orphaned traced process");
} else
proc_reparent(child, initproc);
}
/*
* Move proc from allproc to zombproc, it's now nearly ready to be
* collected by parent.
*/
LIST_REMOVE(l, l_list);
LIST_REMOVE(p, p_list);
LIST_INSERT_HEAD(&zombproc, p, p_list);
/*
* Mark the process as dead. We must do this before we signal
* the parent.
*/
p->p_stat = SDEAD;
/*
* Let anyone watching this DTrace probe know what we're
* on our way out.
*/
SDT_PROBE(proc, kernel, , exit,
((p->p_sflag & PS_COREDUMP) ? CLD_DUMPED :
(p->p_xsig ? CLD_KILLED : CLD_EXITED)),
0,0,0,0);
/* Put in front of parent's sibling list for parent to collect it */
old_parent = p->p_pptr;
old_parent->p_nstopchild++;
if (LIST_FIRST(&old_parent->p_children) != p) {
/* Put child where it can be found quickly */
LIST_REMOVE(p, p_sibling);
LIST_INSERT_HEAD(&old_parent->p_children, p, p_sibling);
}
/*
* Notify parent that we're gone. If parent has the P_NOCLDWAIT
* flag set, notify init instead (and hope it will handle
* this situation).
*/
if (old_parent->p_flag & (PK_NOCLDWAIT|PK_CLDSIGIGN)) {
proc_reparent(p, initproc);
wakeinit = 1;
/*
* If this was the last child of our parent, notify
* parent, so in case he was wait(2)ing, he will
* continue.
*/
if (LIST_FIRST(&old_parent->p_children) == NULL)
cv_broadcast(&old_parent->p_waitcv);
}
/* Reload parent pointer, since p may have been reparented above */
new_parent = p->p_pptr;
if (__predict_false(p->p_exitsig != 0)) {
exit_psignal(p, new_parent, &ksi);
kpsignal(new_parent, &ksi, NULL);
}
/* Calculate the final rusage info. */
calcru(p, &p->p_stats->p_ru.ru_utime, &p->p_stats->p_ru.ru_stime,
NULL, NULL);
callout_destroy(&l->l_timeout_ch);
/*
* Release any PCU resources before becoming a zombie.
*/
pcu_discard_all(l);
/*
* Notify other processes tracking us with a knote that
* we're exiting.
*
* N.B. we do this here because the process is now SDEAD,
* and thus cannot have any more knotes attached. Also,
* knote_proc_exit() expects that p->p_lock is already
* held (and will assert so).
*/
mutex_enter(p->p_lock);
if (!SLIST_EMPTY(&p->p_klist)) {
knote_proc_exit(p);
}
/* Free the LWP ID */
proc_free_lwpid(p, l->l_lid);
lwp_drainrefs(l);
lwp_lock(l);
l->l_prflag &= ~LPR_DETACHED;
l->l_stat = LSZOMB;
lwp_unlock(l);
KASSERT(curlwp == l);
KASSERT(p->p_nrlwps == 1);
KASSERT(p->p_nlwps == 1);
p->p_stat = SZOMB;
p->p_nrlwps--;
p->p_nzlwps++;
p->p_ndlwps = 0;
mutex_exit(p->p_lock);
/*
* Signal the parent to collect us, and drop the proclist lock.
* Drop debugger/procfs lock; no new references can be gained.
*/
rw_exit(&p->p_reflock);
cv_broadcast(&p->p_pptr->p_waitcv);
mutex_exit(&proc_lock);
if (wakeinit)
cv_broadcast(&initproc->p_waitcv);
/*
* NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
*/
/*
* Give machine-dependent code a chance to free any MD LWP
* resources. This must be done before uvm_lwp_exit(), in
* case these resources are in the PCB.
*/
cpu_lwp_free(l, 1);
/* Switch away into oblivion. */
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
panic("exit1");
}
void
exit_lwps(struct lwp *l)
{
proc_t *p = l->l_proc;
lwp_t *l2;
retry:
KASSERT(mutex_owned(p->p_lock));
/*
* Interrupt LWPs in interruptable sleep, unsuspend suspended
* LWPs and then wait for everyone else to finish.
*/
LIST_FOREACH(l2, &p->p_lwps, l_sibling) {
if (l2 == l)
continue;
lwp_lock(l2);
l2->l_flag |= LW_WEXIT;
lwp_need_userret(l2);
if ((l2->l_stat == LSSLEEP && (l2->l_flag & LW_SINTR)) ||
l2->l_stat == LSSUSPENDED || l2->l_stat == LSSTOP) {
l2->l_flag &= ~LW_DBGSUSPEND;
/* setrunnable() will release the lock. */
setrunnable(l2);
continue;
}
lwp_unlock(l2);
}
/*
* Wait for every LWP to exit. Note: LWPs can get suspended/slept
* behind us or there may even be new LWPs created. Therefore, a
* full retry is required on error.
*/
while (p->p_nlwps > 1) {
if (lwp_wait(l, 0, NULL, true)) {
goto retry;
}
}
KASSERT(p->p_nlwps == 1);
}
int
do_sys_waitid(idtype_t idtype, id_t id, int *pid, int *status, int options,
struct wrusage *wru, siginfo_t *si)
{
proc_t *child;
int error;
if (wru != NULL) memset(wru, 0, sizeof(*wru)); if (si != NULL) memset(si, 0, sizeof(*si));
mutex_enter(&proc_lock);
error = find_stopped_child(curproc, idtype, id, options, &child,
wru, si);
if (child == NULL) {
mutex_exit(&proc_lock);
*pid = 0;
*status = 0;
return error;
}
*pid = child->p_pid;
if (child->p_stat == SZOMB) {
/* Child is exiting */
*status = P_WAITSTATUS(child);
/* proc_free() will release the proc_lock. */
if (options & WNOWAIT) {
mutex_exit(&proc_lock);
} else {
proc_free(child, wru);
}
} else {
/* Don't mark SIGCONT if we are being stopped */
*status = (child->p_xsig == SIGCONT && child->p_stat != SSTOP) ?
W_CONTCODE() : W_STOPCODE(child->p_xsig);
mutex_exit(&proc_lock);
}
return 0;
}
int
do_sys_wait(int *pid, int *status, int options, struct rusage *ru)
{
idtype_t idtype;
id_t id;
int ret;
struct wrusage wru;
/*
* Translate the special pid values into the (idtype, pid)
* pair for wait6. The WAIT_MYPGRP case is handled by
* find_stopped_child() on its own.
*/
if (*pid == WAIT_ANY) {
idtype = P_ALL;
id = 0;
} else if (*pid < 0) {
idtype = P_PGID;
id = (id_t)-*pid;
} else {
idtype = P_PID;
id = (id_t)*pid;
}
options |= WEXITED | WTRAPPED;
ret = do_sys_waitid(idtype, id, pid, status, options, ru ? &wru : NULL,
NULL);
if (ru) *ru = wru.wru_self;
return ret;
}
int
sys___wait450(struct lwp *l, const struct sys___wait450_args *uap,
register_t *retval)
{
/* {
syscallarg(int) pid;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct rusage *) rusage;
} */
int error, status, pid = SCARG(uap, pid);
struct rusage ru;
error = do_sys_wait(&pid, &status, SCARG(uap, options),
SCARG(uap, rusage) != NULL ? &ru : NULL);
retval[0] = pid;
if (pid == 0) {
return error;
}
if (SCARG(uap, status)) { error = copyout(&status, SCARG(uap, status), sizeof(status));
}
if (SCARG(uap, rusage) && error == 0) { error = copyout(&ru, SCARG(uap, rusage), sizeof(ru));
}
return error;
}
int
sys_wait6(struct lwp *l, const struct sys_wait6_args *uap, register_t *retval)
{
/* {
syscallarg(idtype_t) idtype;
syscallarg(id_t) id;
syscallarg(int *) status;
syscallarg(int) options;
syscallarg(struct wrusage *) wru;
syscallarg(siginfo_t *) si;
} */
struct wrusage wru, *wrup;
siginfo_t si, *sip;
idtype_t idtype;
int pid;
id_t id;
int error, status;
idtype = SCARG(uap, idtype);
id = SCARG(uap, id);
if (SCARG(uap, wru) != NULL)
wrup = &wru;
else
wrup = NULL;
if (SCARG(uap, info) != NULL)
sip = &si;
else
sip = NULL;
/*
* We expect all callers of wait6() to know about WEXITED and
* WTRAPPED.
*/
error = do_sys_waitid(idtype, id, &pid, &status, SCARG(uap, options),
wrup, sip);
retval[0] = pid; /* tell userland who it was */
#if 0
/*
* should we copyout if there was no process, hence no useful data?
* We don't for an old style wait4() (etc) but I believe
* FreeBSD does for wait6(), so a tossup... Go with FreeBSD for now.
*/
if (pid == 0)
return error;
#endif
if (SCARG(uap, status) != NULL && error == 0)
error = copyout(&status, SCARG(uap, status), sizeof(status));
if (SCARG(uap, wru) != NULL && error == 0)
error = copyout(&wru, SCARG(uap, wru), sizeof(wru));
if (SCARG(uap, info) != NULL && error == 0)
error = copyout(&si, SCARG(uap, info), sizeof(si));
return error;
}
/*
* Find a process that matches the provided criteria, and fill siginfo
* and resources if found.
* Returns:
* -1: Not found, abort early
* 0: Not matched
* 1: Matched, there might be more matches
* 2: This is the only match
*/
static int
match_process(const struct proc *pp, struct proc **q, idtype_t idtype, id_t id,
int options, struct wrusage *wrusage, siginfo_t *siginfo)
{
struct rusage *rup;
struct proc *p = *q;
int rv = 1;
switch (idtype) {
case P_ALL:
mutex_enter(p->p_lock);
break;
case P_PID:
if (p->p_pid != (pid_t)id) {
p = *q = proc_find_raw((pid_t)id);
if (p == NULL || p->p_stat == SIDL || p->p_pptr != pp) {
*q = NULL;
return -1;
}
}
mutex_enter(p->p_lock);
rv++;
break;
case P_PGID:
if (p->p_pgid != (pid_t)id)
return 0;
mutex_enter(p->p_lock);
break;
case P_SID:
if (p->p_session->s_sid != (pid_t)id)
return 0;
mutex_enter(p->p_lock);
break;
case P_UID:
mutex_enter(p->p_lock);
if (kauth_cred_geteuid(p->p_cred) != (uid_t)id) { mutex_exit(p->p_lock);
return 0;
}
break;
case P_GID:
mutex_enter(p->p_lock);
if (kauth_cred_getegid(p->p_cred) != (gid_t)id) { mutex_exit(p->p_lock);
return 0;
}
break;
case P_CID:
case P_PSETID:
case P_CPUID:
/* XXX: Implement me */
default:
return 0;
}
if ((options & WEXITED) == 0 && p->p_stat == SZOMB) { mutex_exit(p->p_lock);
return 0;
}
if (siginfo != NULL) {
siginfo->si_errno = 0;
/*
* SUSv4 requires that the si_signo value is always
* SIGCHLD. Obey it despite the rfork(2) interface
* allows to request other signal for child exit
* notification.
*/
siginfo->si_signo = SIGCHLD;
/*
* This is still a rough estimate. We will fix the
* cases TRAPPED, STOPPED, and CONTINUED later.
*/
if (p->p_sflag & PS_COREDUMP) {
siginfo->si_code = CLD_DUMPED;
siginfo->si_status = p->p_xsig;
} else if (p->p_xsig) {
siginfo->si_code = CLD_KILLED;
siginfo->si_status = p->p_xsig;
} else {
siginfo->si_code = CLD_EXITED;
siginfo->si_status = p->p_xexit;
}
siginfo->si_pid = p->p_pid;
siginfo->si_uid = kauth_cred_geteuid(p->p_cred);
siginfo->si_utime = p->p_stats->p_ru.ru_utime.tv_sec;
siginfo->si_stime = p->p_stats->p_ru.ru_stime.tv_sec;
}
/*
* There should be no reason to limit resources usage info to
* exited processes only. A snapshot about any resources used
* by a stopped process may be exactly what is needed.
*/
if (wrusage != NULL) {
rup = &wrusage->wru_self;
*rup = p->p_stats->p_ru;
calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);
rup = &wrusage->wru_children;
*rup = p->p_stats->p_cru;
calcru(p, &rup->ru_utime, &rup->ru_stime, NULL, NULL);
}
mutex_exit(p->p_lock);
return rv;
}
/*
* Determine if there are existing processes being debugged
* that used to be (and sometime later will be again) children
* of a specific parent (while matching wait criteria)
*/
static bool
debugged_child_exists(idtype_t idtype, id_t id, int options, siginfo_t *si,
const struct proc *parent)
{
struct proc *pp;
/*
* If we are searching for a specific pid, we can optimise a little
*/
if (idtype == P_PID) {
/*
* Check the specific process to see if its real parent is us
*/
pp = proc_find_raw((pid_t)id);
if (pp != NULL && pp->p_stat != SIDL && pp->p_opptr == parent) {
/*
* using P_ALL here avoids match_process() doing the
* same work that we just did, but incorrectly for
* this scenario.
*/
if (match_process(parent, &pp, P_ALL, id, options,
NULL, si))
return true;
}
return false;
}
/*
* For the hard cases, just look everywhere to see if some
* stolen (reparented) process is really our lost child.
* Then check if that process could satisfy the wait conditions.
*/
/*
* XXX inefficient, but hopefully fairly rare.
* XXX should really use a list of reparented processes.
*/
PROCLIST_FOREACH(pp, &allproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */
continue;
if (pp->p_opptr == parent &&
match_process(parent, &pp, idtype, id, options, NULL, si))
return true;
}
PROCLIST_FOREACH(pp, &zombproc) { if (pp->p_stat == SIDL) /* XXX impossible ?? */
continue;
if (pp->p_opptr == parent &&
match_process(parent, &pp, idtype, id, options, NULL, si))
return true;
}
return false;
}
/*
* Scan list of child processes for a child process that has stopped or
* exited. Used by sys_wait4 and 'compat' equivalents.
*
* Must be called with the proc_lock held, and may release while waiting.
*/
static int
find_stopped_child(struct proc *parent, idtype_t idtype, id_t id, int options,
struct proc **child_p, struct wrusage *wru, siginfo_t *si)
{
struct proc *child, *dead;
int error;
KASSERT(mutex_owned(&proc_lock)); if (options & ~WALLOPTS) {
*child_p = NULL;
return EINVAL;
}
if ((options & WSELECTOPTS) == 0) {
/*
* We will be unable to find any matching processes,
* because there are no known events to look for.
* Prefer to return error instead of blocking
* indefinitely.
*/
*child_p = NULL;
return EINVAL;
}
if ((pid_t)id == WAIT_MYPGRP && (idtype == P_PID || idtype == P_PGID)) {
id = (id_t)parent->p_pgid;
idtype = P_PGID;
}
for (;;) {
error = ECHILD;
dead = NULL;
LIST_FOREACH(child, &parent->p_children, p_sibling) {
int rv = match_process(parent, &child, idtype, id,
options, wru, si);
if (rv == -1)
break;
if (rv == 0)
continue;
/*
* Wait for processes with p_exitsig != SIGCHLD
* processes only if WALTSIG is set; wait for
* processes with p_exitsig == SIGCHLD only
* if WALTSIG is clear.
*/
if (((options & WALLSIG) == 0) && (options & WALTSIG ? child->p_exitsig == SIGCHLD : P_EXITSIG(child) != SIGCHLD)){ if (rv == 2) {
child = NULL;
break;
}
continue;
}
error = 0;
if ((options & WNOZOMBIE) == 0) { if (child->p_stat == SZOMB)
break;
if (child->p_stat == SDEAD) {
/*
* We may occasionally arrive here
* after receiving a signal, but
* immediately before the child
* process is zombified. The wait
* will be short, so avoid returning
* to userspace.
*/
dead = child;
}
}
if ((options & WCONTINUED) != 0 && child->p_xsig == SIGCONT && (child->p_sflag & PS_CONTINUED)) { if ((options & WNOWAIT) == 0) {
child->p_sflag &= ~PS_CONTINUED;
child->p_waited = 1;
parent->p_nstopchild--;
}
if (si) {
si->si_status = child->p_xsig;
si->si_code = CLD_CONTINUED;
}
break;
}
if ((options & (WTRAPPED|WSTOPPED)) != 0 && child->p_stat == SSTOP && child->p_waited == 0 && ((child->p_slflag & PSL_TRACED) ||
options & (WUNTRACED|WSTOPPED))) {
if ((options & WNOWAIT) == 0) {
child->p_waited = 1;
parent->p_nstopchild--;
}
if (si) {
si->si_status = child->p_xsig;
si->si_code =
(child->p_slflag & PSL_TRACED) ?
CLD_TRAPPED : CLD_STOPPED;
}
break;
}
if (parent->p_nstopchild == 0 || rv == 2) {
child = NULL;
break;
}
}
/*
* If we found nothing, but we are the bereaved parent
* of a stolen child, look and see if that child (or
* one of them) meets our search criteria. If so, then
* we cannot succeed, but we can hang (wait...),
* or if WNOHANG, return 0 instead of ECHILD
*/
if (child == NULL && error == ECHILD && (parent->p_slflag & PSL_CHTRACED) && debugged_child_exists(idtype, id, options, si, parent))
error = 0;
if (child != NULL || error != 0 ||
((options & WNOHANG) != 0 && dead == NULL)) {
*child_p = child;
return error;
}
/*
* Wait for another child process to stop.
*/
error = cv_wait_sig(&parent->p_waitcv, &proc_lock);
if (error != 0) {
*child_p = NULL;
return error;
}
}
}
/*
* Free a process after parent has taken all the state info. Must be called
* with the proclist lock held, and will release before returning.
*
* *ru is returned to the caller, and must be freed by the caller.
*/
static void
proc_free(struct proc *p, struct wrusage *wru)
{
struct proc *parent = p->p_pptr;
struct lwp *l;
ksiginfo_t ksi;
kauth_cred_t cred1, cred2;
uid_t uid;
KASSERT(mutex_owned(&proc_lock)); KASSERT(p->p_nlwps == 1); KASSERT(p->p_nzlwps == 1); KASSERT(p->p_nrlwps == 0); KASSERT(p->p_stat == SZOMB);
/*
* If we got the child via ptrace(2) or procfs, and
* the parent is different (meaning the process was
* attached, rather than run as a child), then we need
* to give it back to the old parent, and send the
* parent the exit signal. The rest of the cleanup
* will be done when the old parent waits on the child.
*/
if ((p->p_slflag & PSL_TRACED) != 0 && p->p_opptr != parent) {
mutex_enter(p->p_lock);
p->p_slflag &= ~(PSL_TRACED|PSL_SYSCALL);
mutex_exit(p->p_lock);
parent = (p->p_opptr == NULL) ? initproc : p->p_opptr;
proc_reparent(p, parent);
p->p_opptr = NULL;
if (p->p_exitsig != 0) { exit_psignal(p, parent, &ksi);
kpsignal(parent, &ksi, NULL);
}
cv_broadcast(&parent->p_waitcv);
mutex_exit(&proc_lock);
return;
}
sched_proc_exit(parent, p);
/*
* Add child times of exiting process onto its own times.
* This cannot be done any earlier else it might get done twice.
*/
l = LIST_FIRST(&p->p_lwps);
ruadd(&p->p_stats->p_ru, &l->l_ru);
ruadd(&p->p_stats->p_ru, &p->p_stats->p_cru);
ruadd(&parent->p_stats->p_cru, &p->p_stats->p_ru);
if (wru != NULL) { wru->wru_self = p->p_stats->p_ru;
wru->wru_children = p->p_stats->p_cru;
}
p->p_xsig = 0;
p->p_xexit = 0;
/*
* At this point we are going to start freeing the final resources.
* If anyone tries to access the proc structure after here they will
* get a shock - bits are missing. Attempt to make it hard! We
* don't bother with any further locking past this point.
*/
p->p_stat = SIDL; /* not even a zombie any more */
LIST_REMOVE(p, p_list); /* off zombproc */
parent->p_nstopchild--;
LIST_REMOVE(p, p_sibling);
/*
* Let pid be reallocated.
*/
proc_free_pid(p->p_pid);
atomic_dec_uint(&nprocs);
/*
* Unlink process from its process group.
* Releases the proc_lock.
*/
proc_leavepgrp(p);
/*
* Delay release until after lwp_free.
*/
cred2 = l->l_cred;
/*
* Free the last LWP's resources.
*
* lwp_free ensures the LWP is no longer running on another CPU.
*/
lwp_free(l, false, true);
/*
* Now no one except us can reach the process p.
*/
/*
* Decrement the count of procs running with this uid.
*/
cred1 = p->p_cred;
uid = kauth_cred_getuid(cred1);
(void)chgproccnt(uid, -1);
/*
* Release substructures.
*/
lim_free(p->p_limit);
pstatsfree(p->p_stats);
kauth_cred_free(cred1);
kauth_cred_free(cred2);
/*
* Release reference to text vnode
*/
if (p->p_textvp) vrele(p->p_textvp);
kmem_strfree(p->p_path);
mutex_destroy(&p->p_auxlock);
mutex_obj_free(p->p_lock);
mutex_destroy(&p->p_stmutex);
cv_destroy(&p->p_waitcv);
cv_destroy(&p->p_lwpcv);
rw_destroy(&p->p_reflock);
proc_free_mem(p);
}
/*
* Change the parent of a process for tracing purposes.
*/
void
proc_changeparent(struct proc *t, struct proc *p)
{
SET(t->p_slflag, PSL_TRACED);
t->p_opptr = t->p_pptr;
if (t->p_pptr == p)
return;
struct proc *parent = t->p_pptr;
if (parent->p_lock < t->p_lock) {
if (!mutex_tryenter(parent->p_lock)) { mutex_exit(t->p_lock);
mutex_enter(parent->p_lock);
mutex_enter(t->p_lock);
}
} else if (parent->p_lock > t->p_lock) { mutex_enter(parent->p_lock);
}
parent->p_slflag |= PSL_CHTRACED;
proc_reparent(t, p);
if (parent->p_lock != t->p_lock) mutex_exit(parent->p_lock);
}
/*
* make process 'parent' the new parent of process 'child'.
*
* Must be called with proc_lock held.
*/
void
proc_reparent(struct proc *child, struct proc *parent)
{ KASSERT(mutex_owned(&proc_lock)); if (child->p_pptr == parent)
return;
if (child->p_stat == SZOMB || child->p_stat == SDEAD ||
(child->p_stat == SSTOP && !child->p_waited)) {
child->p_pptr->p_nstopchild--;
parent->p_nstopchild++;
}
if (parent == initproc) { child->p_exitsig = SIGCHLD;
child->p_ppid = parent->p_pid;
}
LIST_REMOVE(child, p_sibling); LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
}
/* $NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $ */
/*
* Copyright (c) 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_bmap.c 8.8 (Berkeley) 8/11/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_bmap.c,v 1.54 2022/11/17 06:40:40 chs Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/trace.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
static bool
ufs_issequential(const struct ufsmount *ump, daddr_t daddr0, daddr_t daddr1)
{
/* for ufs, blocks in a hole is not 'contiguous'. */
if (daddr0 == 0)
return false;
return (daddr0 + ump->um_seqinc == daddr1);
}
/*
* Bmap converts the logical block number of a file to its physical block
* number on the disk. The conversion is done by using the logical block
* number to index into the array of block pointers described by the dinode.
*/
int
ufs_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
int error;
/*
* Check for underlying vnode requests and ensure that logical
* to physical mapping is requested.
*/
if (ap->a_vpp != NULL)
*ap->a_vpp = VTOI(ap->a_vp)->i_devvp;
if (ap->a_bnp == NULL)
return (0);
error = ufs_bmaparray(ap->a_vp, ap->a_bn, ap->a_bnp, NULL, NULL,
ap->a_runp, ufs_issequential);
return error;
}
/*
* Indirect blocks are now on the vnode for the file. They are given negative
* logical block numbers. Indirect blocks are addressed by the negative
* address of the first data block to which they point. Double indirect blocks
* are addressed by one less than the address of the first indirect block to
* which they point. Triple indirect blocks are addressed by one less than
* the address of the first double indirect block to which they point.
*
* ufs_bmaparray does the bmap conversion, and if requested returns the
* array of logical blocks which must be traversed to get to a block.
* Each entry contains the offset into that block that gets you to the
* next block and the disk address of the block (if it is assigned).
*/
int
ufs_bmaparray(struct vnode *vp, daddr_t bn, daddr_t *bnp, struct indir *ap,
int *nump, int *runp, ufs_issequential_callback_t is_sequential)
{
struct inode *ip;
struct buf *bp, *cbp;
struct ufsmount *ump;
struct mount *mp;
struct indir a[UFS_NIADDR + 1], *xap;
daddr_t daddr;
daddr_t metalbn;
int error, maxrun = 0, num;
ip = VTOI(vp);
mp = vp->v_mount;
ump = ip->i_ump;
KASSERTMSG(((ap == NULL) == (nump == NULL)),
"ufs_bmaparray: invalid arguments: ap = %p, nump = %p", ap, nump);
if (runp) {
/*
* XXX
* If MAXBSIZE is the largest transfer the disks can handle,
* we probably want maxrun to be 1 block less so that we
* don't create a block larger than the device can handle.
*/
*runp = 0;
maxrun = MAXPHYS / mp->mnt_stat.f_iosize - 1;
}
if (bn >= 0 && bn < UFS_NDADDR) {
if (nump != NULL) *nump = 0;
if (ump->um_fstype == UFS1)
daddr = ufs_rw32(ip->i_ffs1_db[bn],
UFS_MPNEEDSWAP(ump));
else
daddr = ufs_rw64(ip->i_ffs2_db[bn],
UFS_MPNEEDSWAP(ump));
*bnp = blkptrtodb(ump, daddr);
/*
* Since this is FFS independent code, we are out of
* scope for the definitions of BLK_NOCOPY and
* BLK_SNAP, but we do know that they will fall in
* the range 1..um_seqinc, so we use that test and
* return a request for a zeroed out buffer if attempts
* are made to read a BLK_NOCOPY or BLK_SNAP block.
*/
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 &&
daddr < ump->um_seqinc) {
*bnp = -1;
} else if (*bnp == 0) {
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
== SF_SNAPSHOT) {
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
} else {
*bnp = -1;
}
} else if (runp) {
if (ump->um_fstype == UFS1) {
for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
is_sequential(ump,
ufs_rw32(ip->i_ffs1_db[bn - 1],
UFS_MPNEEDSWAP(ump)),
ufs_rw32(ip->i_ffs1_db[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
} else {
for (++bn; bn < UFS_NDADDR && *runp < maxrun &&
is_sequential(ump,
ufs_rw64(ip->i_ffs2_db[bn - 1],
UFS_MPNEEDSWAP(ump)),
ufs_rw64(ip->i_ffs2_db[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
}
return (0);
} else if (bn < 0 && bn >= -UFS_NXADDR) {
KASSERT(ump->um_fstype == UFS2 && (ump->um_flags & UFS_EA) != 0); daddr = ufs_rw64(ip->i_ffs2_extb[-1 - bn], UFS_MPNEEDSWAP(ump));
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0)
*bnp = -1;
return 0;
}
xap = ap == NULL ? a : ap;
if (!nump)
nump = #
if ((error = ufs_getlbns(vp, bn, xap, nump)) != 0)
return (error);
num = *nump;
/* Get disk address out of indirect block array */
if (ump->um_fstype == UFS1)
daddr = ufs_rw32(ip->i_ffs1_ib[xap->in_off],
UFS_MPNEEDSWAP(ump));
else
daddr = ufs_rw64(ip->i_ffs2_ib[xap->in_off],
UFS_MPNEEDSWAP(ump));
for (bp = NULL, ++xap; --num; ++xap) {
/*
* Exit the loop if there is no disk address assigned yet and
* the indirect block isn't in the cache, or if we were
* looking for an indirect block and we've found it.
*/
metalbn = xap->in_lbn;
if (metalbn == bn)
break;
if (daddr == 0) {
mutex_enter(&bufcache_lock);
cbp = incore(vp, metalbn);
mutex_exit(&bufcache_lock);
if (cbp == NULL)
break;
}
/*
* If we get here, we've either got the block in the cache
* or we have a disk address for it, go fetch it.
*/
if (bp) brelse(bp, 0);
xap->in_exists = 1;
bp = getblk(vp, metalbn, mp->mnt_stat.f_iosize, 0, 0);
if (bp == NULL) {
/*
* getblk() above returns NULL only iff we are
* pagedaemon. See the implementation of getblk
* for detail.
*/
return (ENOMEM);
}
if (bp->b_oflags & (BO_DONE | BO_DELWRI)) {
trace(TR_BREADHIT, pack(vp, size), metalbn);
} else {
KASSERTMSG((daddr != 0),
"ufs_bmaparray: indirect block not in cache");
trace(TR_BREADMISS, pack(vp, size), metalbn);
bp->b_blkno = blkptrtodb(ump, daddr);
bp->b_flags |= B_READ;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
curlwp->l_ru.ru_inblock++; /* XXX */
if ((error = biowait(bp)) != 0) { brelse(bp, 0);
return (error);
}
}
if (ump->um_fstype == UFS1) {
daddr = ufs_rw32(((u_int32_t *)bp->b_data)[xap->in_off],
UFS_MPNEEDSWAP(ump));
if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
ufs_rw32(((int32_t *)bp->b_data)[bn-1],
UFS_MPNEEDSWAP(ump)),
ufs_rw32(((int32_t *)bp->b_data)[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
} else {
daddr = ufs_rw64(((u_int64_t *)bp->b_data)[xap->in_off],
UFS_MPNEEDSWAP(ump));
if (num == 1 && daddr && runp) { for (bn = xap->in_off + 1; bn < MNINDIR(ump) && *runp < maxrun &&
is_sequential(ump,
ufs_rw64(((int64_t *)bp->b_data)[bn-1],
UFS_MPNEEDSWAP(ump)),
ufs_rw64(((int64_t *)bp->b_data)[bn],
UFS_MPNEEDSWAP(ump)));
++bn, ++*runp);
}
}
}
if (bp) brelse(bp, 0);
/*
* Since this is FFS independent code, we are out of scope for the
* definitions of BLK_NOCOPY and BLK_SNAP, but we do know that they
* will fall in the range 1..um_seqinc, so we use that test and
* return a request for a zeroed out buffer if attempts are made
* to read a BLK_NOCOPY or BLK_SNAP block.
*/
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) == SF_SNAPSHOT && daddr > 0 && daddr < ump->um_seqinc) { *bnp = -1;
return (0);
}
*bnp = blkptrtodb(ump, daddr);
if (*bnp == 0) {
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL))
== SF_SNAPSHOT) {
*bnp = blkptrtodb(ump, bn * ump->um_seqinc);
} else {
*bnp = -1;
}
}
return (0);
}
/*
* Create an array of logical block number/offset pairs which represent the
* path of indirect blocks required to access a data block. The first "pair"
* contains the logical block number of the appropriate single, double or
* triple indirect block and the offset into the inode indirect block array.
* Note, the logical block number of the inode single/double/triple indirect
* block appears twice in the array, once with the offset into the i_ffs1_ib and
* once with the offset into the page itself.
*/
int
ufs_getlbns(struct vnode *vp, daddr_t bn, struct indir *ap, int *nump)
{
daddr_t metalbn, realbn;
struct ufsmount *ump;
int64_t blockcnt;
int lbc;
int i, numlevels, off;
ump = VFSTOUFS(vp->v_mount);
if (nump) *nump = 0;
numlevels = 0;
realbn = bn;
if (bn < 0)
bn = -bn;
KASSERT(bn >= UFS_NDADDR);
/*
* Determine the number of levels of indirection. After this loop
* is done, blockcnt indicates the number of data blocks possible
* at the given level of indirection, and UFS_NIADDR - i is the number
* of levels of indirection needed to locate the requested block.
*/
bn -= UFS_NDADDR;
for (lbc = 0, i = UFS_NIADDR;; i--, bn -= blockcnt) {
if (i == 0)
return (EFBIG);
lbc += ump->um_lognindir;
blockcnt = (int64_t)1 << lbc;
if (bn < blockcnt)
break;
}
/* Calculate the address of the first meta-block. */
metalbn = -((realbn >= 0 ? realbn : -realbn) - bn + UFS_NIADDR - i);
/*
* At each iteration, off is the offset into the bap array which is
* an array of disk addresses at the current level of indirection.
* The logical block number and the offset in that block are stored
* into the argument array.
*/
ap->in_lbn = metalbn;
ap->in_off = off = UFS_NIADDR - i;
ap->in_exists = 0;
ap++;
for (++numlevels; i <= UFS_NIADDR; i++) {
/* If searching for a meta-data block, quit when found. */
if (metalbn == realbn)
break;
lbc -= ump->um_lognindir;
off = (bn >> lbc) & (MNINDIR(ump) - 1);
++numlevels;
ap->in_lbn = metalbn;
ap->in_off = off;
ap->in_exists = 0;
++ap;
metalbn -= -1 + ((int64_t)off << lbc);
}
if (nump) *nump = numlevels;
return (0);
}
/* $NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $ */
/*-
* Copyright (c) 1998, 1999, 2007, 2009, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_kthread.c,v 1.49 2023/09/23 14:40:42 ad Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/mutex.h>
#include <sys/sched.h>
#include <sys/kmem.h>
#include <sys/msan.h>
#include <uvm/uvm_extern.h>
static kmutex_t kthread_lock;
static kcondvar_t kthread_cv;
void
kthread_sysinit(void)
{
mutex_init(&kthread_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&kthread_cv, "kthrwait");
}
/*
* kthread_create: create a kernel thread, that is, system-only LWP.
*/
int
kthread_create(pri_t pri, int flag, struct cpu_info *ci,
void (*func)(void *), void *arg, lwp_t **lp, const char *fmt, ...)
{
lwp_t *l;
vaddr_t uaddr;
int error, lc;
va_list ap;
KASSERT((flag & KTHREAD_INTR) == 0 || (flag & KTHREAD_MPSAFE) != 0);
uaddr = uvm_uarea_system_alloc(
(flag & (KTHREAD_INTR|KTHREAD_IDLE)) == KTHREAD_IDLE ? ci : NULL);
if (uaddr == 0) {
return ENOMEM;
}
kmsan_orig((void *)uaddr, USPACE, KMSAN_TYPE_POOL, __RET_ADDR);
if ((flag & KTHREAD_TS) != 0) {
lc = SCHED_OTHER;
} else {
lc = SCHED_RR;
}
error = lwp_create(&lwp0, &proc0, uaddr, LWP_DETACHED, NULL,
0, func, arg, &l, lc, &lwp0.l_sigmask, &lwp0.l_sigstk);
if (error) {
uvm_uarea_system_free(uaddr);
return error;
}
if (fmt != NULL) { l->l_name = kmem_alloc(MAXCOMLEN, KM_SLEEP);
va_start(ap, fmt);
vsnprintf(l->l_name, MAXCOMLEN, fmt, ap);
va_end(ap);
}
/*
* Set parameters.
*/
if (pri == PRI_NONE) {
if ((flag & KTHREAD_TS) != 0) {
/* Maximum user priority level. */
pri = MAXPRI_USER;
} else {
/* Minimum kernel priority level. */
pri = PRI_KTHREAD;
}
}
mutex_enter(proc0.p_lock);
lwp_lock(l);
lwp_changepri(l, pri);
if (ci != NULL) { if (ci != l->l_cpu) {
lwp_unlock_to(l, ci->ci_schedstate.spc_lwplock);
lwp_lock(l);
l->l_cpu = ci;
}
l->l_pflag |= LP_BOUND;
}
if ((flag & KTHREAD_MUSTJOIN) != 0) { KASSERT(lp != NULL);
l->l_pflag |= LP_MUSTJOIN;
}
if ((flag & KTHREAD_INTR) != 0) { l->l_pflag |= LP_INTR;
}
if ((flag & KTHREAD_MPSAFE) == 0) {
l->l_pflag &= ~LP_MPSAFE;
}
/*
* Set the new LWP running, unless the caller has requested
* otherwise.
*/
KASSERT(l->l_stat == LSIDL);
if ((flag & KTHREAD_IDLE) == 0) {
setrunnable(l);
/* LWP now unlocked */
} else {
lwp_unlock(l);
}
mutex_exit(proc0.p_lock);
/* All done! */
if (lp != NULL) { *lp = l;
}
return 0;
}
/*
* Cause a kernel thread to exit. Assumes the exiting thread is the
* current context.
*/
void
kthread_exit(int ecode)
{
const char *name;
lwp_t *l = curlwp;
/* If the kernel lock is held, we need to drop it now. */
if ((l->l_pflag & LP_MPSAFE) == 0) {
KERNEL_UNLOCK_LAST(l);
}
/* We can't do much with the exit code, so just report it. */
if (ecode != 0) {
if ((name = l->l_name) == NULL)
name = "unnamed";
printf("WARNING: kthread `%s' (%d) exits with status %d\n",
name, l->l_lid, ecode);
}
/* Barrier for joining. */
if (l->l_pflag & LP_MUSTJOIN) {
bool *exitedp;
mutex_enter(&kthread_lock);
while ((exitedp = l->l_private) == NULL) {
cv_wait(&kthread_cv, &kthread_lock);
}
KASSERT(!*exitedp);
*exitedp = true;
cv_broadcast(&kthread_cv);
mutex_exit(&kthread_lock);
}
/* And exit.. */
lwp_exit(l);
panic("kthread_exit");
}
/*
* Wait for a kthread to exit, as pthread_join().
*/
int
kthread_join(lwp_t *l)
{
bool exited = false;
KASSERT((l->l_flag & LW_SYSTEM) != 0);
KASSERT((l->l_pflag & LP_MUSTJOIN) != 0);
/*
* - Ask the kthread to write to `exited'.
* - After this, touching l is forbidden -- it may be freed.
* - Wait until the kthread has written to `exited'.
*/
mutex_enter(&kthread_lock);
KASSERT(l->l_private == NULL);
l->l_private = &exited;
cv_broadcast(&kthread_cv);
while (!exited) {
cv_wait(&kthread_cv, &kthread_lock);
}
mutex_exit(&kthread_lock);
return 0;
}
/*
* kthread_fpu_enter()
*
* Allow the current lwp, which must be a kthread, to use the FPU.
* Return a cookie that must be passed to kthread_fpu_exit when
* done. Must be used only in thread context. Recursive -- you
* can call kthread_fpu_enter several times in a row as long as
* you pass the cookies in reverse order to kthread_fpu_exit.
*/
int
kthread_fpu_enter(void)
{
struct lwp *l = curlwp;
int s;
KASSERTMSG(!cpu_intr_p(),
"%s is not allowed in interrupt context", __func__);
KASSERTMSG(!cpu_softintr_p(),
"%s is not allowed in interrupt context", __func__);
/*
* Remember whether this thread already had FPU access, and
* mark this thread as having FPU access.
*/
lwp_lock(l);
KASSERTMSG(l->l_flag & LW_SYSTEM,
"%s is allowed only in kthreads", __func__);
s = l->l_flag & LW_SYSTEM_FPU;
l->l_flag |= LW_SYSTEM_FPU;
lwp_unlock(l);
/* Take MD steps to enable the FPU if necessary. */
if (s == 0)
kthread_fpu_enter_md();
return s;
}
/*
* kthread_fpu_exit(s)
*
* Restore the current lwp's FPU access to what it was before the
* matching call to kthread_fpu_enter() that returned s. Must be
* used only in thread context.
*/
void
kthread_fpu_exit(int s)
{
struct lwp *l = curlwp;
KASSERT(s == (s & LW_SYSTEM_FPU));
KASSERTMSG(!cpu_intr_p(),
"%s is not allowed in interrupt context", __func__);
KASSERTMSG(!cpu_softintr_p(),
"%s is not allowed in interrupt context", __func__);
lwp_lock(l);
KASSERTMSG(l->l_flag & LW_SYSTEM,
"%s is allowed only in kthreads", __func__);
KASSERT(l->l_flag & LW_SYSTEM_FPU);
l->l_flag ^= s ^ LW_SYSTEM_FPU;
lwp_unlock(l);
/* Take MD steps to zero and disable the FPU if necessary. */
if (s == 0)
kthread_fpu_exit_md();
}
/* $NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $ */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)umap_vnops.c 8.6 (Berkeley) 5/22/95
*/
/*
* Umap Layer
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vnops.c,v 1.62 2021/10/20 03:08:18 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/kauth.h>
#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer_extern.h>
/*
* Note: If the LAYERFS_MBYPASSDEBUG flag is set, it is possible
* that the debug printing will bomb out, because kauth routines
* do not handle NOCRED or FSCRED like other credentials and end
* up dereferencing an inappropriate pointer.
*
* That should be fixed in kauth rather than here.
*/
int umap_lookup(void *);
int umap_getattr(void *);
int umap_print(void *);
int umap_rename(void *);
/*
* Global vfs data structures
*/
/*
* XXX - strategy, bwrite are hand coded currently. They should
* go away with a merged buffer/block cache.
*
*/
int (**umap_vnodeop_p)(void *);
const struct vnodeopv_entry_desc umap_vnodeop_entries[] = {
{ &vop_default_desc, umap_bypass },
{ &vop_lookup_desc, umap_lookup },
{ &vop_getattr_desc, umap_getattr },
{ &vop_print_desc, umap_print },
{ &vop_rename_desc, umap_rename },
{ &vop_fsync_desc, layer_fsync },
{ &vop_inactive_desc, layer_inactive },
{ &vop_reclaim_desc, layer_reclaim },
{ &vop_open_desc, layer_open },
{ &vop_close_desc, layer_close },
{ &vop_setattr_desc, layer_setattr },
{ &vop_access_desc, layer_access },
{ &vop_accessx_desc, genfs_accessx },
{ &vop_remove_desc, layer_remove },
{ &vop_revoke_desc, layer_revoke },
{ &vop_rmdir_desc, layer_rmdir },
{ &vop_bmap_desc, layer_bmap },
{ &vop_getpages_desc, layer_getpages },
{ &vop_putpages_desc, layer_putpages },
{ NULL, NULL }
};
const struct vnodeopv_desc umapfs_vnodeop_opv_desc =
{ &umap_vnodeop_p, umap_vnodeop_entries };
/*
* This is the 08-June-1999 bypass routine.
* See layer_vnops.c:layer_bypass for more details.
*/
int
umap_bypass(void *v)
{
struct vop_generic_args /* {
struct vnodeop_desc *a_desc;
<other random data follows, presumably>
} */ *ap = v;
int (**our_vnodeop_p)(void *);
kauth_cred_t *credpp = NULL, credp = 0;
kauth_cred_t savecredp = 0, savecompcredp = 0;
kauth_cred_t compcredp = 0;
struct vnode **this_vp_p;
int error;
struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
struct vnode **vps_p[VDESC_MAX_VPS];
struct vnode ***vppp;
struct vnodeop_desc *descp = ap->a_desc;
int reles, i, flags;
struct componentname **compnamepp = 0;
#ifdef DIAGNOSTIC
/*
* We require at least one vp.
*/
if (descp->vdesc_vp_offsets == NULL ||
descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
panic("%s: no vp's in map.\n", __func__);
#endif
vps_p[0] =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
vp0 = *vps_p[0];
flags = MOUNTTOUMAPMOUNT(vp0->v_mount)->umapm_flags;
our_vnodeop_p = vp0->v_op;
if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name);
/*
* Map the vnodes going in.
* Later, we'll invoke the operation based on
* the first mapped vnode's operation vector.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
vps_p[i] = this_vp_p =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
ap);
/*
* We're not guaranteed that any but the first vnode
* are of our type. Check for and don't map any
* that aren't. (We must always map first vp or vclean fails.)
*/
if (i && (*this_vp_p == NULL ||
(*this_vp_p)->v_op != our_vnodeop_p)) {
old_vps[i] = NULL;
} else {
old_vps[i] = *this_vp_p;
*(vps_p[i]) = UMAPVPTOLOWERVP(*this_vp_p);
/*
* XXX - Several operations have the side effect
* of vrele'ing their vp's. We must account for
* that. (This should go away in the future.)
*/
if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p);
}
}
/*
* Fix the credentials. (That's the purpose of this layer.)
*/
if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) {
credpp = VOPARG_OFFSETTO(kauth_cred_t*,
descp->vdesc_cred_offset, ap);
/* Save old values */
savecredp = *credpp;
if (savecredp != NOCRED && savecredp != FSCRED) *credpp = kauth_cred_dup(savecredp);
credp = *credpp;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: user was %d, group %d\n",
kauth_cred_geteuid(credp), kauth_cred_getegid(credp));
/* Map all ids in the credential structure. */
umap_mapids(vp0->v_mount, credp); if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: user now %d, group %d\n",
kauth_cred_geteuid(credp), kauth_cred_getegid(credp));
}
/* BSD often keeps a credential in the componentname structure
* for speed. If there is one, it better get mapped, too.
*/
if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) {
compnamepp = VOPARG_OFFSETTO(struct componentname**,
descp->vdesc_componentname_offset, ap);
savecompcredp = (*compnamepp)->cn_cred;
if (savecompcredp != NOCRED && savecompcredp != FSCRED) (*compnamepp)->cn_cred = kauth_cred_dup(savecompcredp);
compcredp = (*compnamepp)->cn_cred;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(vp0->v_mount, compcredp); if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
}
/*
* Call the operation on the lower layer
* with the modified argument structure.
*/
error = VCALL(*vps_p[0], descp->vdesc_offset, ap);
/*
* Maintain the illusion of call-by-value
* by restoring vnodes in the argument structure
* to their original value.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
if (old_vps[i]) {
*(vps_p[i]) = old_vps[i];
if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i]));
}
}
/*
* Map the possible out-going vpp
* (Assumes that the lower layer always returns
* a VREF'ed vpp unless it gets an error.)
*/
if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
vppp = VOPARG_OFFSETTO(struct vnode***,
descp->vdesc_vpp_offset, ap);
/*
* Only vop_lookup, vop_create, vop_makedir, vop_mknod
* and vop_symlink return vpp's. vop_lookup doesn't call bypass
* as a lookup on "." would generate a locking error.
* So all the calls which get us here have a unlocked vpp. :-)
*/
error = layer_node_create(old_vps[0]->v_mount, **vppp, *vppp);
if (error) { vrele(**vppp);
**vppp = NULL;
}
}
/*
* Free duplicate cred structure and restore old one.
*/
if (descp->vdesc_cred_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && credp &&
kauth_cred_geteuid(credp) != 0)
printf("umap_bypass: returning-user was %d\n",
kauth_cred_geteuid(credp)); if (savecredp != NOCRED && savecredp != FSCRED && credpp) {
kauth_cred_free(credp);
*credpp = savecredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && credpp &&
kauth_cred_geteuid(*credpp) != 0)
printf("umap_bypass: returning-user now %d\n\n",
kauth_cred_geteuid(savecredp));
}
}
if (descp->vdesc_componentname_offset != VDESC_NO_OFFSET) { if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_bypass: returning-component-user was %d\n",
kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) {
kauth_cred_free(compcredp);
(*compnamepp)->cn_cred = savecompcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
kauth_cred_geteuid(savecompcredp) != 0)
printf("umap_bypass: returning-component-user now %d\n",
kauth_cred_geteuid(savecompcredp));
}
}
return (error);
}
/*
* This is based on the 08-June-1999 bypass routine.
* See layer_vnops.c:layer_bypass for more details.
*/
int
umap_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnodeop_desc *a_desc;
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t savecompcredp = NULL;
kauth_cred_t compcredp = NULL;
struct vnode *dvp, *vp, *ldvp;
struct mount *mp;
int error;
int flags, cnf = cnp->cn_flags;
dvp = ap->a_dvp;
mp = dvp->v_mount;
if ((cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
return (EROFS);
flags = MOUNTTOUMAPMOUNT(mp)->umapm_flags;
ldvp = UMAPVPTOLOWERVP(dvp);
if (flags & LAYERFS_MBYPASSDEBUG) printf("umap_lookup\n");
/*
* Fix the credentials. (That's the purpose of this layer.)
*
* BSD often keeps a credential in the componentname structure
* for speed. If there is one, it better get mapped, too.
*/
if ((savecompcredp = cnp->cn_cred)) {
compcredp = kauth_cred_dup(savecompcredp);
cnp->cn_cred = compcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(mp, compcredp);
}
if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
ap->a_dvp = ldvp;
error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
vp = *ap->a_vpp;
*ap->a_vpp = NULL;
if (error == EJUSTRETURN && (cnf & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
error = EROFS;
/* Do locking fixup as appropriate. See layer_lookup() for info */
if (ldvp == vp) {
*ap->a_vpp = dvp;
vref(dvp);
vrele(vp);
} else if (vp != NULL) {
error = layer_node_create(mp, vp, ap->a_vpp);
if (error) {
vrele(vp);
}
}
/*
* Free duplicate cred structure and restore old one.
*/
if ((flags & LAYERFS_MBYPASSDEBUG) && compcredp &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_lookup: returning-component-user was %d\n",
kauth_cred_geteuid(compcredp)); if (savecompcredp != NOCRED && savecompcredp != FSCRED) { if (compcredp) kauth_cred_free(compcredp);
cnp->cn_cred = savecompcredp;
if ((flags & LAYERFS_MBYPASSDEBUG) && savecompcredp &&
kauth_cred_geteuid(savecompcredp) != 0)
printf("umap_lookup: returning-component-user now %d\n",
kauth_cred_geteuid(savecompcredp));
}
return (error);
}
/*
* We handle getattr to change the fsid.
*/
int
umap_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
uid_t uid;
gid_t gid;
int error, tmpid, nentries, gnentries, flags;
u_long (*mapdata)[2];
u_long (*gmapdata)[2];
struct vnode **vp1p;
const struct vnodeop_desc *descp = ap->a_desc;
if ((error = umap_bypass(ap)) != 0)
return (error);
/* Requires that arguments be restored. */
ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
flags = MOUNTTOUMAPMOUNT(ap->a_vp->v_mount)->umapm_flags;
/*
* Umap needs to map the uid and gid returned by a stat
* into the proper values for this site. This involves
* finding the returned uid in the mapping information,
* translating it into the uid on the other end,
* and filling in the proper field in the vattr
* structure pointed to by ap->a_vap. The group
* is easier, since currently all groups will be
* translate to the NULLGROUP.
*/
/* Find entry in map */
uid = ap->a_vap->va_uid;
gid = ap->a_vap->va_gid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: mapped uid = %d, mapped gid = %d\n", uid,
gid);
vp1p = VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
nentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_nentries;
mapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_mapdata);
gnentries = MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gnentries;
gmapdata = (MOUNTTOUMAPMOUNT((*vp1p)->v_mount)->info_gmapdata);
/* Reverse map the uid for the vnode. Since it's a reverse
map, we can't use umap_mapids() to do it. */
tmpid = umap_reverse_findid(uid, mapdata, nentries);
if (tmpid != -1) {
ap->a_vap->va_uid = (uid_t) tmpid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original uid = %d\n", uid);
} else
ap->a_vap->va_uid = (uid_t) NOBODY;
/* Reverse map the gid for the vnode. */
tmpid = umap_reverse_findid(gid, gmapdata, gnentries);
if (tmpid != -1) {
ap->a_vap->va_gid = (gid_t) tmpid;
if ((flags & LAYERFS_MBYPASSDEBUG)) printf("umap_getattr: original gid = %d\n", gid);
} else
ap->a_vap->va_gid = (gid_t) NULLGROUP;
return (0);
}
int
umap_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
printf("\ttag VT_UMAPFS, vp=%p, lowervp=%p\n", vp,
UMAPVPTOLOWERVP(vp));
return (0);
}
int
umap_rename(void *v)
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
int error, flags;
struct componentname *compnamep;
kauth_cred_t compcredp, savecompcredp;
struct vnode *vp;
struct vnode *tvp;
/*
* Rename is irregular, having two componentname structures.
* We need to map the cre in the second structure,
* and then bypass takes care of the rest.
*/
vp = ap->a_fdvp;
flags = MOUNTTOUMAPMOUNT(vp->v_mount)->umapm_flags;
compnamep = ap->a_tcnp;
compcredp = compnamep->cn_cred;
savecompcredp = compcredp;
compcredp = compnamep->cn_cred = kauth_cred_dup(savecompcredp);
if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_rename: rename component credit user was %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
/* Map all ids in the credential structure. */
umap_mapids(vp->v_mount, compcredp); if ((flags & LAYERFS_MBYPASSDEBUG) &&
kauth_cred_geteuid(compcredp) != 0)
printf("umap_rename: rename component credit user now %d, group %d\n",
kauth_cred_geteuid(compcredp), kauth_cred_getegid(compcredp));
tvp = ap->a_tvp;
if (tvp) { if (tvp->v_mount != vp->v_mount)
tvp = NULL;
else
vref(tvp);
}
error = umap_bypass(ap);
if (tvp) {
if (error == 0) VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
vrele(tvp);
}
/* Restore the additional mapped componentname cred structure. */
kauth_cred_free(compcredp);
compnamep->cn_cred = savecompcredp;
return error;
}
/* $NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ktrace_vfs.c,v 1.3 2021/06/29 22:40:53 dholland Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
/*
* ktrace system call, the part of the ktrace framework that
* explicitly interacts with VFS
*/
/* ARGSUSED */
int
sys_ktrace(struct lwp *l, const struct sys_ktrace_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) fname;
syscallarg(int) ops;
syscallarg(int) facs;
syscallarg(int) pid;
} */
struct vnode *vp = NULL;
file_t *fp = NULL;
struct pathbuf *pb;
int error = 0;
int fd;
if (ktrenter(l))
return EAGAIN;
if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) {
/*
* an operation which requires a file argument.
*/
error = pathbuf_copyin(SCARG(uap, fname), &pb);
if (error) {
ktrexit(l);
return (error);
}
error = vn_open(NULL, pb, 0, FREAD|FWRITE, 0, &vp, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb);
ktrexit(l);
return (error);
}
pathbuf_destroy(pb);
VOP_UNLOCK(vp);
if (vp->v_type != VREG) {
vn_close(vp, FREAD|FWRITE, l->l_cred);
ktrexit(l);
return (EACCES);
}
/*
* This uses up a file descriptor slot in the
* tracing process for the duration of this syscall.
* This is not expected to be a problem.
*/
if ((error = fd_allocfile(&fp, &fd)) != 0) {
vn_close(vp, FWRITE, l->l_cred);
ktrexit(l);
return error;
}
fp->f_flag = FWRITE;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_vnode = vp;
vp = NULL;
}
error = ktrace_common(l, SCARG(uap, ops), SCARG(uap, facs),
SCARG(uap, pid), &fp);
if (KTROP(SCARG(uap, ops)) != KTROP_CLEAR) fd_abort(curproc, fp, fd);
return (error);
}
/* $NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $ */
/* NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_pageout.c 8.5 (Berkeley) 2/14/94
* from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
#if defined(PDSIM)
#include "pdsim.h"
#else /* defined(PDSIM) */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pdpolicy_impl.h>
#include <uvm/uvm_stat.h>
#endif /* defined(PDSIM) */
/*
* per-CPU queue of pending page status changes. 128 entries makes for a
* 1kB queue on _LP64 and has been found to be a reasonable compromise that
* keeps lock contention events and wait times low, while not using too much
* memory nor allowing global state to fall too far behind.
*/
#if !defined(CLOCK_PDQ_SIZE)
#define CLOCK_PDQ_SIZE 128
#endif /* !defined(CLOCK_PDQ_SIZE) */
#define PQ_INACTIVE 0x00000010 /* page is in inactive list */
#define PQ_ACTIVE 0x00000020 /* page is in active list */
#if !defined(CLOCK_INACTIVEPCT)
#define CLOCK_INACTIVEPCT 33
#endif /* !defined(CLOCK_INACTIVEPCT) */
struct uvmpdpol_globalstate {
kmutex_t lock; /* lock on state */
/* <= compiler pads here */
struct pglist s_activeq /* allocated pages, in use */
__aligned(COHERENCY_UNIT);
struct pglist s_inactiveq; /* pages between the clock hands */
int s_active;
int s_inactive;
int s_inactarg;
struct uvm_pctparam s_anonmin;
struct uvm_pctparam s_filemin;
struct uvm_pctparam s_execmin;
struct uvm_pctparam s_anonmax;
struct uvm_pctparam s_filemax;
struct uvm_pctparam s_execmax;
struct uvm_pctparam s_inactivepct;
};
struct uvmpdpol_scanstate {
bool ss_anonreact, ss_filereact, ss_execreact;
struct vm_page ss_marker;
};
static void uvmpdpol_pageactivate_locked(struct vm_page *);
static void uvmpdpol_pagedeactivate_locked(struct vm_page *);
static void uvmpdpol_pagedequeue_locked(struct vm_page *);
static bool uvmpdpol_pagerealize_locked(struct vm_page *);
static struct uvm_cpu *uvmpdpol_flush(void);
static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
static struct uvmpdpol_scanstate pdpol_scanstate;
PDPOL_EVCNT_DEFINE(reactexec)
PDPOL_EVCNT_DEFINE(reactfile)
PDPOL_EVCNT_DEFINE(reactanon)
static void
clock_tune(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
s->s_active + s->s_inactive);
if (s->s_inactarg <= uvmexp.freetarg) {
s->s_inactarg = uvmexp.freetarg + 1;
}
}
void
uvmpdpol_scaninit(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
int t;
bool anonunder, fileunder, execunder;
bool anonover, fileover, execover;
bool anonreact, filereact, execreact;
int64_t freepg, anonpg, filepg, execpg;
/*
* decide which types of pages we want to reactivate instead of freeing
* to keep usage within the minimum and maximum usage limits.
* uvm_availmem() will sync the counters.
*/
freepg = uvm_availmem(false);
anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
cpu_count_get(CPU_COUNT_ANONDIRTY) +
cpu_count_get(CPU_COUNT_ANONUNKNOWN);
execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
cpu_count_get(CPU_COUNT_FILEDIRTY) +
cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
execpg;
mutex_enter(&s->lock);
t = s->s_active + s->s_inactive + freepg;
anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
anonreact = anonunder || (!anonover && (fileover || execover));
filereact = fileunder || (!fileover && (anonover || execover));
execreact = execunder || (!execover && (anonover || fileover));
if (filereact && execreact && (anonreact || uvm_swapisfull())) {
anonreact = filereact = execreact = false;
}
ss->ss_anonreact = anonreact;
ss->ss_filereact = filereact;
ss->ss_execreact = execreact;
memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
ss->ss_marker.flags = PG_MARKER;
TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
mutex_exit(&s->lock);
}
void
uvmpdpol_scanfini(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
mutex_enter(&s->lock);
TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
mutex_exit(&s->lock);
}
struct vm_page *
uvmpdpol_selectvictim(krwlock_t **plock)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
struct vm_page *pg;
krwlock_t *lock;
mutex_enter(&s->lock);
while (/* CONSTCOND */ 1) {
struct vm_anon *anon;
struct uvm_object *uobj;
pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
if (pg == NULL) {
break;
}
KASSERT((pg->flags & PG_MARKER) == 0);
uvmexp.pdscans++;
/*
* acquire interlock to stabilize page identity.
* if we have caught the page in a state of flux
* deal with it and retry.
*/
mutex_enter(&pg->interlock);
if (uvmpdpol_pagerealize_locked(pg)) {
mutex_exit(&pg->interlock);
continue;
}
/*
* now prepare to move on to the next page.
*/
TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
pdqueue);
TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
&ss->ss_marker, pdqueue);
/*
* enforce the minimum thresholds on different
* types of memory usage. if reusing the current
* page would reduce that type of usage below its
* minimum, reactivate the page instead and move
* on to the next page.
*/
anon = pg->uanon;
uobj = pg->uobject;
if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactexec);
continue;
}
if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
!UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactfile);
continue;
}
if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
PDPOL_EVCNT_INCR(reactanon);
continue;
}
/*
* try to lock the object that owns the page.
*
* with the page interlock held, we can drop s->lock, which
* could otherwise serve as a barrier to us getting the
* object locked, because the owner of the object's lock may
* be blocked on s->lock (i.e. a deadlock).
*
* whatever happens, uvmpd_trylockowner() will release the
* interlock. with the interlock dropped we can then
* re-acquire our own lock. the order is:
*
* object -> pdpol -> interlock.
*/
mutex_exit(&s->lock);
lock = uvmpd_trylockowner(pg);
/* pg->interlock now released */
mutex_enter(&s->lock);
if (lock == NULL) {
/* didn't get it - try the next page. */
continue;
}
/*
* move referenced pages back to active queue and skip to
* next page.
*/
if (pmap_is_referenced(pg)) {
mutex_enter(&pg->interlock);
uvmpdpol_pageactivate_locked(pg);
mutex_exit(&pg->interlock);
uvmexp.pdreact++;
rw_exit(lock);
continue;
}
/* we have a potential victim. */
*plock = lock;
break;
}
mutex_exit(&s->lock);
return pg;
}
void
uvmpdpol_balancequeue(int swap_shortage)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
int inactive_shortage;
struct vm_page *p, marker;
krwlock_t *lock;
/*
* we have done the scan to get free pages. now we work on meeting
* our inactive target.
*/
memset(&marker, 0, sizeof(marker));
marker.flags = PG_MARKER;
mutex_enter(&s->lock);
TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
for (;;) {
inactive_shortage =
pdpol_state.s_inactarg - pdpol_state.s_inactive;
if (inactive_shortage <= 0 && swap_shortage <= 0) {
break;
}
p = TAILQ_NEXT(&marker, pdqueue);
if (p == NULL) {
break;
}
KASSERT((p->flags & PG_MARKER) == 0);
/*
* acquire interlock to stabilize page identity.
* if we have caught the page in a state of flux
* deal with it and retry.
*/
mutex_enter(&p->interlock);
if (uvmpdpol_pagerealize_locked(p)) {
mutex_exit(&p->interlock);
continue;
}
/*
* now prepare to move on to the next page.
*/
TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
pdqueue);
/*
* try to lock the object that owns the page. see comments
* in uvmpdol_selectvictim().
*/
mutex_exit(&s->lock);
lock = uvmpd_trylockowner(p);
/* p->interlock now released */
mutex_enter(&s->lock);
if (lock == NULL) {
/* didn't get it - try the next page. */
continue;
}
/*
* if there's a shortage of swap slots, try to free it.
*/
if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
(p->flags & PG_BUSY) == 0) {
if (uvmpd_dropswap(p)) {
swap_shortage--;
}
}
/*
* if there's a shortage of inactive pages, deactivate.
*/
if (inactive_shortage > 0) {
pmap_clear_reference(p);
mutex_enter(&p->interlock);
uvmpdpol_pagedeactivate_locked(p);
mutex_exit(&p->interlock);
uvmexp.pddeact++;
inactive_shortage--;
}
rw_exit(lock);
}
TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
mutex_exit(&s->lock);
}
static void
uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
(PQ_INTENT_D | PQ_INTENT_SET));
if (pg->pqflags & PQ_ACTIVE) { TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT(pdpol_state.s_active > 0);
pdpol_state.s_active--;
}
if ((pg->pqflags & PQ_INACTIVE) == 0) { KASSERT(pg->wire_count == 0); TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
pdpol_state.s_inactive++;
}
pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
pg->pqflags |= PQ_INACTIVE;
}
void
uvmpdpol_pagedeactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock));
/*
* we have to clear the reference bit now, as when it comes time to
* realize the intent we won't have the object locked any more.
*/
pmap_clear_reference(pg);
uvmpdpol_set_intent(pg, PQ_INTENT_I);
}
static void
uvmpdpol_pageactivate_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
(PQ_INTENT_D | PQ_INTENT_SET));
uvmpdpol_pagedequeue_locked(pg);
TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
pdpol_state.s_active++;
pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
pg->pqflags |= PQ_ACTIVE;
}
void
uvmpdpol_pageactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_A);
}
static void
uvmpdpol_pagedequeue_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock));
if (pg->pqflags & PQ_ACTIVE) {
TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue); KASSERT((pg->pqflags & PQ_INACTIVE) == 0); KASSERT(pdpol_state.s_active > 0);
pdpol_state.s_active--;
} else if (pg->pqflags & PQ_INACTIVE) { TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue); KASSERT(pdpol_state.s_inactive > 0);
pdpol_state.s_inactive--;
}
pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
}
void
uvmpdpol_pagedequeue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_D);
}
void
uvmpdpol_pageenqueue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); uvmpdpol_set_intent(pg, PQ_INTENT_E);
}
void
uvmpdpol_anfree(struct vm_anon *an)
{
}
bool
uvmpdpol_pageisqueued_p(struct vm_page *pg)
{
uint32_t pqflags;
/*
* if there's an intent set, we have to consider it. otherwise,
* return the actual state. we may be called unlocked for the
* purpose of assertions, which is safe due to the page lifecycle.
*/
pqflags = atomic_load_relaxed(&pg->pqflags);
if ((pqflags & PQ_INTENT_SET) != 0) {
return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
} else {
return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
}
}
bool
uvmpdpol_pageactivate_p(struct vm_page *pg)
{
uint32_t pqflags;
/* consider intent in preference to actual state. */
pqflags = atomic_load_relaxed(&pg->pqflags);
if ((pqflags & PQ_INTENT_SET) != 0) {
pqflags &= PQ_INTENT_MASK;
return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
} else {
/*
* TODO: Enabling this may be too much of a big hammer,
* since we do get useful information from activations.
* Think about it more and maybe come up with a heuristic
* or something.
*
* return (pqflags & PQ_ACTIVE) == 0;
*/
return true;
}
}
void
uvmpdpol_estimatepageable(int *active, int *inactive)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
/*
* Don't take any locks here. This can be called from DDB, and in
* any case the numbers are stale the instant the lock is dropped,
* so it just doesn't matter.
*/
if (active) {
*active = s->s_active;
}
if (inactive) {
*inactive = s->s_inactive;
}
}
#if !defined(PDSIM)
static int
min_check(struct uvm_pctparam *pct, int t)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
int total = t;
if (pct != &s->s_anonmin) {
total += uvm_pctparam_get(&s->s_anonmin);
}
if (pct != &s->s_filemin) {
total += uvm_pctparam_get(&s->s_filemin);
}
if (pct != &s->s_execmin) {
total += uvm_pctparam_get(&s->s_execmin);
}
if (total > 95) {
return EINVAL;
}
return 0;
}
#endif /* !defined(PDSIM) */
void
uvmpdpol_init(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&s->s_activeq);
TAILQ_INIT(&s->s_inactiveq);
uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
uvm_pctparam_init(&s->s_anonmin, 10, min_check);
uvm_pctparam_init(&s->s_filemin, 10, min_check);
uvm_pctparam_init(&s->s_execmin, 5, min_check);
uvm_pctparam_init(&s->s_anonmax, 80, NULL);
uvm_pctparam_init(&s->s_filemax, 50, NULL);
uvm_pctparam_init(&s->s_execmax, 30, NULL);
}
void
uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
{
ucpu->pdq =
kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
ucpu->pdqhead = CLOCK_PDQ_SIZE;
ucpu->pdqtail = CLOCK_PDQ_SIZE;
}
void
uvmpdpol_reinit(void)
{
}
bool
uvmpdpol_needsscan_p(void)
{
/*
* this must be an unlocked check: can be called from interrupt.
*/
return pdpol_state.s_inactive < pdpol_state.s_inactarg;
}
void
uvmpdpol_tune(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
mutex_enter(&s->lock);
clock_tune();
mutex_exit(&s->lock);
}
/*
* uvmpdpol_pagerealize_locked: take the intended state set on a page and
* make it real. return true if any work was done.
*/
static bool
uvmpdpol_pagerealize_locked(struct vm_page *pg)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
KASSERT(mutex_owned(&s->lock)); KASSERT(mutex_owned(&pg->interlock)); switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
case PQ_INTENT_A | PQ_INTENT_SET:
case PQ_INTENT_E | PQ_INTENT_SET:
uvmpdpol_pageactivate_locked(pg);
return true;
case PQ_INTENT_I | PQ_INTENT_SET:
uvmpdpol_pagedeactivate_locked(pg);
return true;
case PQ_INTENT_D | PQ_INTENT_SET:
uvmpdpol_pagedequeue_locked(pg);
return true;
default:
return false;
}
}
/*
* uvmpdpol_flush: return the current uvm_cpu with all of its pending
* updates flushed to the global queues. this routine may block, and
* so can switch cpu. the idea is to empty to queue on whatever cpu
* we finally end up on.
*/
static struct uvm_cpu *
uvmpdpol_flush(void)
{
struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
struct uvm_cpu *ucpu;
struct vm_page *pg;
KASSERT(kpreempt_disabled());
mutex_enter(&s->lock);
for (;;) {
/*
* prefer scanning forwards (even though mutex_enter() is
* serializing) so as to not defeat any prefetch logic in
* the CPU. that means elsewhere enqueuing backwards, like
* a stack, but not so important there as pages are being
* added singularly.
*
* prefetch the next "struct vm_page" while working on the
* current one. this has a measurable and very positive
* effect in reducing the amount of time spent here under
* the global lock.
*/
ucpu = curcpu()->ci_data.cpu_uvm;
KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
break;
}
pg = ucpu->pdq[ucpu->pdqhead++];
if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) { __builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
}
mutex_enter(&pg->interlock);
pg->pqflags &= ~PQ_INTENT_QUEUED;
(void)uvmpdpol_pagerealize_locked(pg);
mutex_exit(&pg->interlock);
}
mutex_exit(&s->lock);
return ucpu;
}
/*
* uvmpdpol_pagerealize: realize any intent set on the page. in this
* implementation, that means putting the page on a per-CPU queue to be
* dealt with later.
*/
void
uvmpdpol_pagerealize(struct vm_page *pg)
{
struct uvm_cpu *ucpu;
/*
* drain the per per-CPU queue if full, then enter the page.
*/
kpreempt_disable();
ucpu = curcpu()->ci_data.cpu_uvm;
if (__predict_false(ucpu->pdqhead == 0)) { ucpu = uvmpdpol_flush();
}
ucpu->pdq[--(ucpu->pdqhead)] = pg;
kpreempt_enable();
}
/*
* uvmpdpol_idle: called from the system idle loop. periodically purge any
* pending updates back to the global queues.
*/
void
uvmpdpol_idle(struct uvm_cpu *ucpu)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
struct vm_page *pg;
KASSERT(kpreempt_disabled());
/*
* if no pages in the queue, we have nothing to do.
*/
if (ucpu->pdqhead == ucpu->pdqtail) {
ucpu->pdqtime = getticks();
return;
}
/*
* don't do this more than ~8 times a second as it would needlessly
* exert pressure.
*/
if (getticks() - ucpu->pdqtime < (hz >> 3)) {
return;
}
/*
* the idle LWP can't block, so we have to try for the lock. if we
* get it, purge the per-CPU pending update queue. continually
* check for a pending resched: in that case exit immediately.
*/
if (mutex_tryenter(&s->lock)) {
while (ucpu->pdqhead != ucpu->pdqtail) {
pg = ucpu->pdq[ucpu->pdqhead];
if (!mutex_tryenter(&pg->interlock)) {
break;
}
ucpu->pdqhead++;
pg->pqflags &= ~PQ_INTENT_QUEUED;
(void)uvmpdpol_pagerealize_locked(pg);
mutex_exit(&pg->interlock);
if (curcpu()->ci_want_resched) {
break;
}
}
if (ucpu->pdqhead == ucpu->pdqtail) {
ucpu->pdqtime = getticks();
}
mutex_exit(&s->lock);
}
}
#if !defined(PDSIM)
#include <sys/sysctl.h> /* XXX SYSCTL_DESCR */
void
uvmpdpol_sysctlsetup(void)
{
struct uvmpdpol_globalstate *s = &pdpol_state;
uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for anonymous application data"));
uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for cached file data"));
uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
SYSCTL_DESCR("Percentage of physical memory reserved "
"for cached executable data"));
uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for "
"anonymous application data"));
uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for cached "
"file data"));
uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
SYSCTL_DESCR("Percentage of physical memory which will "
"be reclaimed from other usage for cached "
"executable data"));
uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
SYSCTL_DESCR("Percentage of inactive queue of "
"the entire (active + inactive) queue"));
}
#endif /* !defined(PDSIM) */
#if defined(PDSIM)
void
pdsim_dump(const char *id)
{
#if defined(DEBUG)
/* XXX */
#endif /* defined(DEBUG) */
}
#endif /* defined(PDSIM) */
/* $NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $ */
/*-
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Chris Torek.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(LIBC_SCCS) && !defined(lint)
#if 0
static char sccsid[] = "@(#)strncpy.c 8.1 (Berkeley) 6/4/93";
#else
__RCSID("$NetBSD: strncpy.c,v 1.4 2018/02/04 01:13:45 mrg Exp $");
#endif
#endif /* LIBC_SCCS and not lint */
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#else
#include <lib/libkern/libkern.h>
#endif
#ifdef _FORTIFY_SOURCE
#undef strncpy
#endif
/*
* Copy src to dst, truncating or null-padding to always copy n bytes.
* Return dst.
*/
char *
strncpy(char *dst, const char *src, size_t n)
{ if (n != 0) {
char *d = dst;
const char *s = src;
do {
if ((*d++ = *s++) == 0) {
/* NUL pad the remaining n-1 bytes */
while (--n != 0)
*d++ = 0;
break;
}
} while (--n != 0);
}
return (dst);
}
/* $NetBSD: in_cksum.c,v 1.22 2008/01/25 21:12:15 joerg Exp $ */
/*
* Copyright (c) 1988, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_cksum.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_cksum.c,v 1.22 2008/01/25 21:12:15 joerg Exp $");
#include <sys/param.h>
#include <netinet/in.h>
int
in_cksum(struct mbuf *m, int len)
{ KASSERT(len >= 0);
return cpu_in_cksum(m, len, 0, 0);
}
/* $NetBSD: sys_getrandom.c,v 1.2 2021/12/28 13:22:43 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* getrandom() system call
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_getrandom.c,v 1.2 2021/12/28 13:22:43 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cprng.h>
#include <sys/entropy.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/random.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscallargs.h>
#include <sys/uio.h>
#include <crypto/nist_hash_drbg/nist_hash_drbg.h>
#define RANDOM_BUFSIZE 512
int
dogetrandom(struct uio *uio, unsigned int flags)
{
uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES] = {0};
struct nist_hash_drbg drbg;
uint8_t *buf;
int extractflags = 0;
int error;
KASSERT((flags & ~(GRND_RANDOM|GRND_INSECURE|GRND_NONBLOCK)) == 0); KASSERT((flags & (GRND_RANDOM|GRND_INSECURE)) !=
(GRND_RANDOM|GRND_INSECURE));
/* Get a buffer for transfers. */
buf = kmem_alloc(RANDOM_BUFSIZE, KM_SLEEP);
/*
* Fast path: for short reads other than from /dev/random, if
* seeded or if INSECURE, just draw from per-CPU cprng_strong.
*/
if (uio->uio_resid <= RANDOM_BUFSIZE &&
!ISSET(flags, GRND_RANDOM) &&
(entropy_ready() || ISSET(flags, GRND_INSECURE))) {
/* Generate data and transfer it out. */
cprng_strong(user_cprng, buf, uio->uio_resid, 0);
error = uiomove(buf, uio->uio_resid, uio);
goto out;
}
/*
* Try to get a seed from the entropy pool. Fail if we would
* block. If GRND_INSECURE, always return something even if it
* is partial entropy; if !GRND_INSECURE, set ENTROPY_HARDFAIL
* in order to tell entropy_extract not to bother drawing
* anything from a partial pool if we can't get full entropy.
*/
if (!ISSET(flags, GRND_NONBLOCK) && !ISSET(flags, GRND_INSECURE))
extractflags |= ENTROPY_WAIT|ENTROPY_SIG;
if (!ISSET(flags, GRND_INSECURE))
extractflags |= ENTROPY_HARDFAIL;
error = entropy_extract(seed, sizeof seed, extractflags);
if (error && !ISSET(flags, GRND_INSECURE))
goto out;
/* Instantiate the DRBG. */
if (nist_hash_drbg_instantiate(&drbg, seed, sizeof seed, NULL, 0,
NULL, 0))
panic("nist_hash_drbg_instantiate");
/* Promptly zero the seed. */
explicit_memset(seed, 0, sizeof seed);
/* Generate data. */
error = 0;
while (uio->uio_resid) {
size_t n = MIN(uio->uio_resid, RANDOM_BUFSIZE);
/*
* Clamp /dev/random output to the entropy capacity and
* seed size. Programs can't rely on long reads.
*/
if (ISSET(flags, GRND_RANDOM)) {
n = MIN(n, ENTROPY_CAPACITY);
n = MIN(n, sizeof seed);
/*
* Guarantee never to return more than one
* buffer in this case to minimize bookkeeping.
*/
CTASSERT(ENTROPY_CAPACITY <= RANDOM_BUFSIZE);
CTASSERT(sizeof seed <= RANDOM_BUFSIZE);
}
/*
* Try to generate a block of data, but if we've hit
* the DRBG reseed interval, reseed.
*/
if (nist_hash_drbg_generate(&drbg, buf, n, NULL, 0)) {
/*
* Get a fresh seed without blocking -- we have
* already generated some output so it is not
* useful to block. This can fail only if the
* request is obscenely large, so it is OK for
* either /dev/random or /dev/urandom to fail:
* we make no promises about gigabyte-sized
* reads happening all at once.
*/
error = entropy_extract(seed, sizeof seed,
ENTROPY_HARDFAIL);
if (error)
break;
/* Reseed and try again. */
if (nist_hash_drbg_reseed(&drbg, seed, sizeof seed,
NULL, 0))
panic("nist_hash_drbg_reseed");
/* Promptly zero the seed. */
explicit_memset(seed, 0, sizeof seed);
/* If it fails now, that's a bug. */
if (nist_hash_drbg_generate(&drbg, buf, n, NULL, 0)) panic("nist_hash_drbg_generate");
}
/* Transfer n bytes out. */
error = uiomove(buf, n, uio);
if (error)
break;
/*
* If this is /dev/random, stop here, return what we
* have, and force the next read to reseed. Programs
* can't rely on /dev/random for long reads.
*/
if (ISSET(flags, GRND_RANDOM)) {
error = 0;
break;
}
/* Now's a good time to yield if needed. */
preempt_point();
/* Check for interruption after at least 256 bytes. */
CTASSERT(RANDOM_BUFSIZE >= 256);
if (__predict_false(curlwp->l_flag & LW_PENDSIG) &&
sigispending(curlwp, 0)) {
error = EINTR;
break;
}
}
out: /* Zero the buffer and free it. */
explicit_memset(buf, 0, RANDOM_BUFSIZE);
kmem_free(buf, RANDOM_BUFSIZE);
return error;
}
int
sys_getrandom(struct lwp *l, const struct sys_getrandom_args *uap,
register_t *retval)
{
/* {
syscallarg(void *) buf;
syscallarg(size_t) buflen;
syscallarg(unsigned) flags;
} */
void *buf = SCARG(uap, buf);
size_t buflen = SCARG(uap, buflen);
int flags = SCARG(uap, flags);
int error;
/* Set up an iov and uio to read into the user's buffer. */
struct iovec iov = { .iov_base = buf, .iov_len = buflen };
struct uio uio = {
.uio_iov = &iov,
.uio_iovcnt = 1,
.uio_offset = 0,
.uio_resid = buflen,
.uio_rw = UIO_READ,
.uio_vmspace = curproc->p_vmspace,
};
/* Validate the flags. */
if (flags & ~(GRND_RANDOM|GRND_INSECURE|GRND_NONBLOCK)) {
/* Unknown flags. */
error = EINVAL;
goto out;
}
if ((flags & (GRND_RANDOM|GRND_INSECURE)) ==
(GRND_RANDOM|GRND_INSECURE)) {
/* Nonsensical combination. */
error = EINVAL;
goto out;
}
/* Do it. */
error = dogetrandom(&uio, flags);
out: /*
* If we transferred anything, return the number of bytes
* transferred and suppress error; otherwise return the error.
*/
*retval = buflen - uio.uio_resid;
if (*retval)
error = 0;
return error;
}
/* $NetBSD: syscallvar.h,v 1.12 2018/04/19 21:19:07 christos Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_SYSCALLVAR_H_
#define _SYS_SYSCALLVAR_H_
#ifndef _KERNEL
#error nothing of interest to userspace here
#endif
#if defined(_KERNEL) && defined(_KERNEL_OPT)
#include "opt_dtrace.h"
#endif
#include <sys/systm.h>
#include <sys/proc.h>
extern struct emul emul_netbsd;
struct syscall_package {
u_short sp_code;
u_short sp_flags;
sy_call_t *sp_call;
};
void syscall_init(void);
int syscall_establish(const struct emul *, const struct syscall_package *);
int syscall_disestablish(const struct emul *, const struct syscall_package *);
static __inline int
sy_call(const struct sysent *sy, struct lwp *l, const void *uap,
register_t *rval)
{
int error;
l->l_sysent = sy;
error = (*sy->sy_call)(l, uap, rval);
l->l_sysent = NULL;
return error;
}
static __inline int
sy_invoke(const struct sysent *sy, struct lwp *l, const void *uap,
register_t *rval, int code)
{
const bool do_trace = l->l_proc->p_trace_enabled &&
(sy->sy_flags & SYCALL_INDIRECT) == 0;
int error;
#ifdef KDTRACE_HOOKS
#define KDTRACE_ENTRY(a) (a)
#else
#define KDTRACE_ENTRY(a) (0)
#endif
if (__predict_true(!(do_trace || KDTRACE_ENTRY(sy->sy_entry))) || (error = trace_enter(code, sy, uap)) == 0) {
rval[0] = 0;
#if !defined(__mips__) && !defined(__m68k__)
/*
* Due to the mips userland code for SYS_break needing v1 to be
* preserved, we can't clear this on mips.
*/
rval[1] = 0;
#endif
error = sy_call(sy, l, uap, rval);
}
if (__predict_false(do_trace || KDTRACE_ENTRY(sy->sy_return))) {
trace_exit(code, sy, uap, rval, error);
}
return error;
}
/* inclusion in the kernel currently depends on SYSCALL_DEBUG */
extern const char * const syscallnames[];
extern const char * const altsyscallnames[];
#endif /* _SYS_SYSCALLVAR_H_ */
/* $NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $ */
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_cb.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_cb.c,v 1.24 2017/09/25 01:56:22 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <netinet/in.h>
/*
* Routines to manage the raw protocol control blocks.
*
* TODO:
* hash lookups by protocol family/protocol + address family
* take care of unique address problems per AF?
* redo address binding to allow wildcards
*/
static u_long raw_sendspace = RAWSNDQ;
static u_long raw_recvspace = RAWRCVQ;
/*
* Allocate a nominal amount of buffer space for the socket.
*/
int
raw_attach(struct socket *so, int proto, struct rawcbhead *rawcbhead)
{
struct rawcb *rp;
int error;
/*
* It is assumed that raw_attach() is called after space has been
* allocated for the rawcb; consumer protocols may simply allocate
* type struct rawcb, or a wrapper data structure that begins with a
* struct rawcb.
*/
rp = sotorawcb(so);
KASSERT(rp != NULL);
sosetlock(so);
if ((error = soreserve(so, raw_sendspace, raw_recvspace)) != 0) {
return error;
}
rp->rcb_socket = so;
rp->rcb_proto.sp_family = so->so_proto->pr_domain->dom_family;
rp->rcb_proto.sp_protocol = proto;
LIST_INSERT_HEAD(rawcbhead, rp, rcb_list); KASSERT(solocked(so));
return 0;
}
/*
* Detach the raw connection block and discard socket resources.
*/
void
raw_detach(struct socket *so)
{
struct rawcb *rp = sotorawcb(so);
const size_t rcb_len = rp->rcb_len;
KASSERT(rp != NULL);
KASSERT(solocked(so));
/* Remove the last reference. */
LIST_REMOVE(rp, rcb_list);
so->so_pcb = NULL;
/* Note: sofree() drops the socket's lock. */
sofree(so);
kmem_free(rp, rcb_len);
if (so->so_lock != softnet_lock) { so->so_lock = softnet_lock;
mutex_obj_hold(softnet_lock);
}
mutex_enter(softnet_lock);
}
/*
* Disconnect and possibly release resources.
*/
void
raw_disconnect(struct rawcb *rp)
{
struct socket *so = rp->rcb_socket;
if (so->so_state & SS_NOFDREF) { raw_detach(so);
}
}
/* $NetBSD: subr_hash.c,v 1.12 2021/06/13 14:58:49 simonb Exp $ */
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_hash.c,v 1.12 2021/06/13 14:58:49 simonb Exp $");
#include <sys/param.h>
#include <sys/bitops.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/pslist.h>
#include <sys/rwlock.h>
#include <sys/sysctl.h>
static int hashstat_sysctl(SYSCTLFN_PROTO);
static size_t
hash_list_size(enum hashtype htype)
{
LIST_HEAD(, generic) *hashtbl_list;
SLIST_HEAD(, generic) *hashtbl_slist;
TAILQ_HEAD(, generic) *hashtbl_tailq;
struct pslist_head *hashtbl_pslist;
size_t esize;
switch (htype) {
case HASH_LIST:
esize = sizeof(*hashtbl_list);
break;
case HASH_PSLIST:
esize = sizeof(*hashtbl_pslist);
break;
case HASH_SLIST:
esize = sizeof(*hashtbl_slist);
break;
case HASH_TAILQ:
esize = sizeof(*hashtbl_tailq);
break;
default:
panic("hashdone: invalid table type");
}
return esize;
}
/*
* General routine to allocate a hash table.
* Allocate enough memory to hold at least `elements' list-head pointers.
* Return a pointer to the allocated space and set *hashmask to a pattern
* suitable for masking a value to use as an index into the returned array.
*/
void *
hashinit(u_int elements, enum hashtype htype, bool waitok, u_long *hashmask)
{
LIST_HEAD(, generic) *hashtbl_list;
SLIST_HEAD(, generic) *hashtbl_slist;
TAILQ_HEAD(, generic) *hashtbl_tailq;
struct pslist_head *hashtbl_pslist;
u_long hashsize, i;
size_t esize;
void *p;
KASSERT(elements > 0);
#define MAXELEMENTS (1U << ((sizeof(elements) * NBBY) - 1))
if (elements > MAXELEMENTS)
elements = MAXELEMENTS;
hashsize = 1UL << (ilog2(elements - 1) + 1); esize = hash_list_size(htype);
p = kmem_alloc(hashsize * esize, waitok ? KM_SLEEP : KM_NOSLEEP);
if (p == NULL)
return NULL;
switch (htype) {
case HASH_LIST:
hashtbl_list = p;
for (i = 0; i < hashsize; i++)
LIST_INIT(&hashtbl_list[i]);
break;
case HASH_PSLIST:
hashtbl_pslist = p;
for (i = 0; i < hashsize; i++)
PSLIST_INIT(&hashtbl_pslist[i]);
break;
case HASH_SLIST:
hashtbl_slist = p;
for (i = 0; i < hashsize; i++)
SLIST_INIT(&hashtbl_slist[i]);
break;
case HASH_TAILQ:
hashtbl_tailq = p;
for (i = 0; i < hashsize; i++)
TAILQ_INIT(&hashtbl_tailq[i]);
break;
}
*hashmask = hashsize - 1;
return p;
}
/*
* Free memory from hash table previosly allocated via hashinit().
*/
void
hashdone(void *hashtbl, enum hashtype htype, u_long hashmask)
{ const size_t esize = hash_list_size(htype);
kmem_free(hashtbl, esize * (hashmask + 1));
}
/*
* Support for hash statistics (vmstat -H / vmstat -h hashname).
*/
struct hashstat {
const char *hs_name;
hashstat_func_t hs_func;
TAILQ_ENTRY(hashstat) hs_next;
};
TAILQ_HEAD(, hashstat) hashstat_list =
TAILQ_HEAD_INITIALIZER(hashstat_list);
static krwlock_t hashstat_lock;
void
hashstat_register(const char *name, hashstat_func_t func)
{
struct hashstat *hs;
hs = kmem_alloc(sizeof(*hs), KM_SLEEP);
hs->hs_name = name;
hs->hs_func = func;
rw_enter(&hashstat_lock, RW_WRITER);
TAILQ_INSERT_TAIL(&hashstat_list, hs, hs_next);
rw_exit(&hashstat_lock);
}
/*
* sysctl support for returning kernel hash statistics.
*
* We (ab)use CTL_DESCRIBE and CTL_QUERY:
* When passed an OID of CTL_DESCRIBE, return a list and description
* of the available hashes.
* When passed an OID of CTL_QUERY, use the hash name passed in the
* "new" hash input as the name of a single hash to return stats on.
*/
static int
hashstat_sysctl(SYSCTLFN_ARGS)
{
struct hashstat_sysctl hs;
struct hashstat *hash;
char queryname[SYSCTL_NAMELEN];
size_t written;
bool fill, query;
int error;
if (oldp == NULL) {
*oldlenp = 0;
TAILQ_FOREACH(hash, &hashstat_list, hs_next)
*oldlenp += sizeof(hs);
return 0;
}
error = 0;
written = 0;
if (namelen > 0 && name[0] == CTL_DESCRIBE)
fill = false;
else
fill = true;
if (namelen > 0 && name[0] == CTL_QUERY) {
const struct hashstat_sysctl *h = newp;
size_t s;
if (h == NULL) {
/* Can't QUERY one hash without supplying the hash name. */
return EINVAL;
}
query = true;
error = sysctl_copyinstr(l, h->hash_name, queryname,
sizeof(queryname), &s);
if (error)
return error;
} else {
query = false;
}
sysctl_unlock();
rw_enter(&hashstat_lock, RW_READER);
TAILQ_FOREACH(hash, &hashstat_list, hs_next) {
if (query && (strcmp(hash->hs_name, queryname) != 0)) {
continue;
}
memset(&hs, 0, sizeof(hs));
error = hash->hs_func(&hs, fill);
if (error)
break;
error = sysctl_copyout(l, &hs, oldp, sizeof(hs));
if (error)
break;
written += sizeof(hs);
oldp = (char *)oldp + sizeof(hs);
}
rw_exit(&hashstat_lock);
sysctl_relock();
if (query && written == 0) /* query not found? */
error = ENOENT;
*oldlenp = written;
return error;
}
SYSCTL_SETUP(sysctl_hash_setup, "sysctl hash stats setup")
{
rw_init(&hashstat_lock); /* as good a place as any for this */
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT,
"hashstat", SYSCTL_DESCR("kernel hash statistics"),
hashstat_sysctl, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: subr_iostat.c,v 1.26 2024/05/04 13:33:18 mlelstv Exp $ */
/* NetBSD: subr_disk.c,v 1.69 2005/05/29 22:24:15 christos Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_iostat.c,v 1.26 2024/05/04 13:33:18 mlelstv Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>
#include <sys/rwlock.h>
/*
* Function prototypes for sysctl nodes
*/
static int sysctl_hw_disknames(SYSCTLFN_PROTO);
static int sysctl_hw_iostatnames(SYSCTLFN_PROTO);
static int sysctl_hw_iostats(SYSCTLFN_PROTO);
static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
u_int namelen);
/*
* A global list of all drives attached to the system. May grow or
* shrink over time.
*/
struct iostatlist_head iostatlist = TAILQ_HEAD_INITIALIZER(iostatlist);
int iostat_count; /* number of drives in global drivelist */
krwlock_t iostatlist_lock;
static void sysctl_io_stats_setup(struct sysctllog **);
/*
* Initialise the iostat subsystem.
*/
void
iostat_init(void)
{
rw_init(&iostatlist_lock);
sysctl_io_stats_setup(NULL);
}
/*
* Searches the iostatlist for the iostat corresponding to the
* name provided.
*/
struct io_stats *
iostat_find(const char *name)
{
struct io_stats *iostatp;
KASSERT(name != NULL);
rw_enter(&iostatlist_lock, RW_READER);
TAILQ_FOREACH(iostatp, &iostatlist, io_link) {
if (strcmp(iostatp->io_name, name) == 0) {
break;
}
}
rw_exit(&iostatlist_lock);
return iostatp;
}
/*
* Allocate and initialise memory for the i/o statistics.
*/
struct io_stats *
iostat_alloc(int32_t type, void *parent, const char *name)
{
struct io_stats *stats;
stats = kmem_zalloc(sizeof(*stats), KM_SLEEP);
stats->io_type = type;
stats->io_parent = parent;
(void)strlcpy(stats->io_name, name, sizeof(stats->io_name));
/*
* Set the attached timestamp.
*/
getmicrouptime(&stats->io_attachtime);
/*
* Link into the drivelist.
*/
rw_enter(&iostatlist_lock, RW_WRITER);
TAILQ_INSERT_TAIL(&iostatlist, stats, io_link);
iostat_count++;
rw_exit(&iostatlist_lock);
return stats;
}
/*
* Remove i/o from stats collection.
*/
void
iostat_free(struct io_stats *stats)
{
/*
* Remove from the iostat list.
*/
if (iostat_count == 0)
panic("iostat_free: iostat_count == 0");
rw_enter(&iostatlist_lock, RW_WRITER);
TAILQ_REMOVE(&iostatlist, stats, io_link);
iostat_count--;
rw_exit(&iostatlist_lock);
kmem_free(stats, sizeof(*stats));
}
/*
* Rename i/o stats.
*/
void
iostat_rename(struct io_stats *stats, const char *name)
{
rw_enter(&iostatlist_lock, RW_WRITER);
(void)strlcpy(stats->io_name, name, sizeof(stats->io_name));
rw_exit(&iostatlist_lock);
}
/*
* multiply timeval by unsigned integer and add to result
*/
static void
timermac(struct timeval *a, uint64_t count, struct timeval *res)
{
struct timeval part = *a;
while (count) { if (count & 1) timeradd(res, &part, res);
timeradd(&part, &part, &part);
count >>= 1;
}
}
/*
* Increment the iostat wait counter.
* Accumulate wait time and timesum.
*
* Wait time is spent in the device bufq.
*/
void
iostat_wait(struct io_stats *stats)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_wait >= 0);
getmicrouptime(&dv_time);
timersub(&dv_time, &stats->io_waitstamp, &diff_time);
count = stats->io_wait++;
if (count != 0) { timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
}
stats->io_waitstamp = dv_time;
}
/*
* Decrement the iostat wait counter.
* Increment the iostat busy counter.
* Accumulate wait and busy times and timesums.
*
* Busy time is spent being processed by the device.
*
* Old devices do not yet measure wait time, so skip
* processing it if the counter is still zero.
*/
void
iostat_busy(struct io_stats *stats)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_wait >= 0); /* > 0 when iostat_wait is used */ KASSERT(stats->io_busy >= 0);
getmicrouptime(&dv_time);
timersub(&dv_time, &stats->io_waitstamp, &diff_time);
if (stats->io_wait != 0) {
count = stats->io_wait--;
timermac(&diff_time, count, &stats->io_waitsum); timeradd(&stats->io_waittime, &diff_time, &stats->io_waittime);
}
stats->io_waitstamp = dv_time;
timersub(&dv_time, &stats->io_busystamp, &diff_time);
count = stats->io_busy++;
if (count != 0) { timermac(&diff_time, count, &stats->io_busysum); timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
}
stats->io_busystamp = dv_time;
}
/*
* Decrement the iostat busy counter, increment the byte count.
* Accumulate busy time and timesum.
*/
void
iostat_unbusy(struct io_stats *stats, long bcount, int read)
{
struct timeval dv_time, diff_time;
int32_t count;
KASSERT(stats->io_busy > 0);
getmicrouptime(&dv_time);
stats->io_timestamp = dv_time;
/* any op */
timersub(&dv_time, &stats->io_busystamp, &diff_time);
count = stats->io_busy--;
timermac(&diff_time, count, &stats->io_busysum);
timeradd(&stats->io_busytime, &diff_time, &stats->io_busytime);
stats->io_busystamp = dv_time;
if (bcount > 0) {
if (read) {
stats->io_rbytes += bcount;
stats->io_rxfer++;
} else {
stats->io_wbytes += bcount;
stats->io_wxfer++;
}
}
}
/*
* Return non-zero if a device has an I/O request in flight.
*/
bool
iostat_isbusy(struct io_stats *stats)
{
return stats->io_busy != 0;
}
/*
* Increment the seek counter. This does look almost redundant but it
* abstracts the stats gathering.
*/
void
iostat_seek(struct io_stats *stats)
{
stats->io_seek++;
}
static int
sysctl_hw_disknames(SYSCTLFN_ARGS)
{
return iostati_getnames(1, oldp, oldlenp, newp, namelen);
}
static int
sysctl_hw_iostatnames(SYSCTLFN_ARGS)
{
return iostati_getnames(0, oldp, oldlenp, newp, namelen);
}
static int
iostati_getnames(int disk_only, char *oldp, size_t *oldlenp, const void *newp,
u_int namelen)
{
char bf[IOSTATNAMELEN + 1];
char *where = oldp;
struct io_stats *stats;
size_t needed, left, slen;
int error, first;
if (newp != NULL)
return (EPERM);
if (namelen != 0)
return (EINVAL);
first = 1;
error = 0;
needed = 0;
left = *oldlenp;
rw_enter(&iostatlist_lock, RW_READER);
for (stats = TAILQ_FIRST(&iostatlist); stats != NULL;
stats = TAILQ_NEXT(stats, io_link)) {
if ((disk_only == 1) && (stats->io_type != IOSTAT_DISK))
continue;
if (where == NULL)
needed += strlen(stats->io_name) + 1;
else {
memset(bf, 0, sizeof(bf));
if (first) {
strncpy(bf, stats->io_name, sizeof(bf));
/* account for trailing NUL byte */
needed += 1;
first = 0;
} else {
bf[0] = ' ';
strncpy(bf + 1, stats->io_name,
sizeof(bf) - 1);
}
bf[IOSTATNAMELEN] = '\0';
slen = strlen(bf);
if (left < slen + 1)
break;
/* +1 to copy out the trailing NUL byte */
error = copyout(bf, where, slen + 1);
if (error)
break;
where += slen;
needed += slen;
left -= slen;
}
}
rw_exit(&iostatlist_lock);
*oldlenp = needed;
return (error);
}
static int
sysctl_hw_iostats(SYSCTLFN_ARGS)
{
struct io_sysctl sdrive;
struct io_stats *stats;
char *where = oldp;
size_t tocopy, left;
int error;
if (newp != NULL)
return (EPERM);
/*
* The original hw.diskstats call was broken and did not require
* the userland to pass in its size of struct disk_sysctl. This
* was fixed after NetBSD 1.6 was released.
*/
if (namelen == 0)
tocopy = offsetof(struct io_sysctl, busy);
else
tocopy = name[0];
if (where == NULL) {
*oldlenp = iostat_count * tocopy;
return (0);
}
error = 0;
left = *oldlenp;
memset(&sdrive, 0, sizeof(sdrive));
*oldlenp = 0;
rw_enter(&iostatlist_lock, RW_READER);
TAILQ_FOREACH(stats, &iostatlist, io_link) {
if (left < tocopy)
break;
strncpy(sdrive.name, stats->io_name, sizeof(sdrive.name));
sdrive.attachtime_sec = stats->io_attachtime.tv_sec;
sdrive.attachtime_usec = stats->io_attachtime.tv_usec;
sdrive.timestamp_sec = stats->io_busystamp.tv_sec;
sdrive.timestamp_usec = stats->io_busystamp.tv_usec;
sdrive.time_sec = stats->io_busytime.tv_sec;
sdrive.time_usec = stats->io_busytime.tv_usec;
sdrive.seek = stats->io_seek;
sdrive.rxfer = stats->io_rxfer;
sdrive.wxfer = stats->io_wxfer;
sdrive.xfer = stats->io_rxfer + stats->io_wxfer;
sdrive.rbytes = stats->io_rbytes;
sdrive.wbytes = stats->io_wbytes;
sdrive.bytes = stats->io_rbytes + stats->io_wbytes;
sdrive.wait_sec = stats->io_waittime.tv_sec;
sdrive.wait_usec = stats->io_waittime.tv_usec;
sdrive.time_sec = stats->io_busytime.tv_sec;
sdrive.time_usec = stats->io_busytime.tv_usec;
sdrive.waitsum_sec = stats->io_waitsum.tv_sec;
sdrive.waitsum_usec = stats->io_waitsum.tv_usec;
sdrive.busysum_sec = stats->io_busysum.tv_sec;
sdrive.busysum_usec = stats->io_busysum.tv_usec;
sdrive.busy = stats->io_busy;
error = copyout(&sdrive, where, uimin(tocopy, sizeof(sdrive)));
if (error)
break;
where += tocopy;
*oldlenp += tocopy;
left -= tocopy;
}
rw_exit(&iostatlist_lock);
return (error);
}
static void
sysctl_io_stats_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "disknames",
SYSCTL_DESCR("List of disk drives present"),
sysctl_hw_disknames, 0, NULL, 0,
CTL_HW, HW_DISKNAMES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "iostatnames",
SYSCTL_DESCR("I/O stats are being collected for these"
" devices"),
sysctl_hw_iostatnames, 0, NULL, 0,
CTL_HW, HW_IOSTATNAMES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "iostats",
SYSCTL_DESCR("Statistics on device I/O operations"),
sysctl_hw_iostats, 0, NULL, 0,
CTL_HW, HW_IOSTATS, CTL_EOL);
}
/* $NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_vnops.c,v 1.220 2023/03/03 10:02:51 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/fstrans.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/poll.h>
#include <sys/mman.h>
#include <sys/file.h>
#include <sys/kauth.h>
#include <sys/stat.h>
#include <sys/extattr.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/genfs_node.h>
#include <miscfs/specfs/specdev.h>
static void filt_genfsdetach(struct knote *);
static int filt_genfsread(struct knote *, long);
static int filt_genfsvnode(struct knote *, long);
/*
* Find the end of the first path component in NAME and return its
* length.
*/
int
genfs_parsepath(void *v)
{
struct vop_parsepath_args /* {
struct vnode *a_dvp;
const char *a_name;
size_t *a_ret;
} */ *ap = v;
const char *name = ap->a_name;
size_t pos;
(void)ap->a_dvp;
pos = 0;
while (name[pos] != '\0' && name[pos] != '/') { pos++;
}
*ap->a_retval = pos;
return 0;
}
int
genfs_poll(void *v)
{
struct vop_poll_args /* {
struct vnode *a_vp;
int a_events;
struct lwp *a_l;
} */ *ap = v;
return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM));
}
int
genfs_seek(void *v)
{
struct vop_seek_args /* {
struct vnode *a_vp;
off_t a_oldoff;
off_t a_newoff;
kauth_cred_t cred;
} */ *ap = v;
if (ap->a_newoff < 0)
return (EINVAL);
return (0);
}
int
genfs_abortop(void *v)
{
struct vop_abortop_args /* {
struct vnode *a_dvp;
struct componentname *a_cnp;
} */ *ap = v;
(void)ap;
return (0);
}
int
genfs_fcntl(void *v)
{
struct vop_fcntl_args /* {
struct vnode *a_vp;
u_int a_command;
void *a_data;
int a_fflag;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
if (ap->a_command == F_SETFL)
return (0);
else
return (EOPNOTSUPP);
}
/*ARGSUSED*/
int
genfs_badop(void *v)
{
panic("genfs: bad op");
}
/*ARGSUSED*/
int
genfs_nullop(void *v)
{
return (0);
}
/*ARGSUSED*/
int
genfs_einval(void *v)
{
return (EINVAL);
}
int
genfs_erofs_link(void *v)
{
/* also for symlink */
struct vop_link_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
VOP_ABORTOP(ap->a_dvp, ap->a_cnp);
return EROFS;
}
/*
* Called when an fs doesn't support a particular vop.
* This takes care to vrele, vput, or vunlock passed in vnodes
* and calls VOP_ABORTOP for a componentname (in non-rename VOP).
*/
int
genfs_eopnotsupp(void *v)
{
struct vop_generic_args /*
struct vnodeop_desc *a_desc;
/ * other random data follows, presumably * /
} */ *ap = v;
struct vnodeop_desc *desc = ap->a_desc;
struct vnode *vp, *vp_last = NULL;
int flags, i, j, offset_cnp, offset_vp;
KASSERT(desc->vdesc_offset != VOP_LOOKUP_DESCOFFSET); KASSERT(desc->vdesc_offset != VOP_ABORTOP_DESCOFFSET);
/*
* Abort any componentname that lookup potentially left state in.
*
* As is logical, componentnames for VOP_RENAME are handled by
* the caller of VOP_RENAME. Yay, rename!
*/
if (desc->vdesc_offset != VOP_RENAME_DESCOFFSET && (offset_vp = desc->vdesc_vp_offsets[0]) != VDESC_NO_OFFSET &&
(offset_cnp = desc->vdesc_componentname_offset) != VDESC_NO_OFFSET){
struct componentname *cnp;
struct vnode *dvp;
dvp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);
cnp = *VOPARG_OFFSETTO(struct componentname **, offset_cnp, ap);
VOP_ABORTOP(dvp, cnp);
}
flags = desc->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; flags >>=1, i++) { if ((offset_vp = desc->vdesc_vp_offsets[i]) == VDESC_NO_OFFSET)
break; /* stop at end of list */
if ((j = flags & VDESC_VP0_WILLPUT)) {
vp = *VOPARG_OFFSETTO(struct vnode **, offset_vp, ap);
/* Skip if NULL */
if (!vp)
continue;
switch (j) {
case VDESC_VP0_WILLPUT:
/* Check for dvp == vp cases */
if (vp == vp_last)
vrele(vp);
else {
vput(vp);
vp_last = vp;
}
break;
case VDESC_VP0_WILLRELE:
vrele(vp);
break;
}
}
}
return (EOPNOTSUPP);
}
/*ARGSUSED*/
int
genfs_ebadf(void *v)
{
return (EBADF);
}
/* ARGSUSED */
int
genfs_enoioctl(void *v)
{
return (EPASSTHROUGH);
}
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
int
genfs_revoke(void *v)
{
struct vop_revoke_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
#ifdef DIAGNOSTIC
if ((ap->a_flags & REVOKEALL) == 0)
panic("genfs_revoke: not revokeall");
#endif
vrevoke(ap->a_vp);
return (0);
}
/*
* Lock the node (for deadfs).
*/
int
genfs_deadlock(void *v)
{
struct vop_lock_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int flags = ap->a_flags;
krw_t op;
if (! ISSET(flags, LK_RETRY))
return ENOENT;
if (ISSET(flags, LK_DOWNGRADE)) {
rw_downgrade(&vip->vi_lock);
} else if (ISSET(flags, LK_UPGRADE)) {
KASSERT(ISSET(flags, LK_NOWAIT)); if (!rw_tryupgrade(&vip->vi_lock)) {
return EBUSY;
}
} else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
if (ISSET(flags, LK_NOWAIT)) {
if (!rw_tryenter(&vip->vi_lock, op))
return EBUSY;
} else {
rw_enter(&vip->vi_lock, op);
}
}
VSTATE_ASSERT_UNLOCKED(vp, VS_RECLAIMED);
return 0;
}
/*
* Unlock the node (for deadfs).
*/
int
genfs_deadunlock(void *v)
{
struct vop_unlock_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
rw_exit(&vip->vi_lock);
return 0;
}
/*
* Lock the node.
*/
int
genfs_lock(void *v)
{
struct vop_lock_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int flags = ap->a_flags;
krw_t op;
if (ISSET(flags, LK_DOWNGRADE)) {
rw_downgrade(&vip->vi_lock);
} else if (ISSET(flags, LK_UPGRADE)) {
KASSERT(ISSET(flags, LK_NOWAIT)); if (!rw_tryupgrade(&vip->vi_lock)) {
return EBUSY;
}
} else if ((flags & (LK_EXCLUSIVE | LK_SHARED)) != 0) {
op = (ISSET(flags, LK_EXCLUSIVE) ? RW_WRITER : RW_READER);
if (ISSET(flags, LK_NOWAIT)) {
if (!rw_tryenter(&vip->vi_lock, op))
return EBUSY;
} else {
rw_enter(&vip->vi_lock, op);
}
}
VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
return 0;
}
/*
* Unlock the node.
*/
int
genfs_unlock(void *v)
{
struct vop_unlock_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
rw_exit(&vip->vi_lock);
return 0;
}
/*
* Return whether or not the node is locked.
*/
int
genfs_islocked(void *v)
{
struct vop_islocked_args /* {
struct vnode *a_vp;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
if (rw_write_held(&vip->vi_lock))
return LK_EXCLUSIVE;
if (rw_read_held(&vip->vi_lock))
return LK_SHARED;
return 0;
}
int
genfs_mmap(void *v)
{
return (0);
}
/*
* VOP_PUTPAGES() for vnodes which never have pages.
*/
int
genfs_null_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(vp->v_uobj.uo_npages == 0);
rw_exit(vp->v_uobj.vmobjlock);
return (0);
}
void
genfs_node_init(struct vnode *vp, const struct genfs_ops *ops)
{
struct genfs_node *gp = VTOG(vp);
rw_init(&gp->g_glock);
gp->g_op = ops;
}
void
genfs_node_destroy(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_destroy(&gp->g_glock);
}
void
genfs_size(struct vnode *vp, off_t size, off_t *eobp, int flags)
{
int bsize;
bsize = 1 << vp->v_mount->mnt_fs_bshift;
*eobp = (size + bsize - 1) & ~(bsize - 1);
}
static void
filt_genfsdetach(struct knote *kn)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
vn_knote_detach(vp, kn);
}
static int
filt_genfsread(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int rv;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, EV_ONESHOT);
return (1);
case 0:
mutex_enter(vp->v_interlock);
kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
rv = (kn->kn_data != 0);
mutex_exit(vp->v_interlock);
return rv;
default:
KASSERT(mutex_owned(vp->v_interlock));
kn->kn_data = vp->v_size - ((file_t *)kn->kn_obj)->f_offset;
return (kn->kn_data != 0);
}
}
static int
filt_genfswrite(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
/*
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, EV_ONESHOT);
return (1);
case 0:
mutex_enter(vp->v_interlock);
kn->kn_data = 0;
mutex_exit(vp->v_interlock);
return 1;
default:
KASSERT(mutex_owned(vp->v_interlock));
kn->kn_data = 0;
return 1;
}
}
static int
filt_genfsvnode(struct knote *kn, long hint)
{
struct vnode *vp = (struct vnode *)kn->kn_hook;
int fflags;
switch (hint) {
case NOTE_REVOKE:
KASSERT(mutex_owned(vp->v_interlock));
knote_set_eof(kn, 0);
if ((kn->kn_sfflags & hint) != 0)
kn->kn_fflags |= hint;
return (1);
case 0:
mutex_enter(vp->v_interlock);
fflags = kn->kn_fflags;
mutex_exit(vp->v_interlock);
break;
default:
KASSERT(mutex_owned(vp->v_interlock));
if ((kn->kn_sfflags & hint) != 0)
kn->kn_fflags |= hint;
fflags = kn->kn_fflags;
break;
}
return (fflags != 0);
}
static const struct filterops genfsread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfsread,
};
static const struct filterops genfswrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfswrite,
};
static const struct filterops genfsvnode_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_genfsdetach,
.f_event = filt_genfsvnode,
};
int
genfs_kqfilter(void *v)
{
struct vop_kqfilter_args /* {
struct vnode *a_vp;
struct knote *a_kn;
} */ *ap = v;
struct vnode *vp;
struct knote *kn;
vp = ap->a_vp;
kn = ap->a_kn;
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &genfsread_filtops;
break;
case EVFILT_WRITE:
kn->kn_fop = &genfswrite_filtops;
break;
case EVFILT_VNODE:
kn->kn_fop = &genfsvnode_filtops;
break;
default:
return (EINVAL);
}
kn->kn_hook = vp;
vn_knote_attach(vp, kn);
return (0);
}
void
genfs_node_wrlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_enter(&gp->g_glock, RW_WRITER);
}
void
genfs_node_rdlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_enter(&gp->g_glock, RW_READER);
}
int
genfs_node_rdtrylock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
return rw_tryenter(&gp->g_glock, RW_READER);
}
void
genfs_node_unlock(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
rw_exit(&gp->g_glock);
}
int
genfs_node_wrlocked(struct vnode *vp)
{
struct genfs_node *gp = VTOG(vp);
return rw_write_held(&gp->g_glock);
}
/*
* Common filesystem object access control check routine. Accepts a
* vnode, cred, uid, gid, mode, acl, requested access mode.
* Returns 0 on success, or an errno on failure.
*/
int
genfs_can_access(vnode_t *vp, kauth_cred_t cred, uid_t file_uid, gid_t file_gid,
mode_t file_mode, struct acl *acl, accmode_t accmode)
{
accmode_t dac_granted;
int error;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0); KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
/*
* Look for a normal, non-privileged way to access the file/directory
* as requested. If it exists, go with that.
*/
dac_granted = 0;
/* Check the owner. */
if (kauth_cred_geteuid(cred) == file_uid) {
dac_granted |= VADMIN;
if (file_mode & S_IXUSR)
dac_granted |= VEXEC;
if (file_mode & S_IRUSR)
dac_granted |= VREAD;
if (file_mode & S_IWUSR)
dac_granted |= (VWRITE | VAPPEND);
goto privchk;
}
/* Otherwise, check the groups (first match) */
/* Otherwise, check the groups. */
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error == 0) {
if (file_mode & S_IXGRP)
dac_granted |= VEXEC;
if (file_mode & S_IRGRP)
dac_granted |= VREAD;
if (file_mode & S_IWGRP)
dac_granted |= (VWRITE | VAPPEND);
goto privchk;
}
/* Otherwise, check everyone else. */
if (file_mode & S_IXOTH)
dac_granted |= VEXEC;
if (file_mode & S_IROTH)
dac_granted |= VREAD;
if (file_mode & S_IWOTH)
dac_granted |= (VWRITE | VAPPEND);
privchk:
if ((accmode & dac_granted) == accmode)
return 0;
return (accmode & VADMIN) ? EPERM : EACCES;
}
/*
* Implement a version of genfs_can_access() that understands POSIX.1e ACL
* semantics;
* the access ACL has already been prepared for evaluation by the file system
* and is passed via 'uid', 'gid', and 'acl'. Return 0 on success, else an
* errno value.
*/
int
genfs_can_access_acl_posix1e(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
gid_t file_gid, mode_t file_mode, struct acl *acl, accmode_t accmode)
{
struct acl_entry *acl_other, *acl_mask;
accmode_t dac_granted;
accmode_t acl_mask_granted;
int group_matched, i;
int error;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
/*
* The owner matches if the effective uid associated with the
* credential matches that of the ACL_USER_OBJ entry. While we're
* doing the first scan, also cache the location of the ACL_MASK and
* ACL_OTHER entries, preventing some future iterations.
*/
acl_mask = acl_other = NULL;
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_USER_OBJ:
if (kauth_cred_geteuid(cred) != file_uid)
break;
dac_granted = 0;
dac_granted |= VADMIN;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
goto out;
case ACL_MASK:
acl_mask = ae;
break;
case ACL_OTHER:
acl_other = ae;
break;
default:
break;
}
}
/*
* An ACL_OTHER entry should always exist in a valid access ACL. If
* it doesn't, then generate a serious failure. For now, this means
* a debugging message and EPERM, but in the future should probably
* be a panic.
*/
if (acl_other == NULL) {
/*
* XXX This should never happen
*/
printf("%s: ACL_OTHER missing\n", __func__);
return EPERM;
}
/*
* Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
* masked by an ACL_MASK entry, if any. As such, first identify the
* ACL_MASK field, then iterate through identifying potential user
* matches, then group matches. If there is no ACL_MASK, assume that
* the mask allows all requests to succeed.
*/
if (acl_mask != NULL) {
acl_mask_granted = 0;
if (acl_mask->ae_perm & ACL_EXECUTE)
acl_mask_granted |= VEXEC;
if (acl_mask->ae_perm & ACL_READ)
acl_mask_granted |= VREAD;
if (acl_mask->ae_perm & ACL_WRITE)
acl_mask_granted |= (VWRITE | VAPPEND);
} else
acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
/*
* Check ACL_USER ACL entries. There will either be one or no
* matches; if there is one, we accept or rejected based on the
* match; otherwise, we continue on to groups.
*/
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_USER:
if (kauth_cred_geteuid(cred) != ae->ae_id)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
}
}
/*
* Group match is best-match, not first-match, so find a "best"
* match. Iterate across, testing each potential group match. Make
* sure we keep track of whether we found a match or not, so that we
* know if we should try again with any available privilege, or if we
* should move on to ACL_OTHER.
*/
group_matched = 0;
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
if ((accmode & dac_granted) == accmode)
return 0;
group_matched = 1;
break;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
if ((accmode & dac_granted) == accmode)
return 0;
group_matched = 1;
break;
default:
break;
}
}
if (group_matched == 1) {
/*
* There was a match, but it did not grant rights via pure
* DAC. Try again, this time with privilege.
*/
for (i = 0; i < acl->acl_cnt; i++) {
struct acl_entry *ae = &acl->acl_entry[i];
switch (ae->ae_tag) {
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error)
break;
dac_granted = 0;
if (ae->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (ae->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (ae->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
dac_granted &= acl_mask_granted;
goto out;
default:
break;
}
}
/*
* Even with privilege, group membership was not sufficient.
* Return failure.
*/
dac_granted = 0;
goto out;
}
/*
* Fall back on ACL_OTHER. ACL_MASK is not applied to ACL_OTHER.
*/
dac_granted = 0;
if (acl_other->ae_perm & ACL_EXECUTE)
dac_granted |= VEXEC;
if (acl_other->ae_perm & ACL_READ)
dac_granted |= VREAD;
if (acl_other->ae_perm & ACL_WRITE)
dac_granted |= (VWRITE | VAPPEND);
out:
if ((accmode & dac_granted) == accmode)
return 0;
return (accmode & VADMIN) ? EPERM : EACCES;
}
static struct {
accmode_t accmode;
int mask;
} accmode2mask[] = {
{ VREAD, ACL_READ_DATA },
{ VWRITE, ACL_WRITE_DATA },
{ VAPPEND, ACL_APPEND_DATA },
{ VEXEC, ACL_EXECUTE },
{ VREAD_NAMED_ATTRS, ACL_READ_NAMED_ATTRS },
{ VWRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS },
{ VDELETE_CHILD, ACL_DELETE_CHILD },
{ VREAD_ATTRIBUTES, ACL_READ_ATTRIBUTES },
{ VWRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES },
{ VDELETE, ACL_DELETE },
{ VREAD_ACL, ACL_READ_ACL },
{ VWRITE_ACL, ACL_WRITE_ACL },
{ VWRITE_OWNER, ACL_WRITE_OWNER },
{ VSYNCHRONIZE, ACL_SYNCHRONIZE },
{ 0, 0 },
};
static int
_access_mask_from_accmode(accmode_t accmode)
{
int access_mask = 0, i;
for (i = 0; accmode2mask[i].accmode != 0; i++) {
if (accmode & accmode2mask[i].accmode)
access_mask |= accmode2mask[i].mask;
}
/*
* VAPPEND is just a modifier for VWRITE; if the caller asked
* for 'VAPPEND | VWRITE', we want to check for ACL_APPEND_DATA only.
*/
if (access_mask & ACL_APPEND_DATA)
access_mask &= ~ACL_WRITE_DATA;
return (access_mask);
}
/*
* Return 0, iff access is allowed, 1 otherwise.
*/
static int
_acl_denies(const struct acl *aclp, int access_mask, kauth_cred_t cred,
int file_uid, int file_gid, int *denied_explicitly)
{
int i, error;
const struct acl_entry *ae;
if (denied_explicitly != NULL)
*denied_explicitly = 0;
KASSERT(aclp->acl_cnt <= ACL_MAX_ENTRIES);
for (i = 0; i < aclp->acl_cnt; i++) {
ae = &(aclp->acl_entry[i]);
if (ae->ae_entry_type != ACL_ENTRY_TYPE_ALLOW &&
ae->ae_entry_type != ACL_ENTRY_TYPE_DENY)
continue;
if (ae->ae_flags & ACL_ENTRY_INHERIT_ONLY)
continue;
switch (ae->ae_tag) {
case ACL_USER_OBJ:
if (kauth_cred_geteuid(cred) != file_uid)
continue;
break;
case ACL_USER:
if (kauth_cred_geteuid(cred) != ae->ae_id)
continue;
break;
case ACL_GROUP_OBJ:
error = kauth_cred_groupmember(cred, file_gid);
if (error > 0)
return error;
if (error != 0)
continue;
break;
case ACL_GROUP:
error = kauth_cred_groupmember(cred, ae->ae_id);
if (error > 0)
return error;
if (error != 0)
continue;
break;
default:
KASSERT(ae->ae_tag == ACL_EVERYONE);
}
if (ae->ae_entry_type == ACL_ENTRY_TYPE_DENY) {
if (ae->ae_perm & access_mask) {
if (denied_explicitly != NULL)
*denied_explicitly = 1;
return (1);
}
}
access_mask &= ~(ae->ae_perm);
if (access_mask == 0)
return (0);
}
if (access_mask == 0)
return (0);
return (1);
}
int
genfs_can_access_acl_nfs4(vnode_t *vp, kauth_cred_t cred, uid_t file_uid,
gid_t file_gid, mode_t file_mode, struct acl *aclp, accmode_t accmode)
{
int denied, explicitly_denied, access_mask, is_directory,
must_be_owner = 0;
file_mode = 0;
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND |
VEXPLICIT_DENY | VREAD_NAMED_ATTRS | VWRITE_NAMED_ATTRS |
VDELETE_CHILD | VREAD_ATTRIBUTES | VWRITE_ATTRIBUTES | VDELETE |
VREAD_ACL | VWRITE_ACL | VWRITE_OWNER | VSYNCHRONIZE)) == 0);
KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE));
if (accmode & VADMIN)
must_be_owner = 1;
/*
* Ignore VSYNCHRONIZE permission.
*/
accmode &= ~VSYNCHRONIZE;
access_mask = _access_mask_from_accmode(accmode);
if (vp && vp->v_type == VDIR)
is_directory = 1;
else
is_directory = 0;
/*
* File owner is always allowed to read and write the ACL
* and basic attributes. This is to prevent a situation
* where user would change ACL in a way that prevents him
* from undoing the change.
*/
if (kauth_cred_geteuid(cred) == file_uid)
access_mask &= ~(ACL_READ_ACL | ACL_WRITE_ACL |
ACL_READ_ATTRIBUTES | ACL_WRITE_ATTRIBUTES);
/*
* Ignore append permission for regular files; use write
* permission instead.
*/
if (!is_directory && (access_mask & ACL_APPEND_DATA)) {
access_mask &= ~ACL_APPEND_DATA;
access_mask |= ACL_WRITE_DATA;
}
denied = _acl_denies(aclp, access_mask, cred, file_uid, file_gid,
&explicitly_denied);
if (must_be_owner) {
if (kauth_cred_geteuid(cred) != file_uid)
denied = EPERM;
}
/*
* For VEXEC, ensure that at least one execute bit is set for
* non-directories. We have to check the mode here to stay
* consistent with execve(2). See the test in
* exec_check_permissions().
*/
__acl_nfs4_sync_mode_from_acl(&file_mode, aclp);
if (!denied && !is_directory && (accmode & VEXEC) &&
(file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
denied = EACCES;
if (!denied)
return (0);
/*
* Access failed. Iff it was not denied explicitly and
* VEXPLICIT_DENY flag was specified, allow access.
*/
if ((accmode & VEXPLICIT_DENY) && explicitly_denied == 0)
return (0);
accmode &= ~VEXPLICIT_DENY;
if (accmode & (VADMIN_PERMS | VDELETE_CHILD | VDELETE))
denied = EPERM;
else
denied = EACCES;
return (denied);
}
/*
* Common routine to check if chmod() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not set the "sticky" bit (meaningless, see chmod(2))
* - You must be a member of the group if you're trying to set the
* SGIDf bit
*
* vp - vnode of the file-system object
* cred - credentials of the invoker
* cur_uid, cur_gid - current uid/gid of the file-system object
* new_mode - new mode for the file-system object
*
* Returns 0 if the change is allowed, or an error value otherwise.
*/
int
genfs_can_chmod(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
gid_t cur_gid, mode_t new_mode)
{
int error;
/*
* To modify the permissions on a file, must possess VADMIN
* for that file.
*/
if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred)) != 0)
return (error);
/*
* Unprivileged users can't set the sticky bit on files.
*/
if ((vp->v_type != VDIR) && (new_mode & S_ISTXT))
return (EFTYPE);
/*
* If the invoker is trying to set the SGID bit on the file,
* check group membership.
*/
if (new_mode & S_ISGID) {
int ismember;
error = kauth_cred_ismember_gid(cred, cur_gid,
&ismember);
if (error || !ismember)
return (EPERM);
}
/*
* Deny setting setuid if we are not the file owner.
*/
if ((new_mode & S_ISUID) && cur_uid != kauth_cred_geteuid(cred))
return (EPERM);
return (0);
}
/*
* Common routine to check if chown() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not try to change ownership, and
* - You must be member of the new group
*
* vp - vnode
* cred - credentials of the invoker
* cur_uid, cur_gid - current uid/gid of the file-system object
* new_uid, new_gid - target uid/gid of the file-system object
*
* Returns 0 if the change is allowed, or an error value otherwise.
*/
int
genfs_can_chown(vnode_t *vp, kauth_cred_t cred, uid_t cur_uid,
gid_t cur_gid, uid_t new_uid, gid_t new_gid)
{
int error, ismember;
/*
* To modify the ownership of a file, must possess VADMIN for that
* file.
*/
if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred)) != 0)
return (error);
/*
* You can only change ownership of a file if:
* You own the file and...
*/
if (kauth_cred_geteuid(cred) == cur_uid) {
/*
* You don't try to change ownership, and...
*/
if (new_uid != cur_uid)
return (EPERM);
/*
* You don't try to change group (no-op), or...
*/
if (new_gid == cur_gid)
return (0);
/*
* Your effective gid is the new gid, or...
*/
if (kauth_cred_getegid(cred) == new_gid)
return (0);
/*
* The new gid is one you're a member of.
*/
ismember = 0;
error = kauth_cred_ismember_gid(cred, new_gid,
&ismember);
if (!error && ismember)
return (0);
}
return (EPERM);
}
int
genfs_can_chtimes(vnode_t *vp, kauth_cred_t cred, uid_t owner_uid,
u_int vaflags)
{
int error;
/*
* Grant permission if the caller is the owner of the file, or
* the super-user, or has ACL_WRITE_ATTRIBUTES permission on
* on the file. If the time pointer is null, then write
* permission on the file is also sufficient.
*
* From NFSv4.1, draft 21, 6.2.1.3.1, Discussion of Mask Attributes:
* A user having ACL_WRITE_DATA or ACL_WRITE_ATTRIBUTES
* will be allowed to set the times [..] to the current
* server time.
*/
error = VOP_ACCESSX(vp, VWRITE_ATTRIBUTES, cred);
if (error != 0 && (vaflags & VA_UTIMES_NULL) != 0)
error = VOP_ACCESS(vp, VWRITE, cred);
if (error)
return (vaflags & VA_UTIMES_NULL) == 0 ? EPERM : EACCES;
return 0;
}
/*
* Common routine to check if chflags() is allowed.
*
* Policy:
* - You must own the file, and
* - You must not change system flags, and
* - You must not change flags on character/block devices.
*
* vp - vnode
* cred - credentials of the invoker
* owner_uid - uid of the file-system object
* changing_sysflags - true if the invoker wants to change system flags
*/
int
genfs_can_chflags(vnode_t *vp, kauth_cred_t cred,
uid_t owner_uid, bool changing_sysflags)
{
/* The user must own the file. */
if (kauth_cred_geteuid(cred) != owner_uid) {
return EPERM;
}
if (changing_sysflags) {
return EPERM;
}
/*
* Unprivileged users cannot change the flags on devices, even if they
* own them.
*/
if (vp->v_type == VCHR || vp->v_type == VBLK) {
return EPERM;
}
return 0;
}
/*
* Common "sticky" policy.
*
* When a directory is "sticky" (as determined by the caller), this
* function may help implementing the following policy:
* - Renaming a file in it is only possible if the user owns the directory
* or the file being renamed.
* - Deleting a file from it is only possible if the user owns the
* directory or the file being deleted.
*/
int
genfs_can_sticky(vnode_t *vp, kauth_cred_t cred, uid_t dir_uid, uid_t file_uid)
{ if (kauth_cred_geteuid(cred) != dir_uid &&
kauth_cred_geteuid(cred) != file_uid)
return EPERM;
return 0;
}
int
genfs_can_extattr(vnode_t *vp, kauth_cred_t cred, accmode_t accmode,
int attrnamespace)
{
/*
* Kernel-invoked always succeeds.
*/
if (cred == NOCRED)
return 0;
switch (attrnamespace) {
case EXTATTR_NAMESPACE_SYSTEM:
return kauth_authorize_system(cred, KAUTH_SYSTEM_FS_EXTATTR,
0, vp->v_mount, NULL, NULL);
case EXTATTR_NAMESPACE_USER:
return VOP_ACCESS(vp, accmode, cred);
default:
return EPERM;
}
}
int
genfs_access(void *v)
{
struct vop_access_args *ap = v;
KASSERT((ap->a_accmode & ~(VEXEC | VWRITE | VREAD | VADMIN |
VAPPEND)) == 0);
return VOP_ACCESSX(ap->a_vp, ap->a_accmode, ap->a_cred);
}
int
genfs_accessx(void *v)
{
struct vop_accessx_args *ap = v;
int error;
accmode_t accmode = ap->a_accmode;
error = vfs_unixify_accmode(&accmode);
if (error != 0)
return error;
if (accmode == 0)
return 0;
return VOP_ACCESS(ap->a_vp, accmode, ap->a_cred);
}
/*
* genfs_pathconf:
*
* Standard implementation of POSIX pathconf, to get information about limits
* for a filesystem.
* Override per filesystem for the case where the filesystem has smaller
* limits.
*/
int
genfs_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
switch (ap->a_name) {
case _PC_PATH_MAX:
*ap->a_retval = PATH_MAX;
return 0;
case _PC_ACL_EXTENDED:
case _PC_ACL_NFS4:
*ap->a_retval = 0;
return 0;
default:
return EINVAL;
}
}
/* $NetBSD: uvm_fault_i.h,v 1.33 2020/02/23 15:46:43 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_fault_i.h,v 1.1.6.1 1997/12/08 16:07:12 chuck Exp
*/
#ifndef _UVM_UVM_FAULT_I_H_
#define _UVM_UVM_FAULT_I_H_
/*
* uvm_fault_i.h: fault inline functions
*/
void uvmfault_update_stats(struct uvm_faultinfo *);
/*
* uvmfault_unlockmaps: unlock the maps
*/
static __inline void
uvmfault_unlockmaps(struct uvm_faultinfo *ufi, bool write_locked)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return;
}
#ifndef __HAVE_NO_PMAP_STATS
uvmfault_update_stats(ufi);
#endif
if (write_locked) {
vm_map_unlock(ufi->map);
} else {
vm_map_unlock_read(ufi->map);
}
}
/*
* uvmfault_unlockall: unlock everything passed in.
*
* => maps must be read-locked (not write-locked).
*/
static __inline void
uvmfault_unlockall(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct uvm_object *uobj)
{
if (uobj) rw_exit(uobj->vmobjlock); if (amap) amap_unlock(amap); uvmfault_unlockmaps(ufi, false);
}
/*
* uvmfault_lookup: lookup a virtual address in a map
*
* => caller must provide a uvm_faultinfo structure with the IN
* params properly filled in
* => we will lookup the map entry (handling submaps) as we go
* => if the lookup is a success we will return with the maps locked
* => if "write_lock" is true, we write_lock the map, otherwise we only
* get a read lock.
* => note that submaps can only appear in the kernel and they are
* required to use the same virtual addresses as the map they
* are referenced by (thus address translation between the main
* map and the submap is unnecessary).
*/
static __inline bool
uvmfault_lookup(struct uvm_faultinfo *ufi, bool write_lock)
{
struct vm_map *tmpmap;
/*
* init ufi values for lookup.
*/
ufi->map = ufi->orig_map;
ufi->size = ufi->orig_size;
/*
* keep going down levels until we are done. note that there can
* only be two levels so we won't loop very long.
*/
for (;;) {
/*
* lock map
*/
if (write_lock) {
vm_map_lock(ufi->map);
} else {
vm_map_lock_read(ufi->map);
}
/*
* lookup
*/
if (!uvm_map_lookup_entry(ufi->map, ufi->orig_rvaddr,
&ufi->entry)) {
uvmfault_unlockmaps(ufi, write_lock);
return(false);
}
/*
* reduce size if necessary
*/
if (ufi->entry->end - ufi->orig_rvaddr < ufi->size) ufi->size = ufi->entry->end - ufi->orig_rvaddr;
/*
* submap? replace map with the submap and lookup again.
* note: VAs in submaps must match VAs in main map.
*/
if (UVM_ET_ISSUBMAP(ufi->entry)) {
tmpmap = ufi->entry->object.sub_map;
if (write_lock) {
vm_map_unlock(ufi->map);
} else {
vm_map_unlock_read(ufi->map);
}
ufi->map = tmpmap;
continue;
}
/*
* got it!
*/
ufi->mapv = ufi->map->timestamp;
return(true);
} /* while loop */
/*NOTREACHED*/
}
/*
* uvmfault_relock: attempt to relock the same version of the map
*
* => fault data structures should be unlocked before calling.
* => if a success (true) maps will be locked after call.
*/
static __inline bool
uvmfault_relock(struct uvm_faultinfo *ufi)
{
/*
* ufi can be NULL when this isn't really a fault,
* but merely paging in anon data.
*/
if (ufi == NULL) {
return true;
}
cpu_count(CPU_COUNT_FLTRELCK, 1);
/*
* relock map. fail if version mismatch (in which case nothing
* gets locked).
*/
vm_map_lock_read(ufi->map);
if (ufi->mapv != ufi->map->timestamp) {
vm_map_unlock_read(ufi->map);
return(false);
}
cpu_count(CPU_COUNT_FLTRELCKOK, 1);
return(true);
}
#endif /* _UVM_UVM_FAULT_I_H_ */
/* $NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $ */
/*
* Copyright (c) 2015 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95
* from: NetBSD: kern_xxx.c,v 1.74 2017/10/28 00:37:11 pgoyette Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_scdebug.c,v 1.2 2019/03/14 19:51:49 palle Exp $");
#ifdef _KERNEL_OPT
#include "opt_syscall_debug.h"
#include "opt_kernhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/kernhist.h>
/*
* Pull in the indirect syscall functions here.
* They are only actually used if the ports syscall entry code
* doesn't special-case SYS_SYSCALL and SYS___SYSCALL
*
* In some cases the generated code for the two functions is identical,
* but there isn't a MI way of determining that - so we don't try.
*/
#define SYS_SYSCALL sys_syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL
#define SYS_SYSCALL sys___syscall
#include "sys_syscall.c"
#undef SYS_SYSCALL
#ifdef SYSCALL_DEBUG
#define SCDEBUG_CALLS 0x0001 /* show calls */
#define SCDEBUG_RETURNS 0x0002 /* show returns */
#define SCDEBUG_ALL 0x0004 /* even syscalls that are not implemented */
#define SCDEBUG_SHOWARGS 0x0008 /* show arguments to calls */
#define SCDEBUG_KERNHIST 0x0010 /* use kernhist instead of printf */
#ifndef SCDEBUG_DEFAULT
#define SCDEBUG_DEFAULT (SCDEBUG_CALLS|SCDEBUG_RETURNS|SCDEBUG_SHOWARGS)
#endif
int scdebug = SCDEBUG_DEFAULT;
#ifdef KERNHIST
KERNHIST_DEFINE(scdebughist);
#define SCDEBUG_KERNHIST_FUNC(a) KERNHIST_FUNC(a)
#define SCDEBUG_KERNHIST_CALLED(a) KERNHIST_CALLED(a)
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) KERNHIST_LOG(a,b,c,d,e,f)
#else
#define SCDEBUG_KERNHIST_FUNC(a) {} /* nothing */
#define SCDEBUG_KERNHIST_CALLED(a) {} /* nothing */
#define SCDEBUG_KERNHIST_LOG(a,b,c,d,e,f) {} /* nothing */
/* The non-kernhist support version can elide all this code easily. */
#undef SCDEBUG_KERNHIST
#define SCDEBUG_KERNHIST 0
#endif
#ifdef __HAVE_MINIMAL_EMUL
#define CODE_NOT_OK(code, em) ((int)(code) < 0)
#else
#define CODE_NOT_OK(code, em) (((int)(code) < 0) || \
((int)(code) >= (em)->e_nsysent))
#endif
void
scdebug_call(register_t code, const register_t args[])
{
SCDEBUG_KERNHIST_FUNC("scdebug_call");
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
const struct sysent *sy;
const struct emul *em;
int i;
if ((scdebug & SCDEBUG_CALLS) == 0)
return;
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_CALLED(scdebughist);
em = p->p_emul;
sy = &em->e_sysent[code];
if ((scdebug & SCDEBUG_ALL) == 0 &&
(CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
return;
}
/*
* The kernhist version of scdebug needs to restrict the usage
* compared to the normal version. histories must avoid these
* sorts of usage:
*
* - the format string *must* be literal, as it is used
* at display time in the kernel or userland
* - strings in the format will cause vmstat -u to crash
* so avoid using %s formats
*
* to avoid these, we have a fairly long block to print args
* as the format needs to change for each, and we can't just
* call printf() on each argument until we're done.
*/
if (scdebug & SCDEBUG_KERNHIST) {
if (CODE_NOT_OK(code, em)) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: OUT OF RANGE (%jd)",
p->p_pid, l->l_lid, code, 0);
} else {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: num %jd call %#jx",
p->p_pid, l->l_lid, code, (uintptr_t)sy->sy_call);
if ((scdebug & SCDEBUG_SHOWARGS) == 0)
return;
if (sy->sy_narg > 7) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-7]: (%jx, %jx, %jx, %jx, ...)",
(long)args[4], (long)args[5],
(long)args[6], (long)args[7]);
} else if (sy->sy_narg > 6) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-6]: (%jx, %jx, %jx)",
(long)args[4], (long)args[5],
(long)args[6], 0);
} else if (sy->sy_narg > 5) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4-5]: (%jx, %jx)",
(long)args[4], (long)args[5], 0, 0);
} else if (sy->sy_narg == 5) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[4]: (%jx)",
(long)args[4], 0, 0, 0);
}
if (sy->sy_narg > 3) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-3]: (%jx, %jx, %jx, %jx, ...)",
(long)args[0], (long)args[1],
(long)args[2], (long)args[3]);
} else if (sy->sy_narg > 2) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-2]: (%jx, %jx, %jx)",
(long)args[0], (long)args[1],
(long)args[2], 0);
} else if (sy->sy_narg > 1) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0-1]: (%jx, %jx)",
(long)args[0], (long)args[1], 0, 0);
} else if (sy->sy_narg == 1) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"args[0]: (%jx)",
(long)args[0], 0, 0, 0);
}
}
return;
}
printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
if (CODE_NOT_OK(code, em))
printf("OUT OF RANGE (%ld)", (long)code);
else {
printf("%ld call: %s", (long)code, em->e_syscallnames[code]);
if (scdebug & SCDEBUG_SHOWARGS) {
printf("(");
for (i = 0; i < sy->sy_argsize/sizeof(register_t); i++)
printf("%s0x%lx", i == 0 ? "" : ", ",
(long)args[i]);
printf(")");
}
}
printf("\n");
}
void
scdebug_ret(register_t code, int error, const register_t retval[])
{
SCDEBUG_KERNHIST_FUNC("scdebug_ret");
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
const struct sysent *sy;
const struct emul *em;
if ((scdebug & SCDEBUG_RETURNS) == 0)
return;
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_CALLED(scdebughist);
em = p->p_emul;
sy = &em->e_sysent[code];
if ((scdebug & SCDEBUG_ALL) == 0 &&
(CODE_NOT_OK(code, em) || sy->sy_call == sys_nosys)) {
if (scdebug & SCDEBUG_KERNHIST)
SCDEBUG_KERNHIST_LOG(scdebughist, "", 0, 0, 0, 0);
return;
}
if (scdebug & SCDEBUG_KERNHIST) {
if (CODE_NOT_OK(code, em)) {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: OUT OF RANGE (%jd)",
p->p_pid, l->l_lid, code, 0);
} else {
SCDEBUG_KERNHIST_LOG(scdebughist,
"pid %jd:%jd: num %jd",
p->p_pid, l->l_lid, code, 0);
SCDEBUG_KERNHIST_LOG(scdebughist,
"ret: err = %jd, rv = 0x%jx,0x%jx",
error, (long)retval[0], (long)retval[1], 0);
}
return;
}
printf("proc %d (%s): %s num ", p->p_pid, p->p_comm, em->e_name);
if (CODE_NOT_OK(code, em))
printf("OUT OF RANGE (%ld)", (long)code);
else
printf("%ld ret %s: err = %d, rv = 0x%lx,0x%lx", (long)code,
em->e_syscallnames[code], error,
(long)retval[0], (long)retval[1]);
printf("\n");
}
#endif /* SYSCALL_DEBUG */
#ifndef SCDEBUG_KERNHIST_SIZE
#define SCDEBUG_KERNHIST_SIZE 500
#endif
void
scdebug_init(void)
{
#if defined(SYSCALL_DEBUG) && defined(KERNHIST)
/* Setup scdebughist kernel history */
KERNHIST_INIT(scdebughist, SCDEBUG_KERNHIST_SIZE);
#endif
}
/* $NetBSD: kern_acct.c,v 1.99 2021/12/05 04:35:38 msaitoh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_acct.c 8.8 (Berkeley) 5/14/95
*/
/*-
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_acct.c 8.8 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_acct.c,v 1.99 2021/12/05 04:35:38 msaitoh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/syslog.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/errno.h>
#include <sys/acct.h>
#include <sys/resourcevar.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/kauth.h>
#include <sys/syscallargs.h>
/*
* The routines implemented in this file are described in:
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
* on pages 62-63.
*
* Arguably, to simplify accounting operations, this mechanism should
* be replaced by one in which an accounting log file (similar to /dev/klog)
* is read by a user process, etc. However, that has its own problems.
*/
/*
* Lock to serialize system calls and kernel threads.
*/
krwlock_t acct_lock;
/*
* The global accounting state and related data. Gain the mutex before
* accessing these variables.
*/
static enum {
ACCT_STOP,
ACCT_ACTIVE,
ACCT_SUSPENDED
} acct_state; /* The current accounting state. */
static struct vnode *acct_vp; /* Accounting vnode pointer. */
static kauth_cred_t acct_cred; /* Credential of accounting file
owner (i.e root). Used when
accounting file i/o. */
static struct lwp *acct_dkwatcher; /* Free disk space checker. */
/*
* Values associated with enabling and disabling accounting
*/
int acctsuspend = 2; /* stop accounting when < 2% free space left */
int acctresume = 4; /* resume when free space risen to > 4% */
int acctchkfreq = 15; /* frequency (in seconds) to check space */
/*
* Encode_comp_t converts from ticks in seconds and microseconds
* to ticks in 1/AHZ seconds. The encoding is described in
* Leffler, et al., on page 63.
*/
#define MANTSIZE 13 /* 13 bit mantissa. */
#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
static comp_t
encode_comp_t(u_long s, u_long us)
{
int exp, rnd;
exp = 0;
rnd = 0;
s *= AHZ;
s += us / (1000000 / AHZ); /* Maximize precision. */
while (s > MAXFRACT) {
rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
exp++;
}
/* If we need to round up, do it (and handle overflow correctly). */
if (rnd && (++s > MAXFRACT)) {
s >>= EXPSIZE;
exp++;
}
/* Clean it up and polish it off. */
exp <<= MANTSIZE; /* Shift the exponent into place */
exp += s; /* and add on the mantissa. */
return (exp);
}
static int
acct_chkfree(void)
{
int error;
struct statvfs *sb;
fsblkcnt_t bavail;
sb = kmem_alloc(sizeof(*sb), KM_SLEEP);
error = VFS_STATVFS(acct_vp->v_mount, sb);
if (error != 0) {
kmem_free(sb, sizeof(*sb));
return (error);
}
if (sb->f_bfree < sb->f_bresvd) {
bavail = 0;
} else {
bavail = sb->f_bfree - sb->f_bresvd;
}
switch (acct_state) {
case ACCT_SUSPENDED:
if (bavail > acctresume * sb->f_blocks / 100) {
acct_state = ACCT_ACTIVE;
log(LOG_NOTICE, "Accounting resumed\n");
}
break;
case ACCT_ACTIVE:
if (bavail <= acctsuspend * sb->f_blocks / 100) {
acct_state = ACCT_SUSPENDED;
log(LOG_NOTICE, "Accounting suspended\n");
}
break;
case ACCT_STOP:
break;
}
kmem_free(sb, sizeof(*sb));
return (0);
}
static void
acct_stop(void)
{
int error;
KASSERT(rw_write_held(&acct_lock)); if (acct_vp != NULLVP && acct_vp->v_type != VBAD) {
error = vn_close(acct_vp, FWRITE, acct_cred);
#ifdef DIAGNOSTIC
if (error != 0) printf("acct_stop: failed to close, errno = %d\n",
error);
#else
__USE(error);
#endif
acct_vp = NULLVP;
}
if (acct_cred != NULL) { kauth_cred_free(acct_cred);
acct_cred = NULL;
}
acct_state = ACCT_STOP;
}
/*
* Periodically check the file system to see if accounting
* should be turned on or off. Beware the case where the vnode
* has been vgone()'d out from underneath us, e.g. when the file
* system containing the accounting file has been forcibly unmounted.
*/
static void
acctwatch(void *arg)
{
int error;
log(LOG_NOTICE, "Accounting started\n");
rw_enter(&acct_lock, RW_WRITER);
while (acct_state != ACCT_STOP) {
if (acct_vp->v_type == VBAD) {
log(LOG_NOTICE, "Accounting terminated\n");
acct_stop();
continue;
}
error = acct_chkfree();
#ifdef DIAGNOSTIC
if (error != 0)
printf("acctwatch: failed to statvfs, error = %d\n",
error);
#else
__USE(error);
#endif
rw_exit(&acct_lock);
error = kpause("actwat", false, acctchkfreq * hz, NULL);
rw_enter(&acct_lock, RW_WRITER);
#ifdef DIAGNOSTIC
if (error != 0 && error != EWOULDBLOCK)
printf("acctwatch: sleep error %d\n", error);
#endif
}
acct_dkwatcher = NULL;
rw_exit(&acct_lock);
kthread_exit(0);
}
void
acct_init(void)
{
acct_state = ACCT_STOP;
acct_vp = NULLVP;
acct_cred = NULL;
rw_init(&acct_lock);
}
/*
* Accounting system call. Written based on the specification and
* previous implementation done by Mark Tinguely.
*/
int
sys_acct(struct lwp *l, const struct sys_acct_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
} */
struct pathbuf *pb;
struct vnode *vp;
int error;
/* Make sure that the caller is root. */
if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_ACCOUNTING,
0, NULL, NULL, NULL)))
return (error);
/*
* If accounting is to be started to a file, open that file for
* writing and make sure it's a 'normal'.
*/
if (SCARG(uap, path) != NULL) {
struct vattr va;
size_t pad;
error = pathbuf_copyin(SCARG(uap, path), &pb);
if (error) {
return error;
}
error = vn_open(NULL, pb, TRYEMULROOT, FWRITE|O_APPEND, 0,
&vp, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb);
return error;
}
if (vp->v_type != VREG) {
VOP_UNLOCK(vp);
error = EACCES;
goto bad;
}
if ((error = VOP_GETATTR(vp, &va, l->l_cred)) != 0) {
VOP_UNLOCK(vp);
goto bad;
}
if ((pad = (va.va_size % sizeof(struct acct))) != 0) {
u_quad_t size = va.va_size - pad;
#ifdef DIAGNOSTIC
printf("Size of accounting file not a multiple of "
"%lu - incomplete record truncated\n",
(unsigned long)sizeof(struct acct));
#endif
vattr_null(&va);
va.va_size = size;
error = VOP_SETATTR(vp, &va, l->l_cred);
if (error != 0) { VOP_UNLOCK(vp);
goto bad;
}
}
VOP_UNLOCK(vp);
}
rw_enter(&acct_lock, RW_WRITER);
/*
* If accounting was previously enabled, kill the old space-watcher,
* free credential for accounting file i/o,
* ... (and, if no new file was specified, leave).
*/
acct_stop();
if (SCARG(uap, path) == NULL)
goto out;
/*
* Save the new accounting file vnode and credential,
* and schedule the new free space watcher.
*/
acct_state = ACCT_ACTIVE;
acct_vp = vp;
acct_cred = l->l_cred;
kauth_cred_hold(acct_cred);
pathbuf_destroy(pb);
error = acct_chkfree(); /* Initial guess. */
if (error != 0) {
acct_stop();
goto out;
}
if (acct_dkwatcher == NULL) {
error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
acctwatch, NULL, &acct_dkwatcher, "acctwatch");
if (error != 0)
acct_stop();
}
out:
rw_exit(&acct_lock);
return (error);
bad:
vn_close(vp, FWRITE, l->l_cred);
pathbuf_destroy(pb);
return error;
}
/*
* Write out process accounting information, on process exit.
* Data to be written out is specified in Leffler, et al.
* and are enumerated below. (They're also noted in the system
* "acct.h" header file.)
*/
int
acct_process(struct lwp *l)
{
struct acct acct;
struct timeval ut, st, tmp;
struct rusage *r;
int t, error = 0;
struct rlimit orlim;
struct proc *p = l->l_proc;
if (acct_state != ACCT_ACTIVE)
return 0;
memset(&acct, 0, sizeof(acct)); /* to zerofill padded data */
rw_enter(&acct_lock, RW_READER);
/* If accounting isn't enabled, don't bother */
if (acct_state != ACCT_ACTIVE)
goto out;
/*
* Temporarily raise the file limit so that accounting can't
* be stopped by the user.
*
* XXX We should think about the CPU limit, too.
*/
lim_privatise(p);
orlim = p->p_rlimit[RLIMIT_FSIZE];
/* Set current and max to avoid illegal values */
p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
p->p_rlimit[RLIMIT_FSIZE].rlim_max = RLIM_INFINITY;
/*
* Get process accounting information.
*/
/* (1) The name of the command that ran */
strncpy(acct.ac_comm, p->p_comm, sizeof(acct.ac_comm));
/* (2) The amount of user and system time that was used */
mutex_enter(p->p_lock);
calcru(p, &ut, &st, NULL, NULL);
mutex_exit(p->p_lock);
acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
/* (3) The elapsed time the command ran (and its starting time) */
acct.ac_btime = p->p_stats->p_start.tv_sec;
getmicrotime(&tmp);
timersub(&tmp, &p->p_stats->p_start, &tmp);
acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
/* (4) The average amount of memory used */
r = &p->p_stats->p_ru;
timeradd(&ut, &st, &tmp);
t = tmp.tv_sec * hz + tmp.tv_usec / tick;
if (t)
acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
else
acct.ac_mem = 0;
/* (5) The number of disk I/O operations done */
acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
/* (6) The UID and GID of the process */
acct.ac_uid = kauth_cred_getuid(l->l_cred);
acct.ac_gid = kauth_cred_getgid(l->l_cred);
/* (7) The terminal from which the process was started */
mutex_enter(&proc_lock);
if ((p->p_lflag & PL_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev;
else
acct.ac_tty = NODEV;
mutex_exit(&proc_lock);
/* (8) The boolean flags that tell how the process terminated, etc. */
acct.ac_flag = p->p_acflag;
/*
* Now, just write the accounting information to the file.
*/
error = vn_rdwr(UIO_WRITE, acct_vp, (void *)&acct,
sizeof(acct), (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT,
acct_cred, NULL, NULL);
if (error != 0)
log(LOG_ERR, "Accounting: write failed %d\n", error);
/* Restore limit - rather pointless since process is about to exit */
p->p_rlimit[RLIMIT_FSIZE] = orlim;
out:
rw_exit(&acct_lock);
return (error);
}
/* $NetBSD: userret.h,v 1.13 2018/07/26 09:29:08 maxv Exp $ */
/*
* XXXfvdl same as i386 counterpart, but should probably be independent.
*/
/*-
* Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
*/
#include <sys/userret.h>
static __inline void userret(struct lwp *);
/*
* Define the code needed before returning to user mode, for
* trap and syscall.
*/
static __inline void
userret(struct lwp *l)
{
/* Invoke MI userret code */
mi_userret(l);
}
/* $NetBSD: subr_once.c,v 1.7 2019/03/19 08:16:51 ryo Exp $ */
/*-
* Copyright (c)2005 YAMAMOTO Takashi,
* Copyright (c)2008 Antti Kantee,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_once.c,v 1.7 2019/03/19 08:16:51 ryo Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/once.h>
static kmutex_t oncemtx;
static kcondvar_t oncecv;
void
once_init(void)
{
mutex_init(&oncemtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&oncecv, "runonce");
}
int
_init_once(once_t *o, int (*fn)(void))
{
/* Fastpath handled by RUN_ONCE() */
int error;
mutex_enter(&oncemtx);
while (o->o_status == ONCE_RUNNING)
cv_wait(&oncecv, &oncemtx);
if (o->o_refcnt++ == 0) { o->o_status = ONCE_RUNNING;
mutex_exit(&oncemtx);
o->o_error = fn();
mutex_enter(&oncemtx);
o->o_status = ONCE_DONE;
cv_broadcast(&oncecv);
}
KASSERT(o->o_refcnt != 0); /* detect overflow */ while (o->o_status == ONCE_RUNNING)
cv_wait(&oncecv, &oncemtx);
error = o->o_error;
mutex_exit(&oncemtx);
return error;
}
void
_fini_once(once_t *o, void (*fn)(void))
{
mutex_enter(&oncemtx);
while (o->o_status == ONCE_RUNNING)
cv_wait(&oncecv, &oncemtx);
KASSERT(o->o_refcnt != 0); /* we need to call _init_once() once */
if (--o->o_refcnt == 0) {
o->o_status = ONCE_RUNNING;
mutex_exit(&oncemtx);
fn();
mutex_enter(&oncemtx);
o->o_status = ONCE_VIRGIN;
cv_broadcast(&oncecv);
}
mutex_exit(&oncemtx);
}
/* $NetBSD: proc.h,v 1.373 2023/10/04 20:52:07 ad Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2020, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)proc.h 8.15 (Berkeley) 5/19/95
*/
#ifndef _SYS_PROC_H_
#define _SYS_PROC_H_
#include <sys/lwp.h>
#if defined(_KMEMUSER) || defined(_KERNEL)
#if defined(_KERNEL_OPT)
#include "opt_multiprocessor.h"
#include "opt_kstack.h"
#include "opt_lockdebug.h"
#endif
#include <machine/proc.h> /* Machine-dependent proc substruct */
#include <machine/pcb.h>
#include <sys/aio.h>
#include <sys/idtype.h>
#include <sys/rwlock.h>
#include <sys/mqueue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/queue.h>
#include <sys/radixtree.h>
#include <sys/signalvar.h>
#include <sys/siginfo.h>
#include <sys/event.h>
#include <sys/specificdata.h>
#ifdef _KERNEL
#include <sys/resourcevar.h>
#else
#include <sys/time.h>
#include <sys/resource.h>
#endif
/*
* One structure allocated per session.
*/
struct session {
int s_count; /* Ref cnt; pgrps in session */
u_int s_flags;
#define S_LOGIN_SET 1 /* s_login set in this session */
struct proc *s_leader; /* Session leader */
struct vnode *s_ttyvp; /* Vnode of controlling terminal */
struct tty *s_ttyp; /* Controlling terminal */
char s_login[MAXLOGNAME]; /* Setlogin() name */
pid_t s_sid; /* Session ID (pid of leader) */
};
/*
* One structure allocated per process group.
*/
struct pgrp {
LIST_HEAD(, proc) pg_members; /* Pointer to pgrp members */
struct session *pg_session; /* Pointer to session */
pid_t pg_id; /* Pgrp id */
int pg_jobc; /*
* Number of processes qualifying
* pgrp for job control
*/
};
/*
* Autoloadable syscall definition
*/
struct sc_autoload {
u_int al_code;
const char *al_module;
};
/*
* One structure allocated per emulation.
*/
struct exec_package;
struct ras;
struct kauth_cred;
struct emul {
const char *e_name; /* Symbolic name */
const char *e_path; /* Extra emulation path (NULL if none)*/
#ifndef __HAVE_MINIMAL_EMUL
int e_flags; /* Miscellaneous flags, see above */
/* Syscall handling function */
const int *e_errno; /* Errno array */
int e_nosys; /* Offset of the nosys() syscall */
int e_nsysent; /* Number of system call entries */
#endif
struct sysent *e_sysent; /* System call array */
const uint32_t *e_nomodbits; /* sys_nosys/sys_nomodule flags
* for syscall_disestablish() */
const char * const *e_syscallnames; /* System call name array */
struct sc_autoload *e_sc_autoload; /* List of autoloadable syscalls */
/* Signal sending function */
void (*e_sendsig)(const struct ksiginfo *,
const sigset_t *);
void (*e_trapsignal)(struct lwp *, struct ksiginfo *);
char *e_sigcode; /* Start of sigcode */
char *e_esigcode; /* End of sigcode */
/* Set registers before execution */
struct uvm_object **e_sigobject;/* shared sigcode object */
void (*e_setregs)(struct lwp *, struct exec_package *,
vaddr_t);
/* Per-process hooks */
void (*e_proc_exec)(struct proc *, struct exec_package *);
void (*e_proc_fork)(struct proc *, struct lwp *, int);
void (*e_proc_exit)(struct proc *);
void (*e_lwp_fork)(struct lwp *, struct lwp *);
void (*e_lwp_exit)(struct lwp *);
#ifdef __HAVE_SYSCALL_INTERN
void (*e_syscall_intern)(struct proc *);
#else
void (*e_syscall)(void);
#endif
/* Emulation specific sysctl data */
struct sysctlnode *e_sysctlovly;
vaddr_t (*e_vm_default_addr)(struct proc *, vaddr_t, vsize_t,
int);
/* Emulation-specific hook for userspace page faults */
int (*e_usertrap)(struct lwp *, vaddr_t, void *);
size_t e_ucsize; /* size of ucontext_t */
void (*e_startlwp)(void *);
/* Dtrace syscall probe */
void (*e_dtrace_syscall)(uint32_t, register_t,
const struct sysent *, const void *,
const register_t *, int);
/* Emulation specific support for ktracing signal posts */
void (*e_ktrpsig)(int, sig_t, const sigset_t *,
const struct ksiginfo *);
};
/*
* Emulation miscellaneous flags
*/
#define EMUL_HAS_SYS___syscall 0x001 /* Has SYS___syscall */
/*
* Description of a process.
*
* This structure contains the information needed to manage a thread of
* control, known in UN*X as a process; it has references to substructures
* containing descriptions of things that the process uses, but may share
* with related processes. The process structure and the substructures
* are always addressible except for those marked "(PROC ONLY)" below,
* which might be addressible only on a processor on which the process
* is running.
*
* Field markings and the corresponding locks:
*
* a: p_auxlock
* k: ktrace_mutex
* l: proc_lock
* t: p_stmutex
* p: p_lock
* (: updated atomically
* :: unlocked, stable
*/
struct vmspace;
struct proc {
LIST_ENTRY(proc) p_list; /* l: List of all processes */
kmutex_t *p_lock; /* :: general mutex */
kcondvar_t p_waitcv; /* p: wait, stop CV on children */
kcondvar_t p_lwpcv; /* p: wait, stop CV on LWPs */
/* Substructures: */
struct kauth_cred *p_cred; /* p: Master copy of credentials */
struct filedesc *p_fd; /* :: Ptr to open files structure */
struct cwdinfo *p_cwdi; /* :: cdir/rdir/cmask info */
struct pstats *p_stats; /* :: Accounting/stats (PROC ONLY) */
struct plimit *p_limit; /* :: Process limits */
struct vmspace *p_vmspace; /* :: Address space */
struct sigacts *p_sigacts; /* :: Process sigactions */
struct aioproc *p_aio; /* p: Asynchronous I/O data */
u_int p_mqueue_cnt; /* (: Count of open message queues */
specificdata_reference
p_specdataref; /* subsystem proc-specific data */
int p_exitsig; /* l: signal to send to parent on exit */
int p_flag; /* p: PK_* flags */
int p_sflag; /* p: PS_* flags */
int p_stflag; /* t: PST_* flags */
short p_slflag; /* l, p: PSL_* flags */
char p_stat; /* l: S* process status. */
char p_lflag; /* l: PL_* flags */
char p_trace_enabled;/* p: cached by syscall_intern() */
char p_pad1[3]; /* unused */
pid_t p_pid; /* :: Process identifier. */
LIST_ENTRY(proc) p_pglist; /* l: List of processes in pgrp. */
struct proc *p_pptr; /* l: Pointer to parent process. */
LIST_ENTRY(proc) p_sibling; /* l: List of sibling processes. */
LIST_HEAD(, proc) p_children; /* l: List of children. */
LIST_HEAD(, lwp) p_lwps; /* p: List of LWPs. */
struct ras *p_raslist; /* a: List of RAS entries */
/* The following fields are all zeroed upon creation in fork. */
#define p_startzero p_nlwps
int p_nlwps; /* p: Number of LWPs */
int p_nzlwps; /* p: Number of zombie LWPs */
int p_nrlwps; /* p: Number running/sleeping LWPs */
int p_nlwpwait; /* p: Number of LWPs in lwp_wait1() */
int p_ndlwps; /* p: Number of detached LWPs */
u_int p_nstopchild; /* l: Count of stopped/dead children */
u_int p_waited; /* l: parent has waited on child */
struct lwp *p_zomblwp; /* p: detached LWP to be reaped */
struct lwp *p_vforklwp; /* p: parent LWP waiting at vfork() */
/* scheduling */
void *p_sched_info; /* p: Scheduler-specific structure */
fixpt_t p_estcpu; /* p: Time avg. value of p_cpticks */
fixpt_t p_estcpu_inherited; /* p: cpu inherited from children */
unsigned int p_forktime;
fixpt_t p_pctcpu; /* p: %cpu from dead LWPs */
struct proc *p_opptr; /* l: save parent during ptrace. */
struct ptimers *p_timers; /* Timers: real, virtual, profiling */
struct bintime p_rtime; /* p: real time */
u_quad_t p_uticks; /* t: Statclock hits in user mode */
u_quad_t p_sticks; /* t: Statclock hits in system mode */
u_quad_t p_iticks; /* t: Statclock hits processing intr */
uint64_t p_xutime; /* p: utime exposed to userspace */
uint64_t p_xstime; /* p: stime exposed to userspace */
int p_traceflag; /* k: Kernel trace points */
void *p_tracep; /* k: Trace private data */
struct vnode *p_textvp; /* :: Vnode of executable */
struct emul *p_emul; /* :: emulation information */
void *p_emuldata; /* :: per-proc emul data, or NULL */
const struct execsw *p_execsw; /* :: exec package information */
struct klist p_klist; /* p: knotes attached to proc */
LIST_HEAD(, lwp) p_sigwaiters; /* p: LWPs waiting for signals */
sigpend_t p_sigpend; /* p: pending signals */
struct lcproc *p_lwpctl; /* p, a: _lwp_ctl() information */
pid_t p_ppid; /* :: cached parent pid */
pid_t p_oppid; /* :: cached original parent pid */
char *p_path; /* :: full pathname of executable */
/*
* End area that is zeroed on creation
*/
#define p_endzero p_startcopy
/*
* The following fields are all copied upon creation in fork.
*/
#define p_startcopy p_sigctx
struct sigctx p_sigctx; /* p: Shared signal state */
u_char p_nice; /* p: Process "nice" value */
char p_comm[MAXCOMLEN+1];
/* p: basename of last exec file */
struct pgrp *p_pgrp; /* l: Pointer to process group */
vaddr_t p_psstrp; /* :: address of process's ps_strings */
u_int p_pax; /* :: PAX flags */
int p_xexit; /* p: exit code */
/*
* End area that is copied on creation
*/
#define p_endcopy p_xsig
u_short p_xsig; /* p: stop signal */
u_short p_acflag; /* p: Acc. flags; see struct lwp also */
struct mdproc p_md; /* p: Any machine-dependent fields */
vaddr_t p_stackbase; /* :: ASLR randomized stack base */
struct kdtrace_proc *p_dtrace; /* :: DTrace-specific data. */
/*
* Locks in their own cache line towards the end.
*/
kmutex_t p_auxlock /* :: secondary, longer term lock */
__aligned(COHERENCY_UNIT);
kmutex_t p_stmutex; /* :: mutex on profiling state */
krwlock_t p_reflock; /* :: lock for debugger, procfs */
};
#define p_rlimit p_limit->pl_rlimit
#define p_session p_pgrp->pg_session
#define p_pgid p_pgrp->pg_id
#endif /* _KMEMUSER || _KERNEL */
/*
* Status values.
*/
#define SIDL 1 /* Process being created by fork */
#define SACTIVE 2 /* Process is not stopped */
#define SDYING 3 /* About to die */
#define SSTOP 4 /* Process debugging or suspension */
#define SZOMB 5 /* Awaiting collection by parent */
#define SDEAD 6 /* Almost a zombie */
#define P_ZOMBIE(p) \
((p)->p_stat == SZOMB || (p)->p_stat == SDYING || (p)->p_stat == SDEAD)
/*
* These flags are kept in p_flag and are protected by p_lock. Access from
* process context only.
*/
#define PK_ADVLOCK 0x00000001 /* Process may hold a POSIX advisory lock */
#define PK_SYSTEM 0x00000002 /* System process (kthread) */
#define PK_SYSVSEM 0x00000004 /* Used SysV semaphores */
#define PK_SUGID 0x00000100 /* Had set id privileges since last exec */
#define PK_KMEM 0x00000200 /* Has kmem access */
#define PK_EXEC 0x00004000 /* Process called exec */
#define PK_NOCLDWAIT 0x00020000 /* No zombies if child dies */
#define PK_32 0x00040000 /* 32-bit process (used on 64-bit kernels) */
#define PK_CLDSIGIGN 0x00080000 /* Process is ignoring SIGCHLD */
#define PK_MARKER 0x80000000 /* Is a dummy marker process */
/*
* These flags are kept in p_sflag and are protected by p_lock. Access from
* process context only.
*/
#define PS_NOCLDSTOP 0x00000008 /* No SIGCHLD when children stop */
#define PS_RUMP_LWPEXIT 0x00000400 /* LWPs in RUMP kernel should exit for GC */
#define PS_WCORE 0x00001000 /* Process needs to dump core */
#define PS_WEXIT 0x00002000 /* Working on exiting */
#define PS_STOPFORK 0x00800000 /* Child will be stopped on fork(2) */
#define PS_STOPEXEC 0x01000000 /* Will be stopped on exec(2) */
#define PS_STOPEXIT 0x02000000 /* Will be stopped at process exit */
#define PS_COREDUMP 0x20000000 /* Process core-dumped */
#define PS_CONTINUED 0x40000000 /* Process is continued */
#define PS_STOPPING 0x80000000 /* Transitioning SACTIVE -> SSTOP */
/*
* These flags are kept in p_slflag and are protected by the proc_lock
* and p_lock. Access from process context only.
*/
#define PSL_TRACEFORK 0x00000001 /* traced process wants fork events */
#define PSL_TRACEVFORK 0x00000002 /* traced process wants vfork events */
#define PSL_TRACEVFORK_DONE \
0x00000004 /* traced process wants vfork done events */
#define PSL_TRACELWP_CREATE \
0x00000008 /* traced process wants LWP create events */
#define PSL_TRACELWP_EXIT \
0x00000010 /* traced process wants LWP exit events */
#define PSL_TRACEPOSIX_SPAWN \
0x00000020 /* traced process wants posix_spawn events */
#define PSL_TRACED 0x00000040 /* Debugged process being traced */
#define PSL_TRACEDCHILD 0x00000080 /* Report process birth */
#define PSL_CHTRACED 0x00000100 /* Child has been traced & reparented */
#define PSL_SYSCALL 0x00000200 /* process has PT_SYSCALL enabled */
#define PSL_SYSCALLEMU 0x00000400 /* cancel in-progress syscall */
/*
* Kept in p_stflag and protected by p_stmutex.
*/
#define PST_PROFIL 0x00000020 /* Has started profiling */
/*
* Kept in p_lflag and protected by the proc_lock. Access
* from process context only.
*/
#define PL_CONTROLT 0x00000001 /* Has a controlling terminal */
#define PL_PPWAIT 0x00000002 /* Parent is waiting for child exec/exit */
#define PL_SIGCOMPAT 0x00000004 /* Has used compat signal trampoline */
#define PL_ORPHANPG 0x00000008 /* Member of an orphaned pgrp */
#if defined(_KMEMUSER) || defined(_KERNEL)
/*
* Macro to compute the exit signal to be delivered.
*/
#define P_EXITSIG(p) \
(((p)->p_slflag & PSL_TRACED) ? SIGCHLD : p->p_exitsig)
/*
* Compute a wait(2) 16 bit exit status code
*/
#define P_WAITSTATUS(p) W_EXITCODE((p)->p_xexit, ((p)->p_xsig | \
(((p)->p_sflag & PS_COREDUMP) ? WCOREFLAG : 0)))
LIST_HEAD(proclist, proc); /* A list of processes */
/*
* This structure associates a proclist with its lock.
*/
struct proclist_desc {
struct proclist *pd_list; /* The list */
/*
* XXX Add a pointer to the proclist's lock eventually.
*/
};
#ifdef _KERNEL
/*
* We use process IDs <= PID_MAX until there are > 16k processes.
* NO_PGID is used to represent "no process group" for a tty.
*/
#define PID_MAX 30000
#define NO_PGID ((pid_t)-1)
#define SESS_LEADER(p) ((p)->p_session->s_leader == (p))
/*
* Flags passed to fork1().
*/
#define FORK_PPWAIT 0x0001 /* Block parent until child exit */
#define FORK_SHAREVM 0x0002 /* Share vmspace with parent */
#define FORK_SHARECWD 0x0004 /* Share cdir/rdir/cmask */
#define FORK_SHAREFILES 0x0008 /* Share file descriptors */
#define FORK_SHARESIGS 0x0010 /* Share signal actions */
#define FORK_NOWAIT 0x0020 /* Make init the parent of the child */
#define FORK_CLEANFILES 0x0040 /* Start with a clean descriptor set */
#define FORK_SYSTEM 0x0080 /* Fork a kernel thread */
extern struct proc proc0; /* Process slot for swapper */
extern u_int nprocs; /* Current number of procs */
extern int maxproc; /* Max number of procs */
#define vmspace_kernel() (proc0.p_vmspace)
extern kmutex_t proc_lock;
extern struct proclist allproc; /* List of all processes */
extern struct proclist zombproc; /* List of zombie processes */
extern struct proc *initproc; /* Process slots for init, pager */
extern const struct proclist_desc proclists[];
int proc_find_locked(struct lwp *, struct proc **, pid_t);
proc_t * proc_find_raw(pid_t);
proc_t * proc_find(pid_t); /* Find process by ID */
proc_t * proc_find_lwpid(pid_t); /* Find process by LWP ID */
struct lwp * proc_find_lwp(proc_t *, pid_t); /* Find LWP in proc by ID */
struct lwp * proc_find_lwp_unlocked(proc_t *, pid_t);
/* Find LWP, acquire proc */
struct lwp * proc_find_lwp_acquire_proc(pid_t, proc_t **);
struct pgrp * pgrp_find(pid_t); /* Find process group by ID */
void procinit(void);
void procinit_sysctl(void);
int proc_enterpgrp(struct proc *, pid_t, pid_t, bool);
void proc_leavepgrp(struct proc *);
void proc_sesshold(struct session *);
void proc_sessrele(struct session *);
void fixjobc(struct proc *, struct pgrp *, int);
int tsleep(wchan_t, pri_t, const char *, int);
int mtsleep(wchan_t, pri_t, const char *, int, kmutex_t *);
void wakeup(wchan_t);
int kpause(const char *, bool, int, kmutex_t *);
void exit1(struct lwp *, int, int) __dead;
int kill1(struct lwp *l, pid_t pid, ksiginfo_t *ksi, register_t *retval);
int do_sys_wait(int *, int *, int, struct rusage *);
int do_sys_waitid(idtype_t, id_t, int *, int *, int, struct wrusage *,
siginfo_t *);
struct proc *proc_alloc(void);
void proc0_init(void);
pid_t proc_alloc_pid(struct proc *);
void proc_free_pid(pid_t);
pid_t proc_alloc_lwpid(struct proc *, struct lwp *);
void proc_free_lwpid(struct proc *, pid_t);
void proc_free_mem(struct proc *);
void exit_lwps(struct lwp *l);
int fork1(struct lwp *, int, int, void *, size_t,
void (*)(void *), void *, register_t *);
int pgid_in_session(struct proc *, pid_t);
void cpu_lwp_fork(struct lwp *, struct lwp *, void *, size_t,
void (*)(void *), void *);
void cpu_lwp_free(struct lwp *, int);
void cpu_lwp_free2(struct lwp *);
void cpu_spawn_return(struct lwp*);
#ifdef __HAVE_SYSCALL_INTERN
void syscall_intern(struct proc *);
#endif
void md_child_return(struct lwp *);
void child_return(void *);
int proc_isunder(struct proc *, struct lwp *);
int proc_uidmatch(kauth_cred_t, kauth_cred_t);
int proc_vmspace_getref(struct proc *, struct vmspace **);
void proc_crmod_leave(kauth_cred_t, kauth_cred_t, bool);
void proc_crmod_enter(void);
int proc_getauxv(struct proc *, void **, size_t *);
int proc_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void proc_specific_key_delete(specificdata_key_t);
void proc_initspecific(struct proc *);
void proc_finispecific(struct proc *);
void * proc_getspecific(struct proc *, specificdata_key_t);
void proc_setspecific(struct proc *, specificdata_key_t, void *);
int proc_compare(const struct proc *, const struct lwp *,
const struct proc *, const struct lwp *);
/*
* Special handlers for delivering EVFILT_PROC notifications. These
* exist to handle some of the special locking considerations around
* processes.
*/
void knote_proc_exec(struct proc *);
void knote_proc_fork(struct proc *, struct proc *);
void knote_proc_exit(struct proc *);
int proclist_foreach_call(struct proclist *,
int (*)(struct proc *, void *arg), void *);
static __inline struct proc *
_proclist_skipmarker(struct proc *p0)
{
struct proc *p = p0;
while (p != NULL && p->p_flag & PK_MARKER)
p = LIST_NEXT(p, p_list);
return p;
}
#define PROC_PTRSZ(p) (((p)->p_flag & PK_32) ? sizeof(int) : sizeof(void *))
#define PROC_REGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_reg32) : sizeof(struct reg))
#define PROC_FPREGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_fpreg32) : sizeof(struct fpreg))
#define PROC_DBREGSZ(p) (((p)->p_flag & PK_32) ? \
sizeof(process_dbreg32) : sizeof(struct dbreg))
#ifndef PROC_MACHINE_ARCH
#define PROC_MACHINE_ARCH(p) machine_arch
#endif
/*
* PROCLIST_FOREACH: iterate on the given proclist, skipping PK_MARKER ones.
*/
#define PROCLIST_FOREACH(var, head) \
for ((var) = LIST_FIRST(head); \
((var) = _proclist_skipmarker(var)) != NULL; \
(var) = LIST_NEXT(var, p_list))
#ifdef KSTACK_CHECK_MAGIC
void kstack_setup_magic(const struct lwp *);
void kstack_check_magic(const struct lwp *);
#else
#define kstack_setup_magic(x)
#define kstack_check_magic(x)
#endif
extern struct emul emul_netbsd;
#endif /* _KERNEL */
/*
* Kernel stack parameters.
*
* KSTACK_LOWEST_ADDR: return the lowest address of the LWP's kernel stack,
* excluding red-zone.
*
* KSTACK_SIZE: the size kernel stack for a LWP, excluding red-zone.
*
* if <machine/proc.h> provides the MD definition, it will be used.
*/
#ifndef KSTACK_LOWEST_ADDR
#define KSTACK_LOWEST_ADDR(l) ((void *)ALIGN((struct pcb *)((l)->l_addr) + 1))
#endif
#ifndef KSTACK_SIZE
#define KSTACK_SIZE (USPACE - ALIGN(sizeof(struct pcb)))
#endif
#endif /* _KMEMUSER || _KERNEL */
#endif /* !_SYS_PROC_H_ */
/* $NetBSD: prop_dictionary_util.c,v 1.9 2022/08/03 21:13:46 riastradh Exp $ */
/*-
* Copyright (c) 2006, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Utility routines to make it more convenient to work with values
* stored in dictionaries.
*
* Note: There is no special magic going on here. We use the standard
* proplib(3) APIs to do all of this work. Any application could do
* exactly what we're doing here.
*/
#include "prop_object_impl.h" /* only to hide kernel vs. not-kernel */
#include <prop/proplib.h>
bool
prop_dictionary_get_dict(prop_dictionary_t dict, const char *key,
prop_dictionary_t *dp)
{
prop_object_t o;
o = prop_dictionary_get(dict, key);
if (prop_object_type(o) != PROP_TYPE_DICTIONARY)
return false;
*dp = o;
return true;
}
bool
prop_dictionary_get_bool(prop_dictionary_t dict, const char *key, bool *valp)
{
prop_bool_t b;
b = prop_dictionary_get(dict, key);
if (prop_object_type(b) != PROP_TYPE_BOOL)
return (false);
*valp = prop_bool_true(b);
return (true);
}
bool
prop_dictionary_set_bool(prop_dictionary_t dict, const char *key, bool val)
{
return prop_dictionary_set_and_rel(dict, key, prop_bool_create(val));
}
#define TEMPLATE(name, typ) \
bool \
prop_dictionary_get_ ## name (prop_dictionary_t dict, \
const char *key, \
typ *valp) \
{ \
return prop_number_ ## name ## _value( \
prop_dictionary_get(dict, key), valp); \
}
TEMPLATE(schar, signed char)
TEMPLATE(short, short)
TEMPLATE(int, int)
TEMPLATE(long, long)
TEMPLATE(longlong, long long)
TEMPLATE(intptr, intptr_t)
TEMPLATE(int8, int8_t)
TEMPLATE(int16, int16_t)
TEMPLATE(int32, int32_t)
TEMPLATE(int64, int64_t)
TEMPLATE(uchar, unsigned char)
TEMPLATE(ushort, unsigned short)
TEMPLATE(uint, unsigned int)
TEMPLATE(ulong, unsigned long)
TEMPLATE(ulonglong, unsigned long long)
TEMPLATE(uintptr, uintptr_t)
TEMPLATE(uint8, uint8_t)
TEMPLATE(uint16, uint16_t)
TEMPLATE(uint32, uint32_t)
TEMPLATE(uint64, uint64_t)
#undef TEMPLATE
static bool
prop_dictionary_set_signed_number(prop_dictionary_t dict, const char *key,
intmax_t val)
{
return prop_dictionary_set_and_rel(dict, key,
prop_number_create_signed(val));
}
static bool
prop_dictionary_set_unsigned_number(prop_dictionary_t dict, const char *key,
uintmax_t val)
{
/*LINTED: for conversion from 'long long' to 'long'*/ \
return prop_dictionary_set_and_rel(dict, key,
prop_number_create_unsigned(val));
}
#define TEMPLATE(name, which, typ) \
bool \
prop_dictionary_set_ ## name (prop_dictionary_t dict, \
const char *key, \
typ val) \
{ \
/*LINTED: for conversion from long long to 'long'*/ \
return prop_dictionary_set_ ## which ## _number(dict, key, val);\
}
#define STEMPLATE(name, typ) TEMPLATE(name, signed, typ)
#define UTEMPLATE(name, typ) TEMPLATE(name, unsigned, typ)
STEMPLATE(schar, signed char)
STEMPLATE(short, short)
STEMPLATE(int, int)
STEMPLATE(long, long)
STEMPLATE(longlong, long long)
STEMPLATE(intptr, intptr_t)
STEMPLATE(int8, int8_t)
STEMPLATE(int16, int16_t)
STEMPLATE(int32, int32_t)
STEMPLATE(int64, int64_t)
UTEMPLATE(uchar, unsigned char)
UTEMPLATE(ushort, unsigned short)
UTEMPLATE(uint, unsigned int)
UTEMPLATE(ulong, unsigned long)
UTEMPLATE(ulonglong, unsigned long long)
UTEMPLATE(uintptr, uintptr_t)
UTEMPLATE(uint8, uint8_t)
UTEMPLATE(uint16, uint16_t)
UTEMPLATE(uint32, uint32_t)
UTEMPLATE(uint64, uint64_t)
#undef STEMPLATE
#undef UTEMPLATE
#undef TEMPLATE
bool
prop_dictionary_get_string(prop_dictionary_t dict, const char *key,
const char **cpp)
{
prop_string_t str;
const char *cp;
str = prop_dictionary_get(dict, key);
if (prop_object_type(str) != PROP_TYPE_STRING)
return (false);
cp = prop_string_value(str);
if (cp == NULL)
return (false);
*cpp = cp;
return (true);
}
bool
prop_dictionary_set_string(prop_dictionary_t dict, const char *key,
const char *cp)
{ return prop_dictionary_set_and_rel(dict, key,
prop_string_create_copy(cp));
}
bool
prop_dictionary_set_string_nocopy(prop_dictionary_t dict,
const char *key,
const char *cp)
{ return prop_dictionary_set_and_rel(dict, key,
prop_string_create_nocopy(cp));
}
bool
prop_dictionary_get_data(prop_dictionary_t dict, const char *key,
const void **vp, size_t *sizep)
{
prop_data_t data;
const void *v;
data = prop_dictionary_get(dict, key);
if (prop_object_type(data) != PROP_TYPE_DATA)
return (false);
v = prop_data_value(data);
if (v == NULL)
return (false);
*vp = v;
if (sizep != NULL)
*sizep = prop_data_size(data);
return (true);
}
bool
prop_dictionary_set_data(prop_dictionary_t dict, const char *key,
const void *v, size_t size)
{
return prop_dictionary_set_and_rel(dict, key,
prop_data_create_copy(v, size));
}
bool
prop_dictionary_set_data_nocopy(prop_dictionary_t dict, const char *key,
const void *v, size_t size)
{
return prop_dictionary_set_and_rel(dict, key,
prop_data_create_nocopy(v, size));
}
_PROP_DEPRECATED(prop_dictionary_get_cstring,
"this program uses prop_dictionary_get_cstring(), "
"which is deprecated; use prop_dictionary_get_string() and copy instead.")
bool
prop_dictionary_get_cstring(prop_dictionary_t dict,
const char *key,
char **cpp)
{
prop_string_t str;
char *cp;
size_t len;
bool rv;
str = prop_dictionary_get(dict, key);
if (prop_object_type(str) != PROP_TYPE_STRING)
return (false);
len = prop_string_size(str);
cp = _PROP_MALLOC(len + 1, M_TEMP);
if (cp == NULL)
return (false);
rv = prop_string_copy_value(str, cp, len + 1);
if (rv)
*cpp = cp;
else
_PROP_FREE(cp, M_TEMP);
return (rv);
}
_PROP_DEPRECATED(prop_string_get_cstring_nocopy,
"this program uses prop_string_get_cstring_nocopy(), "
"which is deprecated; use prop_dictionary_get_string() instead.")
bool
prop_dictionary_get_cstring_nocopy(prop_dictionary_t dict,
const char *key,
const char **cpp)
{
return prop_dictionary_get_string(dict, key, cpp);
}
_PROP_DEPRECATED(prop_dictionary_set_cstring,
"this program uses prop_dictionary_set_cstring(), "
"which is deprecated; use prop_dictionary_set_string() instead.")
bool
prop_dictionary_set_cstring(prop_dictionary_t dict,
const char *key,
const char *cp)
{
return prop_dictionary_set_string(dict, key, cp);
}
_PROP_DEPRECATED(prop_dictionary_set_cstring_nocopy,
"this program uses prop_dictionary_set_cstring_nocopy(), "
"which is deprecated; use prop_dictionary_set_string_nocopy() instead.")
bool
prop_dictionary_set_cstring_nocopy(prop_dictionary_t dict,
const char *key,
const char *cp)
{
return prop_dictionary_set_string_nocopy(dict, key, cp);
}
bool
prop_dictionary_set_and_rel(prop_dictionary_t dict, const char *key,
prop_object_t po)
{
bool rv;
if (po == NULL)
return false;
rv = prop_dictionary_set(dict, key, po);
prop_object_release(po);
return rv;
}
/* $NetBSD: ccd_60.c,v 1.11 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2018 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ccd_60.c,v 1.11 2019/12/12 02:15:42 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/disk.h>
#include <sys/lwp.h>
#include <sys/compat_stub.h>
#include <dev/ccdvar.h>
#include <compat/sys/ccdvar.h>
/*
* Compat code must not be called if on a platform where
* sizeof (size_t) == sizeof (uint64_t) as CCDIOCSET will
* be the same as CCDIOCSET_60
*/
static int
compat_60_ccdioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l,
int (*f)(dev_t, u_long, void *, int, struct lwp *))
{
switch (cmd) {
#ifdef CCDIOCSET_60
case CCDIOCSET_60: {
if (data == NULL)
return 0;
struct ccd_ioctl ccio;
struct ccd_ioctl_60 *ccio60 = data;
ccio.ccio_disks = ccio60->ccio_disks;
ccio.ccio_ndisks = ccio60->ccio_ndisks;
ccio.ccio_ileave = ccio60->ccio_ileave;
ccio.ccio_flags = ccio60->ccio_flags;
ccio.ccio_unit = ccio60->ccio_unit;
int error = (*f)(dev, CCDIOCSET, &ccio, flag, l);
if (!error) {
/* Copy data back, adjust types if necessary */
ccio60->ccio_disks = ccio.ccio_disks;
ccio60->ccio_ndisks = ccio.ccio_ndisks;
ccio60->ccio_ileave = ccio.ccio_ileave;
ccio60->ccio_flags = ccio.ccio_flags;
ccio60->ccio_unit = ccio.ccio_unit;
ccio60->ccio_size = (size_t)ccio.ccio_size;
}
return error;
}
case CCDIOCCLR_60:
if (data == NULL)
return ENOSYS;
/*
* ccio_size member not used, so existing struct OK
* drop through to existing non-compat version
*/
return (*f)(dev, CCDIOCCLR, data, flag, l);
#endif
default:
return ENOSYS;
}
}
void
ccd_60_init(void)
{
MODULE_HOOK_SET(ccd_ioctl_60_hook, compat_60_ccdioctl);
}
void
ccd_60_fini(void)
{
MODULE_HOOK_UNSET(ccd_ioctl_60_hook);
}
/* $NetBSD: if_sl.c,v 1.136 2022/10/26 23:42:42 riastradh Exp $ */
/*
* Copyright (c) 1987, 1989, 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_sl.c 8.9 (Berkeley) 1/9/95
*/
/*
* Serial Line interface
*
* Rick Adams
* Center for Seismic Studies
* 1300 N 17th Street, Suite 1450
* Arlington, Virginia 22209
* (703)276-7900
* rick@seismo.ARPA
* seismo!rick
*
* Pounded on heavily by Chris Torek (chris@mimsy.umd.edu, umcp-cs!chris).
* N.B.: this belongs in netinet, not net, the way it stands now.
* Should have a link-layer type designation, but wouldn't be
* backwards-compatible.
*
* Converted to 4.3BSD Beta by Chris Torek.
* Other changes made at Berkeley, based in part on code by Kirk Smith.
* W. Jolitz added slip abort.
*
* Hacked almost beyond recognition by Van Jacobson (van@helios.ee.lbl.gov).
* Added priority queuing for "interactive" traffic; hooks for TCP
* header compression; ICMP filtering (at 2400 baud, some cretin
* pinging you can use up all your bandwidth). Made low clist behavior
* more robust and slightly less likely to hang serial line.
* Sped up a bunch of things.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_sl.c,v 1.136 2022/10/26 23:42:42 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/buf.h>
#include <sys/dkstat.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/tty.h>
#include <sys/kernel.h>
#include <sys/socketvar.h>
#if __NetBSD__
#include <sys/systm.h>
#include <sys/kauth.h>
#endif
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/device.h>
#include <sys/module.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#endif
#include <net/slcompress.h>
#include <net/if_slvar.h>
#include <net/slip.h>
#include <net/ppp_defs.h>
#include <net/if_ppp.h>
#include <sys/time.h>
#include <net/bpf.h>
#include "ioconf.h"
/*
* SLMAX is a hard limit on input packet size. To simplify the code
* and improve performance, we require that packets fit in an mbuf
* cluster, and if we get a compressed packet, there's enough extra
* room to expand the header into a max length tcp/ip header (128
* bytes). So, SLMAX can be at most
* MCLBYTES - 128
*
* SLMTU is a hard limit on output packet size. To insure good
* interactive response, SLMTU wants to be the smallest size that
* amortizes the header cost. (Remember that even with
* type-of-service queuing, we have to wait for any in-progress
* packet to finish. I.e., we wait, on the average, 1/2 * mtu /
* cps, where cps is the line speed in characters per second.
* E.g., 533ms wait for a 1024 byte MTU on a 9600 baud line. The
* average compressed header size is 6-8 bytes so any MTU > 90
* bytes will give us 90% of the line bandwidth. A 100ms wait is
* tolerable (500ms is not), so want an MTU around 296. (Since TCP
* will send 256 byte segments (to allow for 40 byte headers), the
* typical packet size on the wire will be around 260 bytes). In
* 4.3tahoe+ systems, we can set an MTU in a route so we do that &
* leave the interface MTU relatively high (so we don't IP fragment
* when acting as a gateway to someone using a stupid MTU).
*
* Similar considerations apply to SLIP_HIWAT: It's the amount of
* data that will be queued 'downstream' of us (i.e., in clists
* waiting to be picked up by the tty output interrupt). If we
* queue a lot of data downstream, it's immune to our t.o.s. queuing.
* E.g., if SLIP_HIWAT is 1024, the interactive traffic in mixed
* telnet/ftp will see a 1 sec wait, independent of the mtu (the
* wait is dependent on the ftp window size but that's typically
* 1k - 4k). So, we want SLIP_HIWAT just big enough to amortize
* the cost (in idle time on the wire) of the tty driver running
* off the end of its clists & having to call back slstart for a
* new packet. For a tty interface with any buffering at all, this
* cost will be zero. Even with a totally brain dead interface (like
* the one on a typical workstation), the cost will be <= 1 character
* time. So, setting SLIP_HIWAT to ~100 guarantees that we'll lose
* at most 1% while maintaining good interactive response.
*/
#define BUFOFFSET (128+sizeof(struct ifnet **)+SLIP_HDRLEN)
#define SLMAX (MCLBYTES - BUFOFFSET)
#define SLBUFSIZE (SLMAX + BUFOFFSET)
#ifndef SLMTU
#define SLMTU 296
#endif
#if (SLMTU < 3)
#error SLMTU way too small.
#endif
#define SLIP_HIWAT roundup(50, TTROUND)
#ifndef __NetBSD__ /* XXX - cgd */
#define CLISTRESERVE 1024 /* Can't let clists get too low */
#endif /* !__NetBSD__ */
/*
* SLIP ABORT ESCAPE MECHANISM:
* (inspired by HAYES modem escape arrangement)
* 1sec escape 1sec escape 1sec escape { 1sec escape 1sec escape }
* within window time signals a "soft" exit from slip mode by remote end
* if the IFF_DEBUG flag is on.
*/
#define ABT_ESC '\033' /* can't be t_intr - distant host must know it*/
#define ABT_IDLE 1 /* in seconds - idle before an escape */
#define ABT_COUNT 3 /* count of escapes for abort */
#define ABT_WINDOW (ABT_COUNT*2+2) /* in seconds - time to count */
static int sl_clone_create(struct if_clone *, int);
static int sl_clone_destroy(struct ifnet *);
static LIST_HEAD(, sl_softc) sl_softc_list;
struct if_clone sl_cloner =
IF_CLONE_INITIALIZER("sl", sl_clone_create, sl_clone_destroy);
#define FRAME_END 0xc0 /* Frame End */
#define FRAME_ESCAPE 0xdb /* Frame Esc */
#define TRANS_FRAME_END 0xdc /* transposed frame end */
#define TRANS_FRAME_ESCAPE 0xdd /* transposed frame esc */
static void slintr(void *);
static int slcreate(struct sl_softc *);
static struct mbuf *sl_btom(struct sl_softc *, int);
static int slclose(struct tty *, int);
static int slinput(int, struct tty *);
static int slioctl(struct ifnet *, u_long, void *);
static int slopen(dev_t, struct tty *);
static int sloutput(struct ifnet *, struct mbuf *, const struct sockaddr *,
const struct rtentry *);
static int slstart(struct tty *);
static int sltioctl(struct tty *, u_long, void *, int, struct lwp *);
static struct linesw slip_disc = {
.l_name = "slip",
.l_open = slopen,
.l_close = slclose,
.l_read = ttyerrio,
.l_write = ttyerrio,
.l_ioctl = sltioctl,
.l_rint = slinput,
.l_start = slstart,
.l_modem = nullmodem,
.l_poll = ttyerrpoll
};
void
slattach(int n __unused)
{
/*
* Nothing to do here, initialization is handled by the
* module initialization code in slinit() below).
*/
}
static void
slinit(void)
{
if (ttyldisc_attach(&slip_disc) != 0)
panic("%s", __func__);
LIST_INIT(&sl_softc_list);
if_clone_attach(&sl_cloner);
}
static int
sldetach(void)
{
int error = 0;
if (!LIST_EMPTY(&sl_softc_list))
error = EBUSY;
if (error == 0)
error = ttyldisc_detach(&slip_disc);
if (error == 0)
if_clone_detach(&sl_cloner);
return error;
}
static int
sl_clone_create(struct if_clone *ifc, int unit)
{
struct sl_softc *sc;
sc = malloc(sizeof(*sc), M_DEVBUF, M_WAIT|M_ZERO);
sc->sc_unit = unit;
if_initname(&sc->sc_if, ifc->ifc_name, unit);
sc->sc_if.if_softc = sc;
sc->sc_if.if_mtu = SLMTU;
sc->sc_if.if_flags = IFF_POINTOPOINT | SC_AUTOCOMP | IFF_MULTICAST;
sc->sc_if.if_type = IFT_SLIP;
sc->sc_if.if_ioctl = slioctl;
sc->sc_if.if_output = sloutput;
sc->sc_if.if_dlt = DLT_SLIP;
IFQ_SET_MAXLEN(&sc->sc_fastq, 32);
IFQ_LOCK_INIT(&sc->sc_fastq);
IFQ_SET_READY(&sc->sc_if.if_snd);
if_attach(&sc->sc_if);
if_alloc_sadl(&sc->sc_if);
bpf_attach(&sc->sc_if, DLT_SLIP, SLIP_HDRLEN);
LIST_INSERT_HEAD(&sl_softc_list, sc, sc_iflist);
return 0;
}
static int
sl_clone_destroy(struct ifnet *ifp)
{
struct sl_softc *sc = (struct sl_softc *)ifp->if_softc;
if (sc->sc_ttyp != NULL)
return EBUSY; /* Not removing it */
LIST_REMOVE(sc, sc_iflist);
bpf_detach(ifp);
if_detach(ifp);
IFQ_LOCK_DESTROY(&sc->sc_fastq);
free(sc, M_DEVBUF);
return 0;
}
static int
slcreate(struct sl_softc *sc)
{
if (sc->sc_mbuf == NULL) { sc->sc_mbuf = m_gethdr(M_WAIT, MT_DATA);
m_clget(sc->sc_mbuf, M_WAIT);
}
sc->sc_ep = (u_char *)sc->sc_mbuf->m_ext.ext_buf +
sc->sc_mbuf->m_ext.ext_size;
sc->sc_mp = sc->sc_pktstart = (u_char *)sc->sc_mbuf->m_ext.ext_buf +
BUFOFFSET;
#ifdef INET
sl_compress_init(&sc->sc_comp);
#endif
return 1;
}
/*
* Line specific open routine.
* Attach the given tty to the first available sl unit.
*/
/* ARGSUSED */
static int
slopen(dev_t dev, struct tty *tp)
{
struct lwp *l = curlwp; /* XXX */
struct sl_softc *sc;
int error;
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_INTERFACE_SLIP,
KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD, NULL, NULL, NULL);
if (error)
return error;
if (tp->t_linesw == &slip_disc)
return 0;
LIST_FOREACH(sc, &sl_softc_list, sc_iflist)
if (sc->sc_ttyp == NULL) {
sc->sc_si = softint_establish(SOFTINT_NET,
slintr, sc);
if (sc->sc_si == NULL)
return ENOMEM;
if (slcreate(sc) == 0) {
softint_disestablish(sc->sc_si);
return ENOBUFS;
}
tp->t_sc = (void *)sc;
sc->sc_ttyp = tp;
sc->sc_if.if_baudrate = tp->t_ospeed;
ttylock(tp);
tp->t_state |= TS_ISOPEN | TS_XCLUDE;
ttyflush(tp, FREAD | FWRITE);
/*
* make sure tty output queue is large enough
* to hold a full-sized packet (including frame
* end, and a possible extra frame end). full-sized
* packet occupies a max of 2*SLMAX bytes (because
* of possible escapes), and add two on for frame
* ends.
*/
if (tp->t_outq.c_cn < 2 * SLMAX + 2) {
sc->sc_oldbufsize = tp->t_outq.c_cn;
sc->sc_oldbufquot = tp->t_outq.c_cq != 0;
clfree(&tp->t_outq);
ttyunlock(tp);
error = clalloc(&tp->t_outq, 2 * SLMAX + 2, 0);
if (error) { softint_disestablish(sc->sc_si);
/*
* clalloc() might return -1 which
* is no good, so we need to return
* something else.
*/
return ENOMEM; /* XXX ?! */
}
} else {
sc->sc_oldbufsize = sc->sc_oldbufquot = 0;
ttyunlock(tp);
}
return 0;
}
return ENXIO;
}
/*
* Line specific close routine.
* Detach the tty from the sl unit.
*/
static int
slclose(struct tty *tp, int flag)
{
struct sl_softc *sc;
int s;
ttywflush(tp);
sc = tp->t_sc;
if (sc != NULL) {
softint_disestablish(sc->sc_si);
s = splnet();
if_down(&sc->sc_if);
IF_PURGE(&sc->sc_fastq);
splx(s);
s = spltty();
ttyldisc_release(tp->t_linesw);
tp->t_linesw = ttyldisc_default();
tp->t_state = 0;
sc->sc_ttyp = NULL;
tp->t_sc = NULL;
m_freem(sc->sc_mbuf);
sc->sc_mbuf = NULL;
sc->sc_ep = sc->sc_mp = sc->sc_pktstart = NULL;
IF_PURGE(&sc->sc_inq);
/*
* If necessary, install a new outq buffer of the
* appropriate size.
*/
if (sc->sc_oldbufsize != 0) {
clfree(&tp->t_outq);
clalloc(&tp->t_outq, sc->sc_oldbufsize,
sc->sc_oldbufquot);
}
splx(s);
}
return 0;
}
/*
* Line specific (tty) ioctl routine.
* Provide a way to get the sl unit number.
*/
/* ARGSUSED */
static int
sltioctl(struct tty *tp, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct sl_softc *sc = (struct sl_softc *)tp->t_sc;
/*
* XXX
* This function can be called without KERNEL_LOCK when caller's
* struct cdevsw is set D_MPSAFE. Is KERNEL_LOCK required?
*/
switch (cmd) {
case SLIOCGUNIT:
*(int *)data = sc->sc_unit; /* XXX */
break;
default:
return EPASSTHROUGH;
}
return 0;
}
/*
* Queue a packet. Start transmission if not active.
* Compression happens in slintr(); if we do it here, IP TOS
* will cause us to not compress "background" packets, because
* ordering gets trashed. It can be done for all packets in slintr().
*/
static int
sloutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
const struct rtentry *rtp)
{
struct sl_softc *sc = ifp->if_softc;
struct ip *ip;
struct ifqueue *ifq = NULL;
int s, error;
IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
/*
* `Cannot happen' (see slioctl). Someday we will extend
* the line protocol to support other address families.
*/
if (dst->sa_family != AF_INET) {
printf("%s: af%d not supported\n", sc->sc_if.if_xname,
dst->sa_family);
m_freem(m);
if_statinc(&sc->sc_if, if_noproto);
return EAFNOSUPPORT;
}
if (sc->sc_ttyp == NULL) {
m_freem(m);
return ENETDOWN; /* sort of */
}
if ((sc->sc_ttyp->t_state & TS_CARR_ON) == 0 &&
(sc->sc_ttyp->t_cflag & CLOCAL) == 0) {
m_freem(m);
printf("%s: no carrier and not local\n", sc->sc_if.if_xname);
return EHOSTUNREACH;
}
ip = mtod(m, struct ip *);
#ifdef INET
if (sc->sc_if.if_flags & SC_NOICMP && ip->ip_p == IPPROTO_ICMP) {
m_freem(m);
return ENETRESET; /* XXX ? */
}
#endif
s = spltty();
if (sc->sc_oqlen && sc->sc_ttyp->t_outq.c_cc == sc->sc_oqlen) {
struct bintime bt;
/* if output's been stalled for too long, and restart */
getbinuptime(&bt);
bintime_sub(&bt, &sc->sc_lastpacket);
if (bt.sec > 0) {
sc->sc_otimeout++;
slstart(sc->sc_ttyp);
}
}
splx(s);
s = splnet();
#ifdef INET
if ((ip->ip_tos & IPTOS_LOWDELAY) != 0)
ifq = &sc->sc_fastq;
#endif
if ((error = ifq_enqueue2(ifp, ifq, m)) != 0) {
splx(s);
return error;
}
getbinuptime(&sc->sc_lastpacket);
splx(s);
s = spltty();
if ((sc->sc_oqlen = sc->sc_ttyp->t_outq.c_cc) == 0)
slstart(sc->sc_ttyp);
splx(s);
return 0;
}
/*
* Start output on interface. Get another datagram
* to send from the interface queue and map it to
* the interface before starting output.
*/
static int
slstart(struct tty *tp)
{
struct sl_softc *sc = tp->t_sc;
/*
* If there is more in the output queue, just send it now.
* We are being called in lieu of ttstart and must do what
* it would.
*/
if (tp->t_outq.c_cc != 0) {
(*tp->t_oproc)(tp);
if (tp->t_outq.c_cc > SLIP_HIWAT)
return 0;
}
/*
* This happens briefly when the line shuts down.
*/
if (sc == NULL)
return 0;
softint_schedule(sc->sc_si);
return 0;
}
/*
* Copy data buffer to mbuf chain; add ifnet pointer.
*/
static struct mbuf *
sl_btom(struct sl_softc *sc, int len)
{
struct mbuf *m;
/*
* Allocate a new input buffer and swap.
*/
m = sc->sc_mbuf;
MGETHDR(sc->sc_mbuf, M_DONTWAIT, MT_DATA);
if (sc->sc_mbuf == NULL) {
sc->sc_mbuf = m;
return NULL;
}
MCLGET(sc->sc_mbuf, M_DONTWAIT);
if ((sc->sc_mbuf->m_flags & M_EXT) == 0) {
m_freem(sc->sc_mbuf);
sc->sc_mbuf = m;
return NULL;
}
sc->sc_ep = (u_char *)sc->sc_mbuf->m_ext.ext_buf +
sc->sc_mbuf->m_ext.ext_size;
m->m_data = sc->sc_pktstart;
m->m_pkthdr.len = m->m_len = len;
m_set_rcvif(m, &sc->sc_if);
return m;
}
/*
* tty interface receiver interrupt.
*/
static int
slinput(int c, struct tty *tp)
{
struct sl_softc *sc;
struct mbuf *m;
int len;
tk_nin++;
sc = (struct sl_softc *)tp->t_sc;
if (sc == NULL)
return 0;
if ((c & TTY_ERRORMASK) || ((tp->t_state & TS_CARR_ON) == 0 &&
(tp->t_cflag & CLOCAL) == 0)) {
sc->sc_flags |= SC_ERROR;
return 0;
}
c &= TTY_CHARMASK;
if_statinc(&sc->sc_if, if_ibytes);
if (sc->sc_if.if_flags & IFF_DEBUG) {
if (c == ABT_ESC) {
/*
* If we have a previous abort, see whether
* this one is within the time limit.
*/
if (sc->sc_abortcount &&
time_second >= sc->sc_starttime + ABT_WINDOW)
sc->sc_abortcount = 0;
/*
* If we see an abort after "idle" time, count it;
* record when the first abort escape arrived.
*/
if (time_second >= sc->sc_lasttime + ABT_IDLE) {
if (++sc->sc_abortcount == 1)
sc->sc_starttime = time_second;
if (sc->sc_abortcount >= ABT_COUNT) {
slclose(tp, 0);
return 0;
}
}
} else
sc->sc_abortcount = 0;
sc->sc_lasttime = time_second;
}
switch (c) {
case TRANS_FRAME_ESCAPE:
if (sc->sc_escape)
c = FRAME_ESCAPE;
break;
case TRANS_FRAME_END:
if (sc->sc_escape)
c = FRAME_END;
break;
case FRAME_ESCAPE:
sc->sc_escape = 1;
return 0;
case FRAME_END:
if (sc->sc_flags & SC_ERROR) {
sc->sc_flags &= ~SC_ERROR;
goto newpack;
}
len = sc->sc_mp - sc->sc_pktstart;
if (len < 3)
/* less than min length packet - ignore */
goto newpack;
m = sl_btom(sc, len);
if (m == NULL)
goto error;
IF_ENQUEUE(&sc->sc_inq, m);
softint_schedule(sc->sc_si);
goto newpack;
}
if (sc->sc_mp < sc->sc_ep) {
*sc->sc_mp++ = c;
sc->sc_escape = 0;
return 0;
}
/* can't put lower; would miss an extra frame */
sc->sc_flags |= SC_ERROR;
error:
if_statinc(&sc->sc_if, if_ierrors);
newpack:
sc->sc_mp = sc->sc_pktstart = (u_char *)sc->sc_mbuf->m_ext.ext_buf +
BUFOFFSET;
sc->sc_escape = 0;
return 0;
}
static void
slintr(void *arg)
{
struct sl_softc *sc = arg;
struct tty *tp = sc->sc_ttyp;
struct mbuf *m, *n;
int s, len;
u_char *pktstart;
u_char chdr[CHDR_LEN];
KASSERT(tp != NULL);
/*
* Output processing loop.
*/
mutex_enter(softnet_lock);
for (;;) {
struct mbuf *m2;
struct mbuf *bpf_m;
/*
* Do not remove the packet from the queue if it
* doesn't look like it will fit into the current
* serial output queue. With a packet full of
* escapes, this could be as bad as MTU*2+2.
*/
s = spltty();
if (tp->t_outq.c_cn - tp->t_outq.c_cc <
2 * sc->sc_if.if_mtu + 2) {
splx(s);
break;
}
splx(s);
/*
* Get a packet and send it to the interface.
*/
s = splnet();
IF_DEQUEUE(&sc->sc_fastq, m);
if (m)
if_statinc(&sc->sc_if, if_omcasts); /* XXX */
else
IFQ_DEQUEUE(&sc->sc_if.if_snd, m);
splx(s);
if (m == NULL)
break;
/*
* We do the header compression here rather than in
* sloutput() because the packets will be out of order
* if we are using TOS queueing, and the connection
* ID compression will get munged when this happens.
*/
if (sc->sc_if.if_bpf) {
/*
* We need to save the TCP/IP header before
* it's compressed. To avoid complicated
* code, we just make a deep copy of the
* entire packet (since this is a serial
* line, packets should be short and/or the
* copy should be negligible cost compared
* to the packet transmission time).
*/
bpf_m = m_dup(m, 0, M_COPYALL, M_DONTWAIT);
} else
bpf_m = NULL;
#ifdef INET
struct ip *ip;
if ((ip = mtod(m, struct ip *))->ip_p == IPPROTO_TCP) {
if (sc->sc_if.if_flags & SC_COMPRESS)
*mtod(m, u_char *) |=
sl_compress_tcp(m, ip, &sc->sc_comp, 1);
}
#endif
if (bpf_m)
bpf_mtap_sl_out(&sc->sc_if, mtod(m, u_char *), bpf_m);
getbinuptime(&sc->sc_lastpacket);
s = spltty();
/*
* The extra FRAME_END will start up a new packet,
* and thus will flush any accumulated garbage. We
* do this whenever the line may have been idle for
* some time.
*/
if (tp->t_outq.c_cc == 0) {
if_statinc(&sc->sc_if, if_obytes);
(void)putc(FRAME_END, &tp->t_outq);
}
while (m) {
u_char *bp, *cp, *ep;
bp = cp = mtod(m, u_char *);
ep = cp + m->m_len;
while (cp < ep) {
/*
* Find out how many bytes in the
* string we can handle without
* doing something special.
*/
while (cp < ep) {
switch (*cp++) {
case FRAME_ESCAPE:
case FRAME_END:
cp--;
goto out;
}
}
out:
if (cp > bp) {
/*
* Put N characters at once
* into the tty output queue.
*/
if (b_to_q(bp, cp - bp, &tp->t_outq))
break;
if_statadd(&sc->sc_if, if_obytes,
cp - bp);
}
/*
* If there are characters left in
* the mbuf, the first one must be
* special.. Put it out in a different
* form.
*/
if (cp < ep) {
if (putc(FRAME_ESCAPE, &tp->t_outq))
break;
if (putc(*cp++ == FRAME_ESCAPE ?
TRANS_FRAME_ESCAPE :
TRANS_FRAME_END,
&tp->t_outq)) {
(void)unputc(&tp->t_outq);
break;
}
if_statadd(&sc->sc_if, if_obytes, 2);
}
bp = cp;
}
m = m2 = m_free(m);
}
if (putc(FRAME_END, &tp->t_outq)) {
/*
* Not enough room. Remove a char to make
* room and end the packet normally. If
* you get many collisions (more than one
* or two a day), you probably do not have
* enough clists and you should increase
* "nclist" in param.c
*/
(void)unputc(&tp->t_outq);
(void)putc(FRAME_END, &tp->t_outq);
if_statinc(&sc->sc_if, if_collisions);
} else {
if_statadd2(&sc->sc_if, if_obytes, 1, if_opackets, 1);
}
/*
* We now have characters in the output queue,
* kick the serial port.
*/
(*tp->t_oproc)(tp);
splx(s);
}
/*
* Input processing loop.
*/
for (;;) {
s = spltty();
IF_DEQUEUE(&sc->sc_inq, m);
splx(s);
if (m == NULL)
break;
pktstart = mtod(m, u_char *);
len = m->m_pkthdr.len;
if (sc->sc_if.if_bpf) {
/*
* Save the compressed header, so we
* can tack it on later. Note that we
* will end up copying garbage in some
* cases but this is okay. We remember
* where the buffer started so we can
* compute the new header length.
*/
memcpy(chdr, pktstart, CHDR_LEN);
}
#ifdef INET
u_char c;
if ((c = (*pktstart & 0xf0)) != (IPVERSION << 4)) {
if (c & 0x80)
c = TYPE_COMPRESSED_TCP;
else if (c == TYPE_UNCOMPRESSED_TCP)
*pktstart &= 0x4f; /* XXX */
/*
* We've got something that's not an IP
* packet. If compression is enabled,
* try to decompress it. Otherwise, if
* `auto-enable' compression is on and
* it's a reasonable packet, decompress
* it and then enable compression.
* Otherwise, drop it.
*/
if (sc->sc_if.if_flags & SC_COMPRESS) {
len = sl_uncompress_tcp(&pktstart, len,
(u_int)c, &sc->sc_comp);
if (len <= 0) {
m_freem(m);
continue;
}
} else if ((sc->sc_if.if_flags & SC_AUTOCOMP) &&
c == TYPE_UNCOMPRESSED_TCP && len >= 40) {
len = sl_uncompress_tcp(&pktstart, len,
(u_int)c, &sc->sc_comp);
if (len <= 0) {
m_freem(m);
continue;
}
sc->sc_if.if_flags |= SC_COMPRESS;
} else {
m_freem(m);
continue;
}
}
#endif
m->m_data = (void *) pktstart;
m->m_pkthdr.len = m->m_len = len;
if (sc->sc_if.if_bpf) {
bpf_mtap_sl_in(&sc->sc_if, chdr, &m);
if (m == NULL)
continue;
}
/*
* If the packet will fit into a single
* header mbuf, try to copy it into one,
* to save memory.
*/
if ((m->m_pkthdr.len < MHLEN) &&
(n = m_gethdr(M_DONTWAIT, MT_DATA))) {
int pktlen;
pktlen = m->m_pkthdr.len;
m_move_pkthdr(n, m);
memcpy(mtod(n, void *), mtod(m, void *), pktlen);
n->m_len = m->m_len;
m_freem(m);
m = n;
}
if_statinc(&sc->sc_if, if_ipackets);
getbinuptime(&sc->sc_lastpacket);
#ifdef INET
s = splnet();
if (__predict_false(!pktq_enqueue(ip_pktq, m, 0))) {
if_statadd2(&sc->sc_if, if_ierrors, 1, if_iqdrops, 1);
m_freem(m);
}
splx(s);
#endif
}
mutex_exit(softnet_lock);
}
/*
* Process an ioctl request.
*/
static int
slioctl(struct ifnet *ifp, u_long cmd, void *data)
{
struct ifaddr *ifa = (struct ifaddr *)data;
struct ifreq *ifr = (struct ifreq *)data;
int s = splnet(), error = 0;
struct sl_softc *sc = ifp->if_softc;
struct ppp_stats *psp;
struct ppp_comp_stats *pcp;
switch (cmd) {
case SIOCINITIFADDR:
if (ifa->ifa_addr->sa_family == AF_INET)
ifp->if_flags |= IFF_UP;
else
error = EAFNOSUPPORT;
break;
case SIOCSIFDSTADDR:
if (ifreq_getaddr(cmd, ifr)->sa_family != AF_INET)
error = EAFNOSUPPORT;
break;
case SIOCSIFMTU:
if ((ifr->ifr_mtu < 3) || (ifr->ifr_mtu > SLMAX)) {
error = EINVAL;
break;
}
/*FALLTHROUGH*/
case SIOCGIFMTU:
if ((error = ifioctl_common(&sc->sc_if, cmd, data)) == ENETRESET)
error = 0;
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (ifr == 0) {
error = EAFNOSUPPORT; /* XXX */
break;
}
switch (ifreq_getaddr(cmd, ifr)->sa_family) {
#ifdef INET
case AF_INET:
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
break;
case SIOCGPPPSTATS: {
struct if_data ifi;
if_export_if_data(&sc->sc_if, &ifi, false);
psp = &((struct ifpppstatsreq *) data)->stats;
(void)memset(psp, 0, sizeof(*psp));
psp->p.ppp_ibytes = ifi.ifi_ibytes;
psp->p.ppp_ipackets = ifi.ifi_ipackets;
psp->p.ppp_ierrors = ifi.ifi_ierrors;
psp->p.ppp_obytes = ifi.ifi_obytes;
psp->p.ppp_opackets = ifi.ifi_opackets;
psp->p.ppp_oerrors = ifi.ifi_oerrors;
#ifdef INET
psp->vj.vjs_packets = sc->sc_comp.sls_packets;
psp->vj.vjs_compressed = sc->sc_comp.sls_compressed;
psp->vj.vjs_searches = sc->sc_comp.sls_searches;
psp->vj.vjs_misses = sc->sc_comp.sls_misses;
psp->vj.vjs_uncompressedin = sc->sc_comp.sls_uncompressedin;
psp->vj.vjs_compressedin = sc->sc_comp.sls_compressedin;
psp->vj.vjs_errorin = sc->sc_comp.sls_errorin;
psp->vj.vjs_tossed = sc->sc_comp.sls_tossed;
#endif
}
break;
case SIOCGPPPCSTATS:
pcp = &((struct ifpppcstatsreq *) data)->stats;
(void)memset(pcp, 0, sizeof(*pcp));
break;
default:
error = ifioctl_common(ifp, cmd, data);
break;
}
splx(s);
return error;
}
/*
* Module infrastructure
*/
#include "if_module.h"
IF_MODULE(MODULE_CLASS_DRIVER, sl, "slcompress");
/* $NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $ */
/*-
* Copyright (c) 1999, 2006, 2007, 2008, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_proc.c,v 1.274 2023/10/05 19:41:07 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_kstack.h"
#include "opt_maxuprc.h"
#include "opt_dtrace.h"
#include "opt_compat_netbsd32.h"
#include "opt_kaslr.h"
#endif
#if defined(__HAVE_COMPAT_NETBSD32) && !defined(COMPAT_NETBSD32) \
&& !defined(_RUMPKERNEL)
#define COMPAT_NETBSD32
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/buf.h>
#include <sys/acct.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <ufs/ufs/quota.h>
#include <sys/uio.h>
#include <sys/pool.h>
#include <sys/pset.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/ras.h>
#include <sys/filedesc.h>
#include <sys/syscall_stats.h>
#include <sys/kauth.h>
#include <sys/sleepq.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/dtrace_bsd.h>
#include <sys/sysctl.h>
#include <sys/exec.h>
#include <sys/cpu.h>
#include <sys/compat_stub.h>
#include <sys/futex.h>
#include <sys/pserialize.h>
#include <uvm/uvm_extern.h>
/*
* Process lists.
*/
struct proclist allproc __cacheline_aligned;
struct proclist zombproc __cacheline_aligned;
kmutex_t proc_lock __cacheline_aligned;
static pserialize_t proc_psz;
/*
* pid to lwp/proc lookup is done by indexing the pid_table array.
* Since pid numbers are only allocated when an empty slot
* has been found, there is no need to search any lists ever.
* (an orphaned pgrp will lock the slot, a session will lock
* the pgrp with the same number.)
* If the table is too small it is reallocated with twice the
* previous size and the entries 'unzipped' into the two halves.
* A linked list of free entries is passed through the pt_lwp
* field of 'free' items - set odd to be an invalid ptr. Two
* additional bits are also used to indicate if the slot is
* currently occupied by a proc or lwp, and if the PID is
* hidden from certain kinds of lookups. We thus require a
* minimum alignment for proc and lwp structures (LWPs are
* at least 32-byte aligned).
*/
struct pid_table {
uintptr_t pt_slot;
struct pgrp *pt_pgrp;
pid_t pt_pid;
};
#define PT_F_FREE ((uintptr_t)__BIT(0))
#define PT_F_LWP 0 /* pseudo-flag */
#define PT_F_PROC ((uintptr_t)__BIT(1))
#define PT_F_TYPEBITS (PT_F_FREE|PT_F_PROC)
#define PT_F_ALLBITS (PT_F_FREE|PT_F_PROC)
#define PT_VALID(s) (((s) & PT_F_FREE) == 0)
#define PT_RESERVED(s) ((s) == 0)
#define PT_NEXT(s) ((u_int)(s) >> 1)
#define PT_SET_FREE(pid) (((pid) << 1) | PT_F_FREE)
#define PT_SET_LWP(l) ((uintptr_t)(l))
#define PT_SET_PROC(p) (((uintptr_t)(p)) | PT_F_PROC)
#define PT_SET_RESERVED 0
#define PT_GET_LWP(s) ((struct lwp *)((s) & ~PT_F_ALLBITS))
#define PT_GET_PROC(s) ((struct proc *)((s) & ~PT_F_ALLBITS))
#define PT_GET_TYPE(s) ((s) & PT_F_TYPEBITS)
#define PT_IS_LWP(s) (PT_GET_TYPE(s) == PT_F_LWP && (s) != 0)
#define PT_IS_PROC(s) (PT_GET_TYPE(s) == PT_F_PROC)
#define MIN_PROC_ALIGNMENT (PT_F_ALLBITS + 1)
/*
* Table of process IDs (PIDs).
*/
static struct pid_table *pid_table __read_mostly;
#define INITIAL_PID_TABLE_SIZE (1 << 5)
/* Table mask, threshold for growing and number of allocated PIDs. */
static u_int pid_tbl_mask __read_mostly;
static u_int pid_alloc_lim __read_mostly;
static u_int pid_alloc_cnt __cacheline_aligned;
/* Next free, last free and maximum PIDs. */
static u_int next_free_pt __cacheline_aligned;
static u_int last_free_pt __cacheline_aligned;
static pid_t pid_max __read_mostly;
/* Components of the first process -- never freed. */
struct session session0 = {
.s_count = 1,
.s_sid = 0,
};
struct pgrp pgrp0 = {
.pg_members = LIST_HEAD_INITIALIZER(&pgrp0.pg_members),
.pg_session = &session0,
};
filedesc_t filedesc0;
struct cwdinfo cwdi0 = {
.cwdi_cmask = CMASK,
.cwdi_refcnt = 1,
};
struct plimit limit0;
struct pstats pstat0;
struct vmspace vmspace0;
struct sigacts sigacts0;
struct proc proc0 = {
.p_lwps = LIST_HEAD_INITIALIZER(&proc0.p_lwps),
.p_sigwaiters = LIST_HEAD_INITIALIZER(&proc0.p_sigwaiters),
.p_nlwps = 1,
.p_nrlwps = 1,
.p_pgrp = &pgrp0,
.p_comm = "system",
/*
* Set P_NOCLDWAIT so that kernel threads are reparented to init(8)
* when they exit. init(8) can easily wait them out for us.
*/
.p_flag = PK_SYSTEM | PK_NOCLDWAIT,
.p_stat = SACTIVE,
.p_nice = NZERO,
.p_emul = &emul_netbsd,
.p_cwdi = &cwdi0,
.p_limit = &limit0,
.p_fd = &filedesc0,
.p_vmspace = &vmspace0,
.p_stats = &pstat0,
.p_sigacts = &sigacts0,
#ifdef PROC0_MD_INITIALIZERS
PROC0_MD_INITIALIZERS
#endif
};
kauth_cred_t cred0;
static const int nofile = NOFILE;
static const int maxuprc = MAXUPRC;
static int sysctl_doeproc(SYSCTLFN_PROTO);
static int sysctl_kern_proc_args(SYSCTLFN_PROTO);
static int sysctl_security_expose_address(SYSCTLFN_PROTO);
#ifdef KASLR
static int kern_expose_address = 0;
#else
static int kern_expose_address = 1;
#endif
/*
* The process list descriptors, used during pid allocation and
* by sysctl. No locking on this data structure is needed since
* it is completely static.
*/
const struct proclist_desc proclists[] = {
{ &allproc },
{ &zombproc },
{ NULL },
};
static struct pgrp * pg_remove(pid_t);
static void pg_delete(pid_t);
static void orphanpg(struct pgrp *);
static specificdata_domain_t proc_specificdata_domain;
static pool_cache_t proc_cache;
static kauth_listener_t proc_listener;
static void fill_proc(const struct proc *, struct proc *, bool);
static int fill_pathname(struct lwp *, pid_t, void *, size_t *);
static int fill_cwd(struct lwp *, pid_t, void *, size_t *);
static int
proc_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
switch (action) {
case KAUTH_PROCESS_CANSEE: {
enum kauth_process_req req;
req = (enum kauth_process_req)(uintptr_t)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_CANSEE_ARGS:
case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
case KAUTH_REQ_PROCESS_CANSEE_EPROC:
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_PROCESS_CANSEE_ENV:
if (kauth_cred_getuid(cred) !=
kauth_cred_getuid(p->p_cred) || kauth_cred_getuid(cred) !=
kauth_cred_getsvuid(p->p_cred))
break;
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_PROCESS_CANSEE_KPTR:
if (!kern_expose_address)
break;
if (kern_expose_address == 1 && !(p->p_flag & PK_KMEM))
break;
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_PROCESS_FORK: {
int lnprocs = (int)(unsigned long)arg2;
/*
* Don't allow a nonprivileged user to use the last few
* processes. The variable lnprocs is the current number of
* processes, maxproc is the limit.
*/
if (__predict_false((lnprocs >= maxproc - 5)))
break;
result = KAUTH_RESULT_ALLOW;
break;
}
case KAUTH_PROCESS_CORENAME:
case KAUTH_PROCESS_STOPFLAG:
if (proc_uidmatch(cred, p->p_cred) == 0)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return result;
}
static int
proc_ctor(void *arg __unused, void *obj, int flags __unused)
{
struct proc *p = obj;
memset(p, 0, sizeof(*p));
klist_init(&p->p_klist);
/*
* There is no need for a proc_dtor() to do a klist_fini(),
* since knote_proc_exit() ensures that p->p_klist is empty
* when a process exits.
*/
return 0;
}
static pid_t proc_alloc_pid_slot(struct proc *, uintptr_t);
/*
* Initialize global process hashing structures.
*/
void
procinit(void)
{
const struct proclist_desc *pd;
u_int i;
#define LINK_EMPTY ((PID_MAX + INITIAL_PID_TABLE_SIZE) & ~(INITIAL_PID_TABLE_SIZE - 1))
for (pd = proclists; pd->pd_list != NULL; pd++)
LIST_INIT(pd->pd_list);
mutex_init(&proc_lock, MUTEX_DEFAULT, IPL_NONE);
proc_psz = pserialize_create();
pid_table = kmem_alloc(INITIAL_PID_TABLE_SIZE
* sizeof(struct pid_table), KM_SLEEP);
pid_tbl_mask = INITIAL_PID_TABLE_SIZE - 1;
pid_max = PID_MAX;
/* Set free list running through table...
Preset 'use count' above PID_MAX so we allocate pid 1 next. */
for (i = 0; i <= pid_tbl_mask; i++) {
pid_table[i].pt_slot = PT_SET_FREE(LINK_EMPTY + i + 1);
pid_table[i].pt_pgrp = 0;
pid_table[i].pt_pid = 0;
}
/* slot 0 is just grabbed */
next_free_pt = 1;
/* Need to fix last entry. */
last_free_pt = pid_tbl_mask;
pid_table[last_free_pt].pt_slot = PT_SET_FREE(LINK_EMPTY);
/* point at which we grow table - to avoid reusing pids too often */
pid_alloc_lim = pid_tbl_mask - 1;
#undef LINK_EMPTY
/* Reserve PID 1 for init(8). */ /* XXX slightly gross */
mutex_enter(&proc_lock);
if (proc_alloc_pid_slot(&proc0, PT_SET_RESERVED) != 1)
panic("failed to reserve PID 1 for init(8)");
mutex_exit(&proc_lock);
proc_specificdata_domain = specificdata_domain_create();
KASSERT(proc_specificdata_domain != NULL);
size_t proc_alignment = coherency_unit;
if (proc_alignment < MIN_PROC_ALIGNMENT)
proc_alignment = MIN_PROC_ALIGNMENT;
proc_cache = pool_cache_init(sizeof(struct proc), proc_alignment, 0, 0,
"procpl", NULL, IPL_NONE, proc_ctor, NULL, NULL);
proc_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
proc_listener_cb, NULL);
}
void
procinit_sysctl(void)
{
static struct sysctllog *clog;
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "expose_address",
SYSCTL_DESCR("Enable exposing kernel addresses"),
sysctl_security_expose_address, 0,
&kern_expose_address, 0, CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc",
SYSCTL_DESCR("System-wide process information"),
sysctl_doeproc, 0, NULL, 0,
CTL_KERN, KERN_PROC, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc2",
SYSCTL_DESCR("Machine-independent process information"),
sysctl_doeproc, 0, NULL, 0,
CTL_KERN, KERN_PROC2, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "proc_args",
SYSCTL_DESCR("Process argument information"),
sysctl_kern_proc_args, 0, NULL, 0,
CTL_KERN, KERN_PROC_ARGS, CTL_EOL);
/*
"nodes" under these:
KERN_PROC_ALL
KERN_PROC_PID pid
KERN_PROC_PGRP pgrp
KERN_PROC_SESSION sess
KERN_PROC_TTY tty
KERN_PROC_UID uid
KERN_PROC_RUID uid
KERN_PROC_GID gid
KERN_PROC_RGID gid
all in all, probably not worth the effort...
*/
}
/*
* Initialize process 0.
*/
void
proc0_init(void)
{
struct proc *p;
struct pgrp *pg;
struct rlimit *rlim;
rlim_t lim;
int i;
p = &proc0;
pg = &pgrp0;
mutex_init(&p->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
p->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
rw_init(&p->p_reflock);
cv_init(&p->p_waitcv, "wait");
cv_init(&p->p_lwpcv, "lwpwait");
LIST_INSERT_HEAD(&p->p_lwps, &lwp0, l_sibling);
KASSERT(lwp0.l_lid == 0);
pid_table[lwp0.l_lid].pt_slot = PT_SET_LWP(&lwp0);
LIST_INSERT_HEAD(&allproc, p, p_list);
pid_table[lwp0.l_lid].pt_pgrp = pg;
LIST_INSERT_HEAD(&pg->pg_members, p, p_pglist);
#ifdef __HAVE_SYSCALL_INTERN
(*p->p_emul->e_syscall_intern)(p);
#endif
/* Create credentials. */
cred0 = kauth_cred_alloc();
p->p_cred = cred0;
/* Create the CWD info. */
rw_init(&cwdi0.cwdi_lock);
/* Create the limits structures. */
mutex_init(&limit0.pl_lock, MUTEX_DEFAULT, IPL_NONE);
rlim = limit0.pl_rlimit;
for (i = 0; i < __arraycount(limit0.pl_rlimit); i++) {
rlim[i].rlim_cur = RLIM_INFINITY;
rlim[i].rlim_max = RLIM_INFINITY;
}
rlim[RLIMIT_NOFILE].rlim_max = maxfiles;
rlim[RLIMIT_NOFILE].rlim_cur = maxfiles < nofile ? maxfiles : nofile;
rlim[RLIMIT_NPROC].rlim_max = maxproc;
rlim[RLIMIT_NPROC].rlim_cur = maxproc < maxuprc ? maxproc : maxuprc;
lim = MIN(VM_MAXUSER_ADDRESS, ctob((rlim_t)uvm_availmem(false)));
rlim[RLIMIT_RSS].rlim_max = lim;
rlim[RLIMIT_MEMLOCK].rlim_max = lim;
rlim[RLIMIT_MEMLOCK].rlim_cur = lim / 3;
rlim[RLIMIT_NTHR].rlim_max = maxlwp;
rlim[RLIMIT_NTHR].rlim_cur = maxlwp / 2;
/* Note that default core name has zero length. */
limit0.pl_corename = defcorename;
limit0.pl_cnlen = 0;
limit0.pl_refcnt = 1;
limit0.pl_writeable = false;
limit0.pl_sv_limit = NULL;
/* Configure virtual memory system, set vm rlimits. */
uvm_init_limits(p);
/* Initialize file descriptor table for proc0. */
fd_init(&filedesc0);
/*
* Initialize proc0's vmspace, which uses the kernel pmap.
* All kernel processes (which never have user space mappings)
* share proc0's vmspace, and thus, the kernel pmap.
*/
uvmspace_init(&vmspace0, pmap_kernel(), round_page(VM_MIN_ADDRESS),
trunc_page(VM_MAXUSER_ADDRESS),
#ifdef __USE_TOPDOWN_VM
true
#else
false
#endif
);
/* Initialize signal state for proc0. XXX IPL_SCHED */
mutex_init(&p->p_sigacts->sa_mutex, MUTEX_DEFAULT, IPL_SCHED);
siginit(p);
proc_initspecific(p);
kdtrace_proc_ctor(NULL, p);
}
/*
* Session reference counting.
*/
void
proc_sesshold(struct session *ss)
{
KASSERT(mutex_owned(&proc_lock));
ss->s_count++;
}
void
proc_sessrele(struct session *ss)
{
struct pgrp *pg;
KASSERT(mutex_owned(&proc_lock)); KASSERT(ss->s_count > 0);
/*
* We keep the pgrp with the same id as the session in order to
* stop a process being given the same pid. Since the pgrp holds
* a reference to the session, it must be a 'zombie' pgrp by now.
*/
if (--ss->s_count == 0) {
pg = pg_remove(ss->s_sid);
} else {
pg = NULL;
ss = NULL;
}
mutex_exit(&proc_lock);
if (pg) kmem_free(pg, sizeof(struct pgrp));
if (ss)
kmem_free(ss, sizeof(struct session));
}
/*
* Check that the specified process group is in the session of the
* specified process.
* Treats -ve ids as process ids.
* Used to validate TIOCSPGRP requests.
*/
int
pgid_in_session(struct proc *p, pid_t pg_id)
{
struct pgrp *pgrp;
struct session *session;
int error;
if (pg_id == INT_MIN)
return EINVAL;
mutex_enter(&proc_lock);
if (pg_id < 0) {
struct proc *p1 = proc_find(-pg_id);
if (p1 == NULL) {
error = EINVAL;
goto fail;
}
pgrp = p1->p_pgrp;
} else {
pgrp = pgrp_find(pg_id);
if (pgrp == NULL) {
error = EINVAL;
goto fail;
}
}
session = pgrp->pg_session;
error = (session != p->p_pgrp->pg_session) ? EPERM : 0;
fail:
mutex_exit(&proc_lock);
return error;
}
/*
* p_inferior: is p an inferior of q?
*/
static inline bool
p_inferior(struct proc *p, struct proc *q)
{
KASSERT(mutex_owned(&proc_lock)); for (; p != q; p = p->p_pptr) if (p->p_pid == 0)
return false;
return true;
}
/*
* proc_find_lwp: locate an lwp in said proc by the ID.
*
* => Must be called with p::p_lock held.
* => LSIDL lwps are not returned because they are only partially
* constructed while occupying the slot.
* => Callers need to be careful about lwp::l_stat of the returned
* lwp.
*/
struct lwp *
proc_find_lwp(proc_t *p, pid_t pid)
{
struct pid_table *pt;
unsigned pt_mask;
struct lwp *l = NULL;
uintptr_t slot;
int s;
KASSERT(mutex_owned(p->p_lock));
/*
* Look in the pid_table. This is done unlocked inside a
* pserialize read section covering pid_table's memory
* allocation only, so take care to read things in the correct
* order:
*
* 1. First read the table mask -- this only ever increases, in
* expand_pid_table, so a stale value is safely
* conservative.
*
* 2. Next read the pid table -- this is always set _before_
* the mask increases, so if we see a new table and stale
* mask, the mask is still valid for the table.
*/
s = pserialize_read_enter();
pt_mask = atomic_load_acquire(&pid_tbl_mask);
pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
slot = atomic_load_consume(&pt->pt_slot);
if (__predict_false(!PT_IS_LWP(slot))) {
pserialize_read_exit(s);
return NULL;
}
/*
* Check to see if the LWP is from the correct process. We won't
* see entries in pid_table from a prior process that also used "p",
* by virtue of the fact that allocating "p" means all prior updates
* to dependant data structures are visible to this thread.
*/
l = PT_GET_LWP(slot);
if (__predict_false(atomic_load_relaxed(&l->l_proc) != p)) {
pserialize_read_exit(s);
return NULL;
}
/*
* We now know that p->p_lock holds this LWP stable.
*
* If the status is not LSIDL, it means the LWP is intended to be
* findable by LID and l_lid cannot change behind us.
*
* No need to acquire the LWP's lock to check for LSIDL, as
* p->p_lock must be held to transition in and out of LSIDL.
* Any other observed state of is no particular interest.
*/
pserialize_read_exit(s);
return l->l_stat != LSIDL && l->l_lid == pid ? l : NULL;
}
/*
* proc_find_lwp_unlocked: locate an lwp in said proc by the ID.
*
* => Called in a pserialize read section with no locks held.
* => LSIDL lwps are not returned because they are only partially
* constructed while occupying the slot.
* => Callers need to be careful about lwp::l_stat of the returned
* lwp.
* => If an LWP is found, it's returned locked.
*/
struct lwp *
proc_find_lwp_unlocked(proc_t *p, pid_t pid)
{
struct pid_table *pt;
unsigned pt_mask;
struct lwp *l = NULL;
uintptr_t slot;
KASSERT(pserialize_in_read_section());
/*
* Look in the pid_table. This is done unlocked inside a
* pserialize read section covering pid_table's memory
* allocation only, so take care to read things in the correct
* order:
*
* 1. First read the table mask -- this only ever increases, in
* expand_pid_table, so a stale value is safely
* conservative.
*
* 2. Next read the pid table -- this is always set _before_
* the mask increases, so if we see a new table and stale
* mask, the mask is still valid for the table.
*/
pt_mask = atomic_load_acquire(&pid_tbl_mask);
pt = &atomic_load_consume(&pid_table)[pid & pt_mask];
slot = atomic_load_consume(&pt->pt_slot); if (__predict_false(!PT_IS_LWP(slot))) {
return NULL;
}
/*
* Lock the LWP we found to get it stable. If it's embryonic or
* reaped (LSIDL) then none of the other fields can safely be
* checked.
*/
l = PT_GET_LWP(slot);
lwp_lock(l);
if (__predict_false(l->l_stat == LSIDL)) {
lwp_unlock(l);
return NULL;
}
/*
* l_proc and l_lid are now known stable because the LWP is not
* LSIDL, so check those fields too to make sure we found the
* right thing.
*/
if (__predict_false(l->l_proc != p || l->l_lid != pid)) {
lwp_unlock(l);
return NULL;
}
/* Everything checks out, return it locked. */
return l;
}
/*
* proc_find_lwp_acquire_proc: locate an lwp and acquire a lock
* on its containing proc.
*
* => Similar to proc_find_lwp(), but does not require you to have
* the proc a priori.
* => Also returns proc * to caller, with p::p_lock held.
* => Same caveats apply.
*/
struct lwp *
proc_find_lwp_acquire_proc(pid_t pid, struct proc **pp)
{
struct pid_table *pt;
struct proc *p = NULL;
struct lwp *l = NULL;
uintptr_t slot;
KASSERT(pp != NULL);
mutex_enter(&proc_lock);
pt = &pid_table[pid & pid_tbl_mask];
slot = pt->pt_slot;
if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
l = PT_GET_LWP(slot);
p = l->l_proc;
mutex_enter(p->p_lock);
if (__predict_false(l->l_stat == LSIDL)) {
mutex_exit(p->p_lock);
l = NULL;
p = NULL;
}
}
mutex_exit(&proc_lock);
KASSERT(p == NULL || mutex_owned(p->p_lock));
*pp = p;
return l;
}
/*
* proc_find_raw_pid_table_locked: locate a process by the ID.
*
* => Must be called with proc_lock held.
*/
static proc_t *
proc_find_raw_pid_table_locked(pid_t pid, bool any_lwpid)
{
struct pid_table *pt;
proc_t *p = NULL;
uintptr_t slot;
/* No - used by DDB. KASSERT(mutex_owned(&proc_lock)); */
pt = &pid_table[pid & pid_tbl_mask];
slot = pt->pt_slot;
if (__predict_true(PT_IS_LWP(slot) && pt->pt_pid == pid)) {
/*
* When looking up processes, require a direct match
* on the PID assigned to the proc, not just one of
* its LWPs.
*
* N.B. We require lwp::l_proc of LSIDL LWPs to be
* valid here.
*/
p = PT_GET_LWP(slot)->l_proc; if (__predict_false(p->p_pid != pid && !any_lwpid))
p = NULL;
} else if (PT_IS_PROC(slot) && pt->pt_pid == pid) { p = PT_GET_PROC(slot);
}
return p;
}
proc_t *
proc_find_raw(pid_t pid)
{ return proc_find_raw_pid_table_locked(pid, false);
}
static proc_t *
proc_find_internal(pid_t pid, bool any_lwpid)
{
proc_t *p;
KASSERT(mutex_owned(&proc_lock)); p = proc_find_raw_pid_table_locked(pid, any_lwpid); if (__predict_false(p == NULL)) {
return NULL;
}
/*
* Only allow live processes to be found by PID.
* XXX: p_stat might change, since proc unlocked.
*/
if (__predict_true(p->p_stat == SACTIVE || p->p_stat == SSTOP)) {
return p;
}
return NULL;
}
proc_t *
proc_find(pid_t pid)
{ return proc_find_internal(pid, false);
}
proc_t *
proc_find_lwpid(pid_t pid)
{
return proc_find_internal(pid, true);
}
/*
* pgrp_find: locate a process group by the ID.
*
* => Must be called with proc_lock held.
*/
struct pgrp *
pgrp_find(pid_t pgid)
{
struct pgrp *pg;
KASSERT(mutex_owned(&proc_lock));
pg = pid_table[pgid & pid_tbl_mask].pt_pgrp;
/*
* Cannot look up a process group that only exists because the
* session has not died yet (traditional).
*/
if (pg == NULL || pg->pg_id != pgid || LIST_EMPTY(&pg->pg_members)) {
return NULL;
}
return pg;
}
static void
expand_pid_table(void)
{
size_t pt_size, tsz;
struct pid_table *n_pt, *new_pt;
uintptr_t slot;
struct pgrp *pgrp;
pid_t pid, rpid;
u_int i;
uint new_pt_mask;
KASSERT(mutex_owned(&proc_lock));
/* Unlock the pid_table briefly to allocate memory. */
pt_size = pid_tbl_mask + 1;
mutex_exit(&proc_lock);
tsz = pt_size * 2 * sizeof(struct pid_table);
new_pt = kmem_alloc(tsz, KM_SLEEP);
new_pt_mask = pt_size * 2 - 1;
/* XXX For now. The pratical limit is much lower anyway. */
KASSERT(new_pt_mask <= FUTEX_TID_MASK);
mutex_enter(&proc_lock);
if (pt_size != pid_tbl_mask + 1) {
/* Another process beat us to it... */
mutex_exit(&proc_lock);
kmem_free(new_pt, tsz);
goto out;
}
/*
* Copy entries from old table into new one.
* If 'pid' is 'odd' we need to place in the upper half,
* even pid's to the lower half.
* Free items stay in the low half so we don't have to
* fixup the reference to them.
* We stuff free items on the front of the freelist
* because we can't write to unmodified entries.
* Processing the table backwards maintains a semblance
* of issuing pid numbers that increase with time.
*/
i = pt_size - 1;
n_pt = new_pt + i;
for (; ; i--, n_pt--) {
slot = pid_table[i].pt_slot;
pgrp = pid_table[i].pt_pgrp;
if (!PT_VALID(slot)) {
/* Up 'use count' so that link is valid */
pid = (PT_NEXT(slot) + pt_size) & ~pt_size;
rpid = 0;
slot = PT_SET_FREE(pid);
if (pgrp) pid = pgrp->pg_id;
} else {
pid = pid_table[i].pt_pid;
rpid = pid;
}
/* Save entry in appropriate half of table */
n_pt[pid & pt_size].pt_slot = slot;
n_pt[pid & pt_size].pt_pgrp = pgrp;
n_pt[pid & pt_size].pt_pid = rpid;
/* Put other piece on start of free list */
pid = (pid ^ pt_size) & ~pid_tbl_mask;
n_pt[pid & pt_size].pt_slot =
PT_SET_FREE((pid & ~pt_size) | next_free_pt);
n_pt[pid & pt_size].pt_pgrp = 0;
n_pt[pid & pt_size].pt_pid = 0;
next_free_pt = i | (pid & pt_size);
if (i == 0)
break;
}
/* Save old table size and switch tables */
tsz = pt_size * sizeof(struct pid_table);
n_pt = pid_table;
atomic_store_release(&pid_table, new_pt);
KASSERT(new_pt_mask >= pid_tbl_mask);
atomic_store_release(&pid_tbl_mask, new_pt_mask);
/*
* pid_max starts as PID_MAX (= 30000), once we have 16384
* allocated pids we need it to be larger!
*/
if (pid_tbl_mask > PID_MAX) {
pid_max = pid_tbl_mask * 2 + 1;
pid_alloc_lim |= pid_alloc_lim << 1;
} else
pid_alloc_lim <<= 1; /* doubles number of free slots... */
mutex_exit(&proc_lock);
/*
* Make sure that unlocked access to the old pid_table is complete
* and then free it.
*/
pserialize_perform(proc_psz);
kmem_free(n_pt, tsz);
out: /* Return with proc_lock held again. */
mutex_enter(&proc_lock);
}
struct proc *
proc_alloc(void)
{
struct proc *p;
p = pool_cache_get(proc_cache, PR_WAITOK);
p->p_stat = SIDL; /* protect against others */
proc_initspecific(p);
kdtrace_proc_ctor(NULL, p);
/*
* Allocate a placeholder in the pid_table. When we create the
* first LWP for this process, it will take ownership of the
* slot.
*/
if (__predict_false(proc_alloc_pid(p) == -1)) {
/* Allocating the PID failed; unwind. */
proc_finispecific(p);
proc_free_mem(p);
p = NULL;
}
return p;
}
/*
* proc_alloc_pid_slot: allocate PID and record the occcupant so that
* proc_find_raw() can find it by the PID.
*/
static pid_t __noinline
proc_alloc_pid_slot(struct proc *p, uintptr_t slot)
{
struct pid_table *pt;
pid_t pid;
int nxt;
KASSERT(mutex_owned(&proc_lock)); for (;;expand_pid_table()) { if (__predict_false(pid_alloc_cnt >= pid_alloc_lim)) {
/* ensure pids cycle through 2000+ values */
continue;
}
/*
* The first user process *must* be given PID 1.
* it has already been reserved for us. This
* will be coming in from the proc_alloc() call
* above, and the entry will be usurped later when
* the first user LWP is created.
* XXX this is slightly gross.
*/
if (__predict_false(PT_RESERVED(pid_table[1].pt_slot) &&
p != &proc0)) {
KASSERT(PT_IS_PROC(slot));
pt = &pid_table[1];
pt->pt_slot = slot;
return 1;
}
pt = &pid_table[next_free_pt];
#ifdef DIAGNOSTIC
if (__predict_false(PT_VALID(pt->pt_slot) || pt->pt_pgrp))
panic("proc_alloc: slot busy");
#endif
nxt = PT_NEXT(pt->pt_slot);
if (nxt & pid_tbl_mask)
break;
/* Table full - expand (NB last entry not used....) */
}
/* pid is 'saved use count' + 'size' + entry */
pid = (nxt & ~pid_tbl_mask) + pid_tbl_mask + 1 + next_free_pt;
if ((uint)pid > (uint)pid_max)
pid &= pid_tbl_mask;
next_free_pt = nxt & pid_tbl_mask;
/* XXX For now. The pratical limit is much lower anyway. */
KASSERT(pid <= FUTEX_TID_MASK);
/* Grab table slot */
pt->pt_slot = slot;
KASSERT(pt->pt_pid == 0);
pt->pt_pid = pid;
pid_alloc_cnt++;
return pid;
}
pid_t
proc_alloc_pid(struct proc *p)
{
pid_t pid;
KASSERT((((uintptr_t)p) & PT_F_ALLBITS) == 0); KASSERT(p->p_stat == SIDL);
mutex_enter(&proc_lock);
pid = proc_alloc_pid_slot(p, PT_SET_PROC(p));
if (pid != -1) p->p_pid = pid;
mutex_exit(&proc_lock);
return pid;
}
pid_t
proc_alloc_lwpid(struct proc *p, struct lwp *l)
{
struct pid_table *pt;
pid_t pid;
KASSERT((((uintptr_t)l) & PT_F_ALLBITS) == 0); KASSERT(l->l_proc == p); KASSERT(l->l_stat == LSIDL);
/*
* For unlocked lookup in proc_find_lwp(), make sure l->l_proc
* is globally visible before the LWP becomes visible via the
* pid_table.
*/
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_producer();
#endif
/*
* If the slot for p->p_pid currently points to the proc,
* then we should usurp this ID for the LWP. This happens
* at least once per process (for the first LWP), and can
* happen again if the first LWP for a process exits and
* before the process creates another.
*/
mutex_enter(&proc_lock);
pid = p->p_pid;
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(pt->pt_pid == pid);
if (PT_IS_PROC(pt->pt_slot)) {
KASSERT(PT_GET_PROC(pt->pt_slot) == p);
l->l_lid = pid;
pt->pt_slot = PT_SET_LWP(l);
} else {
/* Need to allocate a new slot. */
pid = proc_alloc_pid_slot(p, PT_SET_LWP(l));
if (pid != -1) l->l_lid = pid;
}
mutex_exit(&proc_lock);
return pid;
}
static void __noinline
proc_free_pid_internal(pid_t pid, uintptr_t type __diagused)
{
struct pid_table *pt;
KASSERT(mutex_owned(&proc_lock));
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(PT_GET_TYPE(pt->pt_slot) == type); KASSERT(pt->pt_pid == pid);
/* save pid use count in slot */
pt->pt_slot = PT_SET_FREE(pid & ~pid_tbl_mask);
pt->pt_pid = 0;
if (pt->pt_pgrp == NULL) {
/* link last freed entry onto ours */
pid &= pid_tbl_mask;
pt = &pid_table[last_free_pt];
pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pid);
pt->pt_pid = 0;
last_free_pt = pid;
pid_alloc_cnt--;
}
}
/*
* Free a process id - called from proc_free (in kern_exit.c)
*
* Called with the proc_lock held.
*/
void
proc_free_pid(pid_t pid)
{ KASSERT(mutex_owned(&proc_lock));
proc_free_pid_internal(pid, PT_F_PROC);
}
/*
* Free a process id used by an LWP. If this was the process's
* first LWP, we convert the slot to point to the process; the
* entry will get cleaned up later when the process finishes exiting.
*
* If not, then it's the same as proc_free_pid().
*/
void
proc_free_lwpid(struct proc *p, pid_t pid)
{ KASSERT(mutex_owned(&proc_lock));
if (__predict_true(p->p_pid == pid)) {
struct pid_table *pt;
pt = &pid_table[pid & pid_tbl_mask];
KASSERT(pt->pt_pid == pid); KASSERT(PT_IS_LWP(pt->pt_slot)); KASSERT(PT_GET_LWP(pt->pt_slot)->l_proc == p);
pt->pt_slot = PT_SET_PROC(p);
return;
}
proc_free_pid_internal(pid, PT_F_LWP);
}
void
proc_free_mem(struct proc *p)
{ kdtrace_proc_dtor(NULL, p);
pool_cache_put(proc_cache, p);
}
/*
* proc_enterpgrp: move p to a new or existing process group (and session).
*
* If we are creating a new pgrp, the pgid should equal
* the calling process' pid.
* If is only valid to enter a process group that is in the session
* of the process.
* Also mksess should only be set if we are creating a process group
*
* Only called from sys_setsid, sys_setpgid and posix_spawn/spawn_return.
*/
int
proc_enterpgrp(struct proc *curp, pid_t pid, pid_t pgid, bool mksess)
{
struct pgrp *new_pgrp, *pgrp;
struct session *sess;
struct proc *p;
int rval;
pid_t pg_id = NO_PGID;
/* Allocate data areas we might need before doing any validity checks */
sess = mksess ? kmem_alloc(sizeof(*sess), KM_SLEEP) : NULL;
new_pgrp = kmem_alloc(sizeof(*new_pgrp), KM_SLEEP);
mutex_enter(&proc_lock);
rval = EPERM; /* most common error (to save typing) */
/* Check pgrp exists or can be created */
pgrp = pid_table[pgid & pid_tbl_mask].pt_pgrp;
if (pgrp != NULL && pgrp->pg_id != pgid)
goto done;
/* Can only set another process under restricted circumstances. */
if (pid != curp->p_pid) {
/* Must exist and be one of our children... */
p = proc_find_internal(pid, false); if (p == NULL || !p_inferior(p, curp)) {
rval = ESRCH;
goto done;
}
/* ... in the same session... */
if (sess != NULL || p->p_session != curp->p_session)
goto done;
/* ... existing pgid must be in same session ... */
if (pgrp != NULL && pgrp->pg_session != p->p_session)
goto done;
/* ... and not done an exec. */
if (p->p_flag & PK_EXEC) {
rval = EACCES;
goto done;
}
} else {
/* ... setsid() cannot re-enter a pgrp */
if (mksess && (curp->p_pgid == curp->p_pid || pgrp_find(curp->p_pid)))
goto done;
p = curp;
}
/* Changing the process group/session of a session
leader is definitely off limits. */
if (SESS_LEADER(p)) {
if (sess == NULL && p->p_pgrp == pgrp)
/* unless it's a definite noop */
rval = 0;
goto done;
}
/* Can only create a process group with id of process */
if (pgrp == NULL && pgid != pid)
goto done;
/* Can only create a session if creating pgrp */
if (sess != NULL && pgrp != NULL)
goto done;
/* Check we allocated memory for a pgrp... */
if (pgrp == NULL && new_pgrp == NULL)
goto done;
/* Don't attach to 'zombie' pgrp */
if (pgrp != NULL && LIST_EMPTY(&pgrp->pg_members))
goto done;
/* Expect to succeed now */
rval = 0;
if (pgrp == p->p_pgrp)
/* nothing to do */
goto done;
/* Ok all setup, link up required structures */
if (pgrp == NULL) {
pgrp = new_pgrp;
new_pgrp = NULL;
if (sess != NULL) {
sess->s_sid = p->p_pid;
sess->s_leader = p;
sess->s_count = 1;
sess->s_ttyvp = NULL;
sess->s_ttyp = NULL;
sess->s_flags = p->p_session->s_flags & ~S_LOGIN_SET;
memcpy(sess->s_login, p->p_session->s_login,
sizeof(sess->s_login));
p->p_lflag &= ~PL_CONTROLT;
} else {
sess = p->p_pgrp->pg_session;
proc_sesshold(sess);
}
pgrp->pg_session = sess;
sess = NULL;
pgrp->pg_id = pgid;
LIST_INIT(&pgrp->pg_members);
#ifdef DIAGNOSTIC
if (__predict_false(pid_table[pgid & pid_tbl_mask].pt_pgrp))
panic("enterpgrp: pgrp table slot in use");
if (__predict_false(mksess && p != curp))
panic("enterpgrp: mksession and p != curproc");
#endif
pid_table[pgid & pid_tbl_mask].pt_pgrp = pgrp;
pgrp->pg_jobc = 0;
}
/*
* Adjust eligibility of affected pgrps to participate in job control.
* Increment eligibility counts before decrementing, otherwise we
* could reach 0 spuriously during the first call.
*/
fixjobc(p, pgrp, 1);
fixjobc(p, p->p_pgrp, 0);
/* Interlock with ttread(). */
mutex_spin_enter(&tty_lock);
/* Move process to requested group. */
LIST_REMOVE(p, p_pglist); if (LIST_EMPTY(&p->p_pgrp->pg_members))
/* defer delete until we've dumped the lock */
pg_id = p->p_pgrp->pg_id;
p->p_pgrp = pgrp;
LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
/* Done with the swap; we can release the tty mutex. */
mutex_spin_exit(&tty_lock);
done:
if (pg_id != NO_PGID) {
/* Releases proc_lock. */
pg_delete(pg_id);
} else {
mutex_exit(&proc_lock);
}
if (sess != NULL) kmem_free(sess, sizeof(*sess)); if (new_pgrp != NULL) kmem_free(new_pgrp, sizeof(*new_pgrp));
#ifdef DEBUG_PGRP
if (__predict_false(rval))
printf("enterpgrp(%d,%d,%d), curproc %d, rval %d\n",
pid, pgid, mksess, curp->p_pid, rval);
#endif
return rval;
}
/*
* proc_leavepgrp: remove a process from its process group.
* => must be called with the proc_lock held, which will be released;
*/
void
proc_leavepgrp(struct proc *p)
{
struct pgrp *pgrp;
KASSERT(mutex_owned(&proc_lock));
/* Interlock with ttread() */
mutex_spin_enter(&tty_lock);
pgrp = p->p_pgrp;
LIST_REMOVE(p, p_pglist);
p->p_pgrp = NULL;
mutex_spin_exit(&tty_lock);
if (LIST_EMPTY(&pgrp->pg_members)) {
/* Releases proc_lock. */
pg_delete(pgrp->pg_id);
} else {
mutex_exit(&proc_lock);
}
}
/*
* pg_remove: remove a process group from the table.
* => must be called with the proc_lock held;
* => returns process group to free;
*/
static struct pgrp *
pg_remove(pid_t pg_id)
{
struct pgrp *pgrp;
struct pid_table *pt;
KASSERT(mutex_owned(&proc_lock));
pt = &pid_table[pg_id & pid_tbl_mask];
pgrp = pt->pt_pgrp;
KASSERT(pgrp != NULL); KASSERT(pgrp->pg_id == pg_id); KASSERT(LIST_EMPTY(&pgrp->pg_members));
pt->pt_pgrp = NULL;
if (!PT_VALID(pt->pt_slot)) {
/* Orphaned pgrp, put slot onto free list. */
KASSERT((PT_NEXT(pt->pt_slot) & pid_tbl_mask) == 0);
pg_id &= pid_tbl_mask;
pt = &pid_table[last_free_pt];
pt->pt_slot = PT_SET_FREE(PT_NEXT(pt->pt_slot) | pg_id);
KASSERT(pt->pt_pid == 0);
last_free_pt = pg_id;
pid_alloc_cnt--;
}
return pgrp;
}
/*
* pg_delete: delete and free a process group.
* => must be called with the proc_lock held, which will be released.
*/
static void
pg_delete(pid_t pg_id)
{
struct pgrp *pg;
struct tty *ttyp;
struct session *ss;
KASSERT(mutex_owned(&proc_lock));
pg = pid_table[pg_id & pid_tbl_mask].pt_pgrp;
if (pg == NULL || pg->pg_id != pg_id || !LIST_EMPTY(&pg->pg_members)) {
mutex_exit(&proc_lock);
return;
}
ss = pg->pg_session;
/* Remove reference (if any) from tty to this process group */
mutex_spin_enter(&tty_lock);
ttyp = ss->s_ttyp;
if (ttyp != NULL && ttyp->t_pgrp == pg) {
ttyp->t_pgrp = NULL;
KASSERT(ttyp->t_session == ss);
}
mutex_spin_exit(&tty_lock);
/*
* The leading process group in a session is freed by proc_sessrele(),
* if last reference. It will also release the locks.
*/
pg = (ss->s_sid != pg->pg_id) ? pg_remove(pg_id) : NULL;
proc_sessrele(ss);
if (pg != NULL) {
/* Free it, if was not done above. */
kmem_free(pg, sizeof(struct pgrp));
}
}
/*
* Adjust pgrp jobc counters when specified process changes process group.
* We count the number of processes in each process group that "qualify"
* the group for terminal job control (those with a parent in a different
* process group of the same session). If that count reaches zero, the
* process group becomes orphaned. Check both the specified process'
* process group and that of its children.
* entering == 0 => p is leaving specified group.
* entering == 1 => p is entering specified group.
*
* Call with proc_lock held.
*/
void
fixjobc(struct proc *p, struct pgrp *pgrp, int entering)
{
struct pgrp *hispgrp;
struct session *mysession = pgrp->pg_session;
struct proc *child;
KASSERT(mutex_owned(&proc_lock));
/*
* Check p's parent to see whether p qualifies its own process
* group; if so, adjust count for p's process group.
*/
hispgrp = p->p_pptr->p_pgrp;
if (hispgrp != pgrp && hispgrp->pg_session == mysession) {
if (entering) {
pgrp->pg_jobc++;
p->p_lflag &= ~PL_ORPHANPG;
} else {
/* KASSERT(pgrp->pg_jobc > 0); */
if (--pgrp->pg_jobc == 0) orphanpg(pgrp);
}
}
/*
* Check this process' children to see whether they qualify
* their process groups; if so, adjust counts for children's
* process groups.
*/
LIST_FOREACH(child, &p->p_children, p_sibling) {
hispgrp = child->p_pgrp;
if (hispgrp != pgrp && hispgrp->pg_session == mysession && !P_ZOMBIE(child)) {
if (entering) {
child->p_lflag &= ~PL_ORPHANPG;
hispgrp->pg_jobc++;
} else {
KASSERT(hispgrp->pg_jobc > 0); if (--hispgrp->pg_jobc == 0) orphanpg(hispgrp);
}
}
}
}
/*
* A process group has become orphaned;
* if there are any stopped processes in the group,
* hang-up all process in that group.
*
* Call with proc_lock held.
*/
static void
orphanpg(struct pgrp *pg)
{
struct proc *p;
KASSERT(mutex_owned(&proc_lock)); LIST_FOREACH(p, &pg->pg_members, p_pglist) { if (p->p_stat == SSTOP) { p->p_lflag |= PL_ORPHANPG;
psignal(p, SIGHUP);
psignal(p, SIGCONT);
}
}
}
#ifdef DDB
#include <ddb/db_output.h>
void pidtbl_dump(void);
void
pidtbl_dump(void)
{
struct pid_table *pt;
struct proc *p;
struct pgrp *pgrp;
uintptr_t slot;
int id;
db_printf("pid table %p size %x, next %x, last %x\n",
pid_table, pid_tbl_mask+1,
next_free_pt, last_free_pt);
for (pt = pid_table, id = 0; id <= pid_tbl_mask; id++, pt++) {
slot = pt->pt_slot;
if (!PT_VALID(slot) && !pt->pt_pgrp)
continue;
if (PT_IS_LWP(slot)) {
p = PT_GET_LWP(slot)->l_proc;
} else if (PT_IS_PROC(slot)) {
p = PT_GET_PROC(slot);
} else {
p = NULL;
}
db_printf(" id %x: ", id);
if (p != NULL)
db_printf("slotpid %d proc %p id %d (0x%x) %s\n",
pt->pt_pid, p, p->p_pid, p->p_pid, p->p_comm);
else
db_printf("next %x use %x\n",
PT_NEXT(slot) & pid_tbl_mask,
PT_NEXT(slot) & ~pid_tbl_mask);
if ((pgrp = pt->pt_pgrp)) {
db_printf("\tsession %p, sid %d, count %d, login %s\n",
pgrp->pg_session, pgrp->pg_session->s_sid,
pgrp->pg_session->s_count,
pgrp->pg_session->s_login);
db_printf("\tpgrp %p, pg_id %d, pg_jobc %d, members %p\n",
pgrp, pgrp->pg_id, pgrp->pg_jobc,
LIST_FIRST(&pgrp->pg_members));
LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
db_printf("\t\tpid %d addr %p pgrp %p %s\n",
p->p_pid, p, p->p_pgrp, p->p_comm);
}
}
}
}
#endif /* DDB */
#ifdef KSTACK_CHECK_MAGIC
#define KSTACK_MAGIC 0xdeadbeaf
/* XXX should be per process basis? */
static int kstackleftmin = KSTACK_SIZE;
static int kstackleftthres = KSTACK_SIZE / 8;
void
kstack_setup_magic(const struct lwp *l)
{
uint32_t *ip;
uint32_t const *end;
KASSERT(l != NULL);
KASSERT(l != &lwp0);
/*
* fill all the stack with magic number
* so that later modification on it can be detected.
*/
ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
for (; ip < end; ip++) {
*ip = KSTACK_MAGIC;
}
}
void
kstack_check_magic(const struct lwp *l)
{
uint32_t const *ip, *end;
int stackleft;
KASSERT(l != NULL);
/* don't check proc0 */ /*XXX*/
if (l == &lwp0)
return;
#ifdef __MACHINE_STACK_GROWS_UP
/* stack grows upwards (eg. hppa) */
ip = (uint32_t *)((void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
end = (uint32_t *)KSTACK_LOWEST_ADDR(l);
for (ip--; ip >= end; ip--)
if (*ip != KSTACK_MAGIC)
break;
stackleft = (void *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE - (void *)ip;
#else /* __MACHINE_STACK_GROWS_UP */
/* stack grows downwards (eg. i386) */
ip = (uint32_t *)KSTACK_LOWEST_ADDR(l);
end = (uint32_t *)((char *)KSTACK_LOWEST_ADDR(l) + KSTACK_SIZE);
for (; ip < end; ip++)
if (*ip != KSTACK_MAGIC)
break;
stackleft = ((const char *)ip) - (const char *)KSTACK_LOWEST_ADDR(l);
#endif /* __MACHINE_STACK_GROWS_UP */
if (kstackleftmin > stackleft) {
kstackleftmin = stackleft;
if (stackleft < kstackleftthres)
printf("warning: kernel stack left %d bytes"
"(pid %u:lid %u)\n", stackleft,
(u_int)l->l_proc->p_pid, (u_int)l->l_lid);
}
if (stackleft <= 0) {
panic("magic on the top of kernel stack changed for "
"pid %u, lid %u: maybe kernel stack overflow",
(u_int)l->l_proc->p_pid, (u_int)l->l_lid);
}
}
#endif /* KSTACK_CHECK_MAGIC */
int
proclist_foreach_call(struct proclist *list,
int (*callback)(struct proc *, void *arg), void *arg)
{
struct proc marker;
struct proc *p;
int ret = 0;
marker.p_flag = PK_MARKER;
mutex_enter(&proc_lock);
for (p = LIST_FIRST(list); ret == 0 && p != NULL;) { if (p->p_flag & PK_MARKER) {
p = LIST_NEXT(p, p_list);
continue;
}
LIST_INSERT_AFTER(p, &marker, p_list);
ret = (*callback)(p, arg);
KASSERT(mutex_owned(&proc_lock));
p = LIST_NEXT(&marker, p_list);
LIST_REMOVE(&marker, p_list);
}
mutex_exit(&proc_lock);
return ret;
}
int
proc_vmspace_getref(struct proc *p, struct vmspace **vm)
{
/* XXXCDC: how should locking work here? */
/* curproc exception is for coredump. */
if ((p != curproc && (p->p_sflag & PS_WEXIT) != 0) ||
(p->p_vmspace->vm_refcnt < 1)) {
return EFAULT;
}
uvmspace_addref(p->p_vmspace);
*vm = p->p_vmspace;
return 0;
}
/*
* Acquire a write lock on the process credential.
*/
void
proc_crmod_enter(void)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
kauth_cred_t oc;
/* Reset what needs to be reset in plimit. */
if (p->p_limit->pl_corename != defcorename) { lim_setcorename(p, defcorename, 0);
}
mutex_enter(p->p_lock);
/* Ensure the LWP cached credentials are up to date. */
if ((oc = l->l_cred) != p->p_cred) { l->l_cred = kauth_cred_hold(p->p_cred);
kauth_cred_free(oc);
}
}
/*
* Set in a new process credential, and drop the write lock. The credential
* must have a reference already. Optionally, free a no-longer required
* credential.
*/
void
proc_crmod_leave(kauth_cred_t scred, kauth_cred_t fcred, bool sugid)
{
struct lwp *l = curlwp, *l2;
struct proc *p = l->l_proc;
kauth_cred_t oc;
KASSERT(mutex_owned(p->p_lock));
/* Is there a new credential to set in? */
if (scred != NULL) {
p->p_cred = scred;
LIST_FOREACH(l2, &p->p_lwps, l_sibling) { if (l2 != l) { lwp_lock(l2);
l2->l_flag |= LW_CACHECRED;
lwp_need_userret(l2);
lwp_unlock(l2);
}
}
/* Ensure the LWP cached credentials are up to date. */
if ((oc = l->l_cred) != scred) { l->l_cred = kauth_cred_hold(scred);
}
} else
oc = NULL; /* XXXgcc */
if (sugid) {
/*
* Mark process as having changed credentials, stops
* tracing etc.
*/
p->p_flag |= PK_SUGID;
}
mutex_exit(p->p_lock);
/* If there is a credential to be released, free it now. */
if (fcred != NULL) { KASSERT(scred != NULL);
kauth_cred_free(fcred);
if (oc != scred) kauth_cred_free(oc);
}
}
/*
* proc_specific_key_create --
* Create a key for subsystem proc-specific data.
*/
int
proc_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return (specificdata_key_create(proc_specificdata_domain, keyp, dtor));
}
/*
* proc_specific_key_delete --
* Delete a key for subsystem proc-specific data.
*/
void
proc_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(proc_specificdata_domain, key);
}
/*
* proc_initspecific --
* Initialize a proc's specificdata container.
*/
void
proc_initspecific(struct proc *p)
{
int error __diagused;
error = specificdata_init(proc_specificdata_domain, &p->p_specdataref);
KASSERT(error == 0);
}
/*
* proc_finispecific --
* Finalize a proc's specificdata container.
*/
void
proc_finispecific(struct proc *p)
{
specificdata_fini(proc_specificdata_domain, &p->p_specdataref);
}
/*
* proc_getspecific --
* Return proc-specific data corresponding to the specified key.
*/
void *
proc_getspecific(struct proc *p, specificdata_key_t key)
{
return (specificdata_getspecific(proc_specificdata_domain,
&p->p_specdataref, key));
}
/*
* proc_setspecific --
* Set proc-specific data corresponding to the specified key.
*/
void
proc_setspecific(struct proc *p, specificdata_key_t key, void *data)
{
specificdata_setspecific(proc_specificdata_domain,
&p->p_specdataref, key, data);
}
int
proc_uidmatch(kauth_cred_t cred, kauth_cred_t target)
{
int r = 0;
if (kauth_cred_getuid(cred) != kauth_cred_getuid(target) ||
kauth_cred_getuid(cred) != kauth_cred_getsvuid(target)) {
/*
* suid proc of ours or proc not ours
*/
r = EPERM;
} else if (kauth_cred_getgid(target) != kauth_cred_getsvgid(target)) {
/*
* sgid proc has sgid back to us temporarily
*/
r = EPERM;
} else {
/*
* our rgid must be in target's group list (ie,
* sub-processes started by a sgid process)
*/
int ismember = 0;
if (kauth_cred_ismember_gid(cred,
kauth_cred_getgid(target), &ismember) != 0 ||
!ismember)
r = EPERM;
}
return (r);
}
/*
* sysctl stuff
*/
#define KERN_PROCSLOP (5 * sizeof(struct kinfo_proc))
static const u_int sysctl_flagmap[] = {
PK_ADVLOCK, P_ADVLOCK,
PK_EXEC, P_EXEC,
PK_NOCLDWAIT, P_NOCLDWAIT,
PK_32, P_32,
PK_CLDSIGIGN, P_CLDSIGIGN,
PK_SUGID, P_SUGID,
0
};
static const u_int sysctl_sflagmap[] = {
PS_NOCLDSTOP, P_NOCLDSTOP,
PS_WEXIT, P_WEXIT,
PS_STOPFORK, P_STOPFORK,
PS_STOPEXEC, P_STOPEXEC,
PS_STOPEXIT, P_STOPEXIT,
0
};
static const u_int sysctl_slflagmap[] = {
PSL_TRACED, P_TRACED,
PSL_CHTRACED, P_CHTRACED,
PSL_SYSCALL, P_SYSCALL,
0
};
static const u_int sysctl_lflagmap[] = {
PL_CONTROLT, P_CONTROLT,
PL_PPWAIT, P_PPWAIT,
0
};
static const u_int sysctl_stflagmap[] = {
PST_PROFIL, P_PROFIL,
0
};
/* used by kern_lwp also */
const u_int sysctl_lwpflagmap[] = {
LW_SINTR, L_SINTR,
LW_SYSTEM, L_SYSTEM,
0
};
/*
* Find the most ``active'' lwp of a process and return it for ps display
* purposes
*/
static struct lwp *
proc_active_lwp(struct proc *p)
{
static const int ostat[] = {
0,
2, /* LSIDL */
6, /* LSRUN */
5, /* LSSLEEP */
4, /* LSSTOP */
0, /* LSZOMB */
1, /* LSDEAD */
7, /* LSONPROC */
3 /* LSSUSPENDED */
};
struct lwp *l, *lp = NULL;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
KASSERT(l->l_stat >= 0);
KASSERT(l->l_stat < __arraycount(ostat));
if (lp == NULL ||
ostat[l->l_stat] > ostat[lp->l_stat] ||
(ostat[l->l_stat] == ostat[lp->l_stat] &&
l->l_cpticks > lp->l_cpticks)) {
lp = l;
continue;
}
}
return lp;
}
static int
sysctl_doeproc(SYSCTLFN_ARGS)
{
union {
struct kinfo_proc kproc;
struct kinfo_proc2 kproc2;
} *kbuf;
struct proc *p, *next, *marker;
char *where, *dp;
int type, op, arg, error;
u_int elem_size, kelem_size, elem_count;
size_t buflen, needed;
bool match, zombie, mmmbrains;
const bool allowaddr = get_expose_address(curproc);
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
dp = where = oldp;
buflen = where != NULL ? *oldlenp : 0;
error = 0;
needed = 0;
type = rnode->sysctl_num;
if (type == KERN_PROC) {
if (namelen == 0)
return EINVAL;
switch (op = name[0]) {
case KERN_PROC_ALL:
if (namelen != 1)
return EINVAL;
arg = 0;
break;
default:
if (namelen != 2)
return EINVAL;
arg = name[1];
break;
}
elem_count = 0; /* Hush little compiler, don't you cry */
kelem_size = elem_size = sizeof(kbuf->kproc);
} else {
if (namelen != 4)
return EINVAL;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
kelem_size = sizeof(kbuf->kproc2);
}
sysctl_unlock();
kbuf = kmem_zalloc(sizeof(*kbuf), KM_SLEEP);
marker = kmem_alloc(sizeof(*marker), KM_SLEEP);
marker->p_flag = PK_MARKER;
mutex_enter(&proc_lock);
/*
* Start with zombies to prevent reporting processes twice, in case they
* are dying and being moved from the list of alive processes to zombies.
*/
mmmbrains = true;
for (p = LIST_FIRST(&zombproc);; p = next) {
if (p == NULL) {
if (mmmbrains) {
p = LIST_FIRST(&allproc);
mmmbrains = false;
}
if (p == NULL)
break;
}
next = LIST_NEXT(p, p_list);
if ((p->p_flag & PK_MARKER) != 0)
continue;
/*
* Skip embryonic processes.
*/
if (p->p_stat == SIDL)
continue;
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_EPROC), NULL, NULL);
if (error != 0) {
mutex_exit(p->p_lock);
continue;
}
/*
* Hande all the operations in one switch on the cost of
* algorithm complexity is on purpose. The win splitting this
* function into several similar copies makes maintenance
* burden, code grow and boost is negligible in practical
* systems.
*/
switch (op) {
case KERN_PROC_PID:
match = (p->p_pid == (pid_t)arg);
break;
case KERN_PROC_PGRP:
match = (p->p_pgrp->pg_id == (pid_t)arg);
break;
case KERN_PROC_SESSION:
match = (p->p_session->s_sid == (pid_t)arg);
break;
case KERN_PROC_TTY:
match = true;
if (arg == (int) KERN_PROC_TTY_REVOKE) {
if ((p->p_lflag & PL_CONTROLT) == 0 ||
p->p_session->s_ttyp == NULL ||
p->p_session->s_ttyvp != NULL) {
match = false;
}
} else if ((p->p_lflag & PL_CONTROLT) == 0 ||
p->p_session->s_ttyp == NULL) {
if ((dev_t)arg != KERN_PROC_TTY_NODEV) {
match = false;
}
} else if (p->p_session->s_ttyp->t_dev != (dev_t)arg) {
match = false;
}
break;
case KERN_PROC_UID:
match = (kauth_cred_geteuid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_RUID:
match = (kauth_cred_getuid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_GID:
match = (kauth_cred_getegid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_RGID:
match = (kauth_cred_getgid(p->p_cred) == (uid_t)arg);
break;
case KERN_PROC_ALL:
match = true;
/* allow everything */
break;
default:
error = EINVAL;
mutex_exit(p->p_lock);
goto cleanup;
}
if (!match) {
mutex_exit(p->p_lock);
continue;
}
/*
* Grab a hold on the process.
*/
if (mmmbrains) {
zombie = true;
} else {
zombie = !rw_tryenter(&p->p_reflock, RW_READER);
}
if (zombie) {
LIST_INSERT_AFTER(p, marker, p_list);
}
if (buflen >= elem_size &&
(type == KERN_PROC || elem_count > 0)) {
ruspace(p); /* Update process vm resource use */
if (type == KERN_PROC) {
fill_proc(p, &kbuf->kproc.kp_proc, allowaddr);
fill_eproc(p, &kbuf->kproc.kp_eproc, zombie,
allowaddr);
} else {
fill_kproc2(p, &kbuf->kproc2, zombie,
allowaddr);
elem_count--;
}
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
/*
* Copy out elem_size, but not larger than kelem_size
*/
error = sysctl_copyout(l, kbuf, dp,
uimin(kelem_size, elem_size));
mutex_enter(&proc_lock);
if (error) {
goto bah;
}
dp += elem_size;
buflen -= elem_size;
} else {
mutex_exit(p->p_lock);
}
needed += elem_size;
/*
* Release reference to process.
*/
if (zombie) {
next = LIST_NEXT(marker, p_list);
LIST_REMOVE(marker, p_list);
} else {
rw_exit(&p->p_reflock);
next = LIST_NEXT(p, p_list);
}
/*
* Short-circuit break quickly!
*/
if (op == KERN_PROC_PID)
break;
}
mutex_exit(&proc_lock);
if (where != NULL) {
*oldlenp = dp - where;
if (needed > *oldlenp) {
error = ENOMEM;
goto out;
}
} else {
needed += KERN_PROCSLOP;
*oldlenp = needed;
}
kmem_free(kbuf, sizeof(*kbuf));
kmem_free(marker, sizeof(*marker));
sysctl_relock();
return 0;
bah:
if (zombie)
LIST_REMOVE(marker, p_list);
else
rw_exit(&p->p_reflock);
cleanup:
mutex_exit(&proc_lock);
out:
kmem_free(kbuf, sizeof(*kbuf));
kmem_free(marker, sizeof(*marker));
sysctl_relock();
return error;
}
int
copyin_psstrings(struct proc *p, struct ps_strings *arginfo)
{
#if !defined(_RUMPKERNEL)
int retval;
if (p->p_flag & PK_32) {
MODULE_HOOK_CALL(kern_proc32_copyin_hook, (p, arginfo),
enosys(), retval);
return retval;
}
#endif /* !defined(_RUMPKERNEL) */
return copyin_proc(p, (void *)p->p_psstrp, arginfo, sizeof(*arginfo));
}
static int
copy_procargs_sysctl_cb(void *cookie_, const void *src, size_t off, size_t len)
{
void **cookie = cookie_;
struct lwp *l = cookie[0];
char *dst = cookie[1];
return sysctl_copyout(l, src, dst + off, len);
}
/*
* sysctl helper routine for kern.proc_args pseudo-subtree.
*/
static int
sysctl_kern_proc_args(SYSCTLFN_ARGS)
{
struct ps_strings pss;
struct proc *p;
pid_t pid;
int type, error;
void *cookie[2];
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (newp != NULL || namelen != 2)
return (EINVAL);
pid = name[0];
type = name[1];
switch (type) {
case KERN_PROC_PATHNAME:
sysctl_unlock();
error = fill_pathname(l, pid, oldp, oldlenp);
sysctl_relock();
return error;
case KERN_PROC_CWD:
sysctl_unlock();
error = fill_cwd(l, pid, oldp, oldlenp);
sysctl_relock();
return error;
case KERN_PROC_ARGV:
case KERN_PROC_NARGV:
case KERN_PROC_ENV:
case KERN_PROC_NENV:
/* ok */
break;
default:
return (EINVAL);
}
sysctl_unlock();
/* check pid */
mutex_enter(&proc_lock);
if ((p = proc_find(pid)) == NULL) {
error = EINVAL;
goto out_locked;
}
mutex_enter(p->p_lock);
/* Check permission. */
if (type == KERN_PROC_ARGV || type == KERN_PROC_NARGV)
error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ARGS), NULL, NULL);
else if (type == KERN_PROC_ENV || type == KERN_PROC_NENV)
error = kauth_authorize_process(l->l_cred, KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENV), NULL, NULL);
else
error = EINVAL; /* XXXGCC */
if (error) {
mutex_exit(p->p_lock);
goto out_locked;
}
if (oldp == NULL) {
if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV)
*oldlenp = sizeof (int);
else
*oldlenp = ARG_MAX; /* XXX XXX XXX */
error = 0;
mutex_exit(p->p_lock);
goto out_locked;
}
/*
* Zombies don't have a stack, so we can't read their psstrings.
* System processes also don't have a user stack.
*/
if (P_ZOMBIE(p) || (p->p_flag & PK_SYSTEM) != 0) {
error = EINVAL;
mutex_exit(p->p_lock);
goto out_locked;
}
error = rw_tryenter(&p->p_reflock, RW_READER) ? 0 : EBUSY;
mutex_exit(p->p_lock);
if (error) {
goto out_locked;
}
mutex_exit(&proc_lock);
if (type == KERN_PROC_NARGV || type == KERN_PROC_NENV) {
int value;
if ((error = copyin_psstrings(p, &pss)) == 0) {
if (type == KERN_PROC_NARGV)
value = pss.ps_nargvstr;
else
value = pss.ps_nenvstr;
error = sysctl_copyout(l, &value, oldp, sizeof(value));
*oldlenp = sizeof(value);
}
} else {
cookie[0] = l;
cookie[1] = oldp;
error = copy_procargs(p, type, oldlenp,
copy_procargs_sysctl_cb, cookie);
}
rw_exit(&p->p_reflock);
sysctl_relock();
return error;
out_locked:
mutex_exit(&proc_lock);
sysctl_relock();
return error;
}
int
copy_procargs(struct proc *p, int oid, size_t *limit,
int (*cb)(void *, const void *, size_t, size_t), void *cookie)
{
struct ps_strings pss;
size_t len, i, loaded, entry_len;
struct uio auio;
struct iovec aiov;
int error, argvlen;
char *arg;
char **argv;
vaddr_t user_argv;
struct vmspace *vmspace;
/*
* Allocate a temporary buffer to hold the argument vector and
* the arguments themselve.
*/
arg = kmem_alloc(PAGE_SIZE, KM_SLEEP);
argv = kmem_alloc(PAGE_SIZE, KM_SLEEP);
/*
* Lock the process down in memory.
*/
vmspace = p->p_vmspace;
uvmspace_addref(vmspace);
/*
* Read in the ps_strings structure.
*/
if ((error = copyin_psstrings(p, &pss)) != 0)
goto done;
/*
* Now read the address of the argument vector.
*/
switch (oid) {
case KERN_PROC_ARGV:
user_argv = (uintptr_t)pss.ps_argvstr;
argvlen = pss.ps_nargvstr;
break;
case KERN_PROC_ENV:
user_argv = (uintptr_t)pss.ps_envstr;
argvlen = pss.ps_nenvstr;
break;
default:
error = EINVAL;
goto done;
}
if (argvlen < 0) {
error = EIO;
goto done;
}
/*
* Now copy each string.
*/
len = 0; /* bytes written to user buffer */
loaded = 0; /* bytes from argv already processed */
i = 0; /* To make compiler happy */
entry_len = PROC_PTRSZ(p);
for (; argvlen; --argvlen) {
int finished = 0;
vaddr_t base;
size_t xlen;
int j;
if (loaded == 0) {
size_t rem = entry_len * argvlen;
loaded = MIN(rem, PAGE_SIZE);
error = copyin_vmspace(vmspace,
(const void *)user_argv, argv, loaded);
if (error)
break;
user_argv += loaded;
i = 0;
}
#if !defined(_RUMPKERNEL)
if (p->p_flag & PK_32)
MODULE_HOOK_CALL(kern_proc32_base_hook,
(argv, i++), 0, base);
else
#endif /* !defined(_RUMPKERNEL) */
base = (vaddr_t)argv[i++];
loaded -= entry_len;
/*
* The program has messed around with its arguments,
* possibly deleting some, and replacing them with
* NULL's. Treat this as the last argument and not
* a failure.
*/
if (base == 0)
break;
while (!finished) {
xlen = PAGE_SIZE - (base & PAGE_MASK);
aiov.iov_base = arg;
aiov.iov_len = PAGE_SIZE;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = base;
auio.uio_resid = xlen;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
error = uvm_io(&vmspace->vm_map, &auio, 0);
if (error)
goto done;
/* Look for the end of the string */
for (j = 0; j < xlen; j++) {
if (arg[j] == '\0') {
xlen = j + 1;
finished = 1;
break;
}
}
/* Check for user buffer overflow */
if (len + xlen > *limit) {
finished = 1;
if (len > *limit)
xlen = 0;
else
xlen = *limit - len;
}
/* Copyout the page */
error = (*cb)(cookie, arg, len, xlen);
if (error)
goto done;
len += xlen;
base += xlen;
}
}
*limit = len;
done:
kmem_free(argv, PAGE_SIZE);
kmem_free(arg, PAGE_SIZE);
uvmspace_free(vmspace);
return error;
}
/*
* Fill in a proc structure for the specified process.
*/
static void
fill_proc(const struct proc *psrc, struct proc *p, bool allowaddr)
{
COND_SET_STRUCT(p->p_list, psrc->p_list, allowaddr);
memset(&p->p_auxlock, 0, sizeof(p->p_auxlock));
COND_SET_STRUCT(p->p_lock, psrc->p_lock, allowaddr);
memset(&p->p_stmutex, 0, sizeof(p->p_stmutex));
memset(&p->p_reflock, 0, sizeof(p->p_reflock));
COND_SET_STRUCT(p->p_waitcv, psrc->p_waitcv, allowaddr);
COND_SET_STRUCT(p->p_lwpcv, psrc->p_lwpcv, allowaddr);
COND_SET_PTR(p->p_cred, psrc->p_cred, allowaddr);
COND_SET_PTR(p->p_fd, psrc->p_fd, allowaddr);
COND_SET_PTR(p->p_cwdi, psrc->p_cwdi, allowaddr);
COND_SET_PTR(p->p_stats, psrc->p_stats, allowaddr);
COND_SET_PTR(p->p_limit, psrc->p_limit, allowaddr);
COND_SET_PTR(p->p_vmspace, psrc->p_vmspace, allowaddr);
COND_SET_PTR(p->p_sigacts, psrc->p_sigacts, allowaddr);
COND_SET_PTR(p->p_aio, psrc->p_aio, allowaddr);
p->p_mqueue_cnt = psrc->p_mqueue_cnt;
memset(&p->p_specdataref, 0, sizeof(p->p_specdataref));
p->p_exitsig = psrc->p_exitsig;
p->p_flag = psrc->p_flag;
p->p_sflag = psrc->p_sflag;
p->p_slflag = psrc->p_slflag;
p->p_lflag = psrc->p_lflag;
p->p_stflag = psrc->p_stflag;
p->p_stat = psrc->p_stat;
p->p_trace_enabled = psrc->p_trace_enabled;
p->p_pid = psrc->p_pid;
COND_SET_STRUCT(p->p_pglist, psrc->p_pglist, allowaddr);
COND_SET_PTR(p->p_pptr, psrc->p_pptr, allowaddr);
COND_SET_STRUCT(p->p_sibling, psrc->p_sibling, allowaddr);
COND_SET_STRUCT(p->p_children, psrc->p_children, allowaddr);
COND_SET_STRUCT(p->p_lwps, psrc->p_lwps, allowaddr);
COND_SET_PTR(p->p_raslist, psrc->p_raslist, allowaddr);
p->p_nlwps = psrc->p_nlwps;
p->p_nzlwps = psrc->p_nzlwps;
p->p_nrlwps = psrc->p_nrlwps;
p->p_nlwpwait = psrc->p_nlwpwait;
p->p_ndlwps = psrc->p_ndlwps;
p->p_nstopchild = psrc->p_nstopchild;
p->p_waited = psrc->p_waited;
COND_SET_PTR(p->p_zomblwp, psrc->p_zomblwp, allowaddr);
COND_SET_PTR(p->p_vforklwp, psrc->p_vforklwp, allowaddr);
COND_SET_PTR(p->p_sched_info, psrc->p_sched_info, allowaddr);
p->p_estcpu = psrc->p_estcpu;
p->p_estcpu_inherited = psrc->p_estcpu_inherited;
p->p_forktime = psrc->p_forktime;
p->p_pctcpu = psrc->p_pctcpu;
COND_SET_PTR(p->p_opptr, psrc->p_opptr, allowaddr);
COND_SET_PTR(p->p_timers, psrc->p_timers, allowaddr);
p->p_rtime = psrc->p_rtime;
p->p_uticks = psrc->p_uticks;
p->p_sticks = psrc->p_sticks;
p->p_iticks = psrc->p_iticks;
p->p_xutime = psrc->p_xutime;
p->p_xstime = psrc->p_xstime;
p->p_traceflag = psrc->p_traceflag;
COND_SET_PTR(p->p_tracep, psrc->p_tracep, allowaddr);
COND_SET_PTR(p->p_textvp, psrc->p_textvp, allowaddr);
COND_SET_PTR(p->p_emul, psrc->p_emul, allowaddr);
COND_SET_PTR(p->p_emuldata, psrc->p_emuldata, allowaddr);
COND_SET_CPTR(p->p_execsw, psrc->p_execsw, allowaddr);
COND_SET_STRUCT(p->p_klist, psrc->p_klist, allowaddr);
COND_SET_STRUCT(p->p_sigwaiters, psrc->p_sigwaiters, allowaddr);
COND_SET_STRUCT(p->p_sigpend.sp_info, psrc->p_sigpend.sp_info,
allowaddr);
p->p_sigpend.sp_set = psrc->p_sigpend.sp_set;
COND_SET_PTR(p->p_lwpctl, psrc->p_lwpctl, allowaddr);
p->p_ppid = psrc->p_ppid;
p->p_oppid = psrc->p_oppid;
COND_SET_PTR(p->p_path, psrc->p_path, allowaddr);
p->p_sigctx = psrc->p_sigctx;
p->p_nice = psrc->p_nice;
memcpy(p->p_comm, psrc->p_comm, sizeof(p->p_comm));
COND_SET_PTR(p->p_pgrp, psrc->p_pgrp, allowaddr);
COND_SET_VALUE(p->p_psstrp, psrc->p_psstrp, allowaddr);
p->p_pax = psrc->p_pax;
p->p_xexit = psrc->p_xexit;
p->p_xsig = psrc->p_xsig;
p->p_acflag = psrc->p_acflag;
COND_SET_STRUCT(p->p_md, psrc->p_md, allowaddr);
p->p_stackbase = psrc->p_stackbase;
COND_SET_PTR(p->p_dtrace, psrc->p_dtrace, allowaddr);
}
/*
* Fill in an eproc structure for the specified process.
*/
void
fill_eproc(struct proc *p, struct eproc *ep, bool zombie, bool allowaddr)
{
struct tty *tp;
struct lwp *l;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
COND_SET_PTR(ep->e_paddr, p, allowaddr);
COND_SET_PTR(ep->e_sess, p->p_session, allowaddr);
if (p->p_cred) {
kauth_cred_topcred(p->p_cred, &ep->e_pcred);
kauth_cred_toucred(p->p_cred, &ep->e_ucred);
}
if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
struct vmspace *vm = p->p_vmspace;
ep->e_vm.vm_rssize = vm_resident_count(vm);
ep->e_vm.vm_tsize = vm->vm_tsize;
ep->e_vm.vm_dsize = vm->vm_dsize;
ep->e_vm.vm_ssize = vm->vm_ssize;
ep->e_vm.vm_map.size = vm->vm_map.size;
/* Pick the primary (first) LWP */
l = proc_active_lwp(p);
KASSERT(l != NULL);
lwp_lock(l);
if (l->l_wchan)
strncpy(ep->e_wmesg, l->l_wmesg, WMESGLEN);
lwp_unlock(l);
}
ep->e_ppid = p->p_ppid;
if (p->p_pgrp && p->p_session) {
ep->e_pgid = p->p_pgrp->pg_id;
ep->e_jobc = p->p_pgrp->pg_jobc;
ep->e_sid = p->p_session->s_sid;
if ((p->p_lflag & PL_CONTROLT) &&
(tp = p->p_session->s_ttyp)) {
ep->e_tdev = tp->t_dev;
ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
COND_SET_PTR(ep->e_tsess, tp->t_session, allowaddr);
} else
ep->e_tdev = (uint32_t)NODEV;
ep->e_flag = p->p_session->s_ttyvp ? EPROC_CTTY : 0;
if (SESS_LEADER(p))
ep->e_flag |= EPROC_SLEADER;
strncpy(ep->e_login, p->p_session->s_login, MAXLOGNAME);
}
ep->e_xsize = ep->e_xrssize = 0;
ep->e_xccount = ep->e_xswrss = 0;
}
/*
* Fill in a kinfo_proc2 structure for the specified process.
*/
void
fill_kproc2(struct proc *p, struct kinfo_proc2 *ki, bool zombie, bool allowaddr)
{
struct tty *tp;
struct lwp *l;
struct timeval ut, st, rt;
sigset_t ss1, ss2;
struct rusage ru;
struct vmspace *vm;
KASSERT(mutex_owned(&proc_lock));
KASSERT(mutex_owned(p->p_lock));
sigemptyset(&ss1);
sigemptyset(&ss2);
COND_SET_VALUE(ki->p_paddr, PTRTOUINT64(p), allowaddr);
COND_SET_VALUE(ki->p_fd, PTRTOUINT64(p->p_fd), allowaddr);
COND_SET_VALUE(ki->p_cwdi, PTRTOUINT64(p->p_cwdi), allowaddr);
COND_SET_VALUE(ki->p_stats, PTRTOUINT64(p->p_stats), allowaddr);
COND_SET_VALUE(ki->p_limit, PTRTOUINT64(p->p_limit), allowaddr);
COND_SET_VALUE(ki->p_vmspace, PTRTOUINT64(p->p_vmspace), allowaddr);
COND_SET_VALUE(ki->p_sigacts, PTRTOUINT64(p->p_sigacts), allowaddr);
COND_SET_VALUE(ki->p_sess, PTRTOUINT64(p->p_session), allowaddr);
ki->p_tsess = 0; /* may be changed if controlling tty below */
COND_SET_VALUE(ki->p_ru, PTRTOUINT64(&p->p_stats->p_ru), allowaddr);
ki->p_eflag = 0;
ki->p_exitsig = p->p_exitsig;
ki->p_flag = L_INMEM; /* Process never swapped out */
ki->p_flag |= sysctl_map_flags(sysctl_flagmap, p->p_flag);
ki->p_flag |= sysctl_map_flags(sysctl_sflagmap, p->p_sflag);
ki->p_flag |= sysctl_map_flags(sysctl_slflagmap, p->p_slflag);
ki->p_flag |= sysctl_map_flags(sysctl_lflagmap, p->p_lflag);
ki->p_flag |= sysctl_map_flags(sysctl_stflagmap, p->p_stflag);
ki->p_pid = p->p_pid;
ki->p_ppid = p->p_ppid;
ki->p_uid = kauth_cred_geteuid(p->p_cred);
ki->p_ruid = kauth_cred_getuid(p->p_cred);
ki->p_gid = kauth_cred_getegid(p->p_cred);
ki->p_rgid = kauth_cred_getgid(p->p_cred);
ki->p_svuid = kauth_cred_getsvuid(p->p_cred);
ki->p_svgid = kauth_cred_getsvgid(p->p_cred);
ki->p_ngroups = kauth_cred_ngroups(p->p_cred);
kauth_cred_getgroups(p->p_cred, ki->p_groups,
uimin(ki->p_ngroups, sizeof(ki->p_groups) / sizeof(ki->p_groups[0])),
UIO_SYSSPACE);
ki->p_uticks = p->p_uticks;
ki->p_sticks = p->p_sticks;
ki->p_iticks = p->p_iticks;
ki->p_tpgid = NO_PGID; /* may be changed if controlling tty below */
COND_SET_VALUE(ki->p_tracep, PTRTOUINT64(p->p_tracep), allowaddr);
ki->p_traceflag = p->p_traceflag;
memcpy(&ki->p_sigignore, &p->p_sigctx.ps_sigignore,sizeof(ki_sigset_t));
memcpy(&ki->p_sigcatch, &p->p_sigctx.ps_sigcatch, sizeof(ki_sigset_t));
ki->p_cpticks = 0;
ki->p_pctcpu = p->p_pctcpu;
ki->p_estcpu = 0;
ki->p_stat = p->p_stat; /* Will likely be overridden by LWP status */
ki->p_realstat = p->p_stat;
ki->p_nice = p->p_nice;
ki->p_xstat = P_WAITSTATUS(p);
ki->p_acflag = p->p_acflag;
strncpy(ki->p_comm, p->p_comm,
uimin(sizeof(ki->p_comm), sizeof(p->p_comm)));
strncpy(ki->p_ename, p->p_emul->e_name, sizeof(ki->p_ename));
ki->p_nlwps = p->p_nlwps;
ki->p_realflag = ki->p_flag;
if (p->p_stat != SIDL && !P_ZOMBIE(p) && !zombie) {
vm = p->p_vmspace;
ki->p_vm_rssize = vm_resident_count(vm);
ki->p_vm_tsize = vm->vm_tsize;
ki->p_vm_dsize = vm->vm_dsize;
ki->p_vm_ssize = vm->vm_ssize;
ki->p_vm_vsize = atop(vm->vm_map.size);
/*
* Since the stack is initially mapped mostly with
* PROT_NONE and grown as needed, adjust the "mapped size"
* to skip the unused stack portion.
*/
ki->p_vm_msize =
atop(vm->vm_map.size) - vm->vm_issize + vm->vm_ssize;
/* Pick the primary (first) LWP */
l = proc_active_lwp(p);
KASSERT(l != NULL);
lwp_lock(l);
ki->p_nrlwps = p->p_nrlwps;
ki->p_forw = 0;
ki->p_back = 0;
COND_SET_VALUE(ki->p_addr, PTRTOUINT64(l->l_addr), allowaddr);
ki->p_stat = l->l_stat;
ki->p_flag |= sysctl_map_flags(sysctl_lwpflagmap, l->l_flag);
ki->p_swtime = l->l_swtime;
ki->p_slptime = l->l_slptime;
if (l->l_stat == LSONPROC)
ki->p_schedflags = l->l_cpu->ci_schedstate.spc_flags;
else
ki->p_schedflags = 0;
ki->p_priority = lwp_eprio(l);
ki->p_usrpri = l->l_priority;
if (l->l_wchan)
strncpy(ki->p_wmesg, l->l_wmesg, sizeof(ki->p_wmesg));
COND_SET_VALUE(ki->p_wchan, PTRTOUINT64(l->l_wchan), allowaddr);
ki->p_cpuid = cpu_index(l->l_cpu);
lwp_unlock(l);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
/* This is hardly correct, but... */
sigplusset(&l->l_sigpend.sp_set, &ss1);
sigplusset(&l->l_sigmask, &ss2);
ki->p_cpticks += l->l_cpticks;
ki->p_pctcpu += l->l_pctcpu;
ki->p_estcpu += l->l_estcpu;
}
}
sigplusset(&p->p_sigpend.sp_set, &ss1);
memcpy(&ki->p_siglist, &ss1, sizeof(ki_sigset_t));
memcpy(&ki->p_sigmask, &ss2, sizeof(ki_sigset_t));
if (p->p_session != NULL) {
ki->p_sid = p->p_session->s_sid;
ki->p__pgid = p->p_pgrp->pg_id;
if (p->p_session->s_ttyvp)
ki->p_eflag |= EPROC_CTTY;
if (SESS_LEADER(p))
ki->p_eflag |= EPROC_SLEADER;
strncpy(ki->p_login, p->p_session->s_login,
uimin(sizeof ki->p_login - 1, sizeof p->p_session->s_login));
ki->p_jobc = p->p_pgrp->pg_jobc;
if ((p->p_lflag & PL_CONTROLT) && (tp = p->p_session->s_ttyp)) {
ki->p_tdev = tp->t_dev;
ki->p_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
COND_SET_VALUE(ki->p_tsess, PTRTOUINT64(tp->t_session),
allowaddr);
} else {
ki->p_tdev = (int32_t)NODEV;
}
}
if (!P_ZOMBIE(p) && !zombie) {
ki->p_uvalid = 1;
ki->p_ustart_sec = p->p_stats->p_start.tv_sec;
ki->p_ustart_usec = p->p_stats->p_start.tv_usec;
calcru(p, &ut, &st, NULL, &rt);
ki->p_rtime_sec = rt.tv_sec;
ki->p_rtime_usec = rt.tv_usec;
ki->p_uutime_sec = ut.tv_sec;
ki->p_uutime_usec = ut.tv_usec;
ki->p_ustime_sec = st.tv_sec;
ki->p_ustime_usec = st.tv_usec;
memcpy(&ru, &p->p_stats->p_ru, sizeof(ru));
rulwps(p, &ru);
ki->p_uru_nvcsw = ru.ru_nvcsw;
ki->p_uru_nivcsw = ru.ru_nivcsw;
ki->p_uru_maxrss = ru.ru_maxrss;
ki->p_uru_ixrss = ru.ru_ixrss;
ki->p_uru_idrss = ru.ru_idrss;
ki->p_uru_isrss = ru.ru_isrss;
ki->p_uru_minflt = ru.ru_minflt;
ki->p_uru_majflt = ru.ru_majflt;
ki->p_uru_nswap = ru.ru_nswap;
ki->p_uru_inblock = ru.ru_inblock;
ki->p_uru_oublock = ru.ru_oublock;
ki->p_uru_msgsnd = ru.ru_msgsnd;
ki->p_uru_msgrcv = ru.ru_msgrcv;
ki->p_uru_nsignals = ru.ru_nsignals;
timeradd(&p->p_stats->p_cru.ru_utime,
&p->p_stats->p_cru.ru_stime, &ut);
ki->p_uctime_sec = ut.tv_sec;
ki->p_uctime_usec = ut.tv_usec;
}
}
int
proc_find_locked(struct lwp *l, struct proc **p, pid_t pid)
{
int error;
mutex_enter(&proc_lock);
if (pid == -1)
*p = l->l_proc;
else
*p = proc_find(pid);
if (*p == NULL) {
if (pid != -1)
mutex_exit(&proc_lock);
return ESRCH;
}
if (pid != -1)
mutex_enter((*p)->p_lock);
mutex_exit(&proc_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, *p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error) {
if (pid != -1)
mutex_exit((*p)->p_lock);
}
return error;
}
static int
fill_pathname(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
int error;
struct proc *p;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
if (p->p_path == NULL) {
if (pid != -1)
mutex_exit(p->p_lock);
return ENOENT;
}
size_t len = strlen(p->p_path) + 1;
if (oldp != NULL) {
size_t copylen = uimin(len, *oldlenp);
error = sysctl_copyout(l, p->p_path, oldp, copylen);
if (error == 0 && *oldlenp < len)
error = ENOSPC;
}
*oldlenp = len;
if (pid != -1)
mutex_exit(p->p_lock);
return error;
}
static int
fill_cwd(struct lwp *l, pid_t pid, void *oldp, size_t *oldlenp)
{
int error;
struct proc *p;
char *path;
char *bp, *bend;
struct cwdinfo *cwdi;
struct vnode *vp;
size_t len, lenused;
if ((error = proc_find_locked(l, &p, pid)) != 0)
return error;
len = MAXPATHLEN * 4;
path = kmem_alloc(len, KM_SLEEP);
bp = &path[len];
bend = bp;
*(--bp) = '\0';
cwdi = p->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
vp = cwdi->cwdi_cdir;
error = getcwd_common(vp, NULL, &bp, path, len/2, 0, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto out;
lenused = bend - bp;
if (oldp != NULL) {
size_t copylen = uimin(lenused, *oldlenp);
error = sysctl_copyout(l, bp, oldp, copylen);
if (error == 0 && *oldlenp < lenused)
error = ENOSPC;
}
*oldlenp = lenused;
out:
if (pid != -1)
mutex_exit(p->p_lock);
kmem_free(path, len);
return error;
}
int
proc_getauxv(struct proc *p, void **buf, size_t *len)
{
struct ps_strings pss;
int error;
void *uauxv, *kauxv;
size_t size;
if ((error = copyin_psstrings(p, &pss)) != 0)
return error;
if (pss.ps_envstr == NULL)
return EIO;
size = p->p_execsw->es_arglen;
if (size == 0)
return EIO;
size_t ptrsz = PROC_PTRSZ(p);
uauxv = (void *)((char *)pss.ps_envstr + (pss.ps_nenvstr + 1) * ptrsz);
kauxv = kmem_alloc(size, KM_SLEEP);
error = copyin_proc(p, uauxv, kauxv, size);
if (error) {
kmem_free(kauxv, size);
return error;
}
*buf = kauxv;
*len = size;
return 0;
}
static int
sysctl_security_expose_address(SYSCTLFN_ARGS)
{
int expose_address, error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &expose_address;
expose_address = *(int *)rnode->sysctl_data;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_KERNADDR,
0, NULL, NULL, NULL))
return EPERM;
switch (expose_address) {
case 0:
case 1:
case 2:
break;
default:
return EINVAL;
}
*(int *)rnode->sysctl_data = expose_address;
return 0;
}
bool
get_expose_address(struct proc *p)
{
/* allow only if sysctl variable is set or privileged */
return kauth_authorize_process(kauth_cred_get(), KAUTH_PROCESS_CANSEE,
p, KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_KPTR), NULL, NULL) == 0;
}
/* $NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1988, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)rtsock.c 8.7 (Berkeley) 10/12/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock_50.c,v 1.16 2020/01/29 05:47:12 thorpej Exp $");
#define COMPAT_RTSOCK /* Use the COMPATNAME/COMPATCALL macros and the
* various other compat definitions - see
* sys/net/rtsock_shared.c for details
*/
#include <net/rtsock_shared.c>
#include <compat/net/route_50.h>
static struct sysctllog *clog;
void
compat_50_rt_oifmsg(struct ifnet *ifp)
{
struct if_msghdr50 oifm;
struct if_data ifi;
struct mbuf *m;
struct rt_addrinfo info;
if (COMPATNAME(route_info).ri_cb.any_count == 0)
return;
(void)memset(&info, 0, sizeof(info));
(void)memset(&oifm, 0, sizeof(oifm));
if_export_if_data(ifp, &ifi, false);
oifm.ifm_index = ifp->if_index;
oifm.ifm_flags = ifp->if_flags;
oifm.ifm_data.ifi_type = ifi.ifi_type;
oifm.ifm_data.ifi_addrlen = ifi.ifi_addrlen;
oifm.ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
oifm.ifm_data.ifi_link_state = ifi.ifi_link_state;
oifm.ifm_data.ifi_mtu = ifi.ifi_mtu;
oifm.ifm_data.ifi_metric = ifi.ifi_metric;
oifm.ifm_data.ifi_baudrate = ifi.ifi_baudrate;
oifm.ifm_data.ifi_ipackets = ifi.ifi_ipackets;
oifm.ifm_data.ifi_ierrors = ifi.ifi_ierrors;
oifm.ifm_data.ifi_opackets = ifi.ifi_opackets;
oifm.ifm_data.ifi_oerrors = ifi.ifi_oerrors;
oifm.ifm_data.ifi_collisions = ifi.ifi_collisions;
oifm.ifm_data.ifi_ibytes = ifi.ifi_ibytes;
oifm.ifm_data.ifi_obytes = ifi.ifi_obytes;
oifm.ifm_data.ifi_imcasts = ifi.ifi_imcasts;
oifm.ifm_data.ifi_omcasts = ifi.ifi_omcasts;
oifm.ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
oifm.ifm_data.ifi_noproto = ifi.ifi_noproto;
TIMESPEC_TO_TIMEVAL(&oifm.ifm_data.ifi_lastchange,
&ifi.ifi_lastchange);
oifm.ifm_addrs = 0;
m = COMPATNAME(rt_msg1)(RTM_OIFINFO, &info, (void *)&oifm, sizeof(oifm));
if (m == NULL)
return;
COMPATNAME(route_enqueue)(m, 0);
}
int
compat_50_iflist(struct ifnet *ifp, struct rt_walkarg *w,
struct rt_addrinfo *info, size_t len)
{
struct if_msghdr50 *ifm;
struct if_data ifi;
int error;
ifm = (struct if_msghdr50 *)w->w_tmem;
if_export_if_data(ifp, &ifi, false);
ifm->ifm_index = ifp->if_index;
ifm->ifm_flags = ifp->if_flags;
ifm->ifm_data.ifi_type = ifi.ifi_type;
ifm->ifm_data.ifi_addrlen = ifi.ifi_addrlen;
ifm->ifm_data.ifi_hdrlen = ifi.ifi_hdrlen;
ifm->ifm_data.ifi_link_state = ifi.ifi_link_state;
ifm->ifm_data.ifi_mtu = ifi.ifi_mtu;
ifm->ifm_data.ifi_metric = ifi.ifi_metric;
ifm->ifm_data.ifi_baudrate = ifi.ifi_baudrate;
ifm->ifm_data.ifi_ipackets = ifi.ifi_ipackets;
ifm->ifm_data.ifi_ierrors = ifi.ifi_ierrors;
ifm->ifm_data.ifi_opackets = ifi.ifi_opackets;
ifm->ifm_data.ifi_oerrors = ifi.ifi_oerrors;
ifm->ifm_data.ifi_collisions = ifi.ifi_collisions;
ifm->ifm_data.ifi_ibytes = ifi.ifi_ibytes;
ifm->ifm_data.ifi_obytes = ifi.ifi_obytes;
ifm->ifm_data.ifi_imcasts = ifi.ifi_imcasts;
ifm->ifm_data.ifi_omcasts = ifi.ifi_omcasts;
ifm->ifm_data.ifi_iqdrops = ifi.ifi_iqdrops;
ifm->ifm_data.ifi_noproto = ifi.ifi_noproto;
TIMESPEC_TO_TIMEVAL(&ifm->ifm_data.ifi_lastchange,
&ifi.ifi_lastchange);
ifm->ifm_addrs = info->rti_addrs;
error = copyout(ifm, w->w_where, len);
if (error)
return error;
w->w_where = (char *)w->w_where + len;
return 0;
}
void
rtsock_50_init(void)
{
MODULE_HOOK_SET(rtsock_iflist_50_hook, compat_50_iflist);
MODULE_HOOK_SET(rtsock_oifmsg_50_hook, compat_50_rt_oifmsg);
MODULE_HOOK_SET(rtsock_rt_missmsg_50_hook, compat_50_rt_missmsg);
MODULE_HOOK_SET(rtsock_rt_ifmsg_50_hook, compat_50_rt_ifmsg);
MODULE_HOOK_SET(rtsock_rt_addrmsg_rt_50_hook, compat_50_rt_addrmsg_rt);
MODULE_HOOK_SET(rtsock_rt_addrmsg_src_50_hook,
compat_50_rt_addrmsg_src);
MODULE_HOOK_SET(rtsock_rt_addrmsg_50_hook, compat_50_rt_addrmsg);
MODULE_HOOK_SET(rtsock_rt_ifannouncemsg_50_hook,
compat_50_rt_ifannouncemsg);
MODULE_HOOK_SET(rtsock_rt_ieee80211msg_50_hook,
compat_50_rt_ieee80211msg);
sysctl_net_route_setup(&clog, PF_OROUTE, "ortable");
}
void
rtsock_50_fini(void)
{
sysctl_teardown(&clog);
MODULE_HOOK_UNSET(rtsock_iflist_50_hook);
MODULE_HOOK_UNSET(rtsock_oifmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_missmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ifmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_rt_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_src_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_addrmsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ifannouncemsg_50_hook);
MODULE_HOOK_UNSET(rtsock_rt_ieee80211msg_50_hook);
}
/* $NetBSD: clock.h,v 1.7 2023/10/27 14:34:58 jschauma Exp $ */
/*-
* Copyright (c) 1996 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Gordon W. Ross
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_CLOCK_H_
#define _SYS_CLOCK_H_
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <stdint.h>
#endif
/* Some handy constants. */
#define SECS_PER_MINUTE 60
#define SECS_PER_HOUR 3600
#define SECS_PER_DAY 86400
#define DAYS_PER_COMMON_YEAR 365
#define DAYS_PER_LEAP_YEAR 366
#define SECS_PER_COMMON_YEAR (SECS_PER_DAY * DAYS_PER_COMMON_YEAR)
#define SECS_PER_LEAP_YEAR (SECS_PER_DAY * DAYS_PER_LEAP_YEAR)
/* Traditional POSIX base year */
#define POSIX_BASE_YEAR 1970
/* Some handy functions */
static __inline int
days_in_month(int m)
{
switch (m) {
case 2:
return 28;
case 4: case 6: case 9: case 11:
return 30;
case 1: case 3: case 5: case 7: case 8: case 10: case 12:
return 31;
default:
return -1;
}
}
/*
* This inline avoids some unnecessary modulo operations
* as compared with the usual macro:
* ( ((year % 4) == 0 &&
* (year % 100) != 0) ||
* ((year % 400) == 0) )
* It is otherwise equivalent.
*/
static __inline int
is_leap_year(uint64_t year)
{
if ((year & 3) != 0)
return 0;
if (__predict_false((year % 100) != 0))
return 1;
return __predict_false((year % 400) == 0);
}
static __inline int
days_per_year(uint64_t year)
{
return is_leap_year(year) ? DAYS_PER_LEAP_YEAR : DAYS_PER_COMMON_YEAR;
}
#endif /* _SYS_CLOCK_H_ */
/* $NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $ */
/*
* Copyright (c) 1988, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)radix.c 8.6 (Berkeley) 10/17/95
*/
/*
* Routines to build and maintain radix trees for routing lookups.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: radix.c,v 1.49 2020/10/18 13:07:31 gson Exp $");
#ifndef _NET_RADIX_H_
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/kmem.h>
#ifdef _KERNEL
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#endif
#include <sys/systm.h>
#include <sys/malloc.h>
#define M_DONTWAIT M_NOWAIT
#include <sys/domain.h>
#else
#include <stdlib.h>
#endif
#include <sys/syslog.h>
#include <net/radix.h>
#endif
typedef void (*rn_printer_t)(void *, const char *fmt, ...);
int max_keylen;
struct radix_mask *rn_mkfreelist;
struct radix_node_head *mask_rnhead;
static char *addmask_key;
static const char normal_chars[] =
{0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, -1};
static char *rn_zeros, *rn_ones;
#define rn_masktop (mask_rnhead->rnh_treetop)
static int rn_satisfies_leaf(const char *, struct radix_node *, int);
static int rn_lexobetter(const void *, const void *);
static struct radix_mask *rn_new_radix_mask(struct radix_node *,
struct radix_mask *);
static struct radix_node *rn_walknext(struct radix_node *, rn_printer_t,
void *);
static struct radix_node *rn_walkfirst(struct radix_node *, rn_printer_t,
void *);
static void rn_nodeprint(struct radix_node *, rn_printer_t, void *,
const char *);
#define SUBTREE_OPEN "[ "
#define SUBTREE_CLOSE " ]"
#ifdef RN_DEBUG
static void rn_treeprint(struct radix_node_head *, rn_printer_t, void *);
#endif /* RN_DEBUG */
/*
* The data structure for the keys is a radix tree with one way
* branching removed. The index rn_b at an internal node n represents a bit
* position to be tested. The tree is arranged so that all descendants
* of a node n have keys whose bits all agree up to position rn_b - 1.
* (We say the index of n is rn_b.)
*
* There is at least one descendant which has a one bit at position rn_b,
* and at least one with a zero there.
*
* A route is determined by a pair of key and mask. We require that the
* bit-wise logical and of the key and mask to be the key.
* We define the index of a route to associated with the mask to be
* the first bit number in the mask where 0 occurs (with bit number 0
* representing the highest order bit).
*
* We say a mask is normal if every bit is 0, past the index of the mask.
* If a node n has a descendant (k, m) with index(m) == index(n) == rn_b,
* and m is a normal mask, then the route applies to every descendant of n.
* If the index(m) < rn_b, this implies the trailing last few bits of k
* before bit b are all 0, (and hence consequently true of every descendant
* of n), so the route applies to all descendants of the node as well.
*
* Similar logic shows that a non-normal mask m such that
* index(m) <= index(n) could potentially apply to many children of n.
* Thus, for each non-host route, we attach its mask to a list at an internal
* node as high in the tree as we can go.
*
* The present version of the code makes use of normal routes in short-
* circuiting an explicit mask and compare operation when testing whether
* a key satisfies a normal route, and also in remembering the unique leaf
* that governs a subtree.
*/
struct radix_node *
rn_search(
const void *v_arg,
struct radix_node *head)
{
const u_char * const v = v_arg;
struct radix_node *x;
for (x = head; x->rn_b >= 0;) {
if (x->rn_bmask & v[x->rn_off])
x = x->rn_r;
else
x = x->rn_l;
}
return x;
}
struct radix_node *
rn_search_m(
const void *v_arg,
struct radix_node *head,
const void *m_arg)
{
struct radix_node *x;
const u_char * const v = v_arg;
const u_char * const m = m_arg;
for (x = head; x->rn_b >= 0;) { if ((x->rn_bmask & m[x->rn_off]) &&
(x->rn_bmask & v[x->rn_off]))
x = x->rn_r;
else
x = x->rn_l;
}
return x;
}
int
rn_refines(
const void *m_arg,
const void *n_arg)
{
const char *m = m_arg;
const char *n = n_arg;
const char *lim = n + *(const u_char *)n;
const char *lim2 = lim;
int longer = (*(const u_char *)n++) - (int)(*(const u_char *)m++);
int masks_are_equal = 1;
if (longer > 0)
lim -= longer;
while (n < lim) { if (*n & ~(*m))
return 0;
if (*n++ != *m++)
masks_are_equal = 0;
}
while (n < lim2)
if (*n++)
return 0;
if (masks_are_equal && (longer < 0)) for (lim2 = m - longer; m < lim2; ) if (*m++)
return 1;
return !masks_are_equal;
}
struct radix_node *
rn_lookup(
const void *v_arg,
const void *m_arg,
struct radix_node_head *head)
{
struct radix_node *x;
const char *netmask = NULL;
if (m_arg) { if ((x = rn_addmask(m_arg, 1, head->rnh_treetop->rn_off)) == 0)
return NULL;
netmask = x->rn_key;
}
x = rn_match(v_arg, head);
if (x != NULL && netmask != NULL) { while (x != NULL && x->rn_mask != netmask)
x = x->rn_dupedkey;
}
return x;
}
static int
rn_satisfies_leaf(
const char *trial,
struct radix_node *leaf,
int skip)
{
const char *cp = trial;
const char *cp2 = leaf->rn_key;
const char *cp3 = leaf->rn_mask;
const char *cplim;
int length = uimin(*(const u_char *)cp, *(const u_char *)cp2);
if (cp3 == 0)
cp3 = rn_ones;
else
length = uimin(length, *(const u_char *)cp3);
cplim = cp + length; cp3 += skip; cp2 += skip;
for (cp += skip; cp < cplim; cp++, cp2++, cp3++) if ((*cp ^ *cp2) & *cp3)
return 0;
return 1;
}
struct radix_node *
rn_match(
const void *v_arg,
struct radix_node_head *head)
{
const char * const v = v_arg;
struct radix_node *t = head->rnh_treetop;
struct radix_node *top = t;
struct radix_node *x;
struct radix_node *saved_t;
const char *cp = v;
const char *cp2;
const char *cplim;
int off = t->rn_off;
int vlen = *(const u_char *)cp;
int matched_off;
int test, b, rn_b;
/*
* Open code rn_search(v, top) to avoid overhead of extra
* subroutine call.
*/
for (; t->rn_b >= 0; ) {
if (t->rn_bmask & cp[t->rn_off])
t = t->rn_r;
else
t = t->rn_l;
}
/*
* See if we match exactly as a host destination
* or at least learn how many bits match, for normal mask finesse.
*
* It doesn't hurt us to limit how many bytes to check
* to the length of the mask, since if it matches we had a genuine
* match and the leaf we have is the most specific one anyway;
* if it didn't match with a shorter length it would fail
* with a long one. This wins big for class B&C netmasks which
* are probably the most common case...
*/
if (t->rn_mask) vlen = *(const u_char *)t->rn_mask;
cp += off; cp2 = t->rn_key + off; cplim = v + vlen;
for (; cp < cplim; cp++, cp2++)
if (*cp != *cp2)
goto on1;
/*
* This extra grot is in case we are explicitly asked
* to look up the default. Ugh!
*/
if ((t->rn_flags & RNF_ROOT) && t->rn_dupedkey)
t = t->rn_dupedkey;
return t;
on1:
test = (*cp ^ *cp2) & 0xff; /* find first bit that differs */
for (b = 7; (test >>= 1) > 0;)
b--;
matched_off = cp - v;
b += matched_off << 3;
rn_b = -1 - b;
/*
* If there is a host route in a duped-key chain, it will be first.
*/
if ((saved_t = t)->rn_mask == 0) t = t->rn_dupedkey; for (; t; t = t->rn_dupedkey)
/*
* Even if we don't match exactly as a host,
* we may match if the leaf we wound up at is
* a route to a net.
*/
if (t->rn_flags & RNF_NORMAL) {
if (rn_b <= t->rn_b)
return t;
} else if (rn_satisfies_leaf(v, t, matched_off))
return t;
t = saved_t;
/* start searching up the tree */
do {
struct radix_mask *m;
t = t->rn_p;
m = t->rn_mklist;
if (m) {
/*
* If non-contiguous masks ever become important
* we can restore the masking and open coding of
* the search and satisfaction test and put the
* calculation of "off" back before the "do".
*/
do {
if (m->rm_flags & RNF_NORMAL) {
if (rn_b <= m->rm_b) return m->rm_leaf;
} else {
off = uimin(t->rn_off, matched_off);
x = rn_search_m(v, t, m->rm_mask); while (x && x->rn_mask != m->rm_mask)
x = x->rn_dupedkey;
if (x && rn_satisfies_leaf(v, x, off))
return x;
}
m = m->rm_mklist;
} while (m);
}
} while (t != top);
return NULL;
}
static void
rn_nodeprint(struct radix_node *rn, rn_printer_t printer, void *arg,
const char *delim)
{
(*printer)(arg, "%s(%s%p: p<%p> l<%p> r<%p>)",
delim, ((void *)rn == arg) ? "*" : "", rn, rn->rn_p,
rn->rn_l, rn->rn_r);
}
#ifdef RN_DEBUG
int rn_debug = 1;
static void
rn_dbg_print(void *arg, const char *fmt, ...)
{
va_list ap;
va_start(ap, fmt);
vlog(LOG_DEBUG, fmt, ap);
va_end(ap);
}
static void
rn_treeprint(struct radix_node_head *h, rn_printer_t printer, void *arg)
{
struct radix_node *dup, *rn;
const char *delim;
if (printer == NULL)
return;
rn = rn_walkfirst(h->rnh_treetop, printer, arg);
for (;;) {
/* Process leaves */
delim = "";
for (dup = rn; dup != NULL; dup = dup->rn_dupedkey) {
if ((dup->rn_flags & RNF_ROOT) != 0)
continue;
rn_nodeprint(dup, printer, arg, delim);
delim = ", ";
}
rn = rn_walknext(rn, printer, arg);
if (rn->rn_flags & RNF_ROOT)
return;
}
/* NOTREACHED */
}
#define traverse(__head, __rn) rn_treeprint((__head), rn_dbg_print, (__rn))
#endif /* RN_DEBUG */
struct radix_node *
rn_newpair(
const void *v,
int b,
struct radix_node nodes[2])
{
struct radix_node *tt = nodes;
struct radix_node *t = tt + 1;
t->rn_b = b; t->rn_bmask = 0x80 >> (b & 7);
t->rn_l = tt; t->rn_off = b >> 3;
tt->rn_b = -1; tt->rn_key = v; tt->rn_p = t;
tt->rn_flags = t->rn_flags = RNF_ACTIVE;
return t;
}
struct radix_node *
rn_insert(
const void *v_arg,
struct radix_node_head *head,
int *dupentry,
struct radix_node nodes[2])
{
struct radix_node *top = head->rnh_treetop;
struct radix_node *t = rn_search(v_arg, top);
struct radix_node *tt;
const char *v = v_arg;
int head_off = top->rn_off;
int vlen = *((const u_char *)v);
const char *cp = v + head_off;
int b;
/*
* Find first bit at which v and t->rn_key differ
*/
{
const char *cp2 = t->rn_key + head_off;
const char *cplim = v + vlen;
int cmp_res;
while (cp < cplim)
if (*cp2++ != *cp++)
goto on1;
*dupentry = 1;
return t;
on1:
*dupentry = 0;
cmp_res = (cp[-1] ^ cp2[-1]) & 0xff;
for (b = (cp - v) << 3; cmp_res; b--)
cmp_res >>= 1;
}
{
struct radix_node *p, *x = top;
cp = v;
do {
p = x;
if (cp[x->rn_off] & x->rn_bmask)
x = x->rn_r;
else x = x->rn_l;
} while (b > (unsigned) x->rn_b); /* x->rn_b < b && x->rn_b >= 0 */
#ifdef RN_DEBUG
if (rn_debug)
log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, p);
#endif
t = rn_newpair(v_arg, b, nodes); tt = t->rn_l;
if ((cp[p->rn_off] & p->rn_bmask) == 0)
p->rn_l = t;
else
p->rn_r = t;
x->rn_p = t; t->rn_p = p; /* frees x, p as temp vars below */
if ((cp[t->rn_off] & t->rn_bmask) == 0) {
t->rn_r = x;
} else {
t->rn_r = tt; t->rn_l = x;
}
#ifdef RN_DEBUG
if (rn_debug) {
log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
traverse(head, p);
}
#endif /* RN_DEBUG */
}
return tt;
}
struct radix_node *
rn_addmask(
const void *n_arg,
int search,
int skip)
{
const char *netmask = n_arg;
const char *cp;
const char *cplim;
struct radix_node *x;
struct radix_node *saved_x;
int b = 0, mlen, j;
int maskduplicated, m0, isnormal;
static int last_zeroed = 0;
if ((mlen = *(const u_char *)netmask) > max_keylen)
mlen = max_keylen;
if (skip == 0)
skip = 1;
if (mlen <= skip)
return mask_rnhead->rnh_nodes; if (skip > 1) memmove(addmask_key + 1, rn_ones + 1, skip - 1);
if ((m0 = mlen) > skip)
memmove(addmask_key + skip, netmask + skip, mlen - skip);
/*
* Trim trailing zeroes.
*/
for (cp = addmask_key + mlen; (cp > addmask_key) && cp[-1] == 0;)
cp--;
mlen = cp - addmask_key;
if (mlen <= skip) {
if (m0 >= last_zeroed) last_zeroed = mlen;
return mask_rnhead->rnh_nodes;
}
if (m0 < last_zeroed) memset(addmask_key + m0, 0, last_zeroed - m0);
*addmask_key = last_zeroed = mlen;
x = rn_search(addmask_key, rn_masktop);
if (memcmp(addmask_key, x->rn_key, mlen) != 0)
x = 0;
if (x || search)
return x;
R_Malloc(x, struct radix_node *, max_keylen + 2 * sizeof (*x));
if ((saved_x = x) == NULL)
return NULL;
memset(x, 0, max_keylen + 2 * sizeof (*x));
cp = netmask = (void *)(x + 2);
memmove(x + 2, addmask_key, mlen);
x = rn_insert(cp, mask_rnhead, &maskduplicated, x);
if (maskduplicated) {
log(LOG_ERR, "rn_addmask: mask impossibly already in tree\n");
Free(saved_x);
return x;
}
/*
* Calculate index of mask, and check for normalcy.
*/
cplim = netmask + mlen; isnormal = 1;
for (cp = netmask + skip; (cp < cplim) && *(const u_char *)cp == 0xff;)
cp++;
if (cp != cplim) { for (j = 0x80; (j & *cp) != 0; j >>= 1)
b++;
if (*cp != normal_chars[b] || cp != (cplim - 1))
isnormal = 0;
}
b += (cp - netmask) << 3;
x->rn_b = -1 - b;
if (isnormal)
x->rn_flags |= RNF_NORMAL;
return x;
}
static int /* XXX: arbitrary ordering for non-contiguous masks */
rn_lexobetter(
const void *m_arg,
const void *n_arg)
{
const u_char *mp = m_arg;
const u_char *np = n_arg;
const u_char *lim;
if (*mp > *np)
return 1; /* not really, but need to check longer one first */
if (*mp == *np) for (lim = mp + *mp; mp < lim;) if (*mp++ > *np++)
return 1;
return 0;
}
static struct radix_mask *
rn_new_radix_mask(
struct radix_node *tt,
struct radix_mask *next)
{
struct radix_mask *m;
MKGet(m);
if (m == NULL) { log(LOG_ERR, "Mask for route not entered\n");
return NULL;
}
memset(m, 0, sizeof(*m));
m->rm_b = tt->rn_b;
m->rm_flags = tt->rn_flags;
if (tt->rn_flags & RNF_NORMAL)
m->rm_leaf = tt;
else
m->rm_mask = tt->rn_mask;
m->rm_mklist = next;
tt->rn_mklist = m;
return m;
}
struct radix_node *
rn_addroute(
const void *v_arg,
const void *n_arg,
struct radix_node_head *head,
struct radix_node treenodes[2])
{
const char *v = v_arg, *netmask = n_arg;
struct radix_node *t, *x = NULL, *tt;
struct radix_node *saved_tt, *top = head->rnh_treetop;
short b = 0, b_leaf = 0;
int keyduplicated;
const char *mmask;
struct radix_mask *m, **mp;
/*
* In dealing with non-contiguous masks, there may be
* many different routes which have the same mask.
* We will find it useful to have a unique pointer to
* the mask to speed avoiding duplicate references at
* nodes and possibly save time in calculating indices.
*/
if (netmask != NULL) { if ((x = rn_addmask(netmask, 0, top->rn_off)) == NULL)
return NULL;
b_leaf = x->rn_b;
b = -1 - x->rn_b;
netmask = x->rn_key;
}
/*
* Deal with duplicated keys: attach node to previous instance
*/
saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
if (keyduplicated) { for (t = tt; tt != NULL; t = tt, tt = tt->rn_dupedkey) { if (tt->rn_mask == netmask)
return NULL;
if (netmask == NULL || (tt->rn_mask != NULL && (b_leaf < tt->rn_b || /* index(netmask) > node */ rn_refines(netmask, tt->rn_mask) || rn_lexobetter(netmask, tt->rn_mask))))
break;
}
/*
* If the mask is not duplicated, we wouldn't
* find it among possible duplicate key entries
* anyway, so the above test doesn't hurt.
*
* We sort the masks for a duplicated key the same way as
* in a masklist -- most specific to least specific.
* This may require the unfortunate nuisance of relocating
* the head of the list.
*
* We also reverse, or doubly link the list through the
* parent pointer.
*/
if (tt == saved_tt) {
struct radix_node *xx = x;
/* link in at head of list */
(tt = treenodes)->rn_dupedkey = t;
tt->rn_flags = t->rn_flags;
tt->rn_p = x = t->rn_p;
t->rn_p = tt;
if (x->rn_l == t)
x->rn_l = tt;
else
x->rn_r = tt;
saved_tt = tt;
x = xx;
} else {
(tt = treenodes)->rn_dupedkey = t->rn_dupedkey;
t->rn_dupedkey = tt;
tt->rn_p = t;
if (tt->rn_dupedkey) tt->rn_dupedkey->rn_p = tt;
}
tt->rn_key = v;
tt->rn_b = -1;
tt->rn_flags = RNF_ACTIVE;
}
/*
* Put mask in tree.
*/
if (netmask != NULL) { tt->rn_mask = netmask;
tt->rn_b = x->rn_b;
tt->rn_flags |= x->rn_flags & RNF_NORMAL;
}
t = saved_tt->rn_p;
if (keyduplicated)
goto on2;
b_leaf = -1 - t->rn_b;
if (t->rn_r == saved_tt) x = t->rn_l;
else
x = t->rn_r;
/* Promote general routes from below */
if (x->rn_b < 0) {
for (mp = &t->rn_mklist; x != NULL; x = x->rn_dupedkey) { if (x->rn_mask != NULL && x->rn_b >= b_leaf &&
x->rn_mklist == NULL) {
*mp = m = rn_new_radix_mask(x, NULL);
if (m != NULL)
mp = &m->rm_mklist;
}
}
} else if (x->rn_mklist != NULL) {
/*
* Skip over masks whose index is > that of new node
*/
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) if (m->rm_b >= b_leaf)
break;
t->rn_mklist = m;
*mp = NULL;
}
on2:
/* Add new route to highest possible ancestor's list */
if (netmask == NULL || b > t->rn_b)
return tt; /* can't lift at all */
b_leaf = tt->rn_b;
do {
x = t;
t = t->rn_p;
} while (b <= t->rn_b && x != top);
/*
* Search through routes associated with node to
* insert new route according to index.
* Need same criteria as when sorting dupedkeys to avoid
* double loop on deletion.
*/
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) { if (m->rm_b < b_leaf)
continue;
if (m->rm_b > b_leaf)
break;
if (m->rm_flags & RNF_NORMAL) {
mmask = m->rm_leaf->rn_mask;
if (tt->rn_flags & RNF_NORMAL) { log(LOG_ERR, "Non-unique normal route,"
" mask not entered\n");
return tt;
}
} else
mmask = m->rm_mask;
if (mmask == netmask) {
m->rm_refs++;
tt->rn_mklist = m;
return tt;
}
if (rn_refines(netmask, mmask) || rn_lexobetter(netmask, mmask))
break;
}
*mp = rn_new_radix_mask(tt, *mp);
return tt;
}
struct radix_node *
rn_delete1(
const void *v_arg,
const void *netmask_arg,
struct radix_node_head *head,
struct radix_node *rn)
{
struct radix_node *t, *p, *x, *tt;
struct radix_mask *m, *saved_m, **mp;
struct radix_node *dupedkey, *saved_tt, *top;
const char *v, *netmask;
int b, head_off, vlen;
v = v_arg;
netmask = netmask_arg;
x = head->rnh_treetop;
tt = rn_search(v, x);
head_off = x->rn_off;
vlen = *(const u_char *)v;
saved_tt = tt;
top = x;
if (tt == NULL ||
memcmp(v + head_off, tt->rn_key + head_off, vlen - head_off) != 0)
return NULL;
/*
* Delete our route from mask lists.
*/
if (netmask != NULL) {
if ((x = rn_addmask(netmask, 1, head_off)) == NULL)
return NULL;
netmask = x->rn_key;
while (tt->rn_mask != netmask) if ((tt = tt->rn_dupedkey) == NULL)
return NULL;
}
if (tt->rn_mask == NULL || (saved_m = m = tt->rn_mklist) == NULL)
goto on1;
if (tt->rn_flags & RNF_NORMAL) {
if (m->rm_leaf != tt || m->rm_refs > 0) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
return NULL; /* dangling ref could cause disaster */
}
} else {
if (m->rm_mask != tt->rn_mask) {
log(LOG_ERR, "rn_delete: inconsistent annotation\n");
goto on1;
}
if (--m->rm_refs >= 0)
goto on1;
}
b = -1 - tt->rn_b;
t = saved_tt->rn_p;
if (b > t->rn_b)
goto on1; /* Wasn't lifted at all */
do {
x = t;
t = t->rn_p;
} while (b <= t->rn_b && x != top); for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist) { if (m == saved_m) {
*mp = m->rm_mklist;
MKFree(m);
break;
}
}
if (m == NULL) {
log(LOG_ERR, "rn_delete: couldn't find our annotation\n");
if (tt->rn_flags & RNF_NORMAL)
return NULL; /* Dangling ref to us */
}
on1:
/*
* Eliminate us from tree
*/
if (tt->rn_flags & RNF_ROOT)
return NULL;
#ifdef RN_DEBUG
if (rn_debug)
log(LOG_DEBUG, "%s: Going In:\n", __func__), traverse(head, tt);
#endif
t = tt->rn_p;
dupedkey = saved_tt->rn_dupedkey;
if (dupedkey != NULL) {
/*
* Here, tt is the deletion target, and
* saved_tt is the head of the dupedkey chain.
*/
if (tt == saved_tt) {
x = dupedkey;
x->rn_p = t;
if (t->rn_l == tt)
t->rn_l = x;
else
t->rn_r = x;
} else {
/* find node in front of tt on the chain */
for (x = p = saved_tt;
p != NULL && p->rn_dupedkey != tt;)
p = p->rn_dupedkey;
if (p != NULL) {
p->rn_dupedkey = tt->rn_dupedkey;
if (tt->rn_dupedkey != NULL) tt->rn_dupedkey->rn_p = p;
} else
log(LOG_ERR, "rn_delete: couldn't find us\n");
}
t = tt + 1;
if (t->rn_flags & RNF_ACTIVE) { *++x = *t;
p = t->rn_p;
if (p->rn_l == t)
p->rn_l = x;
else
p->rn_r = x;
x->rn_l->rn_p = x;
x->rn_r->rn_p = x;
}
goto out;
}
if (t->rn_l == tt) x = t->rn_r;
else
x = t->rn_l;
p = t->rn_p;
if (p->rn_r == t)
p->rn_r = x;
else
p->rn_l = x;
x->rn_p = p;
/*
* Demote routes attached to us.
*/
if (t->rn_mklist == NULL)
;
else if (x->rn_b >= 0) {
for (mp = &x->rn_mklist; (m = *mp) != NULL; mp = &m->rm_mklist)
;
*mp = t->rn_mklist;
} else {
/* If there are any key,mask pairs in a sibling
duped-key chain, some subset will appear sorted
in the same order attached to our mklist */
for (m = t->rn_mklist;
m != NULL && x != NULL;
x = x->rn_dupedkey) {
if (m == x->rn_mklist) {
struct radix_mask *mm = m->rm_mklist;
x->rn_mklist = NULL;
if (--(m->rm_refs) < 0) MKFree(m);
m = mm;
}
}
if (m != NULL) { log(LOG_ERR, "rn_delete: Orphaned Mask %p at %p\n",
m, x);
}
}
/*
* We may be holding an active internal node in the tree.
*/
x = tt + 1;
if (t != x) {
*t = *x;
t->rn_l->rn_p = t;
t->rn_r->rn_p = t;
p = x->rn_p;
if (p->rn_l == x)
p->rn_l = t;
else
p->rn_r = t;
}
out:
#ifdef RN_DEBUG
if (rn_debug) {
log(LOG_DEBUG, "%s: Coming Out:\n", __func__),
traverse(head, tt);
}
#endif /* RN_DEBUG */
tt->rn_flags &= ~RNF_ACTIVE;
tt[1].rn_flags &= ~RNF_ACTIVE;
return tt;
}
struct radix_node *
rn_delete(
const void *v_arg,
const void *netmask_arg,
struct radix_node_head *head)
{
return rn_delete1(v_arg, netmask_arg, head, NULL);
}
static struct radix_node *
rn_walknext(struct radix_node *rn, rn_printer_t printer, void *arg)
{
/* If at right child go back up, otherwise, go right */
while (rn->rn_p->rn_r == rn && (rn->rn_flags & RNF_ROOT) == 0) {
if (printer != NULL)
(*printer)(arg, SUBTREE_CLOSE);
rn = rn->rn_p;
}
if (printer)
rn_nodeprint(rn->rn_p, printer, arg, "");
/* Find the next *leaf* since next node might vanish, too */
for (rn = rn->rn_p->rn_r; rn->rn_b >= 0;) {
if (printer != NULL)
(*printer)(arg, SUBTREE_OPEN);
rn = rn->rn_l;
}
return rn;
}
static struct radix_node *
rn_walkfirst(struct radix_node *rn, rn_printer_t printer, void *arg)
{
/* First time through node, go left */
while (rn->rn_b >= 0) {
if (printer != NULL)
(*printer)(arg, SUBTREE_OPEN);
rn = rn->rn_l;
}
return rn;
}
int
rn_walktree(
struct radix_node_head *h,
int (*f)(struct radix_node *, void *),
void *w)
{
int error;
struct radix_node *base, *next, *rn;
/*
* This gets complicated because we may delete the node
* while applying the function f to it, so we need to calculate
* the successor node in advance.
*/
rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
for (;;) {
base = rn;
next = rn_walknext(rn, NULL, NULL);
/* Process leaves */
while ((rn = base) != NULL) {
base = rn->rn_dupedkey;
if (!(rn->rn_flags & RNF_ROOT) && (error = (*f)(rn, w)))
return error;
}
rn = next;
if (rn->rn_flags & RNF_ROOT)
return 0;
}
/* NOTREACHED */
}
struct radix_node *
rn_search_matched(struct radix_node_head *h,
int (*matcher)(struct radix_node *, void *), void *w)
{
bool matched;
struct radix_node *base, *next, *rn;
/*
* This gets complicated because we may delete the node
* while applying the function f to it, so we need to calculate
* the successor node in advance.
*/
rn = rn_walkfirst(h->rnh_treetop, NULL, NULL);
for (;;) {
base = rn;
next = rn_walknext(rn, NULL, NULL);
/* Process leaves */
while ((rn = base) != NULL) {
base = rn->rn_dupedkey;
if (!(rn->rn_flags & RNF_ROOT)) {
matched = (*matcher)(rn, w);
if (matched)
return rn;
}
}
rn = next;
if (rn->rn_flags & RNF_ROOT)
return NULL;
}
/* NOTREACHED */
}
struct delayinit {
void **head;
int off;
SLIST_ENTRY(delayinit) entries;
};
static SLIST_HEAD(, delayinit) delayinits = SLIST_HEAD_INITIALIZER(delayheads);
static int radix_initialized;
/*
* Initialize a radix tree once radix is initialized. Only for bootstrap.
* Assume that no concurrency protection is necessary at this stage.
*/
void
rn_delayedinit(void **head, int off)
{
struct delayinit *di;
if (radix_initialized)
return;
di = kmem_alloc(sizeof(*di), KM_SLEEP);
di->head = head;
di->off = off;
SLIST_INSERT_HEAD(&delayinits, di, entries);
}
int
rn_inithead(void **head, int off)
{
struct radix_node_head *rnh;
if (*head != NULL)
return 1;
R_Malloc(rnh, struct radix_node_head *, sizeof (*rnh));
if (rnh == NULL)
return 0;
*head = rnh;
return rn_inithead0(rnh, off);
}
int
rn_inithead0(struct radix_node_head *rnh, int off)
{
struct radix_node *t;
struct radix_node *tt;
struct radix_node *ttt;
memset(rnh, 0, sizeof(*rnh));
t = rn_newpair(rn_zeros, off, rnh->rnh_nodes);
ttt = rnh->rnh_nodes + 2;
t->rn_r = ttt;
t->rn_p = t;
tt = t->rn_l;
tt->rn_flags = t->rn_flags = RNF_ROOT | RNF_ACTIVE;
tt->rn_b = -1 - off;
*ttt = *tt;
ttt->rn_key = rn_ones;
rnh->rnh_addaddr = rn_addroute;
rnh->rnh_deladdr = rn_delete;
rnh->rnh_matchaddr = rn_match;
rnh->rnh_lookup = rn_lookup;
rnh->rnh_treetop = t;
return 1;
}
void
rn_init(void)
{
char *cp, *cplim;
struct delayinit *di;
#ifdef _KERNEL
struct domain *dp;
if (radix_initialized)
panic("radix already initialized");
radix_initialized = 1;
DOMAIN_FOREACH(dp) {
if (dp->dom_maxrtkey > max_keylen)
max_keylen = dp->dom_maxrtkey;
}
#endif
if (max_keylen == 0) {
#ifndef _KERNEL
log(LOG_ERR,
"rn_init: radix functions require max_keylen be set\n");
#endif
return;
}
R_Malloc(rn_zeros, char *, 3 * max_keylen);
if (rn_zeros == NULL)
panic("rn_init");
memset(rn_zeros, 0, 3 * max_keylen);
rn_ones = cp = rn_zeros + max_keylen;
addmask_key = cplim = rn_ones + max_keylen;
while (cp < cplim)
*cp++ = -1;
if (rn_inithead((void *)&mask_rnhead, 0) == 0)
panic("rn_init 2");
while ((di = SLIST_FIRST(&delayinits)) != NULL) {
if (!rn_inithead(di->head, di->off))
panic("delayed rn_inithead failed");
SLIST_REMOVE_HEAD(&delayinits, entries);
kmem_free(di, sizeof(*di));
}
}
/* $NetBSD: spl.h,v 1.10 2021/11/02 11:26:05 ryo Exp $ */
/*-
* Copyright (c)2005 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* this header is intended to be included by MD header.
*
* an assumption: makeiplcookie() is reasonably fast.
* if it isn't the case for your port, it's better to have MD optimized
* splxxx() functions, rather than using this header.
*/
#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error not supposed to be exposed to userland.
#endif /* !defined(_KERNEL) && !defined(_KMEMUSER) */
#define _SPL_DECL(x, X) \
static __inline __always_inline int \
spl##x(void) \
{ return splraiseipl(makeiplcookie(IPL_##X)); }
#if defined(IPL_SOFTCLOCK)
_SPL_DECL(softclock, SOFTCLOCK)
#endif /* defined(IPL_SOFTCLOCK) */
#if defined(IPL_SOFTNET)
_SPL_DECL(softnet, SOFTNET)
#endif /* defined(IPL_SOFTNET) */
#if defined(IPL_SOFTSERIAL)
_SPL_DECL(softserial, SOFTSERIAL)
#endif /* defined(IPL_SOFTSERIAL) */
_SPL_DECL(vm, VM)
_SPL_DECL(sched, SCHED)
_SPL_DECL(high, HIGH)
#undef _SPL_DECL
/* $NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Null file-system: VFS operations.
*
* See null_vnops.c for a description.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: null_vfsops.c,v 1.101 2023/02/06 10:32:58 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/module.h>
#include <miscfs/nullfs/null.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, null, "layerfs");
VFS_PROTOS(nullfs);
int
nullfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct vnode *lowerrootvp, *vp;
struct null_args *args = data;
struct null_mount *nmp;
struct layer_mount *lmp;
struct pathbuf *pb;
struct nameidata nd;
int error;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof(*args))
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
lmp = MOUNTTOLAYERMOUNT(mp);
if (lmp == NULL)
return EIO;
args->la.target = NULL;
*data_len = sizeof(*args);
return 0;
}
/* Update is not supported. */
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/* Find the lower vnode and lock it. */
error = pathbuf_copyin(args->la.target, &pb);
if (error) {
return error;
}
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
return error;
}
lowerrootvp = nd.ni_vp;
pathbuf_destroy(pb);
/* Create the mount point. */
nmp = kmem_zalloc(sizeof(struct null_mount), KM_SLEEP);
mp->mnt_data = nmp;
mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_MPSAFE;
mp->mnt_iflag |= lowerrootvp->v_mount->mnt_iflag & IMNT_SHRLOOKUP;
/*
* Make sure that the mount point is sufficiently initialized
* that the node create call will work.
*/
vfs_getnewfsid(mp);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(nmp, sizeof(struct null_mount));
return error;
}
nmp->nullm_size = sizeof(struct null_node);
nmp->nullm_tag = VT_NULL;
nmp->nullm_bypass = layer_bypass;
nmp->nullm_vnodeop_p = null_vnodeop_p;
/* Setup a null node for root vnode. */
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
if (error) {
vrele(lowerrootvp);
kmem_free(nmp, sizeof(struct null_mount));
return error;
}
/*
* Keep a held reference to the root vnode. It will be released on
* umount. Note: nullfs is MP-safe.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
nmp->nullm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->la.target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, curlwp);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
return 0;
}
int
nullfs_unmount(struct mount *mp, int mntflags)
{
struct null_mount *nmp = MOUNTTONULLMOUNT(mp);
struct vnode *null_rootvp = nmp->nullm_rootvp;
int error, flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(null_rootvp) > 1 && (mntflags & MNT_FORCE) == 0)
return EBUSY;
if ((error = vflush(mp, null_rootvp, flags)) != 0)
return error;
/* Eliminate all activity and release the vnode. */
vgone(null_rootvp);
/* Finally, destroy the mount point structures. */
kmem_free(mp->mnt_data, sizeof(struct null_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc null_vnodeop_opv_desc;
const struct vnodeopv_desc * const nullfs_vnodeopv_descs[] = {
&null_vnodeop_opv_desc,
NULL,
};
struct vfsops nullfs_vfsops = {
.vfs_name = MOUNT_NULL,
.vfs_min_mount_data = sizeof (struct null_args),
.vfs_mount = nullfs_mount,
.vfs_start = layerfs_start,
.vfs_unmount = nullfs_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = nullfs_vnodeopv_descs
};
SYSCTL_SETUP(nullfs_sysctl_setup, "nullfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "null",
SYSCTL_DESCR("Loopback file system"),
NULL, 0, NULL, 0,
CTL_VFS, 9, CTL_EOL);
/*
* XXX the "9" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "9" is the order as taken from sys/mount.h
*/
}
static int
null_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&nullfs_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&nullfs_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return error;
}
/* $NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $ */
/*
* Copyright (c) 1998, 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Author: Lennart Augustsson <lennart@augustsson.net>
* Carlstedt Research & Technology
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* wscons mux device.
*
* The mux device is a collection of real mice and keyboards and acts as
* a merge point for all the events from the different real devices.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: wsmux.c,v 1.66 2022/03/28 12:38:58 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif
#include "wsdisplay.h"
#include "wsmux.h"
#include "wskbd.h"
#include "wsmouse.h"
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/device.h>
#include <sys/device_impl.h> /* XXX autoconf abuse */
#include "opt_wsdisplay_compat.h"
#include <dev/wscons/wsconsio.h>
#include <dev/wscons/wsksymdef.h>
#include <dev/wscons/wseventvar.h>
#include <dev/wscons/wscons_callbacks.h>
#include <dev/wscons/wsmuxvar.h>
#include "ioconf.h"
#ifdef WSMUX_DEBUG
#define DPRINTF(x) if (wsmuxdebug) printf x
#define DPRINTFN(n,x) if (wsmuxdebug > (n)) printf x
int wsmuxdebug = 0;
#else
#define DPRINTF(x)
#define DPRINTFN(n,x)
#endif
/*
* The wsmux pseudo device is used to multiplex events from several wsmouse,
* wskbd, and/or wsmux devices together.
* The devices connected together form a tree with muxes in the interior
* and real devices (mouse and kbd) at the leaves. The special case of
* a tree with one node (mux or other) is supported as well.
* Only the device at the root of the tree can be opened (if a non-root
* device is opened the subtree rooted at that point is severed from the
* containing tree). When the root is opened it allocates a wseventvar
* struct which all the nodes in the tree will send their events too.
* An ioctl() performed on the root is propagated to all the nodes.
* There are also ioctl() operations to add and remove nodes from a tree.
*/
static int wsmux_mux_open(struct wsevsrc *, struct wseventvar *);
static int wsmux_mux_close(struct wsevsrc *);
static void wsmux_do_open(struct wsmux_softc *, struct wseventvar *);
static void wsmux_do_close(struct wsmux_softc *);
#if NWSDISPLAY > 0
static int wsmux_evsrc_set_display(device_t, struct wsevsrc *);
#else
#define wsmux_evsrc_set_display NULL
#endif
static int wsmux_do_displayioctl(device_t dev, u_long cmd,
void *data, int flag, struct lwp *l);
static int wsmux_do_ioctl(device_t, u_long, void *,int,struct lwp *);
static int wsmux_add_mux(int, struct wsmux_softc *);
#define WSMUXDEV(n) ((n) & 0x7f)
#define WSMUXCTL(n) ((n) & 0x80)
dev_type_open(wsmuxopen);
dev_type_close(wsmuxclose);
dev_type_read(wsmuxread);
dev_type_ioctl(wsmuxioctl);
dev_type_poll(wsmuxpoll);
dev_type_kqfilter(wsmuxkqfilter);
const struct cdevsw wsmux_cdevsw = {
.d_open = wsmuxopen,
.d_close = wsmuxclose,
.d_read = wsmuxread,
.d_write = nowrite,
.d_ioctl = wsmuxioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = wsmuxpoll,
.d_mmap = nommap,
.d_kqfilter = wsmuxkqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
struct wssrcops wsmux_srcops = {
WSMUX_MUX,
wsmux_mux_open, wsmux_mux_close, wsmux_do_ioctl, wsmux_do_displayioctl,
wsmux_evsrc_set_display
};
/* From upper level */
void
wsmuxattach(int n)
{
}
/* Keep track of all muxes that have been allocated */
static struct wsmux_softc **wsmuxdevs = NULL;
static int nwsmux = 0;
/* Return mux n, create if necessary */
struct wsmux_softc *
wsmux_getmux(int n)
{
struct wsmux_softc *sc;
n = WSMUXDEV(n); /* limit range */
/* Make sure there is room for mux n in the table */
if (n >= nwsmux) {
void *new;
new = realloc(wsmuxdevs, (n + 1) * sizeof(*wsmuxdevs),
M_DEVBUF, M_ZERO | M_WAITOK);
wsmuxdevs = new;
nwsmux = n + 1;
}
sc = wsmuxdevs[n]; if (sc == NULL) { sc = wsmux_create("wsmux", n);
wsmuxdevs[n] = sc;
}
return (sc);
}
/*
* open() of the pseudo device from device table.
*/
int
wsmuxopen(dev_t dev, int flags, int mode, struct lwp *l)
{
struct wsmux_softc *sc;
struct wseventvar *evar;
int minr, unit;
minr = minor(dev);
unit = WSMUXDEV(minr);
sc = wsmux_getmux(unit);
if (sc == NULL)
return (ENXIO);
DPRINTF(("wsmuxopen: %s: sc=%p l=%p\n",
device_xname(sc->sc_base.me_dv), sc, l));
if (WSMUXCTL(minr)) {
/* This is the control device which does not allow reads. */
if (flags & FREAD)
return (EINVAL);
return (0);
}
if ((flags & (FREAD | FWRITE)) == FWRITE)
/* Allow write only open */
return (0);
if (sc->sc_base.me_parent != NULL) {
/* Grab the mux out of the greedy hands of the parent mux. */
DPRINTF(("wsmuxopen: detach\n"));
wsmux_detach_sc(&sc->sc_base);
}
if (sc->sc_base.me_evp != NULL)
/* Already open. */
return (EBUSY);
evar = &sc->sc_base.me_evar;
wsevent_init(evar, l->l_proc);
#ifdef WSDISPLAY_COMPAT_RAWKBD
sc->sc_rawkbd = 0;
#endif
wsmux_do_open(sc, evar);
return (0);
}
/*
* Open of a mux via the parent mux.
*/
int
wsmux_mux_open(struct wsevsrc *me, struct wseventvar *evar)
{
struct wsmux_softc *sc = (struct wsmux_softc *)me;
#ifdef DIAGNOSTIC
if (sc->sc_base.me_evp != NULL) {
printf("wsmux_mux_open: busy\n");
return (EBUSY);
}
if (sc->sc_base.me_parent == NULL) {
printf("wsmux_mux_open: no parent\n");
return (EINVAL);
}
#endif
wsmux_do_open(sc, evar);
return (0);
}
/* Common part of opening a mux. */
void
wsmux_do_open(struct wsmux_softc *sc, struct wseventvar *evar)
{
struct wsevsrc *me;
sc->sc_base.me_evp = evar; /* remember event variable, mark as open */
/* Open all children. */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmuxopen: %s: m=%p dev=%s\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
if (me->me_evp != NULL) {
printf("wsmuxopen: dev already in use\n");
continue;
}
if (me->me_parent != sc) {
printf("wsmux_do_open: bad child=%p\n", me);
continue;
}
{
int error = wsevsrc_open(me, evar);
if (error) {
DPRINTF(("wsmuxopen: open failed %d\n", error));
}
}
#else
/* ignore errors, failing children will not be marked open */
(void)wsevsrc_open(me, evar);
#endif
}
}
/*
* close() of the pseudo device from device table.
*/
int
wsmuxclose(dev_t dev, int flags, int mode,
struct lwp *l)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
struct wseventvar *evar = sc->sc_base.me_evp;
if (WSMUXCTL(minr))
/* control device */
return (0);
if (evar == NULL)
/* Not open for read */
return (0);
wsmux_do_close(sc);
sc->sc_base.me_evp = NULL;
wsevent_fini(evar);
return (0);
}
/*
* Close of a mux via the parent mux.
*/
int
wsmux_mux_close(struct wsevsrc *me)
{
me->me_evp = NULL;
wsmux_do_close((struct wsmux_softc *)me);
return (0);
}
/* Common part of closing a mux. */
void
wsmux_do_close(struct wsmux_softc *sc)
{
struct wsevsrc *me;
DPRINTF(("wsmuxclose: %s: sc=%p\n",
device_xname(sc->sc_base.me_dv), sc));
/* Close all the children. */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmuxclose %s: m=%p dev=%s\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv)));
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmuxclose: bad child=%p\n", me);
continue;
}
#endif
(void)wsevsrc_close(me);
me->me_evp = NULL;
}
}
/*
* read() of the pseudo device from device table.
*/
int
wsmuxread(dev_t dev, struct uio *uio, int flags)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
struct wseventvar *evar;
int error;
if (WSMUXCTL(minr)) {
/* control device */
return (EINVAL);
}
evar = sc->sc_base.me_evp;
if (evar == NULL) {
#ifdef DIAGNOSTIC
/* XXX can we get here? */
printf("wsmuxread: not open\n");
#endif
return (EINVAL);
}
DPRINTFN(5,("wsmuxread: %s event read evar=%p\n",
device_xname(sc->sc_base.me_dv), evar));
error = wsevent_read(evar, uio, flags);
DPRINTFN(5,("wsmuxread: %s event read ==> error=%d\n",
device_xname(sc->sc_base.me_dv), error));
return (error);
}
/*
* ioctl of the pseudo device from device table.
*/
int
wsmuxioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
int u = WSMUXDEV(minor(dev));
return wsmux_do_ioctl(wsmuxdevs[u]->sc_base.me_dv, cmd, data, flag, l);
}
/*
* ioctl of a mux via the parent mux, continuation of wsmuxioctl().
*/
int
wsmux_do_ioctl(device_t dv, u_long cmd, void *data, int flag,
struct lwp *lwp)
{
struct wsmux_softc *sc = device_private(dv);
struct wsevsrc *me;
int error, ok;
int s, n;
struct wseventvar *evar;
struct wscons_event event;
struct wsmux_device_list *l;
DPRINTF(("wsmux_do_ioctl: %s: enter sc=%p, cmd=%08lx\n",
device_xname(sc->sc_base.me_dv), sc, cmd));
switch (cmd) {
#if defined(COMPAT_50) || defined(MODULAR)
case WSMUXIO_OINJECTEVENT:
#endif /* defined(COMPAT_50) || defined(MODULAR) */
case WSMUXIO_INJECTEVENT:
/* Inject an event, e.g., from moused. */
DPRINTF(("%s: inject\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL) {
/* No event sink, so ignore it. */
DPRINTF(("wsmux_do_ioctl: event ignored\n"));
return (0);
}
s = spltty();
event.type = ((struct wscons_event *)data)->type;
event.value = ((struct wscons_event *)data)->value;
error = wsevent_inject(evar, &event, 1);
splx(s);
return error;
case WSMUXIO_ADD_DEVICE:
#define d ((struct wsmux_device *)data)
DPRINTF(("%s: add type=%d, no=%d\n",
device_xname(sc->sc_base.me_dv), d->type, d->idx));
switch (d->type) {
#if NWSMOUSE > 0
case WSMUX_MOUSE:
return (wsmouse_add_mux(d->idx, sc));
#endif
#if NWSKBD > 0
case WSMUX_KBD:
return (wskbd_add_mux(d->idx, sc));
#endif
case WSMUX_MUX:
return (wsmux_add_mux(d->idx, sc));
case WSMUX_BELL:
return (wsbell_add_mux(d->idx, sc));
default:
return (EINVAL);
}
case WSMUXIO_REMOVE_DEVICE:
DPRINTF(("%s: rem type=%d, no=%d\n",
device_xname(sc->sc_base.me_dv), d->type, d->idx));
/* Locate the device */
TAILQ_FOREACH(me, &sc->sc_cld, me_next) { if (me->me_ops->type == d->type &&
device_unit(me->me_dv) == d->idx) {
DPRINTF(("wsmux_do_ioctl: detach\n"));
wsmux_detach_sc(me);
return (0);
}
}
return (EINVAL);
#undef d
case WSMUXIO_LIST_DEVICES:
DPRINTF(("%s: list\n", device_xname(sc->sc_base.me_dv)));
l = (struct wsmux_device_list *)data;
n = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
if (n >= WSMUX_MAXDEV)
break;
l->devices[n].type = me->me_ops->type;
l->devices[n].idx = device_unit(me->me_dv);
n++;
}
l->ndevices = n;
return (0);
#ifdef WSDISPLAY_COMPAT_RAWKBD
case WSKBDIO_SETMODE:
sc->sc_rawkbd = *(int *)data;
DPRINTF(("wsmux_do_ioctl: save rawkbd = %d\n", sc->sc_rawkbd));
break;
#endif
case WSKBDIO_SETVERSION:
case WSMOUSEIO_SETVERSION:
case WSDISPLAYIO_SETVERSION:
DPRINTF(("%s: WSxxxIO_SETVERSION\n",
device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
return wsevent_setversion(evar, *(int *)data);
case FIONBIO:
DPRINTF(("%s: FIONBIO\n", device_xname(sc->sc_base.me_dv)));
return (0);
case FIOASYNC:
DPRINTF(("%s: FIOASYNC\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
evar->async = *(int *)data != 0;
return (0);
case FIOSETOWN:
DPRINTF(("%s: FIOSETOWN\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
if (-*(int *)data != evar->io->p_pgid && *(int *)data != evar->io->p_pid)
return (EPERM);
return (0);
case TIOCSPGRP:
DPRINTF(("%s: TIOCSPGRP\n", device_xname(sc->sc_base.me_dv)));
evar = sc->sc_base.me_evp;
if (evar == NULL)
return (EINVAL);
if (*(int *)data != evar->io->p_pgid)
return (EPERM);
return (0);
default:
DPRINTF(("%s: unknown\n", device_xname(sc->sc_base.me_dv)));
break;
}
if (sc->sc_base.me_evp == NULL
#if NWSDISPLAY > 0
&& sc->sc_base.me_dispdv == NULL
#endif
)
return (EACCES);
/* Return 0 if any of the ioctl() succeeds, otherwise the last error */
error = 0;
ok = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
#ifdef DIAGNOSTIC
/* XXX check evp? */
if (me->me_parent != sc) {
printf("wsmux_do_ioctl: bad child %p\n", me);
continue;
}
#endif
error = wsevsrc_ioctl(me, cmd, data, flag, lwp);
DPRINTF(("wsmux_do_ioctl: %s: me=%p dev=%s ==> %d\n",
device_xname(sc->sc_base.me_dv), me,
device_xname(me->me_dv), error));
if (!error)
ok = 1;
}
if (ok) {
error = 0;
if (cmd == WSKBDIO_SETENCODING) { sc->sc_kbd_layout = *((kbd_t *)data);
}
}
return (error);
}
/*
* poll() of the pseudo device from device table.
*/
int
wsmuxpoll(dev_t dev, int events, struct lwp *l)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
if (WSMUXCTL(minr)) {
/* control device */
return (0);
}
if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
printf("wsmuxpoll: not open\n");
#endif
return (POLLHUP);
}
return (wsevent_poll(sc->sc_base.me_evp, events, l));
}
/*
* kqfilter() of the pseudo device from device table.
*/
int
wsmuxkqfilter(dev_t dev, struct knote *kn)
{
int minr = minor(dev);
struct wsmux_softc *sc = wsmuxdevs[WSMUXDEV(minr)];
if (WSMUXCTL(minr)) {
/* control device */
return (1);
}
if (sc->sc_base.me_evp == NULL) {
#ifdef DIAGNOSTIC
printf("wsmuxkqfilter: not open\n");
#endif
return (1);
}
return (wsevent_kqfilter(sc->sc_base.me_evp, kn));
}
/*
* Add mux unit as a child to muxsc.
*/
int
wsmux_add_mux(int unit, struct wsmux_softc *muxsc)
{
struct wsmux_softc *sc, *m;
sc = wsmux_getmux(unit);
if (sc == NULL)
return (ENXIO);
DPRINTF(("wsmux_add_mux: %s(%p) to %s(%p)\n",
device_xname(sc->sc_base.me_dv), sc,
device_xname(muxsc->sc_base.me_dv), muxsc));
if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL)
return (EBUSY);
/* The mux we are adding must not be an ancestor of itself. */
for (m = muxsc; m != NULL ; m = m->sc_base.me_parent) if (m == sc)
return (EINVAL);
return (wsmux_attach_sc(muxsc, &sc->sc_base));
}
/* Create a new mux softc. */
struct wsmux_softc *
wsmux_create(const char *name, int unit)
{
struct wsmux_softc *sc;
/* XXX This is wrong -- should use autoconfiguration framework */
DPRINTF(("wsmux_create: allocating\n"));
sc = malloc(sizeof *sc, M_DEVBUF, M_WAITOK|M_ZERO);
sc->sc_base.me_dv = malloc(sizeof(struct device), M_DEVBUF,
M_WAITOK|M_ZERO);
TAILQ_INIT(&sc->sc_cld);
snprintf(sc->sc_base.me_dv->dv_xname,
sizeof sc->sc_base.me_dv->dv_xname, "%s%d", name, unit);
sc->sc_base.me_dv->dv_private = sc;
sc->sc_base.me_dv->dv_unit = unit;
sc->sc_base.me_ops = &wsmux_srcops;
sc->sc_kbd_layout = KB_NONE;
return (sc);
}
/* Attach me as a child to sc. */
int
wsmux_attach_sc(struct wsmux_softc *sc, struct wsevsrc *me)
{
int error;
if (sc == NULL)
return (EINVAL);
DPRINTF(("wsmux_attach_sc: %s(%p): type=%d\n",
device_xname(sc->sc_base.me_dv), sc, me->me_ops->type));
#ifdef DIAGNOSTIC
if (me->me_parent != NULL) {
printf("wsmux_attach_sc: busy\n");
return (EBUSY);
}
#endif
me->me_parent = sc;
TAILQ_INSERT_TAIL(&sc->sc_cld, me, me_next);
error = 0;
#if NWSDISPLAY > 0
if (sc->sc_base.me_dispdv != NULL) {
/* This is a display mux, so attach the new device to it. */
DPRINTF(("wsmux_attach_sc: %s: set display %p\n",
device_xname(sc->sc_base.me_dv),
sc->sc_base.me_dispdv));
if (me->me_ops->dsetdisplay != NULL) {
error = wsevsrc_set_display(me, &sc->sc_base);
/* Ignore that the console already has a display. */
if (error == EBUSY)
error = 0;
if (!error) {
#ifdef WSDISPLAY_COMPAT_RAWKBD
DPRINTF(("wsmux_attach_sc: %s set rawkbd=%d\n",
device_xname(me->me_dv),
sc->sc_rawkbd));
(void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
&sc->sc_rawkbd, 0, 0);
#endif
if (sc->sc_kbd_layout != KB_NONE)
(void)wsevsrc_ioctl(me,
WSKBDIO_SETENCODING,
&sc->sc_kbd_layout, FWRITE, 0);
}
}
}
#endif
if (sc->sc_base.me_evp != NULL) {
/* Mux is open, so open the new subdevice */
DPRINTF(("wsmux_attach_sc: %s: calling open of %s\n",
device_xname(sc->sc_base.me_dv),
device_xname(me->me_dv)));
error = wsevsrc_open(me, sc->sc_base.me_evp);
} else {
DPRINTF(("wsmux_attach_sc: %s not open\n",
device_xname(sc->sc_base.me_dv)));
}
if (error) {
me->me_parent = NULL;
TAILQ_REMOVE(&sc->sc_cld, me, me_next);
}
DPRINTF(("wsmux_attach_sc: %s(%p) done, error=%d\n",
device_xname(sc->sc_base.me_dv), sc, error));
return (error);
}
/* Remove me from the parent. */
void
wsmux_detach_sc(struct wsevsrc *me)
{
struct wsmux_softc *sc = me->me_parent;
DPRINTF(("wsmux_detach_sc: %s(%p) parent=%p\n",
device_xname(me->me_dv), me, sc));
#ifdef DIAGNOSTIC
if (sc == NULL) {
printf("wsmux_detach_sc: %s has no parent\n",
device_xname(me->me_dv));
return;
}
#endif
#if NWSDISPLAY > 0
if (sc->sc_base.me_dispdv != NULL) {
if (me->me_ops->dsetdisplay != NULL)
/* ignore error, there's nothing we can do */
(void)wsevsrc_set_display(me, NULL);
} else
#endif
if (me->me_evp != NULL) {
DPRINTF(("wsmux_detach_sc: close\n"));
/* mux device is open, so close multiplexee */
(void)wsevsrc_close(me);
}
TAILQ_REMOVE(&sc->sc_cld, me, me_next);
me->me_parent = NULL;
DPRINTF(("wsmux_detach_sc: done sc=%p\n", sc));
}
/*
* Display ioctl() of a mux via the parent mux.
*/
int
wsmux_do_displayioctl(device_t dv, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct wsmux_softc *sc = device_private(dv);
struct wsevsrc *me;
int error, ok;
DPRINTF(("wsmux_displayioctl: %s: sc=%p, cmd=%08lx\n",
device_xname(sc->sc_base.me_dv), sc, cmd));
#ifdef WSDISPLAY_COMPAT_RAWKBD
if (cmd == WSKBDIO_SETMODE) {
sc->sc_rawkbd = *(int *)data;
DPRINTF(("wsmux_displayioctl: rawkbd = %d\n", sc->sc_rawkbd));
}
#endif
/*
* Return 0 if any of the ioctl() succeeds, otherwise the last error.
* Return EPASSTHROUGH if no mux component accepts the ioctl.
*/
error = EPASSTHROUGH;
ok = 0;
TAILQ_FOREACH(me, &sc->sc_cld, me_next) {
DPRINTF(("wsmux_displayioctl: me=%p\n", me));
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmux_displayioctl: bad child %p\n", me);
continue;
}
#endif
if (me->me_ops->ddispioctl != NULL) {
error = wsevsrc_display_ioctl(me, cmd, data, flag, l);
DPRINTF(("wsmux_displayioctl: me=%p dev=%s ==> %d\n",
me, device_xname(me->me_dv), error));
if (!error)
ok = 1;
}
}
if (ok)
error = 0;
return (error);
}
#if NWSDISPLAY > 0
/*
* Set display of a mux via the parent mux.
*/
int
wsmux_evsrc_set_display(device_t dv, struct wsevsrc *ame)
{
struct wsmux_softc *muxsc = (struct wsmux_softc *)ame;
struct wsmux_softc *sc = device_private(dv);
device_t displaydv = muxsc ? muxsc->sc_base.me_dispdv : NULL;
DPRINTF(("wsmux_set_display: %s: displaydv=%p\n",
device_xname(sc->sc_base.me_dv), displaydv));
if (displaydv != NULL) {
if (sc->sc_base.me_dispdv != NULL)
return (EBUSY);
} else {
if (sc->sc_base.me_dispdv == NULL)
return (ENXIO);
}
return wsmux_set_display(sc, displaydv);
}
int
wsmux_set_display(struct wsmux_softc *sc, device_t displaydv)
{
device_t odisplaydv;
struct wsevsrc *me;
struct wsmux_softc *nsc = displaydv ? sc : NULL;
int error, ok;
odisplaydv = sc->sc_base.me_dispdv;
sc->sc_base.me_dispdv = displaydv;
if (displaydv)
aprint_verbose_dev(sc->sc_base.me_dv, "connecting to %s\n",
device_xname(displaydv));
ok = 0;
error = 0;
TAILQ_FOREACH(me, &sc->sc_cld,me_next) {
#ifdef DIAGNOSTIC
if (me->me_parent != sc) {
printf("wsmux_set_display: bad child parent %p\n", me);
continue;
}
#endif
if (me->me_ops->dsetdisplay != NULL) {
error = wsevsrc_set_display(me, &nsc->sc_base);
DPRINTF(("wsmux_set_display: m=%p dev=%s error=%d\n",
me, device_xname(me->me_dv), error));
if (!error) {
ok = 1;
#ifdef WSDISPLAY_COMPAT_RAWKBD
DPRINTF(("wsmux_set_display: %s set rawkbd=%d\n",
device_xname(me->me_dv), sc->sc_rawkbd));
(void)wsevsrc_ioctl(me, WSKBDIO_SETMODE,
&sc->sc_rawkbd, 0, 0);
#endif
}
}
}
if (ok)
error = 0;
if (displaydv == NULL)
aprint_verbose("%s: disconnecting from %s\n",
device_xname(sc->sc_base.me_dv),
device_xname(odisplaydv));
return (error);
}
#endif /* NWSDISPLAY > 0 */
/* $NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_fault.c,v 1.1.2.23 1998/02/06 05:29:05 chs Exp
*/
/*
* uvm_fault.c: fault handler
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_fault.c,v 1.237 2024/03/15 07:09:37 andvar Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/mman.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_rndsource.h>
/*
*
* a word on page faults:
*
* types of page faults we handle:
*
* CASE 1: upper layer faults CASE 2: lower layer faults
*
* CASE 1A CASE 1B CASE 2A CASE 2B
* read/write1 write>1 read/write +-cow_write/zero
* | | | |
* +--|--+ +--|--+ +-----+ + | + | +-----+
* amap | V | | ---------> new | | | | ^ |
* +-----+ +-----+ +-----+ + | + | +--|--+
* | | |
* +-----+ +-----+ +--|--+ | +--|--+
* uobj | d/c | | d/c | | V | +----+ |
* +-----+ +-----+ +-----+ +-----+
*
* d/c = don't care
*
* case [0]: layerless fault
* no amap or uobj is present. this is an error.
*
* case [1]: upper layer fault [anon active]
* 1A: [read] or [write with anon->an_ref == 1]
* I/O takes place in upper level anon and uobj is not touched.
* 1B: [write with anon->an_ref > 1]
* new anon is alloc'd and data is copied off ["COW"]
*
* case [2]: lower layer fault [uobj]
* 2A: [read on non-NULL uobj] or [write to non-copy_on_write area]
* I/O takes place directly in object.
* 2B: [write to copy_on_write] or [read on NULL uobj]
* data is "promoted" from uobj to a new anon.
* if uobj is null, then we zero fill.
*
* we follow the standard UVM locking protocol ordering:
*
* MAPS => AMAP => UOBJ => ANON => PAGE QUEUES (PQ)
* we hold a PG_BUSY page if we unlock for I/O
*
*
* the code is structured as follows:
*
* - init the "IN" params in the ufi structure
* ReFault: (ERESTART returned to the loop in uvm_fault_internal)
* - do lookups [locks maps], check protection, handle needs_copy
* - check for case 0 fault (error)
* - establish "range" of fault
* - if we have an amap lock it and extract the anons
* - if sequential advice deactivate pages behind us
* - at the same time check pmap for unmapped areas and anon for pages
* that we could map in (and do map it if found)
* - check object for resident pages that we could map in
* - if (case 2) goto Case2
* - >>> handle case 1
* - ensure source anon is resident in RAM
* - if case 1B alloc new anon and copy from source
* - map the correct page in
* Case2:
* - >>> handle case 2
* - ensure source page is resident (if uobj)
* - if case 2B alloc new anon and copy from source (could be zero
* fill if uobj == NULL)
* - map the correct page in
* - done!
*
* note on paging:
* if we have to do I/O we place a PG_BUSY page in the correct object,
* unlock everything, and do the I/O. when I/O is done we must reverify
* the state of the world before assuming that our data structures are
* valid. [because mappings could change while the map is unlocked]
*
* alternative 1: unbusy the page in question and restart the page fault
* from the top (ReFault). this is easy but does not take advantage
* of the information that we already have from our previous lookup,
* although it is possible that the "hints" in the vm_map will help here.
*
* alternative 2: the system already keeps track of a "version" number of
* a map. [i.e. every time you write-lock a map (e.g. to change a
* mapping) you bump the version number up by one...] so, we can save
* the version number of the map before we release the lock and start I/O.
* then when I/O is done we can relock and check the version numbers
* to see if anything changed. this might save us some over 1 because
* we don't have to unbusy the page and may be less compares(?).
*
* alternative 3: put in backpointers or a way to "hold" part of a map
* in place while I/O is in progress. this could be complex to
* implement (especially with structures like amap that can be referenced
* by multiple map entries, and figuring out what should wait could be
* complex as well...).
*
* we use alternative 2. given that we are multi-threaded now we may want
* to reconsider the choice.
*/
/*
* local data structures
*/
struct uvm_advice {
int advice;
int nback;
int nforw;
};
/*
* page range array:
* note: index in array must match "advice" value
* XXX: borrowed numbers from freebsd. do they work well for us?
*/
static const struct uvm_advice uvmadvice[] = {
{ UVM_ADV_NORMAL, 3, 4 },
{ UVM_ADV_RANDOM, 0, 0 },
{ UVM_ADV_SEQUENTIAL, 8, 7},
};
#define UVM_MAXRANGE 16 /* must be MAX() of nback+nforw+1 */
/*
* private prototypes
*/
/*
* inline functions
*/
/*
* uvmfault_anonflush: try and deactivate pages in specified anons
*
* => does not have to deactivate page if it is busy
*/
static inline void
uvmfault_anonflush(struct vm_anon **anons, int n)
{
int lcv;
struct vm_page *pg;
for (lcv = 0; lcv < n; lcv++) { if (anons[lcv] == NULL)
continue;
KASSERT(rw_lock_held(anons[lcv]->an_lock));
pg = anons[lcv]->an_page;
if (pg && (pg->flags & PG_BUSY) == 0) { uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
}
}
}
/*
* normal functions
*/
/*
* uvmfault_amapcopy: clear "needs_copy" in a map.
*
* => called with VM data structures unlocked (usually, see below)
* => we get a write lock on the maps and clear needs_copy for a VA
* => if we are out of RAM we sleep (waiting for more)
*/
static void
uvmfault_amapcopy(struct uvm_faultinfo *ufi)
{
for (;;) {
/*
* no mapping? give up.
*/
if (uvmfault_lookup(ufi, true) == false)
return;
/*
* copy if needed.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry))
amap_copy(ufi->map, ufi->entry, AMAP_COPY_NOWAIT,
ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
/*
* didn't work? must be out of RAM. unlock and sleep.
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
uvmfault_unlockmaps(ufi, true);
uvm_wait("fltamapcopy");
continue;
}
/*
* got it! unlock and return.
*/
uvmfault_unlockmaps(ufi, true);
return;
}
/*NOTREACHED*/
}
/*
* uvmfault_anonget: get data in an anon into a non-busy, non-released
* page in that anon.
*
* => Map, amap and thus anon should be locked by caller.
* => If we fail, we unlock everything and error is returned.
* => If we are successful, return with everything still locked.
* => We do not move the page on the queues [gets moved later]. If we
* allocate a new page [we_own], it gets put on the queues. Either way,
* the result is that the page is on the queues at return time
* => For pages which are on loan from a uvm_object (and thus are not owned
* by the anon): if successful, return with the owning object locked.
* The caller must unlock this object when it unlocks everything else.
*/
int
uvmfault_anonget(struct uvm_faultinfo *ufi, struct vm_amap *amap,
struct vm_anon *anon)
{
struct vm_page *pg;
krw_t lock_type;
int error __unused; /* used for VMSWAP */
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(anon->an_lock)); KASSERT(anon->an_lock == amap->am_lock);
/* Increment the counters.*/
cpu_count(CPU_COUNT_FLTANGET, 1);
if (anon->an_page) {
curlwp->l_ru.ru_minflt++;
} else {
curlwp->l_ru.ru_majflt++;
}
error = 0;
/*
* Loop until we get the anon data, or fail.
*/
for (;;) {
bool we_own, locked;
/*
* Note: 'we_own' will become true if we set PG_BUSY on a page.
*/
we_own = false;
pg = anon->an_page;
/*
* If there is a resident page and it is loaned, then anon
* may not own it. Call out to uvm_anon_lockloanpg() to
* identify and lock the real owner of the page.
*/
if (pg && pg->loan_count)
pg = uvm_anon_lockloanpg(anon);
/*
* Is page resident? Make sure it is not busy/released.
*/
lock_type = rw_lock_op(anon->an_lock);
if (pg) {
/*
* at this point, if the page has a uobject [meaning
* we have it on loan], then that uobject is locked
* by us! if the page is busy, we drop all the
* locks (including uobject) and try again.
*/
if ((pg->flags & PG_BUSY) == 0) {
UVMHIST_LOG(maphist, "<- OK",0,0,0,0);
return 0;
}
cpu_count(CPU_COUNT_FLTPGWAIT, 1);
/*
* The last unlock must be an atomic unlock and wait
* on the owner of page.
*/
if (pg->uobject) {
/* Owner of page is UVM object. */
uvmfault_unlockall(ufi, amap, NULL);
UVMHIST_LOG(maphist, " unlock+wait on uobj",0,
0,0,0);
uvm_pagewait(pg, pg->uobject->vmobjlock, "anonget1");
} else {
/* Owner of page is anon. */
uvmfault_unlockall(ufi, NULL, NULL);
UVMHIST_LOG(maphist, " unlock+wait on anon",0,
0,0,0);
uvm_pagewait(pg, anon->an_lock, "anonget2");
}
} else {
#if defined(VMSWAP)
/*
* No page, therefore allocate one. A write lock is
* required for this. If the caller didn't supply
* one, fail now and have them retry.
*/
if (lock_type == RW_READER) {
return ENOLCK;
}
pg = uvm_pagealloc(NULL,
ufi != NULL ? ufi->orig_rvaddr : 0,
anon, ufi != NULL ? UVM_FLAG_COLORMATCH : 0);
if (pg == NULL) {
/* Out of memory. Wait a little. */
uvmfault_unlockall(ufi, amap, NULL);
cpu_count(CPU_COUNT_FLTNORAM, 1);
UVMHIST_LOG(maphist, " noram -- UVM_WAIT",0,
0,0,0);
if (!uvm_reclaimable()) {
return ENOMEM;
}
uvm_wait("flt_noram1");
} else {
/* PG_BUSY bit is set. */
we_own = true;
uvmfault_unlockall(ufi, amap, NULL);
/*
* Pass a PG_BUSY+PG_FAKE clean page into
* the uvm_swap_get() function with all data
* structures unlocked. Note that it is OK
* to read an_swslot here, because we hold
* PG_BUSY on the page.
*/
cpu_count(CPU_COUNT_PAGEINS, 1);
error = uvm_swap_get(pg, anon->an_swslot,
PGO_SYNCIO);
/*
* We clean up after the I/O below in the
* 'we_own' case.
*/
}
#else
panic("%s: no page", __func__);
#endif /* defined(VMSWAP) */
}
/*
* Re-lock the map and anon.
*/
locked = uvmfault_relock(ufi); if (locked || we_own) { rw_enter(anon->an_lock, lock_type);
}
/*
* If we own the page (i.e. we set PG_BUSY), then we need
* to clean up after the I/O. There are three cases to
* consider:
*
* 1) Page was released during I/O: free anon and ReFault.
* 2) I/O not OK. Free the page and cause the fault to fail.
* 3) I/O OK! Activate the page and sync with the non-we_own
* case (i.e. drop anon lock if not locked).
*/
if (we_own) { KASSERT(lock_type == RW_WRITER);
#if defined(VMSWAP)
if (error) {
/*
* Remove the swap slot from the anon and
* mark the anon as having no real slot.
* Do not free the swap slot, thus preventing
* it from being used again.
*/
if (anon->an_swslot > 0) { uvm_swap_markbad(anon->an_swslot, 1);
}
anon->an_swslot = SWSLOT_BAD;
if ((pg->flags & PG_RELEASED) != 0) {
goto released;
}
/*
* Note: page was never !PG_BUSY, so it
* cannot be mapped and thus no need to
* pmap_page_protect() it.
*/
uvm_pagefree(pg);
if (locked) { uvmfault_unlockall(ufi, NULL, NULL);
}
rw_exit(anon->an_lock);
UVMHIST_LOG(maphist, "<- ERROR", 0,0,0,0);
return error;
}
if ((pg->flags & PG_RELEASED) != 0) {
released:
KASSERT(anon->an_ref == 0);
/*
* Released while we had unlocked amap.
*/
if (locked) { uvmfault_unlockall(ufi, NULL, NULL);
}
uvm_anon_release(anon);
if (error) {
UVMHIST_LOG(maphist,
"<- ERROR/RELEASED", 0,0,0,0);
return error;
}
UVMHIST_LOG(maphist, "<- RELEASED", 0,0,0,0);
return ERESTART;
}
/*
* We have successfully read the page, activate it.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
UVM_PAGE_OWN(pg, NULL);
#else
panic("%s: we_own", __func__);
#endif /* defined(VMSWAP) */
}
/*
* We were not able to re-lock the map - restart the fault.
*/
if (!locked) {
if (we_own) {
rw_exit(anon->an_lock);
}
UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
return ERESTART;
}
/*
* Verify that no one has touched the amap and moved
* the anon on us.
*/
if (ufi != NULL && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start) != anon) {
uvmfault_unlockall(ufi, amap, NULL);
UVMHIST_LOG(maphist, "<- REFAULT", 0,0,0,0);
return ERESTART;
}
/*
* Retry..
*/
cpu_count(CPU_COUNT_FLTANRETRY, 1);
continue;
}
/*NOTREACHED*/
}
/*
* uvmfault_promote: promote data to a new anon. used for 1B and 2B.
*
* 1. allocate an anon and a page.
* 2. fill its contents.
* 3. put it into amap.
*
* => if we fail (result != 0) we unlock everything.
* => on success, return a new locked anon via 'nanon'.
* (*nanon)->an_page will be a resident, locked, dirty page.
* => it's caller's responsibility to put the promoted nanon->an_page to the
* page queue.
*/
static int
uvmfault_promote(struct uvm_faultinfo *ufi,
struct vm_anon *oanon,
struct vm_page *uobjpage,
struct vm_anon **nanon, /* OUT: allocated anon */
struct vm_anon **spare)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj;
struct vm_anon *anon;
struct vm_page *pg;
struct vm_page *opg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (oanon) {
/* anon COW */
opg = oanon->an_page;
KASSERT(opg != NULL); KASSERT(opg->uobject == NULL || opg->loan_count > 0); } else if (uobjpage != PGO_DONTCARE) {
/* object-backed COW */
opg = uobjpage;
KASSERT(rw_lock_held(opg->uobject->vmobjlock));
} else {
/* ZFOD */
opg = NULL;
}
if (opg != NULL) {
uobj = opg->uobject;
} else {
uobj = NULL;
}
KASSERT(amap != NULL); KASSERT(uobjpage != NULL); KASSERT(rw_write_held(amap->am_lock)); KASSERT(oanon == NULL || amap->am_lock == oanon->an_lock); KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock));
if (*spare != NULL) {
anon = *spare;
*spare = NULL;
} else {
anon = uvm_analloc();
}
if (anon) {
/*
* The new anon is locked.
*
* if opg == NULL, we want a zero'd, dirty page,
* so have uvm_pagealloc() do that for us.
*/
KASSERT(anon->an_lock == NULL);
anon->an_lock = amap->am_lock;
pg = uvm_pagealloc(NULL, ufi->orig_rvaddr, anon,
UVM_FLAG_COLORMATCH | (opg == NULL ? UVM_PGA_ZERO : 0));
if (pg == NULL) {
anon->an_lock = NULL;
}
} else {
pg = NULL;
}
/*
* out of memory resources?
*/
if (pg == NULL) {
/* save anon for the next try. */
if (anon != NULL) {
*spare = anon;
}
/* unlock and fail ... */
uvmfault_unlockall(ufi, amap, uobj);
if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist, "out of VM", 0,0,0,0);
cpu_count(CPU_COUNT_FLTNOANON, 1);
error = ENOMEM;
goto done;
}
UVMHIST_LOG(maphist, "out of RAM, waiting for more", 0,0,0,0);
cpu_count(CPU_COUNT_FLTNORAM, 1);
uvm_wait("flt_noram5");
error = ERESTART;
goto done;
}
/*
* copy the page [pg now dirty]
*
* Remove the pmap entry now for the old page at this address
* so that no thread can modify the new page while any thread
* might still see the old page.
*/
if (opg) { pmap_remove(vm_map_pmap(ufi->orig_map), ufi->orig_rvaddr,
ufi->orig_rvaddr + PAGE_SIZE);
pmap_update(vm_map_pmap(ufi->orig_map));
uvm_pagecopy(opg, pg);
}
KASSERT(uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_DIRTY);
amap_add(&ufi->entry->aref, ufi->orig_rvaddr - ufi->entry->start, anon,
oanon != NULL);
/*
* from this point on am_lock won't be dropped until the page is
* entered, so it's safe to unbusy the page up front.
*
* uvm_fault_{upper,lower}_done will activate or enqueue the page.
*/
pg = anon->an_page;
pg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(pg, NULL);
*nanon = anon;
error = 0;
done:
return error;
}
/*
* Update statistics after fault resolution.
* - maxrss
*/
void
uvmfault_update_stats(struct uvm_faultinfo *ufi)
{
struct vm_map *map;
struct vmspace *vm;
struct proc *p;
vsize_t res;
map = ufi->orig_map;
p = curproc;
KASSERT(p != NULL);
vm = p->p_vmspace;
if (&vm->vm_map != map)
return;
res = pmap_resident_count(map->pmap);
if (vm->vm_rssmax < res) vm->vm_rssmax = res;
}
/*
* F A U L T - m a i n e n t r y p o i n t
*/
/*
* uvm_fault: page fault handler
*
* => called from MD code to resolve a page fault
* => VM data structures usually should be unlocked. however, it is
* possible to call here with the main map locked if the caller
* gets a write lock, sets it recursive, and then calls us (c.f.
* uvm_map_pageable). this should be avoided because it keeps
* the map locked off during I/O.
* => MUST NEVER BE CALLED IN INTERRUPT CONTEXT
*/
#define MASK(entry) (UVM_ET_ISCOPYONWRITE(entry) ? \
~VM_PROT_WRITE : VM_PROT_ALL)
/* fault_flag values passed from uvm_fault_wire to uvm_fault_internal */
#define UVM_FAULT_WIRE (1 << 0)
#define UVM_FAULT_MAXPROT (1 << 1)
struct uvm_faultctx {
/*
* the following members are set up by uvm_fault_check() and
* read-only after that.
*
* note that narrow is used by uvm_fault_check() to change
* the behaviour after ERESTART.
*
* most of them might change after RESTART if the underlying
* map entry has been changed behind us. an exception is
* wire_paging, which does never change.
*/
vm_prot_t access_type;
vaddr_t startva;
int npages;
int centeridx;
bool narrow; /* work on a single requested page only */
bool wire_mapping; /* request a PMAP_WIRED mapping
(UVM_FAULT_WIRE or VM_MAPENT_ISWIRED) */
bool wire_paging; /* request uvm_pagewire
(true for UVM_FAULT_WIRE) */
bool cow_now; /* VM_PROT_WRITE is actually requested
(ie. should break COW and page loaning) */
/*
* enter_prot is set up by uvm_fault_check() and clamped
* (ie. drop the VM_PROT_WRITE bit) in various places in case
* of !cow_now.
*/
vm_prot_t enter_prot; /* prot at which we want to enter pages in */
/*
* the following member is for uvmfault_promote() and ERESTART.
*/
struct vm_anon *anon_spare;
/*
* the following is actually a uvm_fault_lower() internal.
* it's here merely for debugging.
* (or due to the mechanical separation of the function?)
*/
bool promote;
/*
* type of lock to acquire on objects in both layers.
*/
krw_t lower_lock_type;
krw_t upper_lock_type;
};
static inline int uvm_fault_check(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon ***, bool);
static int uvm_fault_upper(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon **);
static inline int uvm_fault_upper_lookup(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_anon **, struct vm_page **);
static inline void uvm_fault_upper_neighbor(
struct uvm_faultinfo *, const struct uvm_faultctx *,
vaddr_t, struct vm_page *, bool);
static inline int uvm_fault_upper_loan(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_anon *, struct uvm_object **);
static inline int uvm_fault_upper_promote(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *);
static inline int uvm_fault_upper_direct(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *);
static int uvm_fault_upper_enter(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *, struct vm_anon *,
struct vm_page *, struct vm_anon *);
static inline void uvm_fault_upper_done(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_anon *, struct vm_page *);
static int uvm_fault_lower(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct vm_page **);
static inline void uvm_fault_lower_lookup(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct vm_page **);
static inline void uvm_fault_lower_neighbor(
struct uvm_faultinfo *, const struct uvm_faultctx *,
vaddr_t, struct vm_page *);
static inline int uvm_fault_lower_io(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object **, struct vm_page **);
static inline int uvm_fault_lower_direct(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
static inline int uvm_fault_lower_direct_loan(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page **,
struct vm_page **);
static inline int uvm_fault_lower_promote(
struct uvm_faultinfo *, struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
static int uvm_fault_lower_enter(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *,
struct vm_anon *, struct vm_page *);
static inline void uvm_fault_lower_done(
struct uvm_faultinfo *, const struct uvm_faultctx *,
struct uvm_object *, struct vm_page *);
int
uvm_fault_internal(struct vm_map *orig_map, vaddr_t vaddr,
vm_prot_t access_type, int fault_flag)
{
struct uvm_faultinfo ufi;
struct uvm_faultctx flt = {
.access_type = access_type,
/* don't look for neighborhood * pages on "wire" fault */
.narrow = (fault_flag & UVM_FAULT_WIRE) != 0,
/* "wire" fault causes wiring of both mapping and paging */
.wire_mapping = (fault_flag & UVM_FAULT_WIRE) != 0,
.wire_paging = (fault_flag & UVM_FAULT_WIRE) != 0,
/*
* default lock type to acquire on upper & lower layer
* objects: reader. this can be upgraded at any point
* during the fault from read -> write and uvm_faultctx
* changed to match, but is never downgraded write -> read.
*/
#ifdef __HAVE_UNLOCKED_PMAP /* XXX temporary */
.upper_lock_type = RW_WRITER,
.lower_lock_type = RW_WRITER,
#else
.upper_lock_type = RW_READER,
.lower_lock_type = RW_READER,
#endif
};
const bool maxprot = (fault_flag & UVM_FAULT_MAXPROT) != 0;
struct vm_anon *anons_store[UVM_MAXRANGE], **anons;
struct vm_page *pages_store[UVM_MAXRANGE], **pages;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(map=%#jx, vaddr=%#jx, at=%jd, ff=%jd)",
(uintptr_t)orig_map, vaddr, access_type, fault_flag);
/* Don't count anything until user interaction is possible */
kpreempt_disable();
if (__predict_true(start_init_exec)) {
struct cpu_info *ci = curcpu();
CPU_COUNT(CPU_COUNT_NFAULT, 1);
/* Don't flood RNG subsystem with samples. */
if (++(ci->ci_faultrng) == 503) { ci->ci_faultrng = 0;
rnd_add_uint32(&uvm_fault_rndsource,
sizeof(vaddr_t) == sizeof(uint32_t) ?
(uint32_t)vaddr : sizeof(vaddr_t) ==
sizeof(uint64_t) ?
(uint32_t)vaddr :
(uint32_t)ci->ci_counts[CPU_COUNT_NFAULT]);
}
}
kpreempt_enable();
/*
* init the IN parameters in the ufi
*/
ufi.orig_map = orig_map;
ufi.orig_rvaddr = trunc_page(vaddr);
ufi.orig_size = PAGE_SIZE; /* can't get any smaller than this */
error = ERESTART;
while (error == ERESTART) { /* ReFault: */
anons = anons_store;
pages = pages_store;
error = uvm_fault_check(&ufi, &flt, &anons, maxprot);
if (error != 0)
continue;
error = uvm_fault_upper_lookup(&ufi, &flt, anons, pages);
if (error != 0)
continue;
if (pages[flt.centeridx] == PGO_DONTCARE)
error = uvm_fault_upper(&ufi, &flt, anons);
else {
struct uvm_object * const uobj =
ufi.entry->object.uvm_obj;
if (uobj && uobj->pgops->pgo_fault != NULL) {
/*
* invoke "special" fault routine.
*/
rw_enter(uobj->vmobjlock, RW_WRITER);
/* locked: maps(read), amap(if there), uobj */
error = uobj->pgops->pgo_fault(&ufi,
flt.startva, pages, flt.npages,
flt.centeridx, flt.access_type,
PGO_LOCKED|PGO_SYNCIO);
/*
* locked: nothing, pgo_fault has unlocked
* everything
*/
/*
* object fault routine responsible for
* pmap_update().
*/
/*
* Wake up the pagedaemon if the fault method
* failed for lack of memory but some can be
* reclaimed.
*/
if (error == ENOMEM && uvm_reclaimable()) { uvm_wait("pgo_fault");
error = ERESTART;
}
} else {
error = uvm_fault_lower(&ufi, &flt, pages);
}
}
}
if (flt.anon_spare != NULL) {
flt.anon_spare->an_ref--;
KASSERT(flt.anon_spare->an_ref == 0); KASSERT(flt.anon_spare->an_lock == NULL);
uvm_anfree(flt.anon_spare);
}
return error;
}
/*
* uvm_fault_check: check prot, handle needs-copy, etc.
*
* 1. lookup entry.
* 2. check protection.
* 3. adjust fault condition (mainly for simulated fault).
* 4. handle needs-copy (lazy amap copy).
* 5. establish range of interest for neighbor fault (aka pre-fault).
* 6. look up anons (if amap exists).
* 7. flush pages (if MADV_SEQUENTIAL)
*
* => called with nothing locked.
* => if we fail (result != 0) we unlock everything.
* => initialize/adjust many members of flt.
*/
static int
uvm_fault_check(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon ***ranons, bool maxprot)
{
struct vm_amap *amap;
struct uvm_object *uobj;
vm_prot_t check_prot;
int nback, nforw;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* lookup and lock the maps
*/
if (uvmfault_lookup(ufi, false) == false) {
UVMHIST_LOG(maphist, "<- no mapping @ %#jx", ufi->orig_rvaddr,
0,0,0);
return EFAULT;
}
/* locked: maps(read) */
#ifdef DIAGNOSTIC
if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0) {
printf("Page fault on non-pageable map:\n");
printf("ufi->map = %p\n", ufi->map);
printf("ufi->orig_map = %p\n", ufi->orig_map);
printf("ufi->orig_rvaddr = %#lx\n", (u_long) ufi->orig_rvaddr);
panic("uvm_fault: (ufi->map->flags & VM_MAP_PAGEABLE) == 0");
}
#endif
/*
* check protection
*/
check_prot = maxprot ?
ufi->entry->max_protection : ufi->entry->protection;
if ((check_prot & flt->access_type) != flt->access_type) {
UVMHIST_LOG(maphist,
"<- protection failure (prot=%#jx, access=%#jx)",
ufi->entry->protection, flt->access_type, 0, 0);
uvmfault_unlockmaps(ufi, false);
return EFAULT;
}
/*
* "enter_prot" is the protection we want to enter the page in at.
* for certain pages (e.g. copy-on-write pages) this protection can
* be more strict than ufi->entry->protection. "wired" means either
* the entry is wired or we are fault-wiring the pg.
*/
flt->enter_prot = ufi->entry->protection;
if (VM_MAPENT_ISWIRED(ufi->entry)) {
flt->wire_mapping = true;
flt->wire_paging = true;
flt->narrow = true;
}
if (flt->wire_mapping) {
flt->access_type = flt->enter_prot; /* full access for wired */
flt->cow_now = (check_prot & VM_PROT_WRITE) != 0;
} else {
flt->cow_now = (flt->access_type & VM_PROT_WRITE) != 0;
}
if (flt->wire_paging) {
/* wiring pages requires a write lock. */
flt->upper_lock_type = RW_WRITER;
flt->lower_lock_type = RW_WRITER;
}
flt->promote = false;
/*
* handle "needs_copy" case. if we need to copy the amap we will
* have to drop our readlock and relock it with a write lock. (we
* need a write lock to change anything in a map entry [e.g.
* needs_copy]).
*/
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
if (flt->cow_now || (ufi->entry->object.uvm_obj == NULL)) { KASSERT(!maxprot);
/* need to clear */
UVMHIST_LOG(maphist,
" need to clear needs_copy and refault",0,0,0,0);
uvmfault_unlockmaps(ufi, false); uvmfault_amapcopy(ufi);
cpu_count(CPU_COUNT_FLTAMCOPY, 1);
return ERESTART;
} else {
/*
* ensure that we pmap_enter page R/O since
* needs_copy is still true
*/
flt->enter_prot &= ~VM_PROT_WRITE;
}
}
/*
* identify the players
*/
amap = ufi->entry->aref.ar_amap; /* upper layer */
uobj = ufi->entry->object.uvm_obj; /* lower layer */
/*
* check for a case 0 fault. if nothing backing the entry then
* error now.
*/
if (amap == NULL && uobj == NULL) { uvmfault_unlockmaps(ufi, false);
UVMHIST_LOG(maphist,"<- no backing store, no overlay",0,0,0,0);
return EFAULT;
}
/*
* for a case 2B fault waste no time on adjacent pages because
* they are likely already entered.
*/
if (uobj != NULL && amap != NULL &&
(flt->access_type & VM_PROT_WRITE) != 0) {
/* wide fault (!narrow) */
flt->narrow = true;
}
/*
* establish range of interest based on advice from mapper
* and then clip to fit map entry. note that we only want
* to do this the first time through the fault. if we
* ReFault we will disable this by setting "narrow" to true.
*/
if (flt->narrow == false) {
/* wide fault (!narrow) */
KASSERT(uvmadvice[ufi->entry->advice].advice ==
ufi->entry->advice);
nback = MIN(uvmadvice[ufi->entry->advice].nback,
(ufi->orig_rvaddr - ufi->entry->start) >> PAGE_SHIFT);
flt->startva = ufi->orig_rvaddr - (nback << PAGE_SHIFT);
/*
* note: "-1" because we don't want to count the
* faulting page as forw
*/
nforw = MIN(uvmadvice[ufi->entry->advice].nforw,
((ufi->entry->end - ufi->orig_rvaddr) >>
PAGE_SHIFT) - 1);
flt->npages = nback + nforw + 1;
flt->centeridx = nback;
flt->narrow = true; /* ensure only once per-fault */
} else {
/* narrow fault! */
nback = nforw = 0;
flt->startva = ufi->orig_rvaddr;
flt->npages = 1;
flt->centeridx = 0;
}
/* offset from entry's start to pgs' start */
const voff_t eoff = flt->startva - ufi->entry->start;
/* locked: maps(read) */
UVMHIST_LOG(maphist, " narrow=%jd, back=%jd, forw=%jd, startva=%#jx",
flt->narrow, nback, nforw, flt->startva);
UVMHIST_LOG(maphist, " entry=%#jx, amap=%#jx, obj=%#jx",
(uintptr_t)ufi->entry, (uintptr_t)amap, (uintptr_t)uobj, 0);
/*
* guess at the most suitable lock types to acquire.
* if we've got an amap then lock it and extract current anons.
*/
if (amap) {
if ((amap_flags(amap) & AMAP_SHARED) == 0) {
/*
* the amap isn't shared. get a writer lock to
* avoid the cost of upgrading the lock later if
* needed.
*
* XXX nice for PostgreSQL, but consider threads.
*/
flt->upper_lock_type = RW_WRITER;
} else if ((flt->access_type & VM_PROT_WRITE) != 0) {
/*
* assume we're about to COW.
*/
flt->upper_lock_type = RW_WRITER;
}
amap_lock(amap, flt->upper_lock_type);
amap_lookups(&ufi->entry->aref, eoff, *ranons, flt->npages);
} else {
if ((flt->access_type & VM_PROT_WRITE) != 0) {
/*
* we are about to dirty the object and that
* requires a write lock.
*/
flt->lower_lock_type = RW_WRITER;
}
*ranons = NULL; /* to be safe */
}
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/*
* for MADV_SEQUENTIAL mappings we want to deactivate the back pages
* now and then forget about them (for the rest of the fault).
*/
if (ufi->entry->advice == MADV_SEQUENTIAL && nback != 0) {
UVMHIST_LOG(maphist, " MADV_SEQUENTIAL: flushing backpages",
0,0,0,0);
/* flush back-page anons? */
if (amap) uvmfault_anonflush(*ranons, nback);
/*
* flush object? change lock type to RW_WRITER, to avoid
* excessive competition between read/write locks if many
* threads doing "sequential access".
*/
if (uobj) {
voff_t uoff;
flt->lower_lock_type = RW_WRITER;
uoff = ufi->entry->offset + eoff;
rw_enter(uobj->vmobjlock, RW_WRITER);
(void) (uobj->pgops->pgo_put)(uobj, uoff, uoff +
(nback << PAGE_SHIFT), PGO_DEACTIVATE);
}
/* now forget about the backpages */
if (amap)
*ranons += nback;
flt->startva += (nback << PAGE_SHIFT);
flt->npages -= nback;
flt->centeridx = 0;
}
/*
* => startva is fixed
* => npages is fixed
*/
KASSERT(flt->startva <= ufi->orig_rvaddr); KASSERT(ufi->orig_rvaddr + ufi->orig_size <=
flt->startva + (flt->npages << PAGE_SHIFT));
return 0;
}
/*
* uvm_fault_upper_upgrade: upgrade upper lock, reader -> writer
*/
static inline int
uvm_fault_upper_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_amap *amap, struct uvm_object *uobj)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap != NULL); KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
/*
* fast path.
*/
if (__predict_true(flt->upper_lock_type == RW_WRITER)) {
return 0;
}
/*
* otherwise try for the upgrade. if we don't get it, unlock
* everything, restart the fault and next time around get a writer
* lock.
*/
flt->upper_lock_type = RW_WRITER;
if (__predict_false(!rw_tryupgrade(amap->am_lock))) {
uvmfault_unlockall(ufi, amap, uobj);
cpu_count(CPU_COUNT_FLTNOUP, 1);
UVMHIST_LOG(maphist, " !upgrade upper", 0, 0,0,0);
return ERESTART;
}
cpu_count(CPU_COUNT_FLTUP, 1);
KASSERT(flt->upper_lock_type == rw_lock_op(amap->am_lock));
return 0;
}
/*
* uvm_fault_upper_lookup: look up existing h/w mapping and amap.
*
* iterate range of interest:
* 1. check if h/w mapping exists. if yes, we don't care
* 2. check if anon exists. if not, page is lower.
* 3. if anon exists, enter h/w mapping for neighbors.
*
* => called with amap locked (if exists).
*/
static int
uvm_fault_upper_lookup(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_anon **anons, struct vm_page **pages)
{
struct vm_amap *amap = ufi->entry->aref.ar_amap;
int lcv;
vaddr_t currva;
bool shadowed __unused;
bool entered;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/*
* map in the backpages and frontpages we found in the amap in hopes
* of preventing future faults. we also init the pages[] array as
* we go.
*/
currva = flt->startva;
shadowed = false;
entered = false;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
/*
* unmapped or center page. check if any anon at this level.
*/
if (amap == NULL || anons[lcv] == NULL) {
pages[lcv] = NULL;
continue;
}
/*
* check for present page and map if possible.
*/
pages[lcv] = PGO_DONTCARE;
if (lcv == flt->centeridx) { /* save center for later! */
shadowed = true;
continue;
}
struct vm_anon *anon = anons[lcv];
struct vm_page *pg = anon->an_page;
KASSERT(anon->an_lock == amap->am_lock);
/*
* ignore loaned and busy pages.
* don't play with VAs that are already mapped.
*/
if (pg && pg->loan_count == 0 && (pg->flags & PG_BUSY) == 0 &&
!pmap_extract(ufi->orig_map->pmap, currva, NULL)) {
uvm_fault_upper_neighbor(ufi, flt, currva,
pg, anon->an_ref > 1);
entered = true;
}
}
if (entered) { pmap_update(ufi->orig_map->pmap);
}
/* locked: maps(read), amap(if there) */
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
/* (shadowed == true) if there is an anon at the faulting address */
UVMHIST_LOG(maphist, " shadowed=%jd, will_get=%jd", shadowed,
(ufi->entry->object.uvm_obj && shadowed != false),0,0);
return 0;
}
/*
* uvm_fault_upper_neighbor: enter single upper neighbor page.
*
* => called with amap and anon locked.
*/
static void
uvm_fault_upper_neighbor(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
vaddr_t currva, struct vm_page *pg, bool readonly)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: amap, anon */
KASSERT(pg->uobject == NULL); KASSERT(pg->uanon != NULL); KASSERT(rw_lock_op(pg->uanon->an_lock) == flt->upper_lock_type); KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
/*
* there wasn't a direct fault on the page, so avoid the cost of
* activating it.
*/
if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
}
UVMHIST_LOG(maphist,
" MAPPING: n anon: pm=%#jx, va=%#jx, pg=%#jx",
(uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
cpu_count(CPU_COUNT_FLTNAMAP, 1);
/*
* Since this page isn't the page that's actually faulting,
* ignore pmap_enter() failures; it's not critical that we
* enter these right now.
*/
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(pg),
readonly ? (flt->enter_prot & ~VM_PROT_WRITE) :
flt->enter_prot,
PMAP_CANFAIL | (flt->wire_mapping ? PMAP_WIRED : 0));
}
/*
* uvm_fault_upper: handle upper fault.
*
* 1. acquire anon lock.
* 2. get anon. let uvmfault_anonget do the dirty work.
* 3. handle loan.
* 4. dispatch direct or promote handlers.
*/
static int
uvm_fault_upper(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon **anons)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon * const anon = anons[flt->centeridx];
struct uvm_object *uobj;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap, anon */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock);
/*
* handle case 1: fault on an anon in our amap
*/
UVMHIST_LOG(maphist, " case 1 fault: anon=%#jx",
(uintptr_t)anon, 0, 0, 0);
/*
* no matter if we have case 1A or case 1B we are going to need to
* have the anon's memory resident. ensure that now.
*/
/*
* let uvmfault_anonget do the dirty work.
* if it fails (!OK) it will unlock everything for us.
* if it succeeds, locks are still valid and locked.
* also, if it is OK, then the anon's page is on the queues.
* if the page is on loan from a uvm_object, then anonget will
* lock that object for us if it does not fail.
*/
retry:
error = uvmfault_anonget(ufi, amap, anon);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
case EAGAIN:
kpause("fltagain1", false, hz/2, NULL);
return ERESTART;
case ENOLCK:
/* it needs a write lock: retry */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
goto retry;
default:
return error;
}
/*
* uobj is non null if the page is on loan from an object (i.e. uobj)
*/
uobj = anon->an_page->uobject; /* locked by anonget if !NULL */
/* locked: maps(read), amap, anon, uobj(if one) */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* special handling for loaned pages
*/
if (anon->an_page->loan_count) { error = uvm_fault_upper_loan(ufi, flt, anon, &uobj);
if (error != 0)
return error;
}
/*
* if we are case 1B then we will need to allocate a new blank
* anon to transfer the data into. note that we have a lock
* on anon, so no one can busy or release the page until we are done.
* also note that the ref count can't drop to zero here because
* it is > 1 and we are only dropping one ref.
*
* in the (hopefully very rare) case that we are out of RAM we
* will unlock, wait for more RAM, and refault.
*
* if we are out of anon VM we kill the process (XXX: could wait?).
*/
if (flt->cow_now && anon->an_ref > 1) {
flt->promote = true;
error = uvm_fault_upper_promote(ufi, flt, uobj, anon);
} else {
error = uvm_fault_upper_direct(ufi, flt, uobj, anon);
}
return error;
}
/*
* uvm_fault_upper_loan: handle loaned upper page.
*
* 1. if not cow'ing now, simply adjust flt->enter_prot.
* 2. if cow'ing now, and if ref count is 1, break loan.
*/
static int
uvm_fault_upper_loan(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_anon *anon, struct uvm_object **ruobj)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
int error = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (!flt->cow_now) {
/*
* for read faults on loaned pages we just cap the
* protection at read-only.
*/
flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
} else {
/*
* note that we can't allow writes into a loaned page!
*
* if we have a write fault on a loaned page in an
* anon then we need to look at the anon's ref count.
* if it is greater than one then we are going to do
* a normal copy-on-write fault into a new anon (this
* is not a problem). however, if the reference count
* is one (a case where we would normally allow a
* write directly to the page) then we need to kill
* the loan before we continue.
*/
/* >1 case is already ok */
if (anon->an_ref == 1) {
/* breaking loan requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
error = uvm_loanbreak_anon(anon, *ruobj);
if (error != 0) { uvmfault_unlockall(ufi, amap, *ruobj);
uvm_wait("flt_noram2");
return ERESTART;
}
/* if we were a loan receiver uobj is gone */
if (*ruobj)
*ruobj = NULL;
}
}
return error;
}
/*
* uvm_fault_upper_promote: promote upper page.
*
* 1. call uvmfault_promote.
* 2. enqueue page.
* 3. deref.
* 4. pass page to uvm_fault_upper_enter.
*/
static int
uvm_fault_upper_promote(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon * const oanon = anon;
struct vm_page *pg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist, " case 1B: COW fault",0,0,0,0);
/* promoting requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, NULL);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock));
cpu_count(CPU_COUNT_FLT_ACOW, 1);
error = uvmfault_promote(ufi, oanon, PGO_DONTCARE, &anon,
&flt->anon_spare);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
default:
return error;
}
pg = anon->an_page;
KASSERT(anon->an_lock == oanon->an_lock); KASSERT((pg->flags & (PG_BUSY | PG_FAKE)) == 0);
/* deref: can not drop to zero here by defn! */
KASSERT(oanon->an_ref > 1);
oanon->an_ref--;
/*
* note: oanon is still locked, as is the new anon. we
* need to check for this later when we unlock oanon; if
* oanon != anon, we'll have to unlock anon, too.
*/
return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}
/*
* uvm_fault_upper_direct: handle direct fault.
*/
static int
uvm_fault_upper_direct(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon)
{
struct vm_anon * const oanon = anon;
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
cpu_count(CPU_COUNT_FLT_ANON, 1);
pg = anon->an_page;
if (anon->an_ref > 1) /* disallow writes to ref > 1 anons */ flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
return uvm_fault_upper_enter(ufi, flt, uobj, anon, pg, oanon);
}
/*
* uvm_fault_upper_enter: enter h/w mapping of upper page.
*/
static int
uvm_fault_upper_enter(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_anon *anon, struct vm_page *pg,
struct vm_anon *oanon)
{
struct pmap *pmap = ufi->orig_map->pmap;
vaddr_t va = ufi->orig_rvaddr;
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap, oanon, anon(if different from oanon) */
KASSERT(rw_lock_op(amap->am_lock) == flt->upper_lock_type); KASSERT(anon->an_lock == amap->am_lock); KASSERT(oanon->an_lock == amap->am_lock); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(uvm_pagegetdirty(pg) != UVM_PAGE_STATUS_CLEAN);
/*
* now map the page in.
*/
UVMHIST_LOG(maphist,
" MAPPING: anon: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
(uintptr_t)pmap, va, (uintptr_t)pg, flt->promote);
if (pmap_enter(pmap, va, VM_PAGE_TO_PHYS(pg),
flt->enter_prot, flt->access_type | PMAP_CANFAIL |
(flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
/*
* If pmap_enter() fails, it must not leave behind an existing
* pmap entry. In particular, a now-stale entry for a different
* page would leave the pmap inconsistent with the vm_map.
* This is not to imply that pmap_enter() should remove an
* existing mapping in such a situation (since that could create
* different problems, eg. if the existing mapping is wired),
* but rather that the pmap should be designed such that it
* never needs to fail when the new mapping is replacing an
* existing mapping and the new page has no existing mappings.
*
* XXX This can't be asserted safely any more because many
* LWPs and/or many processes could simultaneously fault on
* the same VA and some might succeed.
*/
/* KASSERT(!pmap_extract(pmap, va, NULL)); */
/*
* ensure that the page is queued in the case that
* we just promoted.
*/
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
/* XXX instrumentation */
return ENOMEM;
}
/* XXX instrumentation */
uvm_wait("flt_pmfail1");
return ERESTART;
}
uvm_fault_upper_done(ufi, flt, anon, pg);
/*
* done case 1! finish up by unlocking everything and returning success
*/
pmap_update(pmap);
uvmfault_unlockall(ufi, amap, uobj);
return 0;
}
/*
* uvm_fault_upper_done: queue upper center page.
*/
static void
uvm_fault_upper_done(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_anon *anon, struct vm_page *pg)
{
const bool wire_paging = flt->wire_paging;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* ... update the page queues.
*/
if (wire_paging) {
uvm_pagelock(pg);
uvm_pagewire(pg);
uvm_pageunlock(pg);
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* and since an anon with no swap cannot be clean,
* mark it dirty now.
*/
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
uvm_anon_dropswap(anon);
} else if (uvmpdpol_pageactivate_p(pg)) {
/*
* avoid re-activating the page unless needed,
* to avoid false sharing on multiprocessor.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
/*
* uvm_fault_lower_upgrade: upgrade lower lock, reader -> writer
*/
static inline int
uvm_fault_lower_upgrade(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_amap *amap, struct uvm_object *uobj, struct vm_page *uobjpage)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(uobj != NULL); KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
/*
* fast path.
*/
if (__predict_true(flt->lower_lock_type == RW_WRITER)) {
return 0;
}
/*
* otherwise try for the upgrade. if we don't get it, unlock
* everything, restart the fault and next time around get a writer
* lock.
*/
flt->lower_lock_type = RW_WRITER;
if (__predict_false(!rw_tryupgrade(uobj->vmobjlock))) {
uvmfault_unlockall(ufi, amap, uobj);
cpu_count(CPU_COUNT_FLTNOUP, 1);
UVMHIST_LOG(maphist, " !upgrade lower", 0, 0,0,0);
return ERESTART;
}
cpu_count(CPU_COUNT_FLTUP, 1);
KASSERT(flt->lower_lock_type == rw_lock_op(uobj->vmobjlock));
return 0;
}
/*
* uvm_fault_lower: handle lower fault.
*
* 1. check uobj
* 1.1. if null, ZFOD.
* 1.2. if not null, look up unmapped neighbor pages.
* 2. for center page, check if promote.
* 2.1. ZFOD always needs promotion.
* 2.2. other uobjs, when entry is marked COW (usually MAP_PRIVATE vnode).
* 3. if uobj is not ZFOD and page is not found, do i/o.
* 4. dispatch either direct / promote fault.
*/
static int
uvm_fault_lower(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_page **pages)
{
struct vm_amap *amap __diagused = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
struct vm_page *uobjpage;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* now, if the desired page is not shadowed by the amap and we have
* a backing object that does not have a special fault routine, then
* we ask (with pgo_get) the object for resident pages that we care
* about and attempt to map them in. we do not let pgo_get block
* (PGO_LOCKED).
*/
if (uobj == NULL) {
/* zero fill; don't care neighbor pages */
uobjpage = NULL;
} else {
uvm_fault_lower_lookup(ufi, flt, pages);
uobjpage = pages[flt->centeridx];
}
/*
* note that at this point we are done with any front or back pages.
* we are now going to focus on the center page (i.e. the one we've
* faulted on). if we have faulted on the upper (anon) layer
* [i.e. case 1], then the anon we want is anons[centeridx] (we have
* not touched it yet). if we have faulted on the bottom (uobj)
* layer [i.e. case 2] and the page was both present and available,
* then we've got a pointer to it as "uobjpage" and we've already
* made it BUSY.
*/
/*
* locked:
* maps(read), amap(if there), uobj(if !null), uobjpage(if !null)
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* note that uobjpage can not be PGO_DONTCARE at this point. we now
* set uobjpage to PGO_DONTCARE if we are doing a zero fill. if we
* have a backing object, check and see if we are going to promote
* the data up to an anon during the fault.
*/
if (uobj == NULL) {
uobjpage = PGO_DONTCARE;
flt->promote = true; /* always need anon here */
} else {
KASSERT(uobjpage != PGO_DONTCARE); flt->promote = flt->cow_now && UVM_ET_ISCOPYONWRITE(ufi->entry);
}
UVMHIST_LOG(maphist, " case 2 fault: promote=%jd, zfill=%jd",
flt->promote, (uobj == NULL), 0,0);
/*
* if uobjpage is not null then we do not need to do I/O to get the
* uobjpage.
*
* if uobjpage is null, then we need to unlock and ask the pager to
* get the data for us. once we have the data, we need to reverify
* the state the world. we are currently not holding any resources.
*/
if (uobjpage) {
/* update rusage counters */
curlwp->l_ru.ru_minflt++;
} else {
error = uvm_fault_lower_io(ufi, flt, &uobj, &uobjpage);
if (error != 0)
return error;
}
/*
* locked:
* maps(read), amap(if !null), uobj(if !null), uobjpage(if uobj)
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* notes:
* - at this point uobjpage can not be NULL
* - at this point uobjpage can not be PG_RELEASED (since we checked
* for it above)
* - at this point uobjpage could be waited on (handle later)
* - uobjpage can be from a different object if tmpfs (vnode vs UAO)
*/
KASSERT(uobjpage != NULL); KASSERT(uobj == NULL ||
uobjpage->uobject->vmobjlock == uobj->vmobjlock);
KASSERT(uobj == NULL || !UVM_OBJ_IS_CLEAN(uobjpage->uobject) ||
uvm_pagegetdirty(uobjpage) == UVM_PAGE_STATUS_CLEAN);
if (!flt->promote) {
error = uvm_fault_lower_direct(ufi, flt, uobj, uobjpage);
} else {
error = uvm_fault_lower_promote(ufi, flt, uobj, uobjpage);
}
return error;
}
/*
* uvm_fault_lower_lookup: look up on-memory uobj pages.
*
* 1. get on-memory pages.
* 2. if failed, give up (get only center page later).
* 3. if succeeded, enter h/w mapping of neighbor pages.
*/
static void
uvm_fault_lower_lookup(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct vm_page **pages)
{
struct uvm_object *uobj = ufi->entry->object.uvm_obj;
int lcv, gotpages;
vaddr_t currva;
bool entered;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
rw_enter(uobj->vmobjlock, flt->lower_lock_type);
/*
* Locked: maps(read), amap(if there), uobj
*/
cpu_count(CPU_COUNT_FLTLGET, 1);
gotpages = flt->npages;
(void) uobj->pgops->pgo_get(uobj,
ufi->entry->offset + flt->startva - ufi->entry->start,
pages, &gotpages, flt->centeridx,
flt->access_type & MASK(ufi->entry), ufi->entry->advice,
PGO_LOCKED);
KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* check for pages to map, if we got any
*/
if (gotpages == 0) {
pages[flt->centeridx] = NULL;
return;
}
entered = false;
currva = flt->startva;
for (lcv = 0; lcv < flt->npages; lcv++, currva += PAGE_SIZE) {
struct vm_page *curpg;
curpg = pages[lcv];
if (curpg == NULL || curpg == PGO_DONTCARE) {
continue;
}
/*
* in the case of tmpfs, the pages might be from a different
* uvm_object. just make sure that they have the same lock.
*/
KASSERT(curpg->uobject->vmobjlock == uobj->vmobjlock); KASSERT((curpg->flags & PG_BUSY) == 0);
/*
* leave the centre page for later. don't screw with
* existing mappings (needless & expensive).
*/
if (lcv == flt->centeridx) {
UVMHIST_LOG(maphist, " got uobjpage (%#jx) "
"with locked get", (uintptr_t)curpg, 0, 0, 0);
} else if (!pmap_extract(ufi->orig_map->pmap, currva, NULL)) { uvm_fault_lower_neighbor(ufi, flt, currva, curpg);
entered = true;
}
}
if (entered) { pmap_update(ufi->orig_map->pmap);
}
}
/*
* uvm_fault_lower_neighbor: enter h/w mapping of lower neighbor page.
*/
static void
uvm_fault_lower_neighbor(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
vaddr_t currva, struct vm_page *pg)
{
const bool readonly = uvm_pagereadonly_p(pg) || pg->loan_count > 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* locked: maps(read), amap(if there), uobj */
/*
* calling pgo_get with PGO_LOCKED returns us pages which
* are neither busy nor released, so we don't need to check
* for this. we can just directly enter the pages.
*
* there wasn't a direct fault on the page, so avoid the cost of
* activating it.
*/
if (!uvmpdpol_pageisqueued_p(pg) && pg->wire_count == 0) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pageunlock(pg);
}
UVMHIST_LOG(maphist,
" MAPPING: n obj: pm=%#jx, va=%#jx, pg=%#jx",
(uintptr_t)ufi->orig_map->pmap, currva, (uintptr_t)pg, 0);
cpu_count(CPU_COUNT_FLTNOMAP, 1);
/*
* Since this page isn't the page that's actually faulting,
* ignore pmap_enter() failures; it's not critical that we
* enter these right now.
* NOTE: page can't be waited on or PG_RELEASED because we've
* held the lock the whole time we've had the handle.
*/
KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT(!UVM_OBJ_IS_CLEAN(pg->uobject) ||
uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN);
KASSERT((pg->flags & PG_BUSY) == 0); KASSERT(rw_lock_op(pg->uobject->vmobjlock) == flt->lower_lock_type);
const vm_prot_t mapprot =
readonly ? (flt->enter_prot & ~VM_PROT_WRITE) : flt->enter_prot & MASK(ufi->entry);
const u_int mapflags =
PMAP_CANFAIL | (flt->wire_mapping ? (mapprot | PMAP_WIRED) : 0);
(void) pmap_enter(ufi->orig_map->pmap, currva,
VM_PAGE_TO_PHYS(pg), mapprot, mapflags);
}
/*
* uvm_fault_lower_io: get lower page from backing store.
*
* 1. unlock everything, because i/o will block.
* 2. call pgo_get.
* 3. if failed, recover.
* 4. if succeeded, relock everything and verify things.
*/
static int
uvm_fault_lower_io(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object **ruobj, struct vm_page **ruobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct uvm_object *uobj = *ruobj;
struct vm_page *pg;
bool locked;
int gotpages;
int error;
voff_t uoff;
vm_prot_t access_type;
int advice;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/* grab everything we need from the entry before we unlock */
uoff = (ufi->orig_rvaddr - ufi->entry->start) + ufi->entry->offset;
access_type = flt->access_type & MASK(ufi->entry);
advice = ufi->entry->advice;
/* Locked: maps(read), amap(if there), uobj */
KASSERT(rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/* Upgrade to a write lock if needed. */
error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, NULL);
if (error != 0) {
return error;
}
uvmfault_unlockall(ufi, amap, NULL);
/* update rusage counters */
curlwp->l_ru.ru_majflt++;
/* Locked: uobj(write) */
KASSERT(rw_write_held(uobj->vmobjlock));
cpu_count(CPU_COUNT_FLTGET, 1);
gotpages = 1;
pg = NULL;
error = uobj->pgops->pgo_get(uobj, uoff, &pg, &gotpages,
0, access_type, advice, PGO_SYNCIO);
/* locked: pg(if no error) */
/*
* recover from I/O
*/
if (error) {
if (error == EAGAIN) {
UVMHIST_LOG(maphist,
" pgo_get says TRY AGAIN!",0,0,0,0);
kpause("fltagain2", false, hz/2, NULL);
return ERESTART;
}
#if 0
KASSERT(error != ERESTART);
#else
/* XXXUEBS don't re-fault? */
if (error == ERESTART)
error = EIO;
#endif
UVMHIST_LOG(maphist, "<- pgo_get failed (code %jd)",
error, 0,0,0);
return error;
}
/*
* re-verify the state of the world by first trying to relock
* the maps. always relock the object.
*/
locked = uvmfault_relock(ufi); if (locked && amap) amap_lock(amap, flt->upper_lock_type);
/* might be changed */
uobj = pg->uobject;
rw_enter(uobj->vmobjlock, flt->lower_lock_type);
KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(flt->lower_lock_type == RW_WRITER);
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
/* locked(locked): maps(read), amap(if !null), uobj, pg */
/* locked(!locked): uobj, pg */
/*
* verify that the page has not be released and re-verify
* that amap slot is still free. if there is a problem,
* we unlock and clean up.
*/
if ((pg->flags & PG_RELEASED) != 0 || (locked && amap && amap_lookup(&ufi->entry->aref,
ufi->orig_rvaddr - ufi->entry->start))) {
if (locked) uvmfault_unlockall(ufi, amap, NULL);
locked = false;
}
/*
* unbusy/release the page.
*/
if ((pg->flags & PG_RELEASED) == 0) {
pg->flags &= ~PG_BUSY;
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
UVM_PAGE_OWN(pg, NULL);
} else {
cpu_count(CPU_COUNT_FLTPGRELE, 1);
uvm_pagefree(pg);
}
/*
* didn't get the lock? retry.
*/
if (locked == false) {
UVMHIST_LOG(maphist,
" wasn't able to relock after fault: retry",
0,0,0,0);
rw_exit(uobj->vmobjlock);
return ERESTART;
}
/*
* we have the data in pg. we are holding object lock (so the page
* can't be released on us).
*/
/* locked: maps(read), amap(if !null), uobj */
*ruobj = uobj;
*ruobjpage = pg;
return 0;
}
/*
* uvm_fault_lower_direct: fault lower center page
*
* 1. adjust flt->enter_prot.
* 2. if page is loaned, resolve.
*/
int
uvm_fault_lower_direct(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *uobjpage)
{
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* we are not promoting. if the mapping is COW ensure that we
* don't give more access than we should (e.g. when doing a read
* fault on a COPYONWRITE mapping we want to map the COW page in
* R/O even though the entry protection could be R/W).
*
* set "pg" to the page we want to map in (uobjpage, usually)
*/
cpu_count(CPU_COUNT_FLT_OBJ, 1);
if (UVM_ET_ISCOPYONWRITE(ufi->entry) || UVM_OBJ_NEEDS_WRITEFAULT(uobjpage->uobject))
flt->enter_prot &= ~VM_PROT_WRITE;
pg = uobjpage; /* map in the actual object */
KASSERT(uobjpage != PGO_DONTCARE);
/*
* we are faulting directly on the page. be careful
* about writing to loaned pages...
*/
if (uobjpage->loan_count) { uvm_fault_lower_direct_loan(ufi, flt, uobj, &pg, &uobjpage);
}
KASSERT(pg == uobjpage);
KASSERT((pg->flags & PG_BUSY) == 0);
return uvm_fault_lower_enter(ufi, flt, uobj, NULL, pg);
}
/*
* uvm_fault_lower_direct_loan: resolve loaned page.
*
* 1. if not cow'ing, adjust flt->enter_prot.
* 2. if cow'ing, break loan.
*/
static int
uvm_fault_lower_direct_loan(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page **rpg,
struct vm_page **ruobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_page *pg;
struct vm_page *uobjpage = *ruobjpage;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (!flt->cow_now) {
/* read fault: cap the protection at readonly */
/* cap! */
flt->enter_prot = flt->enter_prot & ~VM_PROT_WRITE;
} else {
/*
* write fault: must break the loan here. to do this
* we need a write lock on the object.
*/
error = uvm_fault_lower_upgrade(ufi, flt, amap, uobj, uobjpage);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(uobj->vmobjlock));
pg = uvm_loanbreak(uobjpage);
if (pg == NULL) {
uvmfault_unlockall(ufi, amap, uobj);
UVMHIST_LOG(maphist,
" out of RAM breaking loan, waiting",
0,0,0,0);
cpu_count(CPU_COUNT_FLTNORAM, 1);
uvm_wait("flt_noram4");
return ERESTART;
}
*rpg = pg;
*ruobjpage = pg;
/*
* drop ownership of page while still holding object lock,
* which won't be dropped until the page is entered.
*/
uvm_pagelock(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~PG_BUSY;
UVM_PAGE_OWN(pg, NULL);
}
return 0;
}
/*
* uvm_fault_lower_promote: promote lower page.
*
* 1. call uvmfault_promote.
* 2. fill in data.
* 3. if not ZFOD, dispose old page.
*/
int
uvm_fault_lower_promote(
struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *uobjpage)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
struct vm_anon *anon;
struct vm_page *pg;
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap != NULL);
/* promoting requires a write lock. */
error = uvm_fault_upper_upgrade(ufi, flt, amap, uobj);
if (error != 0) {
return error;
}
KASSERT(rw_write_held(amap->am_lock)); KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
/*
* If we are going to promote the data to an anon we
* allocate a blank anon here and plug it into our amap.
*/
error = uvmfault_promote(ufi, NULL, uobjpage, &anon, &flt->anon_spare);
switch (error) {
case 0:
break;
case ERESTART:
return ERESTART;
default:
return error;
}
pg = anon->an_page;
/*
* Fill in the data.
*/
if (uobjpage != PGO_DONTCARE) {
cpu_count(CPU_COUNT_FLT_PRCOPY, 1);
/*
* promote to shared amap? make sure all sharing
* procs see it
*/
if ((amap_flags(amap) & AMAP_SHARED) != 0) { pmap_page_protect(uobjpage, VM_PROT_NONE);
/*
* XXX: PAGE MIGHT BE WIRED!
*/
}
UVMHIST_LOG(maphist,
" promote uobjpage %#jx to anon/page %#jx/%#jx",
(uintptr_t)uobjpage, (uintptr_t)anon, (uintptr_t)pg, 0);
} else {
cpu_count(CPU_COUNT_FLT_PRZERO, 1);
/*
* Page is zero'd and marked dirty by
* uvmfault_promote().
*/
UVMHIST_LOG(maphist," zero fill anon/page %#jx/%#jx",
(uintptr_t)anon, (uintptr_t)pg, 0, 0);
}
return uvm_fault_lower_enter(ufi, flt, uobj, anon, pg);
}
/*
* uvm_fault_lower_enter: enter h/w mapping of lower page or anon page promoted
* from the lower page.
*/
int
uvm_fault_lower_enter(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj,
struct vm_anon *anon, struct vm_page *pg)
{
struct vm_amap * const amap = ufi->entry->aref.ar_amap;
const bool readonly = uvm_pagereadonly_p(pg);
int error;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* Locked:
*
* maps(read), amap(if !null), uobj(if !null),
* anon(if !null), pg(if anon), unlock_uobj(if !null)
*
* anon must be write locked (promotion). uobj can be either.
*
* Note: pg is either the uobjpage or the new page in the new anon.
*/
KASSERT(amap == NULL ||
rw_lock_op(amap->am_lock) == flt->upper_lock_type);
KASSERT(uobj == NULL ||
rw_lock_op(uobj->vmobjlock) == flt->lower_lock_type);
KASSERT(anon == NULL || anon->an_lock == amap->am_lock);
/*
* note that pg can't be PG_RELEASED or PG_BUSY since we did
* not drop the object lock since the last time we checked.
*/
KASSERT((pg->flags & PG_RELEASED) == 0); KASSERT((pg->flags & PG_BUSY) == 0);
/*
* all resources are present. we can now map it in and free our
* resources.
*/
UVMHIST_LOG(maphist,
" MAPPING: case2: pm=%#jx, va=%#jx, pg=%#jx, promote=%jd",
(uintptr_t)ufi->orig_map->pmap, ufi->orig_rvaddr,
(uintptr_t)pg, flt->promote);
KASSERTMSG((flt->access_type & VM_PROT_WRITE) == 0 || !readonly,
"promote=%u cow_now=%u access_type=%x enter_prot=%x cow=%u "
"entry=%p map=%p orig_rvaddr=%p pg=%p",
flt->promote, flt->cow_now, flt->access_type, flt->enter_prot,
UVM_ET_ISCOPYONWRITE(ufi->entry), ufi->entry, ufi->orig_map,
(void *)ufi->orig_rvaddr, pg);
KASSERT((flt->access_type & VM_PROT_WRITE) == 0 || !readonly);
if (pmap_enter(ufi->orig_map->pmap, ufi->orig_rvaddr,
VM_PAGE_TO_PHYS(pg),
readonly ? flt->enter_prot & ~VM_PROT_WRITE : flt->enter_prot,
flt->access_type | PMAP_CANFAIL |
(flt->wire_mapping ? PMAP_WIRED : 0)) != 0) {
/*
* No need to undo what we did; we can simply think of
* this as the pmap throwing away the mapping information.
*
* We do, however, have to go through the ReFault path,
* as the map may change while we're asleep.
*/
/*
* ensure that the page is queued in the case that
* we just promoted the page.
*/
if (anon != NULL) { uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
}
uvmfault_unlockall(ufi, amap, uobj); if (!uvm_reclaimable()) {
UVMHIST_LOG(maphist,
"<- failed. out of VM",0,0,0,0);
/* XXX instrumentation */
error = ENOMEM;
return error;
}
/* XXX instrumentation */
uvm_wait("flt_pmfail2");
return ERESTART;
}
uvm_fault_lower_done(ufi, flt, uobj, pg);
pmap_update(ufi->orig_map->pmap);
uvmfault_unlockall(ufi, amap, uobj);
UVMHIST_LOG(maphist, "<- done (SUCCESS!)",0,0,0,0);
return 0;
}
/*
* uvm_fault_lower_done: queue lower center page.
*/
void
uvm_fault_lower_done(
struct uvm_faultinfo *ufi, const struct uvm_faultctx *flt,
struct uvm_object *uobj, struct vm_page *pg)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
if (flt->wire_paging) {
uvm_pagelock(pg);
uvm_pagewire(pg);
uvm_pageunlock(pg);
if (pg->flags & PG_AOBJ) {
/*
* since the now-wired page cannot be paged out,
* release its swap resources for others to use.
* since an aobj page with no swap cannot be clean,
* mark it dirty now.
*
* use pg->uobject here. if the page is from a
* tmpfs vnode, the pages are backed by its UAO and
* not the vnode.
*/
KASSERT(uobj != NULL); KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
uao_dropswap(pg->uobject, pg->offset >> PAGE_SHIFT);
}
} else if (uvmpdpol_pageactivate_p(pg)) {
/*
* avoid re-activating the page unless needed,
* to avoid false sharing on multiprocessor.
*/
uvm_pagelock(pg);
uvm_pageactivate(pg);
uvm_pageunlock(pg);
}
}
/*
* uvm_fault_wire: wire down a range of virtual addresses in a map.
*
* => map may be read-locked by caller, but MUST NOT be write-locked.
* => if map is read-locked, any operations which may cause map to
* be write-locked in uvm_fault() must be taken care of by
* the caller. See uvm_map_pageable().
*/
int
uvm_fault_wire(struct vm_map *map, vaddr_t start, vaddr_t end,
vm_prot_t access_type, int maxprot)
{
vaddr_t va;
int error;
/*
* now fault it in a page at a time. if the fault fails then we have
* to undo what we have done. note that in uvm_fault VM_PROT_NONE
* is replaced with the max protection if fault_type is VM_FAULT_WIRE.
*/
/*
* XXX work around overflowing a vaddr_t. this prevents us from
* wiring the last page in the address space, though.
*/
if (start > end) {
return EFAULT;
}
for (va = start; va < end; va += PAGE_SIZE) {
error = uvm_fault_internal(map, va, access_type,
(maxprot ? UVM_FAULT_MAXPROT : 0) | UVM_FAULT_WIRE);
if (error) {
if (va != start) { uvm_fault_unwire(map, start, va);
}
return error;
}
}
return 0;
}
/*
* uvm_fault_unwire(): unwire range of virtual space.
*/
void
uvm_fault_unwire(struct vm_map *map, vaddr_t start, vaddr_t end)
{
vm_map_lock_read(map);
uvm_fault_unwire_locked(map, start, end);
vm_map_unlock_read(map);
}
/*
* uvm_fault_unwire_locked(): the guts of uvm_fault_unwire().
*
* => map must be at least read-locked.
*/
void
uvm_fault_unwire_locked(struct vm_map *map, vaddr_t start, vaddr_t end)
{
struct vm_map_entry *entry, *oentry;
pmap_t pmap = vm_map_pmap(map);
vaddr_t va;
paddr_t pa;
struct vm_page *pg;
/*
* we assume that the area we are unwiring has actually been wired
* in the first place. this means that we should be able to extract
* the PAs from the pmap. we also lock out the page daemon so that
* we can call uvm_pageunwire.
*/
/*
* find the beginning map entry for the region.
*/
KASSERT(start >= vm_map_min(map)); KASSERT(end <= vm_map_max(map));
if (uvm_map_lookup_entry(map, start, &entry) == false)
panic("uvm_fault_unwire_locked: address not in map");
oentry = NULL;
for (va = start; va < end; va += PAGE_SIZE) {
/*
* find the map entry for the current address.
*/
KASSERT(va >= entry->start); while (va >= entry->end) { KASSERT(entry->next != &map->header); KASSERT(entry->next->start <= entry->end);
entry = entry->next;
}
/*
* lock it.
*/
if (entry != oentry) { if (oentry != NULL) { uvm_map_unlock_entry(oentry);
}
uvm_map_lock_entry(entry, RW_WRITER);
oentry = entry;
}
/*
* if the entry is no longer wired, tell the pmap.
*/
if (!pmap_extract(pmap, va, &pa))
continue;
if (VM_MAPENT_ISWIRED(entry) == 0) pmap_unwire(pmap, va);
pg = PHYS_TO_VM_PAGE(pa);
if (pg) { uvm_pagelock(pg);
uvm_pageunwire(pg);
uvm_pageunlock(pg);
}
}
if (oentry != NULL) { uvm_map_unlock_entry(entry);
}
}
/* $NetBSD: igmp.c,v 1.70 2020/05/15 06:34:34 maxv Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Internet Group Management Protocol (IGMP) routines.
*
* Written by Steve Deering, Stanford, May 1988.
* Modified by Rosen Sharma, Stanford, Aug 1994.
* Modified by Bill Fenner, Xerox PARC, Feb 1995.
*
* MULTICAST Revision: 1.3
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: igmp.c,v 1.70 2020/05/15 06:34:34 maxv Exp $");
#ifdef _KERNEL_OPT
#include "opt_mrouting.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/cprng.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/igmp.h>
#include <netinet/igmp_var.h>
/*
* Per-interface router version information.
*/
typedef struct router_info {
LIST_ENTRY(router_info) rti_link;
ifnet_t * rti_ifp;
int rti_type; /* type of router on this interface */
int rti_age; /* time since last v1 query */
} router_info_t;
/*
* The router-info list and the timer flag are protected by in_multilock.
*
* Lock order:
*
* softnet_lock ->
* in_multilock
*/
static struct pool igmp_rti_pool __cacheline_aligned;
static LIST_HEAD(, router_info) rti_head __cacheline_aligned;
static int igmp_timers_on __cacheline_aligned;
static percpu_t * igmpstat_percpu __read_mostly;
#define IGMP_STATINC(x) _NET_STATINC(igmpstat_percpu, x)
static void igmp_sendpkt(struct in_multi *, int);
static int rti_fill(struct in_multi *);
static router_info_t * rti_find(struct ifnet *);
static void rti_delete(struct ifnet *);
static void sysctl_net_inet_igmp_setup(struct sysctllog **);
/*
* rti_fill: associate router information with the given multicast group;
* if there is no router information for the interface, then create it.
*/
static int
rti_fill(struct in_multi *inm)
{
router_info_t *rti;
KASSERT(in_multi_lock_held()); LIST_FOREACH(rti, &rti_head, rti_link) {
if (rti->rti_ifp == inm->inm_ifp) {
inm->inm_rti = rti;
return rti->rti_type == IGMP_v1_ROUTER ?
IGMP_v1_HOST_MEMBERSHIP_REPORT :
IGMP_v2_HOST_MEMBERSHIP_REPORT;
}
}
rti = pool_get(&igmp_rti_pool, PR_NOWAIT);
if (rti == NULL) {
return 0;
}
rti->rti_ifp = inm->inm_ifp;
rti->rti_type = IGMP_v2_ROUTER;
LIST_INSERT_HEAD(&rti_head, rti, rti_link);
inm->inm_rti = rti;
return IGMP_v2_HOST_MEMBERSHIP_REPORT;
}
/*
* rti_find: lookup or create router information for the given interface.
*/
static router_info_t *
rti_find(ifnet_t *ifp)
{
router_info_t *rti;
KASSERT(in_multi_lock_held());
LIST_FOREACH(rti, &rti_head, rti_link) {
if (rti->rti_ifp == ifp)
return rti;
}
rti = pool_get(&igmp_rti_pool, PR_NOWAIT);
if (rti == NULL) {
return NULL;
}
rti->rti_ifp = ifp;
rti->rti_type = IGMP_v2_ROUTER;
LIST_INSERT_HEAD(&rti_head, rti, rti_link);
return rti;
}
/*
* rti_delete: remove and free the router information entry for the
* given interface.
*/
static void
rti_delete(ifnet_t *ifp)
{
router_info_t *rti;
KASSERT(in_multi_lock_held());
LIST_FOREACH(rti, &rti_head, rti_link) {
if (rti->rti_ifp == ifp) {
LIST_REMOVE(rti, rti_link);
pool_put(&igmp_rti_pool, rti);
break;
}
}
}
void
igmp_init(void)
{
pool_init(&igmp_rti_pool, sizeof(router_info_t), 0, 0, 0,
"igmppl", NULL, IPL_SOFTNET);
igmpstat_percpu = percpu_alloc(sizeof(uint64_t) * IGMP_NSTATS);
sysctl_net_inet_igmp_setup(NULL);
LIST_INIT(&rti_head);
}
void
igmp_input(struct mbuf *m, int off, int proto)
{
ifnet_t *ifp;
struct ip *ip = mtod(m, struct ip *);
struct igmp *igmp;
u_int minlen, timer;
struct in_multi *inm;
struct in_ifaddr *ia;
int ip_len, iphlen;
struct psref psref;
iphlen = off;
IGMP_STATINC(IGMP_STAT_RCV_TOTAL);
/*
* Validate lengths
*/
minlen = iphlen + IGMP_MINLEN;
ip_len = ntohs(ip->ip_len);
if (ip_len < minlen) {
IGMP_STATINC(IGMP_STAT_RCV_TOOSHORT);
m_freem(m);
return;
}
if (((m->m_flags & M_EXT) && (ip->ip_src.s_addr & IN_CLASSA_NET) == 0)
|| m->m_len < minlen) {
if ((m = m_pullup(m, minlen)) == NULL) {
IGMP_STATINC(IGMP_STAT_RCV_TOOSHORT);
return;
}
ip = mtod(m, struct ip *);
}
/*
* Validate checksum
*/
m->m_data += iphlen;
m->m_len -= iphlen;
igmp = mtod(m, struct igmp *);
/* No need to assert alignment here. */
if (in_cksum(m, ip_len - iphlen)) {
IGMP_STATINC(IGMP_STAT_RCV_BADSUM);
m_freem(m);
return;
}
m->m_data -= iphlen;
m->m_len += iphlen;
ifp = m_get_rcvif_psref(m, &psref);
if (__predict_false(ifp == NULL))
goto drop;
switch (igmp->igmp_type) {
case IGMP_HOST_MEMBERSHIP_QUERY:
IGMP_STATINC(IGMP_STAT_RCV_QUERIES);
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (igmp->igmp_code == 0) {
struct in_multistep step;
router_info_t *rti;
if (ip->ip_dst.s_addr != INADDR_ALLHOSTS_GROUP) {
IGMP_STATINC(IGMP_STAT_RCV_BADQUERIES);
goto drop;
}
in_multi_lock(RW_WRITER);
rti = rti_find(ifp);
if (rti == NULL) {
in_multi_unlock();
break;
}
rti->rti_type = IGMP_v1_ROUTER;
rti->rti_age = 0;
/*
* Start the timers in all of our membership records
* for the interface on which the query arrived,
* except those that are already running and those
* that belong to a "local" group (224.0.0.X).
*/
inm = in_first_multi(&step);
while (inm != NULL) {
if (inm->inm_ifp == ifp &&
inm->inm_timer == 0 &&
!IN_LOCAL_GROUP(inm->inm_addr.s_addr)) {
inm->inm_state = IGMP_DELAYING_MEMBER;
inm->inm_timer = IGMP_RANDOM_DELAY(
IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ);
igmp_timers_on = true;
}
inm = in_next_multi(&step);
}
in_multi_unlock();
} else {
struct in_multistep step;
if (!IN_MULTICAST(ip->ip_dst.s_addr)) {
IGMP_STATINC(IGMP_STAT_RCV_BADQUERIES);
goto drop;
}
timer = igmp->igmp_code * PR_FASTHZ / IGMP_TIMER_SCALE;
if (timer == 0)
timer = 1;
/*
* Start the timers in all of our membership records
* for the interface on which the query arrived,
* except those that are already running and those
* that belong to a "local" group (224.0.0.X). For
* timers already running, check if they need to be
* reset.
*/
in_multi_lock(RW_WRITER);
inm = in_first_multi(&step);
while (inm != NULL) {
if (inm->inm_ifp == ifp &&
!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(ip->ip_dst.s_addr == INADDR_ALLHOSTS_GROUP ||
in_hosteq(ip->ip_dst, inm->inm_addr))) {
switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
if (inm->inm_timer <= timer)
break;
/* FALLTHROUGH */
case IGMP_IDLE_MEMBER:
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
inm->inm_state =
IGMP_DELAYING_MEMBER;
inm->inm_timer =
IGMP_RANDOM_DELAY(timer);
igmp_timers_on = true;
break;
case IGMP_SLEEPING_MEMBER:
inm->inm_state =
IGMP_AWAKENING_MEMBER;
break;
}
}
inm = in_next_multi(&step);
}
in_multi_unlock();
}
break;
case IGMP_v1_HOST_MEMBERSHIP_REPORT:
IGMP_STATINC(IGMP_STAT_RCV_REPORTS);
if (ifp->if_flags & IFF_LOOPBACK)
break;
if (!IN_MULTICAST(igmp->igmp_group.s_addr) ||
!in_hosteq(igmp->igmp_group, ip->ip_dst)) {
IGMP_STATINC(IGMP_STAT_RCV_BADREPORTS);
goto drop;
}
/*
* KLUDGE: if the IP source address of the report has an
* unspecified (i.e., zero) subnet number, as is allowed for
* a booting host, replace it with the correct subnet number
* so that a process-level multicast routing daemon can
* determine which subnet it arrived from. This is necessary
* to compensate for the lack of any way for a process to
* determine the arrival interface of an incoming packet.
*/
if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) {
int s = pserialize_read_enter();
ia = in_get_ia_from_ifp(ifp); /* XXX */
if (ia)
ip->ip_src.s_addr = ia->ia_subnet;
pserialize_read_exit(s);
}
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
in_multi_lock(RW_WRITER);
inm = in_lookup_multi(igmp->igmp_group, ifp);
if (inm != NULL) {
inm->inm_timer = 0;
IGMP_STATINC(IGMP_STAT_RCV_OURREPORTS);
switch (inm->inm_state) {
case IGMP_IDLE_MEMBER:
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
case IGMP_SLEEPING_MEMBER:
inm->inm_state = IGMP_SLEEPING_MEMBER;
break;
case IGMP_DELAYING_MEMBER:
if (inm->inm_rti->rti_type == IGMP_v1_ROUTER)
inm->inm_state = IGMP_LAZY_MEMBER;
else
inm->inm_state = IGMP_SLEEPING_MEMBER;
break;
}
}
in_multi_unlock();
break;
case IGMP_v2_HOST_MEMBERSHIP_REPORT: {
int s = pserialize_read_enter();
#ifdef MROUTING
/*
* Make sure we don't hear our own membership report. Fast
* leave requires knowing that we are the only member of a
* group.
*/
ia = in_get_ia_from_ifp(ifp); /* XXX */
if (ia && in_hosteq(ip->ip_src, ia->ia_addr.sin_addr)) {
pserialize_read_exit(s);
break;
}
#endif
IGMP_STATINC(IGMP_STAT_RCV_REPORTS);
if (ifp->if_flags & IFF_LOOPBACK) {
pserialize_read_exit(s);
break;
}
if (!IN_MULTICAST(igmp->igmp_group.s_addr) ||
!in_hosteq(igmp->igmp_group, ip->ip_dst)) {
IGMP_STATINC(IGMP_STAT_RCV_BADREPORTS);
pserialize_read_exit(s);
goto drop;
}
/*
* KLUDGE: if the IP source address of the report has an
* unspecified (i.e., zero) subnet number, as is allowed for
* a booting host, replace it with the correct subnet number
* so that a process-level multicast routing daemon can
* determine which subnet it arrived from. This is necessary
* to compensate for the lack of any way for a process to
* determine the arrival interface of an incoming packet.
*/
if ((ip->ip_src.s_addr & IN_CLASSA_NET) == 0) {
#ifndef MROUTING
ia = in_get_ia_from_ifp(ifp); /* XXX */
#endif
if (ia)
ip->ip_src.s_addr = ia->ia_subnet;
}
pserialize_read_exit(s);
/*
* If we belong to the group being reported, stop
* our timer for that group.
*/
in_multi_lock(RW_WRITER);
inm = in_lookup_multi(igmp->igmp_group, ifp);
if (inm != NULL) {
inm->inm_timer = 0;
IGMP_STATINC(IGMP_STAT_RCV_OURREPORTS);
switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
case IGMP_IDLE_MEMBER:
case IGMP_AWAKENING_MEMBER:
inm->inm_state = IGMP_LAZY_MEMBER;
break;
case IGMP_LAZY_MEMBER:
case IGMP_SLEEPING_MEMBER:
break;
}
}
in_multi_unlock();
break;
}
}
m_put_rcvif_psref(ifp, &psref);
/*
* Pass all valid IGMP packets up to any process(es) listening
* on a raw IGMP socket.
*/
/*
* Currently, igmp_input() is always called holding softnet_lock
* by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
*/
KASSERT(mutex_owned(softnet_lock));
rip_input(m, iphlen, proto);
return;
drop:
m_put_rcvif_psref(ifp, &psref);
m_freem(m);
return;
}
int
igmp_joingroup(struct in_multi *inm)
{ KASSERT(in_multi_lock_held());
inm->inm_state = IGMP_IDLE_MEMBER;
if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(inm->inm_ifp->if_flags & IFF_LOOPBACK) == 0) {
int report_type;
report_type = rti_fill(inm);
if (report_type == 0) {
return ENOMEM;
}
igmp_sendpkt(inm, report_type);
inm->inm_state = IGMP_DELAYING_MEMBER;
inm->inm_timer = IGMP_RANDOM_DELAY(
IGMP_MAX_HOST_REPORT_DELAY * PR_FASTHZ);
igmp_timers_on = true;
} else
inm->inm_timer = 0;
return 0;
}
void
igmp_leavegroup(struct in_multi *inm)
{ KASSERT(in_multi_lock_held()); switch (inm->inm_state) {
case IGMP_DELAYING_MEMBER:
case IGMP_IDLE_MEMBER:
if (!IN_LOCAL_GROUP(inm->inm_addr.s_addr) &&
(inm->inm_ifp->if_flags & IFF_LOOPBACK) == 0)
if (inm->inm_rti->rti_type != IGMP_v1_ROUTER) igmp_sendpkt(inm, IGMP_HOST_LEAVE_MESSAGE);
break;
case IGMP_LAZY_MEMBER:
case IGMP_AWAKENING_MEMBER:
case IGMP_SLEEPING_MEMBER:
break;
}
}
void
igmp_fasttimo(void)
{
struct in_multi *inm;
struct in_multistep step;
/*
* Quick check to see if any work needs to be done, in order
* to minimize the overhead of fasttimo processing.
*/
if (!igmp_timers_on) {
return;
}
/* XXX: Needed for ip_output(). */
SOFTNET_LOCK_UNLESS_NET_MPSAFE();
in_multi_lock(RW_WRITER);
igmp_timers_on = false;
inm = in_first_multi(&step);
while (inm != NULL) {
if (inm->inm_timer == 0) {
/* do nothing */
} else if (--inm->inm_timer == 0) {
if (inm->inm_state == IGMP_DELAYING_MEMBER) {
if (inm->inm_rti->rti_type == IGMP_v1_ROUTER)
igmp_sendpkt(inm,
IGMP_v1_HOST_MEMBERSHIP_REPORT);
else
igmp_sendpkt(inm,
IGMP_v2_HOST_MEMBERSHIP_REPORT);
inm->inm_state = IGMP_IDLE_MEMBER;
}
} else {
igmp_timers_on = true;
}
inm = in_next_multi(&step);
}
in_multi_unlock();
SOFTNET_UNLOCK_UNLESS_NET_MPSAFE();
}
void
igmp_slowtimo(void)
{
router_info_t *rti;
in_multi_lock(RW_WRITER);
LIST_FOREACH(rti, &rti_head, rti_link) {
if (rti->rti_type == IGMP_v1_ROUTER &&
++rti->rti_age >= IGMP_AGE_THRESHOLD) {
rti->rti_type = IGMP_v2_ROUTER;
}
}
in_multi_unlock();
}
/*
* igmp_sendpkt: construct an IGMP packet, given the multicast structure
* and the type, and send the datagram.
*/
static void
igmp_sendpkt(struct in_multi *inm, int type)
{
struct mbuf *m;
struct igmp *igmp;
struct ip *ip;
struct ip_moptions imo;
KASSERT(in_multi_lock_held());
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m == NULL)
return;
KASSERT(max_linkhdr + sizeof(struct ip) + IGMP_MINLEN <= MHLEN);
m->m_data += max_linkhdr;
m->m_len = sizeof(struct ip) + IGMP_MINLEN;
m->m_pkthdr.len = sizeof(struct ip) + IGMP_MINLEN;
ip = mtod(m, struct ip *);
ip->ip_tos = 0;
ip->ip_len = htons(sizeof(struct ip) + IGMP_MINLEN);
ip->ip_off = htons(0);
ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
ip->ip_p = IPPROTO_IGMP;
ip->ip_src = zeroin_addr;
ip->ip_dst = inm->inm_addr;
m->m_data += sizeof(struct ip);
m->m_len -= sizeof(struct ip);
igmp = mtod(m, struct igmp *);
igmp->igmp_type = type;
igmp->igmp_code = 0;
igmp->igmp_group = inm->inm_addr;
igmp->igmp_cksum = 0;
igmp->igmp_cksum = in_cksum(m, IGMP_MINLEN);
m->m_data -= sizeof(struct ip);
m->m_len += sizeof(struct ip);
imo.imo_multicast_if_index = if_get_index(inm->inm_ifp);
imo.imo_multicast_ttl = 1;
/*
* Request loopback of the report if we are acting as a multicast
* router, so that the process-level routing demon can hear it.
*/
#ifdef MROUTING
extern struct socket *ip_mrouter;
imo.imo_multicast_loop = (ip_mrouter != NULL);
#else
imo.imo_multicast_loop = 0;
#endif
/*
* Note: IP_IGMP_MCAST indicates that in_multilock is held.
* The caller must still acquire softnet_lock for ip_output().
*/
#ifndef NET_MPSAFE
KASSERT(mutex_owned(softnet_lock));
#endif
ip_output(m, NULL, NULL, IP_IGMP_MCAST, &imo, NULL);
IGMP_STATINC(IGMP_STAT_SND_REPORTS);
}
void
igmp_purgeif(ifnet_t *ifp)
{
in_multi_lock(RW_WRITER);
rti_delete(ifp);
in_multi_unlock();
}
static int
sysctl_net_inet_igmp_stats(SYSCTLFN_ARGS)
{
return NETSTAT_SYSCTL(igmpstat_percpu, IGMP_NSTATS);
}
static void
sysctl_net_inet_igmp_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "igmp",
SYSCTL_DESCR("Internet Group Management Protocol"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET, IPPROTO_IGMP, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("IGMP statistics"),
sysctl_net_inet_igmp_stats, 0, NULL, 0,
CTL_NET, PF_INET, IPPROTO_IGMP, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $ */
/*
* Copyright (c) 1997, 1999, 2000, 2002, 2007, 2008, 2010, 2014, 2015, 2018,
* 2020, 2021 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Kranenburg; by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center; by Andrew Doran, and by
* Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pool.c,v 1.290 2023/04/09 12:21:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include "opt_pool.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/bitops.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/debug.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/xcall.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/fault.h>
#include <uvm/uvm_extern.h>
/*
* Pool resource management utility.
*
* Memory is allocated in pages which are split into pieces according to
* the pool item size. Each page is kept on one of three lists in the
* pool structure: `pr_emptypages', `pr_fullpages' and `pr_partpages',
* for empty, full and partially-full pages respectively. The individual
* pool items are on a linked list headed by `ph_itemlist' in each page
* header. The memory for building the page list is either taken from
* the allocated pages themselves (for small pool items) or taken from
* an internal pool of page headers (`phpool').
*/
/* List of all pools. Non static as needed by 'vmstat -m' */
TAILQ_HEAD(, pool) pool_head = TAILQ_HEAD_INITIALIZER(pool_head);
/* Private pool for page header structures */
#define PHPOOL_MAX 8
static struct pool phpool[PHPOOL_MAX];
#define PHPOOL_FREELIST_NELEM(idx) \
(((idx) == 0) ? BITMAP_MIN_SIZE : BITMAP_SIZE * (1 << (idx)))
#if !defined(KMSAN) && (defined(DIAGNOSTIC) || defined(KASAN))
#define POOL_REDZONE
#endif
#if defined(POOL_QUARANTINE)
#define POOL_NOCACHE
#endif
#ifdef POOL_REDZONE
# ifdef KASAN
# define POOL_REDZONE_SIZE 8
# else
# define POOL_REDZONE_SIZE 2
# endif
static void pool_redzone_init(struct pool *, size_t);
static void pool_redzone_fill(struct pool *, void *);
static void pool_redzone_check(struct pool *, void *);
static void pool_cache_redzone_check(pool_cache_t, void *);
#else
# define pool_redzone_init(pp, sz) __nothing
# define pool_redzone_fill(pp, ptr) __nothing
# define pool_redzone_check(pp, ptr) __nothing
# define pool_cache_redzone_check(pc, ptr) __nothing
#endif
#ifdef KMSAN
static inline void pool_get_kmsan(struct pool *, void *);
static inline void pool_put_kmsan(struct pool *, void *);
static inline void pool_cache_get_kmsan(pool_cache_t, void *);
static inline void pool_cache_put_kmsan(pool_cache_t, void *);
#else
#define pool_get_kmsan(pp, ptr) __nothing
#define pool_put_kmsan(pp, ptr) __nothing
#define pool_cache_get_kmsan(pc, ptr) __nothing
#define pool_cache_put_kmsan(pc, ptr) __nothing
#endif
#ifdef POOL_QUARANTINE
static void pool_quarantine_init(struct pool *);
static void pool_quarantine_flush(struct pool *);
static bool pool_put_quarantine(struct pool *, void *,
struct pool_pagelist *);
#else
#define pool_quarantine_init(a) __nothing
#define pool_quarantine_flush(a) __nothing
#define pool_put_quarantine(a, b, c) false
#endif
#ifdef POOL_NOCACHE
static bool pool_cache_put_nocache(pool_cache_t, void *);
#else
#define pool_cache_put_nocache(a, b) false
#endif
#define NO_CTOR __FPTRCAST(int (*)(void *, void *, int), nullop)
#define NO_DTOR __FPTRCAST(void (*)(void *, void *), nullop)
#define pc_has_pser(pc) (((pc)->pc_roflags & PR_PSERIALIZE) != 0)
#define pc_has_ctor(pc) ((pc)->pc_ctor != NO_CTOR)
#define pc_has_dtor(pc) ((pc)->pc_dtor != NO_DTOR)
#define pp_has_pser(pp) (((pp)->pr_roflags & PR_PSERIALIZE) != 0)
#define pool_barrier() xc_barrier(0)
/*
* Pool backend allocators.
*
* Each pool has a backend allocator that handles allocation, deallocation,
* and any additional draining that might be needed.
*
* We provide two standard allocators:
*
* pool_allocator_kmem - the default when no allocator is specified
*
* pool_allocator_nointr - used for pools that will not be accessed
* in interrupt context.
*/
void *pool_page_alloc(struct pool *, int);
void pool_page_free(struct pool *, void *);
static void *pool_page_alloc_meta(struct pool *, int);
static void pool_page_free_meta(struct pool *, void *);
struct pool_allocator pool_allocator_kmem = {
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 0
};
struct pool_allocator pool_allocator_nointr = {
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 0
};
struct pool_allocator pool_allocator_meta = {
.pa_alloc = pool_page_alloc_meta,
.pa_free = pool_page_free_meta,
.pa_pagesz = 0
};
#define POOL_ALLOCATOR_BIG_BASE 13
static struct pool_allocator pool_allocator_big[] = {
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 0),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 1),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 2),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 3),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 4),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 5),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 6),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 7),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 8),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 9),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 10),
},
{
.pa_alloc = pool_page_alloc,
.pa_free = pool_page_free,
.pa_pagesz = 1 << (POOL_ALLOCATOR_BIG_BASE + 11),
}
};
static int pool_bigidx(size_t);
/* # of seconds to retain page after last use */
int pool_inactive_time = 10;
/* Next candidate for drainage (see pool_drain()) */
static struct pool *drainpp;
/* This lock protects both pool_head and drainpp. */
static kmutex_t pool_head_lock;
static kcondvar_t pool_busy;
/* This lock protects initialization of a potentially shared pool allocator */
static kmutex_t pool_allocator_lock;
static unsigned int poolid_counter = 0;
typedef uint32_t pool_item_bitmap_t;
#define BITMAP_SIZE (CHAR_BIT * sizeof(pool_item_bitmap_t))
#define BITMAP_MASK (BITMAP_SIZE - 1)
#define BITMAP_MIN_SIZE (CHAR_BIT * sizeof(((struct pool_item_header *)NULL)->ph_u2))
struct pool_item_header {
/* Page headers */
LIST_ENTRY(pool_item_header)
ph_pagelist; /* pool page list */
union {
/* !PR_PHINPAGE */
struct {
SPLAY_ENTRY(pool_item_header)
phu_node; /* off-page page headers */
} phu_offpage;
/* PR_PHINPAGE */
struct {
unsigned int phu_poolid;
} phu_onpage;
} ph_u1;
void * ph_page; /* this page's address */
uint32_t ph_time; /* last referenced */
uint16_t ph_nmissing; /* # of chunks in use */
uint16_t ph_off; /* start offset in page */
union {
/* !PR_USEBMAP */
struct {
LIST_HEAD(, pool_item)
phu_itemlist; /* chunk list for this page */
} phu_normal;
/* PR_USEBMAP */
struct {
pool_item_bitmap_t phu_bitmap[1];
} phu_notouch;
} ph_u2;
};
#define ph_node ph_u1.phu_offpage.phu_node
#define ph_poolid ph_u1.phu_onpage.phu_poolid
#define ph_itemlist ph_u2.phu_normal.phu_itemlist
#define ph_bitmap ph_u2.phu_notouch.phu_bitmap
#define PHSIZE ALIGN(sizeof(struct pool_item_header))
CTASSERT(offsetof(struct pool_item_header, ph_u2) +
BITMAP_MIN_SIZE / CHAR_BIT == sizeof(struct pool_item_header));
#if defined(DIAGNOSTIC) && !defined(KASAN)
#define POOL_CHECK_MAGIC
#endif
struct pool_item {
#ifdef POOL_CHECK_MAGIC
u_int pi_magic;
#endif
#define PI_MAGIC 0xdeaddeadU
/* Other entries use only this list entry */
LIST_ENTRY(pool_item) pi_list;
};
#define POOL_NEEDS_CATCHUP(pp) \
((pp)->pr_nitems < (pp)->pr_minitems || \
(pp)->pr_npages < (pp)->pr_minpages)
#define POOL_OBJ_TO_PAGE(pp, v) \
(void *)((uintptr_t)v & pp->pr_alloc->pa_pagemask)
/*
* Pool cache management.
*
* Pool caches provide a way for constructed objects to be cached by the
* pool subsystem. This can lead to performance improvements by avoiding
* needless object construction/destruction; it is deferred until absolutely
* necessary.
*
* Caches are grouped into cache groups. Each cache group references up
* to PCG_NUMOBJECTS constructed objects. When a cache allocates an
* object from the pool, it calls the object's constructor and places it
* into a cache group. When a cache group frees an object back to the
* pool, it first calls the object's destructor. This allows the object
* to persist in constructed form while freed to the cache.
*
* The pool references each cache, so that when a pool is drained by the
* pagedaemon, it can drain each individual cache as well. Each time a
* cache is drained, the most idle cache group is freed to the pool in
* its entirety.
*
* Pool caches are laid on top of pools. By layering them, we can avoid
* the complexity of cache management for pools which would not benefit
* from it.
*/
static struct pool pcg_normal_pool;
static struct pool pcg_large_pool;
static struct pool cache_pool;
static struct pool cache_cpu_pool;
static pcg_t *volatile pcg_large_cache __cacheline_aligned;
static pcg_t *volatile pcg_normal_cache __cacheline_aligned;
/* List of all caches. */
TAILQ_HEAD(,pool_cache) pool_cache_head =
TAILQ_HEAD_INITIALIZER(pool_cache_head);
int pool_cache_disable; /* global disable for caching */
static const pcg_t pcg_dummy; /* zero sized: always empty, yet always full */
static bool pool_cache_put_slow(pool_cache_t, pool_cache_cpu_t *, int,
void *);
static bool pool_cache_get_slow(pool_cache_t, pool_cache_cpu_t *, int,
void **, paddr_t *, int);
static void pool_cache_cpu_init1(struct cpu_info *, pool_cache_t);
static int pool_cache_invalidate_groups(pool_cache_t, pcg_t *);
static void pool_cache_invalidate_cpu(pool_cache_t, u_int);
static void pool_cache_transfer(pool_cache_t);
static int pool_pcg_get(pcg_t *volatile *, pcg_t **);
static int pool_pcg_put(pcg_t *volatile *, pcg_t *);
static pcg_t * pool_pcg_trunc(pcg_t *volatile *);
static int pool_catchup(struct pool *);
static void pool_prime_page(struct pool *, void *,
struct pool_item_header *);
static void pool_update_curpage(struct pool *);
static int pool_grow(struct pool *, int);
static void *pool_allocator_alloc(struct pool *, int);
static void pool_allocator_free(struct pool *, void *);
static void pool_print_pagelist(struct pool *, struct pool_pagelist *,
void (*)(const char *, ...) __printflike(1, 2));
static void pool_print1(struct pool *, const char *,
void (*)(const char *, ...) __printflike(1, 2));
static int pool_chk_page(struct pool *, const char *,
struct pool_item_header *);
/* -------------------------------------------------------------------------- */
static inline unsigned int
pr_item_bitmap_index(const struct pool *pp, const struct pool_item_header *ph,
const void *v)
{
const char *cp = v;
unsigned int idx;
KASSERT(pp->pr_roflags & PR_USEBMAP);
idx = (cp - (char *)ph->ph_page - ph->ph_off) / pp->pr_size;
if (__predict_false(idx >= pp->pr_itemsperpage)) {
panic("%s: [%s] %u >= %u", __func__, pp->pr_wchan, idx,
pp->pr_itemsperpage);
}
return idx;
}
static inline void
pr_item_bitmap_put(const struct pool *pp, struct pool_item_header *ph,
void *obj)
{
unsigned int idx = pr_item_bitmap_index(pp, ph, obj);
pool_item_bitmap_t *bitmap = ph->ph_bitmap + (idx / BITMAP_SIZE);
pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
if (__predict_false((*bitmap & mask) != 0)) {
panic("%s: [%s] %p already freed", __func__, pp->pr_wchan, obj);
}
*bitmap |= mask;
}
static inline void *
pr_item_bitmap_get(const struct pool *pp, struct pool_item_header *ph)
{
pool_item_bitmap_t *bitmap = ph->ph_bitmap;
unsigned int idx;
int i;
for (i = 0; ; i++) {
int bit;
KASSERT((i * BITMAP_SIZE) < pp->pr_itemsperpage); bit = ffs32(bitmap[i]); if (bit) {
pool_item_bitmap_t mask;
bit--;
idx = (i * BITMAP_SIZE) + bit;
mask = 1U << bit;
KASSERT((bitmap[i] & mask) != 0);
bitmap[i] &= ~mask;
break;
}
}
KASSERT(idx < pp->pr_itemsperpage);
return (char *)ph->ph_page + ph->ph_off + idx * pp->pr_size;
}
static inline void
pr_item_bitmap_init(const struct pool *pp, struct pool_item_header *ph)
{
pool_item_bitmap_t *bitmap = ph->ph_bitmap;
const int n = howmany(pp->pr_itemsperpage, BITMAP_SIZE);
int i;
for (i = 0; i < n; i++) {
bitmap[i] = (pool_item_bitmap_t)-1;
}
}
/* -------------------------------------------------------------------------- */
static inline void
pr_item_linkedlist_put(const struct pool *pp, struct pool_item_header *ph,
void *obj)
{
struct pool_item *pi = obj;
KASSERT(!pp_has_pser(pp));
#ifdef POOL_CHECK_MAGIC
pi->pi_magic = PI_MAGIC;
#endif
if (pp->pr_redzone) {
/*
* Mark the pool_item as valid. The rest is already
* invalid.
*/
kasan_mark(pi, sizeof(*pi), sizeof(*pi), 0);
}
LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
}
static inline void *
pr_item_linkedlist_get(struct pool *pp, struct pool_item_header *ph)
{
struct pool_item *pi;
void *v;
v = pi = LIST_FIRST(&ph->ph_itemlist);
if (__predict_false(v == NULL)) {
mutex_exit(&pp->pr_lock);
panic("%s: [%s] page empty", __func__, pp->pr_wchan);
}
KASSERTMSG((pp->pr_nitems > 0),
"%s: [%s] nitems %u inconsistent on itemlist",
__func__, pp->pr_wchan, pp->pr_nitems);
#ifdef POOL_CHECK_MAGIC
KASSERTMSG((pi->pi_magic == PI_MAGIC),
"%s: [%s] free list modified: "
"magic=%x; page %p; item addr %p", __func__,
pp->pr_wchan, pi->pi_magic, ph->ph_page, pi);
#endif
/*
* Remove from item list.
*/
LIST_REMOVE(pi, pi_list);
return v;
}
/* -------------------------------------------------------------------------- */
static inline void
pr_phinpage_check(struct pool *pp, struct pool_item_header *ph, void *page,
void *object)
{
if (__predict_false((void *)ph->ph_page != page)) {
panic("%s: [%s] item %p not part of pool", __func__,
pp->pr_wchan, object);
}
if (__predict_false((char *)object < (char *)page + ph->ph_off)) {
panic("%s: [%s] item %p below item space", __func__,
pp->pr_wchan, object);
}
if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
panic("%s: [%s] item %p poolid %u != %u", __func__,
pp->pr_wchan, object, ph->ph_poolid, pp->pr_poolid);
}
}
static inline void
pc_phinpage_check(pool_cache_t pc, void *object)
{
struct pool_item_header *ph;
struct pool *pp;
void *page;
pp = &pc->pc_pool;
page = POOL_OBJ_TO_PAGE(pp, object);
ph = (struct pool_item_header *)page;
pr_phinpage_check(pp, ph, page, object);
}
/* -------------------------------------------------------------------------- */
static inline int
phtree_compare(struct pool_item_header *a, struct pool_item_header *b)
{
/*
* We consider pool_item_header with smaller ph_page bigger. This
* unnatural ordering is for the benefit of pr_find_pagehead.
*/
if (a->ph_page < b->ph_page)
return 1;
else if (a->ph_page > b->ph_page)
return -1;
else
return 0;
}
SPLAY_PROTOTYPE(phtree, pool_item_header, ph_node, phtree_compare);SPLAY_GENERATE(phtree, pool_item_header, ph_node, phtree_compare);
static inline struct pool_item_header *
pr_find_pagehead_noalign(struct pool *pp, void *v)
{
struct pool_item_header *ph, tmp;
tmp.ph_page = (void *)(uintptr_t)v;
ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
if (ph == NULL) {
ph = SPLAY_ROOT(&pp->pr_phtree);
if (ph != NULL && phtree_compare(&tmp, ph) >= 0) { ph = SPLAY_NEXT(phtree, &pp->pr_phtree, ph);
}
KASSERT(ph == NULL || phtree_compare(&tmp, ph) < 0);
}
return ph;
}
/*
* Return the pool page header based on item address.
*/
static inline struct pool_item_header *
pr_find_pagehead(struct pool *pp, void *v)
{
struct pool_item_header *ph, tmp;
if ((pp->pr_roflags & PR_NOALIGN) != 0) {
ph = pr_find_pagehead_noalign(pp, v);
} else {
void *page = POOL_OBJ_TO_PAGE(pp, v);
if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
ph = (struct pool_item_header *)page;
pr_phinpage_check(pp, ph, page, v);
} else {
tmp.ph_page = page;
ph = SPLAY_FIND(phtree, &pp->pr_phtree, &tmp);
}
}
KASSERT(ph == NULL || ((pp->pr_roflags & PR_PHINPAGE) != 0) ||
((char *)ph->ph_page <= (char *)v &&
(char *)v < (char *)ph->ph_page + pp->pr_alloc->pa_pagesz));
return ph;
}
static void
pr_pagelist_free(struct pool *pp, struct pool_pagelist *pq)
{
struct pool_item_header *ph;
while ((ph = LIST_FIRST(pq)) != NULL) { LIST_REMOVE(ph, ph_pagelist); pool_allocator_free(pp, ph->ph_page); if ((pp->pr_roflags & PR_PHINPAGE) == 0) pool_put(pp->pr_phpool, ph);
}
}
/*
* Remove a page from the pool.
*/
static inline void
pr_rmpage(struct pool *pp, struct pool_item_header *ph,
struct pool_pagelist *pq)
{
KASSERT(mutex_owned(&pp->pr_lock));
/*
* If the page was idle, decrement the idle page count.
*/
if (ph->ph_nmissing == 0) {
KASSERT(pp->pr_nidle != 0);
KASSERTMSG((pp->pr_nitems >= pp->pr_itemsperpage),
"%s: [%s] nitems=%u < itemsperpage=%u", __func__,
pp->pr_wchan, pp->pr_nitems, pp->pr_itemsperpage);
pp->pr_nidle--;
}
pp->pr_nitems -= pp->pr_itemsperpage;
/*
* Unlink the page from the pool and queue it for release.
*/
LIST_REMOVE(ph, ph_pagelist);
if (pp->pr_roflags & PR_PHINPAGE) {
if (__predict_false(ph->ph_poolid != pp->pr_poolid)) {
panic("%s: [%s] ph %p poolid %u != %u",
__func__, pp->pr_wchan, ph, ph->ph_poolid,
pp->pr_poolid);
}
} else {
SPLAY_REMOVE(phtree, &pp->pr_phtree, ph);
}
LIST_INSERT_HEAD(pq, ph, ph_pagelist);
pp->pr_npages--;
pp->pr_npagefree++;
pool_update_curpage(pp);
}
/*
* Initialize all the pools listed in the "pools" link set.
*/
void
pool_subsystem_init(void)
{
size_t size;
int idx;
mutex_init(&pool_head_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&pool_allocator_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&pool_busy, "poolbusy");
/*
* Initialize private page header pool and cache magazine pool if we
* haven't done so yet.
*/
for (idx = 0; idx < PHPOOL_MAX; idx++) {
static char phpool_names[PHPOOL_MAX][6+1+6+1];
int nelem;
size_t sz;
nelem = PHPOOL_FREELIST_NELEM(idx);
KASSERT(nelem != 0);
snprintf(phpool_names[idx], sizeof(phpool_names[idx]),
"phpool-%d", nelem);
sz = offsetof(struct pool_item_header,
ph_bitmap[howmany(nelem, BITMAP_SIZE)]);
pool_init(&phpool[idx], sz, 0, 0, 0,
phpool_names[idx], &pool_allocator_meta, IPL_VM);
}
size = sizeof(pcg_t) +
(PCG_NOBJECTS_NORMAL - 1) * sizeof(pcgpair_t);
pool_init(&pcg_normal_pool, size, coherency_unit, 0, 0,
"pcgnormal", &pool_allocator_meta, IPL_VM);
size = sizeof(pcg_t) +
(PCG_NOBJECTS_LARGE - 1) * sizeof(pcgpair_t);
pool_init(&pcg_large_pool, size, coherency_unit, 0, 0,
"pcglarge", &pool_allocator_meta, IPL_VM);
pool_init(&cache_pool, sizeof(struct pool_cache), coherency_unit,
0, 0, "pcache", &pool_allocator_meta, IPL_NONE);
pool_init(&cache_cpu_pool, sizeof(pool_cache_cpu_t), coherency_unit,
0, 0, "pcachecpu", &pool_allocator_meta, IPL_NONE);
}
static inline bool
pool_init_is_phinpage(const struct pool *pp)
{
size_t pagesize;
if (pp->pr_roflags & PR_PHINPAGE) {
return true;
}
if (pp->pr_roflags & (PR_NOTOUCH | PR_NOALIGN)) {
return false;
}
pagesize = pp->pr_alloc->pa_pagesz;
/*
* Threshold: the item size is below 1/16 of a page size, and below
* 8 times the page header size. The latter ensures we go off-page
* if the page header would make us waste a rather big item.
*/
if (pp->pr_size < MIN(pagesize / 16, PHSIZE * 8)) {
return true;
}
/* Put the header into the page if it doesn't waste any items. */
if (pagesize / pp->pr_size == (pagesize - PHSIZE) / pp->pr_size) {
return true;
}
return false;
}
static inline bool
pool_init_is_usebmap(const struct pool *pp)
{
size_t bmapsize;
if (pp->pr_roflags & PR_NOTOUCH) {
return true;
}
/*
* If we're off-page, go with a bitmap.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
return true;
}
/*
* If we're on-page, and the page header can already contain a bitmap
* big enough to cover all the items of the page, go with a bitmap.
*/
bmapsize = roundup(PHSIZE, pp->pr_align) -
offsetof(struct pool_item_header, ph_bitmap[0]);
KASSERT(bmapsize % sizeof(pool_item_bitmap_t) == 0);
if (pp->pr_itemsperpage <= bmapsize * CHAR_BIT) {
return true;
}
return false;
}
/*
* Initialize the given pool resource structure.
*
* We export this routine to allow other kernel parts to declare
* static pools that must be initialized before kmem(9) is available.
*/
void
pool_init(struct pool *pp, size_t size, u_int align, u_int ioff, int flags,
const char *wchan, struct pool_allocator *palloc, int ipl)
{
struct pool *pp1;
size_t prsize;
int itemspace, slack;
/* XXX ioff will be removed. */
KASSERT(ioff == 0);
#ifdef DEBUG
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
/*
* Check that the pool hasn't already been initialised and
* added to the list of all pools.
*/
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (pp == pp1)
panic("%s: [%s] already initialised", __func__,
wchan);
}
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
#endif
if (palloc == NULL)
palloc = &pool_allocator_kmem;
if (!cold)
mutex_enter(&pool_allocator_lock);
if (palloc->pa_refcnt++ == 0) {
if (palloc->pa_pagesz == 0)
palloc->pa_pagesz = PAGE_SIZE;
TAILQ_INIT(&palloc->pa_list);
mutex_init(&palloc->pa_lock, MUTEX_DEFAULT, IPL_VM);
palloc->pa_pagemask = ~(palloc->pa_pagesz - 1);
palloc->pa_pageshift = ffs(palloc->pa_pagesz) - 1;
}
if (!cold)
mutex_exit(&pool_allocator_lock);
/*
* PR_PSERIALIZE implies PR_NOTOUCH; freed objects must remain
* valid until the the backing page is returned to the system.
*/
if (flags & PR_PSERIALIZE) {
flags |= PR_NOTOUCH;
}
if (align == 0)
align = ALIGN(1);
prsize = size;
if ((flags & PR_NOTOUCH) == 0 && prsize < sizeof(struct pool_item))
prsize = sizeof(struct pool_item);
prsize = roundup(prsize, align);
KASSERTMSG((prsize <= palloc->pa_pagesz),
"%s: [%s] pool item size (%zu) larger than page size (%u)",
__func__, wchan, prsize, palloc->pa_pagesz);
/*
* Initialize the pool structure.
*/
LIST_INIT(&pp->pr_emptypages);
LIST_INIT(&pp->pr_fullpages);
LIST_INIT(&pp->pr_partpages);
pp->pr_cache = NULL;
pp->pr_curpage = NULL;
pp->pr_npages = 0;
pp->pr_minitems = 0;
pp->pr_minpages = 0;
pp->pr_maxpages = UINT_MAX;
pp->pr_roflags = flags;
pp->pr_flags = 0;
pp->pr_size = prsize;
pp->pr_reqsize = size;
pp->pr_align = align;
pp->pr_wchan = wchan;
pp->pr_alloc = palloc;
pp->pr_poolid = atomic_inc_uint_nv(&poolid_counter);
pp->pr_nitems = 0;
pp->pr_nout = 0;
pp->pr_hardlimit = UINT_MAX;
pp->pr_hardlimit_warning = NULL;
pp->pr_hardlimit_ratecap.tv_sec = 0;
pp->pr_hardlimit_ratecap.tv_usec = 0;
pp->pr_hardlimit_warning_last.tv_sec = 0;
pp->pr_hardlimit_warning_last.tv_usec = 0;
pp->pr_drain_hook = NULL;
pp->pr_drain_hook_arg = NULL;
pp->pr_freecheck = NULL;
pp->pr_redzone = false;
pool_redzone_init(pp, size);
pool_quarantine_init(pp);
/*
* Decide whether to put the page header off-page to avoid wasting too
* large a part of the page or too big an item. Off-page page headers
* go on a hash table, so we can match a returned item with its header
* based on the page address.
*/
if (pool_init_is_phinpage(pp)) {
/* Use the beginning of the page for the page header */
itemspace = palloc->pa_pagesz - roundup(PHSIZE, align);
pp->pr_itemoffset = roundup(PHSIZE, align);
pp->pr_roflags |= PR_PHINPAGE;
} else {
/* The page header will be taken from our page header pool */
itemspace = palloc->pa_pagesz;
pp->pr_itemoffset = 0;
SPLAY_INIT(&pp->pr_phtree);
}
pp->pr_itemsperpage = itemspace / pp->pr_size;
KASSERT(pp->pr_itemsperpage != 0);
/*
* Decide whether to use a bitmap or a linked list to manage freed
* items.
*/
if (pool_init_is_usebmap(pp)) {
pp->pr_roflags |= PR_USEBMAP;
}
/*
* If we're off-page, then we're using a bitmap; choose the appropriate
* pool to allocate page headers, whose size varies depending on the
* bitmap. If we're on-page, nothing to do.
*/
if (!(pp->pr_roflags & PR_PHINPAGE)) {
int idx;
KASSERT(pp->pr_roflags & PR_USEBMAP);
for (idx = 0; pp->pr_itemsperpage > PHPOOL_FREELIST_NELEM(idx);
idx++) {
/* nothing */
}
if (idx >= PHPOOL_MAX) {
/*
* if you see this panic, consider to tweak
* PHPOOL_MAX and PHPOOL_FREELIST_NELEM.
*/
panic("%s: [%s] too large itemsperpage(%d) for "
"PR_USEBMAP", __func__,
pp->pr_wchan, pp->pr_itemsperpage);
}
pp->pr_phpool = &phpool[idx];
} else {
pp->pr_phpool = NULL;
}
/*
* Use the slack between the chunks and the page header
* for "cache coloring".
*/
slack = itemspace - pp->pr_itemsperpage * pp->pr_size;
pp->pr_maxcolor = rounddown(slack, align);
pp->pr_curcolor = 0;
pp->pr_nget = 0;
pp->pr_nfail = 0;
pp->pr_nput = 0;
pp->pr_npagealloc = 0;
pp->pr_npagefree = 0;
pp->pr_hiwat = 0;
pp->pr_nidle = 0;
pp->pr_refcnt = 0;
mutex_init(&pp->pr_lock, MUTEX_DEFAULT, ipl);
cv_init(&pp->pr_cv, wchan);
pp->pr_ipl = ipl;
/* Insert into the list of all pools. */
if (!cold)
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pp1, &pool_head, pr_poollist) {
if (strcmp(pp1->pr_wchan, pp->pr_wchan) > 0)
break;
}
if (pp1 == NULL)
TAILQ_INSERT_TAIL(&pool_head, pp, pr_poollist);
else
TAILQ_INSERT_BEFORE(pp1, pp, pr_poollist);
if (!cold)
mutex_exit(&pool_head_lock);
/* Insert this into the list of pools using this allocator. */
if (!cold)
mutex_enter(&palloc->pa_lock);
TAILQ_INSERT_TAIL(&palloc->pa_list, pp, pr_alloc_list);
if (!cold)
mutex_exit(&palloc->pa_lock);
}
/*
* De-commission a pool resource.
*/
void
pool_destroy(struct pool *pp)
{
struct pool_pagelist pq;
struct pool_item_header *ph;
pool_quarantine_flush(pp);
/* Remove from global pool list */
mutex_enter(&pool_head_lock);
while (pp->pr_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_head, pp, pr_poollist);
if (drainpp == pp)
drainpp = NULL;
mutex_exit(&pool_head_lock);
/* Remove this pool from its allocator's list of pools. */
mutex_enter(&pp->pr_alloc->pa_lock);
TAILQ_REMOVE(&pp->pr_alloc->pa_list, pp, pr_alloc_list);
mutex_exit(&pp->pr_alloc->pa_lock);
mutex_enter(&pool_allocator_lock);
if (--pp->pr_alloc->pa_refcnt == 0)
mutex_destroy(&pp->pr_alloc->pa_lock);
mutex_exit(&pool_allocator_lock);
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_cache == NULL);
KASSERTMSG((pp->pr_nout == 0),
"%s: [%s] pool busy: still out: %u", __func__, pp->pr_wchan,
pp->pr_nout);
KASSERT(LIST_EMPTY(&pp->pr_fullpages));
KASSERT(LIST_EMPTY(&pp->pr_partpages));
/* Remove all pages */
LIST_INIT(&pq);
while ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
pr_rmpage(pp, ph, &pq);
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
cv_destroy(&pp->pr_cv);
mutex_destroy(&pp->pr_lock);
}
void
pool_set_drain_hook(struct pool *pp, void (*fn)(void *, int), void *arg)
{
/* XXX no locking -- must be used just after pool_init() */
KASSERTMSG((pp->pr_drain_hook == NULL),
"%s: [%s] already set", __func__, pp->pr_wchan);
pp->pr_drain_hook = fn;
pp->pr_drain_hook_arg = arg;
}
static struct pool_item_header *
pool_alloc_item_header(struct pool *pp, void *storage, int flags)
{
struct pool_item_header *ph;
if ((pp->pr_roflags & PR_PHINPAGE) != 0)
ph = storage;
else
ph = pool_get(pp->pr_phpool, flags);
return ph;
}
/*
* Grab an item from the pool.
*/
void *
pool_get(struct pool *pp, int flags)
{
struct pool_item_header *ph;
void *v;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); KASSERTMSG((pp->pr_itemsperpage != 0),
"%s: [%s] pr_itemsperpage is zero, "
"pool not initialized?", __func__, pp->pr_wchan);
KASSERTMSG((!(cpu_intr_p() || cpu_softintr_p())
|| pp->pr_ipl != IPL_NONE || cold || panicstr != NULL),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pp->pr_wchan);
if (flags & PR_WAITOK) { ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) { if (fault_inject())
return NULL;
}
mutex_enter(&pp->pr_lock);
startover:
/*
* Check to see if we've reached the hard limit. If we have,
* and we can wait, then wait until an item has been returned to
* the pool.
*/
KASSERTMSG((pp->pr_nout <= pp->pr_hardlimit),
"%s: %s: crossed hard limit", __func__, pp->pr_wchan);
if (__predict_false(pp->pr_nout == pp->pr_hardlimit)) {
if (pp->pr_drain_hook != NULL) {
/*
* Since the drain hook is going to free things
* back to the pool, unlock, call the hook, re-lock,
* and check the hardlimit condition again.
*/
mutex_exit(&pp->pr_lock);
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
mutex_enter(&pp->pr_lock);
if (pp->pr_nout < pp->pr_hardlimit)
goto startover;
}
if ((flags & PR_WAITOK) && !(flags & PR_LIMITFAIL)) {
/*
* XXX: A warning isn't logged in this case. Should
* it be?
*/
pp->pr_flags |= PR_WANTED;
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_WANTED);
goto startover;
}
/*
* Log a message that the hard limit has been hit.
*/
if (pp->pr_hardlimit_warning != NULL &&
ratecheck(&pp->pr_hardlimit_warning_last,
&pp->pr_hardlimit_ratecap))
log(LOG_ERR, "%s\n", pp->pr_hardlimit_warning);
pp->pr_nfail++;
mutex_exit(&pp->pr_lock);
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return NULL;
}
/*
* The convention we use is that if `curpage' is not NULL, then
* it points at a non-empty bucket. In particular, `curpage'
* never points at a page header which has PR_PHINPAGE set and
* has no items in its bucket.
*/
if ((ph = pp->pr_curpage) == NULL) {
int error;
KASSERTMSG((pp->pr_nitems == 0),
"%s: [%s] curpage NULL, inconsistent nitems %u",
__func__, pp->pr_wchan, pp->pr_nitems);
/*
* Call the back-end page allocator for more memory.
* Release the pool lock, as the back-end page allocator
* may block.
*/
error = pool_grow(pp, flags);
if (error != 0) {
/*
* pool_grow aborts when another thread
* is allocating a new page. Retry if it
* waited for it.
*/
if (error == ERESTART)
goto startover;
/*
* We were unable to allocate a page or item
* header, but we released the lock during
* allocation, so perhaps items were freed
* back to the pool. Check for this case.
*/
if (pp->pr_curpage != NULL)
goto startover;
pp->pr_nfail++;
mutex_exit(&pp->pr_lock);
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return NULL;
}
/* Start the allocation process over. */
goto startover;
}
if (pp->pr_roflags & PR_USEBMAP) {
KASSERTMSG((ph->ph_nmissing < pp->pr_itemsperpage),
"%s: [%s] pool page empty", __func__, pp->pr_wchan);
v = pr_item_bitmap_get(pp, ph);
} else {
v = pr_item_linkedlist_get(pp, ph);
}
pp->pr_nitems--;
pp->pr_nout++;
if (ph->ph_nmissing == 0) { KASSERT(pp->pr_nidle > 0);
pp->pr_nidle--;
/*
* This page was previously empty. Move it to the list of
* partially-full pages. This page is already curpage.
*/
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
}
ph->ph_nmissing++;
if (ph->ph_nmissing == pp->pr_itemsperpage) { KASSERTMSG(((pp->pr_roflags & PR_USEBMAP) ||
LIST_EMPTY(&ph->ph_itemlist)),
"%s: [%s] nmissing (%u) inconsistent", __func__,
pp->pr_wchan, ph->ph_nmissing);
/*
* This page is now full. Move it to the full list
* and select a new current page.
*/
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_fullpages, ph, ph_pagelist); pool_update_curpage(pp);
}
pp->pr_nget++;
/*
* If we have a low water mark and we are now below that low
* water mark, add more items to the pool.
*/
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
mutex_exit(&pp->pr_lock);
KASSERT((((vaddr_t)v) & (pp->pr_align - 1)) == 0);
FREECHECK_OUT(&pp->pr_freecheck, v);
pool_redzone_fill(pp, v);
pool_get_kmsan(pp, v);
if (flags & PR_ZERO) memset(v, 0, pp->pr_reqsize);
return v;
}
/*
* Internal version of pool_put(). Pool is already locked/entered.
*/
static void
pool_do_put(struct pool *pp, void *v, struct pool_pagelist *pq)
{
struct pool_item_header *ph;
KASSERT(mutex_owned(&pp->pr_lock));
pool_redzone_check(pp, v);
pool_put_kmsan(pp, v);
FREECHECK_IN(&pp->pr_freecheck, v);
LOCKDEBUG_MEM_CHECK(v, pp->pr_size);
KASSERTMSG((pp->pr_nout > 0),
"%s: [%s] putting with none out", __func__, pp->pr_wchan);
if (__predict_false((ph = pr_find_pagehead(pp, v)) == NULL)) {
panic("%s: [%s] page header missing", __func__, pp->pr_wchan);
}
/*
* Return to item list.
*/
if (pp->pr_roflags & PR_USEBMAP) {
pr_item_bitmap_put(pp, ph, v);
} else {
pr_item_linkedlist_put(pp, ph, v);
}
KDASSERT(ph->ph_nmissing != 0);
ph->ph_nmissing--;
pp->pr_nput++;
pp->pr_nitems++;
pp->pr_nout--;
/* Cancel "pool empty" condition if it exists */
if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (pp->pr_flags & PR_WANTED) { pp->pr_flags &= ~PR_WANTED;
cv_broadcast(&pp->pr_cv);
}
/*
* If this page is now empty, do one of two things:
*
* (1) If we have more pages than the page high water mark,
* free the page back to the system. ONLY CONSIDER
* FREEING BACK A PAGE IF WE HAVE MORE THAN OUR MINIMUM PAGE
* CLAIM.
*
* (2) Otherwise, move the page to the empty page list.
*
* Either way, select a new current page (so we use a partially-full
* page if one is available).
*/
if (ph->ph_nmissing == 0) {
pp->pr_nidle++;
if (pp->pr_nitems - pp->pr_itemsperpage >= pp->pr_minitems && pp->pr_npages > pp->pr_minpages &&
pp->pr_npages > pp->pr_maxpages) {
pr_rmpage(pp, ph, pq);
} else {
LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
/*
* Update the timestamp on the page. A page must
* be idle for some period of time before it can
* be reclaimed by the pagedaemon. This minimizes
* ping-pong'ing for memory.
*
* note for 64-bit time_t: truncating to 32-bit is not
* a problem for our usage.
*/
ph->ph_time = time_uptime;
}
pool_update_curpage(pp);
}
/*
* If the page was previously completely full, move it to the
* partially-full list and make it the current page. The next
* allocation will get the item from this page, instead of
* further fragmenting the pool.
*/
else if (ph->ph_nmissing == (pp->pr_itemsperpage - 1)) { LIST_REMOVE(ph, ph_pagelist); LIST_INSERT_HEAD(&pp->pr_partpages, ph, ph_pagelist);
pp->pr_curpage = ph;
}
}
void
pool_put(struct pool *pp, void *v)
{
struct pool_pagelist pq;
LIST_INIT(&pq);
mutex_enter(&pp->pr_lock);
if (!pool_put_quarantine(pp, v, &pq)) {
pool_do_put(pp, v, &pq);
}
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
}
/*
* pool_grow: grow a pool by a page.
*
* => called with pool locked.
* => unlock and relock the pool.
* => return with pool locked.
*/
static int
pool_grow(struct pool *pp, int flags)
{
struct pool_item_header *ph;
char *storage;
/*
* If there's a pool_grow in progress, wait for it to complete
* and try again from the top.
*/
if (pp->pr_flags & PR_GROWING) {
if (flags & PR_WAITOK) {
do {
cv_wait(&pp->pr_cv, &pp->pr_lock);
} while (pp->pr_flags & PR_GROWING);
return ERESTART;
} else {
if (pp->pr_flags & PR_GROWINGNOWAIT) {
/*
* This needs an unlock/relock dance so
* that the other caller has a chance to
* run and actually do the thing. Note
* that this is effectively a busy-wait.
*/
mutex_exit(&pp->pr_lock);
mutex_enter(&pp->pr_lock);
return ERESTART;
}
return EWOULDBLOCK;
}
}
pp->pr_flags |= PR_GROWING;
if (flags & PR_WAITOK)
mutex_exit(&pp->pr_lock);
else
pp->pr_flags |= PR_GROWINGNOWAIT; storage = pool_allocator_alloc(pp, flags); if (__predict_false(storage == NULL))
goto out;
ph = pool_alloc_item_header(pp, storage, flags); if (__predict_false(ph == NULL)) { pool_allocator_free(pp, storage);
goto out;
}
if (flags & PR_WAITOK) mutex_enter(&pp->pr_lock); pool_prime_page(pp, storage, ph);
pp->pr_npagealloc++;
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
/*
* If anyone was waiting for pool_grow, notify them that we
* may have just done it.
*/
cv_broadcast(&pp->pr_cv);
return 0;
out:
if (flags & PR_WAITOK)
mutex_enter(&pp->pr_lock);
KASSERT(pp->pr_flags & PR_GROWING);
pp->pr_flags &= ~(PR_GROWING|PR_GROWINGNOWAIT);
return ENOMEM;
}
void
pool_prime(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_minpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
if (pp->pr_maxpages <= pp->pr_minpages)
pp->pr_maxpages = pp->pr_minpages + 1; /* XXX */
while (pp->pr_npages < pp->pr_minpages)
(void) pool_grow(pp, PR_WAITOK);
mutex_exit(&pp->pr_lock);
}
/*
* Add a page worth of items to the pool.
*
* Note, we must be called with the pool descriptor LOCKED.
*/
static void
pool_prime_page(struct pool *pp, void *storage, struct pool_item_header *ph)
{
const unsigned int align = pp->pr_align;
struct pool_item *pi;
void *cp = storage;
int n;
KASSERT(mutex_owned(&pp->pr_lock)); KASSERTMSG(((pp->pr_roflags & PR_NOALIGN) ||
(((uintptr_t)cp & (pp->pr_alloc->pa_pagesz - 1)) == 0)),
"%s: [%s] unaligned page: %p", __func__, pp->pr_wchan, cp);
/*
* Insert page header.
*/
LIST_INSERT_HEAD(&pp->pr_emptypages, ph, ph_pagelist);
LIST_INIT(&ph->ph_itemlist);
ph->ph_page = storage;
ph->ph_nmissing = 0;
ph->ph_time = time_uptime;
if (pp->pr_roflags & PR_PHINPAGE)
ph->ph_poolid = pp->pr_poolid;
else
SPLAY_INSERT(phtree, &pp->pr_phtree, ph);
pp->pr_nidle++;
/*
* The item space starts after the on-page header, if any.
*/
ph->ph_off = pp->pr_itemoffset;
/*
* Color this page.
*/
ph->ph_off += pp->pr_curcolor;
cp = (char *)cp + ph->ph_off;
if ((pp->pr_curcolor += align) > pp->pr_maxcolor)
pp->pr_curcolor = 0;
KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
/*
* Insert remaining chunks on the bucket list.
*/
n = pp->pr_itemsperpage;
pp->pr_nitems += n;
if (pp->pr_roflags & PR_USEBMAP) {
pr_item_bitmap_init(pp, ph);
} else {
while (n--) {
pi = (struct pool_item *)cp;
KASSERT((((vaddr_t)pi) & (align - 1)) == 0);
/* Insert on page list */
LIST_INSERT_HEAD(&ph->ph_itemlist, pi, pi_list);
#ifdef POOL_CHECK_MAGIC
pi->pi_magic = PI_MAGIC;
#endif
cp = (char *)cp + pp->pr_size;
KASSERT((((vaddr_t)cp) & (align - 1)) == 0);
}
}
/*
* If the pool was depleted, point at the new page.
*/
if (pp->pr_curpage == NULL) pp->pr_curpage = ph; if (++pp->pr_npages > pp->pr_hiwat) pp->pr_hiwat = pp->pr_npages;
}
/*
* Used by pool_get() when nitems drops below the low water mark. This
* is used to catch up pr_nitems with the low water mark.
*
* Note 1, we never wait for memory here, we let the caller decide what to do.
*
* Note 2, we must be called with the pool already locked, and we return
* with it locked.
*/
static int
pool_catchup(struct pool *pp)
{
int error = 0;
while (POOL_NEEDS_CATCHUP(pp)) {
error = pool_grow(pp, PR_NOWAIT);
if (error) {
if (error == ERESTART)
continue;
break;
}
}
return error;
}
static void
pool_update_curpage(struct pool *pp)
{
pp->pr_curpage = LIST_FIRST(&pp->pr_partpages);
if (pp->pr_curpage == NULL) {
pp->pr_curpage = LIST_FIRST(&pp->pr_emptypages);
}
KASSERTMSG((pp->pr_curpage == NULL) == (pp->pr_nitems == 0),
"pp=%p curpage=%p nitems=%u", pp, pp->pr_curpage, pp->pr_nitems);
}
void
pool_setlowat(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_minitems = n;
/* Make sure we're caught up with the newly-set low water mark. */
if (POOL_NEEDS_CATCHUP(pp) && pool_catchup(pp) != 0) {
/*
* XXX: Should we log a warning? Should we set up a timeout
* to try again in a second or so? The latter could break
* a caller's assumptions about interrupt protection, etc.
*/
}
mutex_exit(&pp->pr_lock);
}
void
pool_sethiwat(struct pool *pp, int n)
{
mutex_enter(&pp->pr_lock);
pp->pr_maxitems = n;
mutex_exit(&pp->pr_lock);
}
void
pool_sethardlimit(struct pool *pp, int n, const char *warnmess, int ratecap)
{
mutex_enter(&pp->pr_lock);
pp->pr_hardlimit = n;
pp->pr_hardlimit_warning = warnmess;
pp->pr_hardlimit_ratecap.tv_sec = ratecap;
pp->pr_hardlimit_warning_last.tv_sec = 0;
pp->pr_hardlimit_warning_last.tv_usec = 0;
pp->pr_maxpages = roundup(n, pp->pr_itemsperpage) / pp->pr_itemsperpage;
mutex_exit(&pp->pr_lock);
}
unsigned int
pool_nget(struct pool *pp)
{
return pp->pr_nget;
}
unsigned int
pool_nput(struct pool *pp)
{
return pp->pr_nput;
}
/*
* Release all complete pages that have not been used recently.
*
* Must not be called from interrupt context.
*/
int
pool_reclaim(struct pool *pp)
{
struct pool_item_header *ph, *phnext;
struct pool_pagelist pq;
struct pool_cache *pc;
uint32_t curtime;
bool klock;
int rv;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
if (pp->pr_drain_hook != NULL) {
/*
* The drain hook must be called with the pool unlocked.
*/
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, PR_NOWAIT);
}
/*
* XXXSMP Because we do not want to cause non-MPSAFE code
* to block.
*/
if (pp->pr_ipl == IPL_SOFTNET || pp->pr_ipl == IPL_SOFTCLOCK ||
pp->pr_ipl == IPL_SOFTSERIAL) {
KERNEL_LOCK(1, NULL);
klock = true;
} else
klock = false;
/* Reclaim items from the pool's cache (if any). */
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL)
pool_cache_invalidate(pc);
if (mutex_tryenter(&pp->pr_lock) == 0) {
if (klock) {
KERNEL_UNLOCK_ONE(NULL);
}
return 0;
}
LIST_INIT(&pq);
curtime = time_uptime;
for (ph = LIST_FIRST(&pp->pr_emptypages); ph != NULL; ph = phnext) {
phnext = LIST_NEXT(ph, ph_pagelist);
/* Check our minimum page claim */
if (pp->pr_npages <= pp->pr_minpages)
break;
KASSERT(ph->ph_nmissing == 0);
if (curtime - ph->ph_time < pool_inactive_time)
continue;
/*
* If freeing this page would put us below the minimum free items
* or the minimum pages, stop now.
*/
if (pp->pr_nitems - pp->pr_itemsperpage < pp->pr_minitems ||
pp->pr_npages - 1 < pp->pr_minpages)
break;
pr_rmpage(pp, ph, &pq);
}
mutex_exit(&pp->pr_lock);
if (LIST_EMPTY(&pq))
rv = 0;
else {
pr_pagelist_free(pp, &pq);
rv = 1;
}
if (klock) {
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
/*
* Drain pools, one at a time. The drained pool is returned within ppp.
*
* Note, must never be called from interrupt context.
*/
bool
pool_drain(struct pool **ppp)
{
bool reclaimed;
struct pool *pp;
KASSERT(!TAILQ_EMPTY(&pool_head));
pp = NULL;
/* Find next pool to drain, and add a reference. */
mutex_enter(&pool_head_lock);
do {
if (drainpp == NULL) {
drainpp = TAILQ_FIRST(&pool_head);
}
if (drainpp != NULL) {
pp = drainpp;
drainpp = TAILQ_NEXT(pp, pr_poollist);
}
/*
* Skip completely idle pools. We depend on at least
* one pool in the system being active.
*/
} while (pp == NULL || pp->pr_npages == 0);
pp->pr_refcnt++;
mutex_exit(&pool_head_lock);
/* Drain the cache (if any) and pool.. */
reclaimed = pool_reclaim(pp);
/* Finally, unlock the pool. */
mutex_enter(&pool_head_lock);
pp->pr_refcnt--;
cv_broadcast(&pool_busy);
mutex_exit(&pool_head_lock);
if (ppp != NULL)
*ppp = pp;
return reclaimed;
}
/*
* Calculate the total number of pages consumed by pools.
*/
int
pool_totalpages(void)
{
mutex_enter(&pool_head_lock);
int pages = pool_totalpages_locked();
mutex_exit(&pool_head_lock);
return pages;
}
int
pool_totalpages_locked(void)
{
struct pool *pp;
uint64_t total = 0;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
uint64_t bytes =
(uint64_t)pp->pr_npages * pp->pr_alloc->pa_pagesz;
if ((pp->pr_roflags & PR_RECURSIVE) != 0)
bytes -= ((uint64_t)pp->pr_nout * pp->pr_size);
total += bytes;
}
return atop(total);
}
/*
* Diagnostic helpers.
*/
void
pool_printall(const char *modif, void (*pr)(const char *, ...))
{
struct pool *pp;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
pool_printit(pp, modif, pr);
}
}
void
pool_printit(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{
if (pp == NULL) {
(*pr)("Must specify a pool to print.\n");
return;
}
pool_print1(pp, modif, pr);
}
static void
pool_print_pagelist(struct pool *pp, struct pool_pagelist *pl,
void (*pr)(const char *, ...))
{
struct pool_item_header *ph;
LIST_FOREACH(ph, pl, ph_pagelist) {
(*pr)("\t\tpage %p, nmissing %d, time %" PRIu32 "\n",
ph->ph_page, ph->ph_nmissing, ph->ph_time);
#ifdef POOL_CHECK_MAGIC
struct pool_item *pi;
if (!(pp->pr_roflags & PR_USEBMAP)) {
LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
if (pi->pi_magic != PI_MAGIC) {
(*pr)("\t\t\titem %p, magic 0x%x\n",
pi, pi->pi_magic);
}
}
}
#endif
}
}
static void
pool_print1(struct pool *pp, const char *modif, void (*pr)(const char *, ...))
{
struct pool_item_header *ph;
pool_cache_t pc;
pcg_t *pcg;
pool_cache_cpu_t *cc;
uint64_t cpuhit, cpumiss, pchit, pcmiss;
uint32_t nfull;
int i;
bool print_log = false, print_pagelist = false, print_cache = false;
bool print_short = false, skip_empty = false;
char c;
while ((c = *modif++) != '\0') {
if (c == 'l')
print_log = true;
if (c == 'p')
print_pagelist = true;
if (c == 'c')
print_cache = true;
if (c == 's')
print_short = true;
if (c == 'S')
skip_empty = true;
}
if (skip_empty && pp->pr_nget == 0)
return;
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
(*pr)("POOLCACHE");
} else {
(*pr)("POOL");
}
/* Single line output. */
if (print_short) {
(*pr)(" %s:%p:%u:%u:%u:%u:%u:%u:%u:%u:%u:%u\n",
pp->pr_wchan, pp, pp->pr_size, pp->pr_align, pp->pr_npages,
pp->pr_nitems, pp->pr_nout, pp->pr_nget, pp->pr_nput,
pp->pr_npagealloc, pp->pr_npagefree, pp->pr_nidle);
return;
}
(*pr)(" %s: size %u, align %u, ioff %u, roflags 0x%08x\n",
pp->pr_wchan, pp->pr_size, pp->pr_align, pp->pr_itemoffset,
pp->pr_roflags);
(*pr)("\tpool %p, alloc %p\n", pp, pp->pr_alloc);
(*pr)("\tminitems %u, minpages %u, maxpages %u, npages %u\n",
pp->pr_minitems, pp->pr_minpages, pp->pr_maxpages, pp->pr_npages);
(*pr)("\titemsperpage %u, nitems %u, nout %u, hardlimit %u\n",
pp->pr_itemsperpage, pp->pr_nitems, pp->pr_nout, pp->pr_hardlimit);
(*pr)("\tnget %lu, nfail %lu, nput %lu\n",
pp->pr_nget, pp->pr_nfail, pp->pr_nput);
(*pr)("\tnpagealloc %lu, npagefree %lu, hiwat %u, nidle %lu\n",
pp->pr_npagealloc, pp->pr_npagefree, pp->pr_hiwat, pp->pr_nidle);
if (!print_pagelist)
goto skip_pagelist;
if ((ph = LIST_FIRST(&pp->pr_emptypages)) != NULL)
(*pr)("\n\tempty page list:\n");
pool_print_pagelist(pp, &pp->pr_emptypages, pr);
if ((ph = LIST_FIRST(&pp->pr_fullpages)) != NULL)
(*pr)("\n\tfull page list:\n");
pool_print_pagelist(pp, &pp->pr_fullpages, pr);
if ((ph = LIST_FIRST(&pp->pr_partpages)) != NULL)
(*pr)("\n\tpartial-page list:\n");
pool_print_pagelist(pp, &pp->pr_partpages, pr);
if (pp->pr_curpage == NULL)
(*pr)("\tno current page\n");
else
(*pr)("\tcurpage %p\n", pp->pr_curpage->ph_page);
skip_pagelist:
if (print_log)
goto skip_log;
(*pr)("\n");
skip_log:
#define PR_GROUPLIST(pcg) \
(*pr)("\t\tgroup %p: avail %d\n", pcg, pcg->pcg_avail); \
for (i = 0; i < pcg->pcg_size; i++) { \
if (pcg->pcg_objects[i].pcgo_pa != \
POOL_PADDR_INVALID) { \
(*pr)("\t\t\t%p, 0x%llx\n", \
pcg->pcg_objects[i].pcgo_va, \
(unsigned long long) \
pcg->pcg_objects[i].pcgo_pa); \
} else { \
(*pr)("\t\t\t%p\n", \
pcg->pcg_objects[i].pcgo_va); \
} \
}
if (pc != NULL) {
cpuhit = 0;
cpumiss = 0;
pcmiss = 0;
nfull = 0;
for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
if ((cc = pc->pc_cpus[i]) == NULL)
continue;
cpuhit += cc->cc_hits;
cpumiss += cc->cc_misses;
pcmiss += cc->cc_pcmisses;
nfull += cc->cc_nfull;
}
pchit = cpumiss - pcmiss;
(*pr)("\tcpu layer hits %llu misses %llu\n", cpuhit, cpumiss);
(*pr)("\tcache layer hits %llu misses %llu\n", pchit, pcmiss);
(*pr)("\tcache layer full groups %u\n", nfull);
if (print_cache) {
(*pr)("\tfull cache groups:\n");
for (pcg = pc->pc_fullgroups; pcg != NULL;
pcg = pcg->pcg_next) {
PR_GROUPLIST(pcg);
}
}
}
#undef PR_GROUPLIST
}
static int
pool_chk_page(struct pool *pp, const char *label, struct pool_item_header *ph)
{
struct pool_item *pi;
void *page;
int n;
if ((pp->pr_roflags & PR_NOALIGN) == 0) {
page = POOL_OBJ_TO_PAGE(pp, ph);
if (page != ph->ph_page &&
(pp->pr_roflags & PR_PHINPAGE) != 0) {
if (label != NULL)
printf("%s: ", label);
printf("pool(%p:%s): page inconsistency: page %p;"
" at page head addr %p (p %p)\n", pp,
pp->pr_wchan, ph->ph_page,
ph, page);
return 1;
}
}
if ((pp->pr_roflags & PR_USEBMAP) != 0)
return 0;
for (pi = LIST_FIRST(&ph->ph_itemlist), n = 0;
pi != NULL;
pi = LIST_NEXT(pi,pi_list), n++) {
#ifdef POOL_CHECK_MAGIC
if (pi->pi_magic != PI_MAGIC) {
if (label != NULL)
printf("%s: ", label);
printf("pool(%s): free list modified: magic=%x;"
" page %p; item ordinal %d; addr %p\n",
pp->pr_wchan, pi->pi_magic, ph->ph_page,
n, pi);
panic("pool");
}
#endif
if ((pp->pr_roflags & PR_NOALIGN) != 0) {
continue;
}
page = POOL_OBJ_TO_PAGE(pp, pi);
if (page == ph->ph_page)
continue;
if (label != NULL)
printf("%s: ", label);
printf("pool(%p:%s): page inconsistency: page %p;"
" item ordinal %d; addr %p (p %p)\n", pp,
pp->pr_wchan, ph->ph_page,
n, pi, page);
return 1;
}
return 0;
}
int
pool_chk(struct pool *pp, const char *label)
{
struct pool_item_header *ph;
int r = 0;
mutex_enter(&pp->pr_lock);
LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
r = pool_chk_page(pp, label, ph);
if (r) {
goto out;
}
}
out:
mutex_exit(&pp->pr_lock);
return r;
}
/*
* pool_cache_init:
*
* Initialize a pool cache.
*/
pool_cache_t
pool_cache_init(size_t size, u_int align, u_int align_offset, u_int flags,
const char *wchan, struct pool_allocator *palloc, int ipl,
int (*ctor)(void *, void *, int), void (*dtor)(void *, void *), void *arg)
{
pool_cache_t pc;
pc = pool_get(&cache_pool, PR_WAITOK);
if (pc == NULL)
return NULL;
pool_cache_bootstrap(pc, size, align, align_offset, flags, wchan,
palloc, ipl, ctor, dtor, arg);
return pc;
}
/*
* pool_cache_bootstrap:
*
* Kernel-private version of pool_cache_init(). The caller
* provides initial storage.
*/
void
pool_cache_bootstrap(pool_cache_t pc, size_t size, u_int align,
u_int align_offset, u_int flags, const char *wchan,
struct pool_allocator *palloc, int ipl,
int (*ctor)(void *, void *, int), void (*dtor)(void *, void *),
void *arg)
{
CPU_INFO_ITERATOR cii;
pool_cache_t pc1;
struct cpu_info *ci;
struct pool *pp;
unsigned int ppflags;
pp = &pc->pc_pool;
if (palloc == NULL && ipl == IPL_NONE) {
if (size > PAGE_SIZE) {
int bigidx = pool_bigidx(size);
palloc = &pool_allocator_big[bigidx];
flags |= PR_NOALIGN;
} else
palloc = &pool_allocator_nointr;
}
ppflags = flags;
if (ctor == NULL) {
ctor = NO_CTOR;
}
if (dtor == NULL) {
dtor = NO_DTOR;
} else {
/*
* If we have a destructor, then the pool layer does not
* need to worry about PR_PSERIALIZE.
*/
ppflags &= ~PR_PSERIALIZE;
}
pool_init(pp, size, align, align_offset, ppflags, wchan, palloc, ipl);
pc->pc_fullgroups = NULL;
pc->pc_partgroups = NULL;
pc->pc_ctor = ctor;
pc->pc_dtor = dtor;
pc->pc_arg = arg;
pc->pc_refcnt = 0;
pc->pc_roflags = flags;
pc->pc_freecheck = NULL;
if ((flags & PR_LARGECACHE) != 0) {
pc->pc_pcgsize = PCG_NOBJECTS_LARGE;
pc->pc_pcgpool = &pcg_large_pool;
pc->pc_pcgcache = &pcg_large_cache;
} else {
pc->pc_pcgsize = PCG_NOBJECTS_NORMAL;
pc->pc_pcgpool = &pcg_normal_pool;
pc->pc_pcgcache = &pcg_normal_cache;
}
/* Allocate per-CPU caches. */
memset(pc->pc_cpus, 0, sizeof(pc->pc_cpus));
pc->pc_ncpu = 0;
if (ncpu < 2) {
/* XXX For sparc: boot CPU is not attached yet. */
pool_cache_cpu_init1(curcpu(), pc);
} else {
for (CPU_INFO_FOREACH(cii, ci)) {
pool_cache_cpu_init1(ci, pc);
}
}
/* Add to list of all pools. */
if (__predict_true(!cold))
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pc1, &pool_cache_head, pc_cachelist) {
if (strcmp(pc1->pc_pool.pr_wchan, pc->pc_pool.pr_wchan) > 0)
break;
}
if (pc1 == NULL)
TAILQ_INSERT_TAIL(&pool_cache_head, pc, pc_cachelist);
else
TAILQ_INSERT_BEFORE(pc1, pc, pc_cachelist);
if (__predict_true(!cold))
mutex_exit(&pool_head_lock);
atomic_store_release(&pp->pr_cache, pc);
}
/*
* pool_cache_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_destroy(pool_cache_t pc)
{
pool_cache_bootstrap_destroy(pc);
pool_put(&cache_pool, pc);
}
/*
* pool_cache_bootstrap_destroy:
*
* Destroy a pool cache.
*/
void
pool_cache_bootstrap_destroy(pool_cache_t pc)
{
struct pool *pp = &pc->pc_pool;
u_int i;
/* Remove it from the global list. */
mutex_enter(&pool_head_lock);
while (pc->pc_refcnt != 0)
cv_wait(&pool_busy, &pool_head_lock);
TAILQ_REMOVE(&pool_cache_head, pc, pc_cachelist);
mutex_exit(&pool_head_lock);
/* First, invalidate the entire cache. */
pool_cache_invalidate(pc);
/* Disassociate it from the pool. */
mutex_enter(&pp->pr_lock);
atomic_store_relaxed(&pp->pr_cache, NULL);
mutex_exit(&pp->pr_lock);
/* Destroy per-CPU data */
for (i = 0; i < __arraycount(pc->pc_cpus); i++)
pool_cache_invalidate_cpu(pc, i);
/* Finally, destroy it. */
pool_destroy(pp);
}
/*
* pool_cache_cpu_init1:
*
* Called for each pool_cache whenever a new CPU is attached.
*/
static void
pool_cache_cpu_init1(struct cpu_info *ci, pool_cache_t pc)
{
pool_cache_cpu_t *cc;
int index;
index = ci->ci_index;
KASSERT(index < __arraycount(pc->pc_cpus));
if ((cc = pc->pc_cpus[index]) != NULL) {
return;
}
/*
* The first CPU is 'free'. This needs to be the case for
* bootstrap - we may not be able to allocate yet.
*/
if (pc->pc_ncpu == 0) {
cc = &pc->pc_cpu0;
pc->pc_ncpu = 1;
} else {
pc->pc_ncpu++;
cc = pool_get(&cache_cpu_pool, PR_WAITOK);
}
cc->cc_current = __UNCONST(&pcg_dummy);
cc->cc_previous = __UNCONST(&pcg_dummy);
cc->cc_pcgcache = pc->pc_pcgcache;
cc->cc_hits = 0;
cc->cc_misses = 0;
cc->cc_pcmisses = 0;
cc->cc_contended = 0;
cc->cc_nfull = 0;
cc->cc_npart = 0;
pc->pc_cpus[index] = cc;
}
/*
* pool_cache_cpu_init:
*
* Called whenever a new CPU is attached.
*/
void
pool_cache_cpu_init(struct cpu_info *ci)
{
pool_cache_t pc;
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pc, &pool_cache_head, pc_cachelist) {
pc->pc_refcnt++;
mutex_exit(&pool_head_lock);
pool_cache_cpu_init1(ci, pc);
mutex_enter(&pool_head_lock);
pc->pc_refcnt--;
cv_broadcast(&pool_busy);
}
mutex_exit(&pool_head_lock);
}
/*
* pool_cache_reclaim:
*
* Reclaim memory from a pool cache.
*/
bool
pool_cache_reclaim(pool_cache_t pc)
{
return pool_reclaim(&pc->pc_pool);
}
static inline void
pool_cache_pre_destruct(pool_cache_t pc)
{
/*
* Perform a passive serialization barrier before destructing
* a batch of one or more objects.
*/
if (__predict_false(pc_has_pser(pc))) { pool_barrier();
}
}
static void
pool_cache_destruct_object1(pool_cache_t pc, void *object)
{
(*pc->pc_dtor)(pc->pc_arg, object);
pool_put(&pc->pc_pool, object);
}
/*
* pool_cache_destruct_object:
*
* Force destruction of an object and its release back into
* the pool.
*/
void
pool_cache_destruct_object(pool_cache_t pc, void *object)
{
FREECHECK_IN(&pc->pc_freecheck, object);
pool_cache_pre_destruct(pc);
pool_cache_destruct_object1(pc, object);
}
/*
* pool_cache_invalidate_groups:
*
* Invalidate a chain of groups and destruct all objects. Return the
* number of groups that were invalidated.
*/
static int
pool_cache_invalidate_groups(pool_cache_t pc, pcg_t *pcg)
{
void *object;
pcg_t *next;
int i, n;
if (pcg == NULL) {
return 0;
}
pool_cache_pre_destruct(pc);
for (n = 0; pcg != NULL; pcg = next, n++) {
next = pcg->pcg_next;
for (i = 0; i < pcg->pcg_avail; i++) {
object = pcg->pcg_objects[i].pcgo_va;
pool_cache_destruct_object1(pc, object);
}
if (pcg->pcg_size == PCG_NOBJECTS_LARGE) {
pool_put(&pcg_large_pool, pcg);
} else {
KASSERT(pcg->pcg_size == PCG_NOBJECTS_NORMAL);
pool_put(&pcg_normal_pool, pcg);
}
}
return n;
}
/*
* pool_cache_invalidate:
*
* Invalidate a pool cache (destruct and release all of the
* cached objects). Does not reclaim objects from the pool.
*
* Note: For pool caches that provide constructed objects, there
* is an assumption that another level of synchronization is occurring
* between the input to the constructor and the cache invalidation.
*
* Invalidation is a costly process and should not be called from
* interrupt context.
*/
void
pool_cache_invalidate(pool_cache_t pc)
{
uint64_t where;
pcg_t *pcg;
int n, s;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
if (ncpu < 2 || !mp_online) {
/*
* We might be called early enough in the boot process
* for the CPU data structures to not be fully initialized.
* In this case, transfer the content of the local CPU's
* cache back into global cache as only this CPU is currently
* running.
*/
pool_cache_transfer(pc);
} else {
/*
* Signal all CPUs that they must transfer their local
* cache back to the global pool then wait for the xcall to
* complete.
*/
where = xc_broadcast(0,
__FPTRCAST(xcfunc_t, pool_cache_transfer), pc, NULL);
xc_wait(where);
}
/* Now dequeue and invalidate everything. */
pcg = pool_pcg_trunc(&pcg_normal_cache);
(void)pool_cache_invalidate_groups(pc, pcg);
pcg = pool_pcg_trunc(&pcg_large_cache);
(void)pool_cache_invalidate_groups(pc, pcg);
pcg = pool_pcg_trunc(&pc->pc_fullgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_nfull -= n;
splx(s);
pcg = pool_pcg_trunc(&pc->pc_partgroups);
n = pool_cache_invalidate_groups(pc, pcg);
s = splvm();
((pool_cache_cpu_t *)pc->pc_cpus[curcpu()->ci_index])->cc_npart -= n;
splx(s);
}
/*
* pool_cache_invalidate_cpu:
*
* Invalidate all CPU-bound cached objects in pool cache, the CPU being
* identified by its associated index.
* It is caller's responsibility to ensure that no operation is
* taking place on this pool cache while doing this invalidation.
* WARNING: as no inter-CPU locking is enforced, trying to invalidate
* pool cached objects from a CPU different from the one currently running
* may result in an undefined behaviour.
*/
static void
pool_cache_invalidate_cpu(pool_cache_t pc, u_int index)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
if ((cc = pc->pc_cpus[index]) == NULL)
return;
if ((pcg = cc->cc_current) != &pcg_dummy) {
pcg->pcg_next = NULL;
pool_cache_invalidate_groups(pc, pcg);
}
if ((pcg = cc->cc_previous) != &pcg_dummy) {
pcg->pcg_next = NULL;
pool_cache_invalidate_groups(pc, pcg);
}
if (cc != &pc->pc_cpu0)
pool_put(&cache_cpu_pool, cc);
}
void
pool_cache_set_drain_hook(pool_cache_t pc, void (*fn)(void *, int), void *arg)
{
pool_set_drain_hook(&pc->pc_pool, fn, arg);
}
void
pool_cache_setlowat(pool_cache_t pc, int n)
{
pool_setlowat(&pc->pc_pool, n);
}
void
pool_cache_sethiwat(pool_cache_t pc, int n)
{
pool_sethiwat(&pc->pc_pool, n);
}
void
pool_cache_sethardlimit(pool_cache_t pc, int n, const char *warnmess, int ratecap)
{
pool_sethardlimit(&pc->pc_pool, n, warnmess, ratecap);
}
void
pool_cache_prime(pool_cache_t pc, int n)
{
pool_prime(&pc->pc_pool, n);
}
unsigned int
pool_cache_nget(pool_cache_t pc)
{
return pool_nget(&pc->pc_pool);
}
unsigned int
pool_cache_nput(pool_cache_t pc)
{
return pool_nput(&pc->pc_pool);
}
/*
* pool_pcg_get:
*
* Get a cache group from the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because
* of spin wait vs. kernel_lock.
*/
static int
pool_pcg_get(pcg_t *volatile *head, pcg_t **pcgp)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count); n = atomic_load_relaxed(head);
continue;
}
if (__predict_false(o == NULL)) {
break;
}
/* Lock out concurrent get/put. */
n = atomic_cas_ptr(head, o, __UNCONST(&pcg_dummy));
if (o == n) {
/* Fetch pointer to next item and then unlock. */
membar_datadep_consumer(); /* alpha */
n = atomic_load_relaxed(&o->pcg_next); atomic_store_release(head, n);
break;
}
}
*pcgp = o;
return count != SPINLOCK_BACKOFF_MIN;
}
/*
* pool_pcg_trunc:
*
* Chop out entire list of pool cache groups.
*/
static pcg_t *
pool_pcg_trunc(pcg_t *volatile *head)
{
int count = SPINLOCK_BACKOFF_MIN, s;
pcg_t *o, *n;
s = splvm();
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
n = atomic_cas_ptr(head, o, NULL);
if (o == n) {
splx(s);
membar_datadep_consumer(); /* alpha */
return o;
}
}
}
/*
* pool_pcg_put:
*
* Put a pool cache group to the specified list. Return true if
* contention was encountered. Must be called at IPL_VM because of
* spin wait vs. kernel_lock.
*/
static int
pool_pcg_put(pcg_t *volatile *head, pcg_t *pcg)
{
int count = SPINLOCK_BACKOFF_MIN;
pcg_t *o, *n;
for (o = atomic_load_relaxed(head);; o = n) {
if (__predict_false(o == &pcg_dummy)) {
/* Wait for concurrent get to complete. */
SPINLOCK_BACKOFF(count);
n = atomic_load_relaxed(head);
continue;
}
pcg->pcg_next = o;
membar_release();
n = atomic_cas_ptr(head, o, pcg);
if (o == n) {
return count != SPINLOCK_BACKOFF_MIN;
}
}
}
static bool __noinline
pool_cache_get_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s,
void **objectp, paddr_t *pap, int flags)
{
pcg_t *pcg, *cur;
void *object;
KASSERT(cc->cc_current->pcg_avail == 0); KASSERT(cc->cc_previous->pcg_avail == 0);
cc->cc_misses++;
/*
* If there's a full group, release our empty group back to the
* cache. Install the full group as cc_current and return.
*/
cc->cc_contended += pool_pcg_get(&pc->pc_fullgroups, &pcg);
if (__predict_true(pcg != NULL)) {
KASSERT(pcg->pcg_avail == pcg->pcg_size); if (__predict_true((cur = cc->cc_current) != &pcg_dummy)) { KASSERT(cur->pcg_avail == 0);
(void)pool_pcg_put(cc->cc_pcgcache, cur);
}
cc->cc_nfull--;
cc->cc_current = pcg;
return true;
}
/*
* Nothing available locally or in cache. Take the slow
* path: fetch a new object from the pool and construct
* it.
*/
cc->cc_pcmisses++;
splx(s);
object = pool_get(&pc->pc_pool, flags);
*objectp = object;
if (__predict_false(object == NULL)) {
KASSERT((flags & (PR_NOWAIT|PR_LIMITFAIL)) != 0);
return false;
}
if (__predict_false((*pc->pc_ctor)(pc->pc_arg, object, flags) != 0)) {
pool_put(&pc->pc_pool, object);
*objectp = NULL;
return false;
}
KASSERT((((vaddr_t)object) & (pc->pc_pool.pr_align - 1)) == 0); if (pap != NULL) {
#ifdef POOL_VTOPHYS
*pap = POOL_VTOPHYS(object);
#else
*pap = POOL_PADDR_INVALID;
#endif
}
FREECHECK_OUT(&pc->pc_freecheck, object);
return false;
}
/*
* pool_cache_get{,_paddr}:
*
* Get an object from a pool cache (optionally returning
* the physical address of the object).
*/
void *
pool_cache_get_paddr(pool_cache_t pc, int flags, paddr_t *pap)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
void *object;
int s;
KASSERT(!(flags & PR_NOWAIT) != !(flags & PR_WAITOK)); if (pc->pc_pool.pr_ipl == IPL_NONE &&
__predict_true(!cold) &&
__predict_true(panicstr == NULL)) {
KASSERTMSG(!cpu_intr_p(),
"%s: [%s] is IPL_NONE, but called from interrupt context",
__func__, pc->pc_pool.pr_wchan);
KASSERTMSG(!cpu_softintr_p(),
"%s: [%s] is IPL_NONE,"
" but called from soft interrupt context",
__func__, pc->pc_pool.pr_wchan);
}
if (flags & PR_WAITOK) { ASSERT_SLEEPABLE();
}
if (flags & PR_NOWAIT) { if (fault_inject())
return NULL;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* Try and allocate an object from the current group. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail > 0)) {
object = pcg->pcg_objects[--pcg->pcg_avail].pcgo_va;
if (__predict_false(pap != NULL)) *pap = pcg->pcg_objects[pcg->pcg_avail].pcgo_pa;
#if defined(DIAGNOSTIC)
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = NULL;
KASSERT(pcg->pcg_avail < pcg->pcg_size); KASSERT(object != NULL);
#endif
cc->cc_hits++;
splx(s);
FREECHECK_OUT(&pc->pc_freecheck, object);
pool_redzone_fill(&pc->pc_pool, object);
pool_cache_get_kmsan(pc, object);
return object;
}
/*
* That failed. If the previous group isn't empty, swap
* it with the current group and allocate from there.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail > 0)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't allocate from either group: try the slow path.
* If get_slow() allocated an object for us, or if
* no more objects are available, it will return false.
* Otherwise, we need to retry.
*/
if (!pool_cache_get_slow(pc, cc, s, &object, pap, flags)) { if (object != NULL) { kmsan_orig(object, pc->pc_pool.pr_size,
KMSAN_TYPE_POOL, __RET_ADDR);
}
break;
}
}
/*
* We would like to KASSERT(object || (flags & PR_NOWAIT)), but
* pool_cache_get can fail even in the PR_WAITOK case, if the
* constructor fails.
*/
return object;
}
static bool __noinline
pool_cache_put_slow(pool_cache_t pc, pool_cache_cpu_t *cc, int s, void *object)
{
pcg_t *pcg, *cur;
KASSERT(cc->cc_current->pcg_avail == cc->cc_current->pcg_size);
KASSERT(cc->cc_previous->pcg_avail == cc->cc_previous->pcg_size);
cc->cc_misses++;
/*
* Try to get an empty group from the cache. If there are no empty
* groups in the cache then allocate one.
*/
(void)pool_pcg_get(cc->cc_pcgcache, &pcg);
if (__predict_false(pcg == NULL)) {
if (__predict_true(!pool_cache_disable)) {
pcg = pool_get(pc->pc_pcgpool, PR_NOWAIT);
}
if (__predict_true(pcg != NULL)) {
pcg->pcg_avail = 0;
pcg->pcg_size = pc->pc_pcgsize;
}
}
/*
* If there's a empty group, release our full group back to the
* cache. Install the empty group to the local CPU and return.
*/
if (pcg != NULL) {
KASSERT(pcg->pcg_avail == 0);
if (__predict_false(cc->cc_previous == &pcg_dummy)) {
cc->cc_previous = pcg;
} else {
cur = cc->cc_current;
if (__predict_true(cur != &pcg_dummy)) {
KASSERT(cur->pcg_avail == cur->pcg_size);
cc->cc_contended +=
pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
}
cc->cc_current = pcg;
}
return true;
}
/*
* Nothing available locally or in cache, and we didn't
* allocate an empty group. Take the slow path and destroy
* the object here and now.
*/
cc->cc_pcmisses++;
splx(s);
pool_cache_destruct_object(pc, object);
return false;
}
/*
* pool_cache_put{,_paddr}:
*
* Put an object back to the pool cache (optionally caching the
* physical address of the object).
*/
void
pool_cache_put_paddr(pool_cache_t pc, void *object, paddr_t pa)
{
pool_cache_cpu_t *cc;
pcg_t *pcg;
int s;
KASSERT(object != NULL);
pool_cache_put_kmsan(pc, object);
pool_cache_redzone_check(pc, object);
FREECHECK_IN(&pc->pc_freecheck, object);
if (pc->pc_pool.pr_roflags & PR_PHINPAGE) { pc_phinpage_check(pc, object);
}
if (pool_cache_put_nocache(pc, object)) {
return;
}
/* Lock out interrupts and disable preemption. */
s = splvm();
while (/* CONSTCOND */ true) {
/* If the current group isn't full, release it there. */
cc = pc->pc_cpus[curcpu()->ci_index];
pcg = cc->cc_current;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
pcg->pcg_objects[pcg->pcg_avail].pcgo_va = object;
pcg->pcg_objects[pcg->pcg_avail].pcgo_pa = pa;
pcg->pcg_avail++;
cc->cc_hits++;
splx(s);
return;
}
/*
* That failed. If the previous group isn't full, swap
* it with the current group and try again.
*/
pcg = cc->cc_previous;
if (__predict_true(pcg->pcg_avail < pcg->pcg_size)) {
cc->cc_previous = cc->cc_current;
cc->cc_current = pcg;
continue;
}
/*
* Can't free to either group: try the slow path.
* If put_slow() releases the object for us, it
* will return false. Otherwise we need to retry.
*/
if (!pool_cache_put_slow(pc, cc, s, object))
break;
}
}
/*
* pool_cache_transfer:
*
* Transfer objects from the per-CPU cache to the global cache.
* Run within a cross-call thread.
*/
static void
pool_cache_transfer(pool_cache_t pc)
{
pool_cache_cpu_t *cc;
pcg_t *prev, *cur;
int s;
s = splvm();
cc = pc->pc_cpus[curcpu()->ci_index];
cur = cc->cc_current;
cc->cc_current = __UNCONST(&pcg_dummy);
prev = cc->cc_previous;
cc->cc_previous = __UNCONST(&pcg_dummy);
if (cur != &pcg_dummy) {
if (cur->pcg_avail == cur->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, cur);
cc->cc_nfull++;
} else if (cur->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, cur);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, cur);
cc->cc_npart++;
}
}
if (prev != &pcg_dummy) {
if (prev->pcg_avail == prev->pcg_size) {
(void)pool_pcg_put(&pc->pc_fullgroups, prev);
cc->cc_nfull++;
} else if (prev->pcg_avail == 0) {
(void)pool_pcg_put(pc->pc_pcgcache, prev);
} else {
(void)pool_pcg_put(&pc->pc_partgroups, prev);
cc->cc_npart++;
}
}
splx(s);
}
static int
pool_bigidx(size_t size)
{
int i;
for (i = 0; i < __arraycount(pool_allocator_big); i++) {
if (1 << (i + POOL_ALLOCATOR_BIG_BASE) >= size)
return i;
}
panic("pool item size %zu too large, use a custom allocator", size);
}
static void *
pool_allocator_alloc(struct pool *pp, int flags)
{
struct pool_allocator *pa = pp->pr_alloc;
void *res;
res = (*pa->pa_alloc)(pp, flags);
if (res == NULL && (flags & PR_WAITOK) == 0) {
/*
* We only run the drain hook here if PR_NOWAIT.
* In other cases, the hook will be run in
* pool_reclaim().
*/
if (pp->pr_drain_hook != NULL) {
(*pp->pr_drain_hook)(pp->pr_drain_hook_arg, flags);
res = (*pa->pa_alloc)(pp, flags);
}
}
return res;
}
static void
pool_allocator_free(struct pool *pp, void *v)
{
struct pool_allocator *pa = pp->pr_alloc;
if (pp->pr_redzone) {
KASSERT(!pp_has_pser(pp));
kasan_mark(v, pa->pa_pagesz, pa->pa_pagesz, 0);
} else if (__predict_false(pp_has_pser(pp))) {
/*
* Perform a passive serialization barrier before freeing
* the pool page back to the system.
*/
pool_barrier();
}
(*pa->pa_free)(pp, v);
}
void *
pool_page_alloc(struct pool *pp, int flags)
{
const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
vmem_addr_t va;
int ret;
ret = uvm_km_kmem_alloc(kmem_va_arena, pp->pr_alloc->pa_pagesz,
vflags | VM_INSTANTFIT, &va);
return ret ? NULL : (void *)va;
}
void
pool_page_free(struct pool *pp, void *v)
{
uvm_km_kmem_free(kmem_va_arena, (vaddr_t)v, pp->pr_alloc->pa_pagesz);
}
static void *
pool_page_alloc_meta(struct pool *pp, int flags)
{
const vm_flag_t vflags = (flags & PR_WAITOK) ? VM_SLEEP: VM_NOSLEEP;
vmem_addr_t va;
int ret;
ret = vmem_alloc(kmem_meta_arena, pp->pr_alloc->pa_pagesz,
vflags | VM_INSTANTFIT, &va);
return ret ? NULL : (void *)va;
}
static void
pool_page_free_meta(struct pool *pp, void *v)
{
vmem_free(kmem_meta_arena, (vmem_addr_t)v, pp->pr_alloc->pa_pagesz);
}
#ifdef KMSAN
static inline void
pool_get_kmsan(struct pool *pp, void *p)
{
kmsan_orig(p, pp->pr_size, KMSAN_TYPE_POOL, __RET_ADDR);
kmsan_mark(p, pp->pr_size, KMSAN_STATE_UNINIT);
}
static inline void
pool_put_kmsan(struct pool *pp, void *p)
{
kmsan_mark(p, pp->pr_size, KMSAN_STATE_INITED);
}
static inline void
pool_cache_get_kmsan(pool_cache_t pc, void *p)
{
if (__predict_false(pc_has_ctor(pc))) {
return;
}
pool_get_kmsan(&pc->pc_pool, p);
}
static inline void
pool_cache_put_kmsan(pool_cache_t pc, void *p)
{
pool_put_kmsan(&pc->pc_pool, p);
}
#endif
#ifdef POOL_QUARANTINE
static void
pool_quarantine_init(struct pool *pp)
{
pp->pr_quar.rotor = 0;
memset(&pp->pr_quar, 0, sizeof(pp->pr_quar));
}
static void
pool_quarantine_flush(struct pool *pp)
{
pool_quar_t *quar = &pp->pr_quar;
struct pool_pagelist pq;
size_t i;
LIST_INIT(&pq);
mutex_enter(&pp->pr_lock);
for (i = 0; i < POOL_QUARANTINE_DEPTH; i++) {
if (quar->list[i] == 0)
continue;
pool_do_put(pp, (void *)quar->list[i], &pq);
}
mutex_exit(&pp->pr_lock);
pr_pagelist_free(pp, &pq);
}
static bool
pool_put_quarantine(struct pool *pp, void *v, struct pool_pagelist *pq)
{
pool_quar_t *quar = &pp->pr_quar;
uintptr_t old;
if (pp->pr_roflags & PR_NOTOUCH) {
return false;
}
pool_redzone_check(pp, v);
old = quar->list[quar->rotor];
quar->list[quar->rotor] = (uintptr_t)v;
quar->rotor = (quar->rotor + 1) % POOL_QUARANTINE_DEPTH;
if (old != 0) {
pool_do_put(pp, (void *)old, pq);
}
return true;
}
#endif
#ifdef POOL_NOCACHE
static bool
pool_cache_put_nocache(pool_cache_t pc, void *p)
{
pool_cache_destruct_object(pc, p);
return true;
}
#endif
#ifdef POOL_REDZONE
#if defined(_LP64)
# define PRIME 0x9e37fffffffc0000UL
#else /* defined(_LP64) */
# define PRIME 0x9e3779b1
#endif /* defined(_LP64) */
#define STATIC_BYTE 0xFE
CTASSERT(POOL_REDZONE_SIZE > 1);
#ifndef KASAN
static inline uint8_t
pool_pattern_generate(const void *p)
{
return (uint8_t)(((uintptr_t)p) * PRIME
>> ((sizeof(uintptr_t) - sizeof(uint8_t))) * CHAR_BIT);
}
#endif
static void
pool_redzone_init(struct pool *pp, size_t requested_size)
{
size_t redzsz;
size_t nsz;
#ifdef KASAN
redzsz = requested_size;
kasan_add_redzone(&redzsz);
redzsz -= requested_size;
#else
redzsz = POOL_REDZONE_SIZE;
#endif
if (pp->pr_roflags & PR_NOTOUCH) {
pp->pr_redzone = false;
return;
}
/*
* We may have extended the requested size earlier; check if
* there's naturally space in the padding for a red zone.
*/
if (pp->pr_size - requested_size >= redzsz) {
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
return;
}
/*
* No space in the natural padding; check if we can extend a
* bit the size of the pool.
*
* Avoid using redzone for allocations half of a page or larger.
* For pagesize items, we'd waste a whole new page (could be
* unmapped?), and for half pagesize items, approximately half
* the space is lost (eg, 4K pages, you get one 2K allocation.)
*/
nsz = roundup(pp->pr_size + redzsz, pp->pr_align);
if (nsz <= (pp->pr_alloc->pa_pagesz / 2)) {
/* Ok, we can */
pp->pr_size = nsz;
pp->pr_reqsize_with_redzone = requested_size + redzsz;
pp->pr_redzone = true;
} else {
/* No space for a red zone... snif :'( */
pp->pr_redzone = false;
aprint_debug("pool redzone disabled for '%s'\n", pp->pr_wchan);
}
}
static void
pool_redzone_fill(struct pool *pp, void *p)
{
if (!pp->pr_redzone)
return;
KASSERT(!pp_has_pser(pp));
#ifdef KASAN
kasan_mark(p, pp->pr_reqsize, pp->pr_reqsize_with_redzone,
KASAN_POOL_REDZONE);
#else
uint8_t *cp, pat;
const uint8_t *ep;
cp = (uint8_t *)p + pp->pr_reqsize;
ep = cp + POOL_REDZONE_SIZE;
/*
* We really don't want the first byte of the red zone to be '\0';
* an off-by-one in a string may not be properly detected.
*/
pat = pool_pattern_generate(cp);
*cp = (pat == '\0') ? STATIC_BYTE: pat;
cp++;
while (cp < ep) {
*cp = pool_pattern_generate(cp);
cp++;
}
#endif
}
static void
pool_redzone_check(struct pool *pp, void *p)
{
if (!pp->pr_redzone)
return;
KASSERT(!pp_has_pser(pp));
#ifdef KASAN
kasan_mark(p, 0, pp->pr_reqsize_with_redzone, KASAN_POOL_FREED);
#else
uint8_t *cp, pat, expected;
const uint8_t *ep;
cp = (uint8_t *)p + pp->pr_reqsize;
ep = cp + POOL_REDZONE_SIZE;
pat = pool_pattern_generate(cp);
expected = (pat == '\0') ? STATIC_BYTE: pat;
if (__predict_false(*cp != expected)) {
panic("%s: [%s] 0x%02x != 0x%02x", __func__,
pp->pr_wchan, *cp, expected);
}
cp++;
while (cp < ep) {
expected = pool_pattern_generate(cp);
if (__predict_false(*cp != expected)) {
panic("%s: [%s] 0x%02x != 0x%02x", __func__,
pp->pr_wchan, *cp, expected);
}
cp++;
}
#endif
}
static void
pool_cache_redzone_check(pool_cache_t pc, void *p)
{
#ifdef KASAN
/*
* If there is a ctor/dtor, or if the cache objects use
* passive serialization, leave the data as valid.
*/
if (__predict_false(pc_has_ctor(pc) || pc_has_dtor(pc) ||
pc_has_pser(pc))) {
return;
}
#endif
pool_redzone_check(&pc->pc_pool, p);
}
#endif /* POOL_REDZONE */
#if defined(DDB)
static bool
pool_in_page(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{
return (uintptr_t)ph->ph_page <= addr &&
addr < (uintptr_t)ph->ph_page + pp->pr_alloc->pa_pagesz;
}
static bool
pool_in_item(struct pool *pp, void *item, uintptr_t addr)
{
return (uintptr_t)item <= addr && addr < (uintptr_t)item + pp->pr_size;
}
static bool
pool_in_cg(struct pool *pp, struct pool_cache_group *pcg, uintptr_t addr)
{
int i;
if (pcg == NULL) {
return false;
}
for (i = 0; i < pcg->pcg_avail; i++) {
if (pool_in_item(pp, pcg->pcg_objects[i].pcgo_va, addr)) {
return true;
}
}
return false;
}
static bool
pool_allocated(struct pool *pp, struct pool_item_header *ph, uintptr_t addr)
{
if ((pp->pr_roflags & PR_USEBMAP) != 0) {
unsigned int idx = pr_item_bitmap_index(pp, ph, (void *)addr);
pool_item_bitmap_t *bitmap =
ph->ph_bitmap + (idx / BITMAP_SIZE);
pool_item_bitmap_t mask = 1U << (idx & BITMAP_MASK);
return (*bitmap & mask) == 0;
} else {
struct pool_item *pi;
LIST_FOREACH(pi, &ph->ph_itemlist, pi_list) {
if (pool_in_item(pp, pi, addr)) {
return false;
}
}
return true;
}
}
void
pool_whatis(uintptr_t addr, void (*pr)(const char *, ...))
{
struct pool *pp;
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
struct pool_item_header *ph;
struct pool_cache *pc;
uintptr_t item;
bool allocated = true;
bool incache = false;
bool incpucache = false;
char cpucachestr[32];
if ((pp->pr_roflags & PR_PHINPAGE) != 0) {
LIST_FOREACH(ph, &pp->pr_fullpages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
goto found;
}
}
LIST_FOREACH(ph, &pp->pr_partpages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
allocated =
pool_allocated(pp, ph, addr);
goto found;
}
}
LIST_FOREACH(ph, &pp->pr_emptypages, ph_pagelist) {
if (pool_in_page(pp, ph, addr)) {
allocated = false;
goto found;
}
}
continue;
} else {
ph = pr_find_pagehead_noalign(pp, (void *)addr);
if (ph == NULL || !pool_in_page(pp, ph, addr)) {
continue;
}
allocated = pool_allocated(pp, ph, addr);
}
found:
if (allocated &&
(pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
struct pool_cache_group *pcg;
int i;
for (pcg = pc->pc_fullgroups; pcg != NULL;
pcg = pcg->pcg_next) {
if (pool_in_cg(pp, pcg, addr)) {
incache = true;
goto print;
}
}
for (i = 0; i < __arraycount(pc->pc_cpus); i++) {
pool_cache_cpu_t *cc;
if ((cc = pc->pc_cpus[i]) == NULL) {
continue;
}
if (pool_in_cg(pp, cc->cc_current, addr) ||
pool_in_cg(pp, cc->cc_previous, addr)) {
struct cpu_info *ci =
cpu_lookup(i);
incpucache = true;
snprintf(cpucachestr,
sizeof(cpucachestr),
"cached by CPU %u",
ci->ci_index);
goto print;
}
}
}
print:
item = (uintptr_t)ph->ph_page + ph->ph_off;
item = item + rounddown(addr - item, pp->pr_size);
(*pr)("%p is %p+%zu in POOL '%s' (%s)\n",
(void *)addr, item, (size_t)(addr - item),
pp->pr_wchan,
incpucache ? cpucachestr :
incache ? "cached" : allocated ? "allocated" : "free");
}
}
#endif /* defined(DDB) */
static int
pool_sysctl(SYSCTLFN_ARGS)
{
struct pool_sysctl data;
struct pool *pp;
struct pool_cache *pc;
pool_cache_cpu_t *cc;
int error;
size_t i, written;
if (oldp == NULL) {
*oldlenp = 0;
TAILQ_FOREACH(pp, &pool_head, pr_poollist)
*oldlenp += sizeof(data);
return 0;
}
memset(&data, 0, sizeof(data));
error = 0;
written = 0;
mutex_enter(&pool_head_lock);
TAILQ_FOREACH(pp, &pool_head, pr_poollist) {
if (written + sizeof(data) > *oldlenp)
break;
pp->pr_refcnt++;
strlcpy(data.pr_wchan, pp->pr_wchan, sizeof(data.pr_wchan));
data.pr_pagesize = pp->pr_alloc->pa_pagesz;
data.pr_flags = pp->pr_roflags | pp->pr_flags;
#define COPY(field) data.field = pp->field
COPY(pr_size);
COPY(pr_itemsperpage);
COPY(pr_nitems);
COPY(pr_nout);
COPY(pr_hardlimit);
COPY(pr_npages);
COPY(pr_minpages);
COPY(pr_maxpages);
COPY(pr_nget);
COPY(pr_nfail);
COPY(pr_nput);
COPY(pr_npagealloc);
COPY(pr_npagefree);
COPY(pr_hiwat);
COPY(pr_nidle);
#undef COPY
data.pr_cache_nmiss_pcpu = 0;
data.pr_cache_nhit_pcpu = 0;
data.pr_cache_nmiss_global = 0;
data.pr_cache_nempty = 0;
data.pr_cache_ncontended = 0;
data.pr_cache_npartial = 0;
if ((pc = atomic_load_consume(&pp->pr_cache)) != NULL) {
uint32_t nfull = 0;
data.pr_cache_meta_size = pc->pc_pcgsize;
for (i = 0; i < pc->pc_ncpu; ++i) {
cc = pc->pc_cpus[i];
if (cc == NULL)
continue;
data.pr_cache_ncontended += cc->cc_contended;
data.pr_cache_nmiss_pcpu += cc->cc_misses;
data.pr_cache_nhit_pcpu += cc->cc_hits;
data.pr_cache_nmiss_global += cc->cc_pcmisses;
nfull += cc->cc_nfull; /* 32-bit rollover! */
data.pr_cache_npartial += cc->cc_npart;
}
data.pr_cache_nfull = nfull;
} else {
data.pr_cache_meta_size = 0;
data.pr_cache_nfull = 0;
}
data.pr_cache_nhit_global = data.pr_cache_nmiss_pcpu -
data.pr_cache_nmiss_global;
if (pp->pr_refcnt == UINT_MAX) /* XXX possible? */
continue;
mutex_exit(&pool_head_lock);
error = sysctl_copyout(l, &data, oldp, sizeof(data));
mutex_enter(&pool_head_lock);
if (--pp->pr_refcnt == 0)
cv_broadcast(&pool_busy);
if (error)
break;
written += sizeof(data);
oldp = (char *)oldp + sizeof(data);
}
mutex_exit(&pool_head_lock);
*oldlenp = written;
return error;
}
SYSCTL_SETUP(sysctl_pool_setup, "sysctl kern.pool setup")
{
const struct sysctlnode *rnode = NULL;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pool",
SYSCTL_DESCR("Get pool statistics"),
pool_sysctl, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $ */
/*-
* Copyright (c) 2010, 2011, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Passive serialization.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_pserialize.c,v 1.24 2023/10/04 20:28:06 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/evcnt.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <sys/xcall.h>
struct pserialize {
char psz_dummy;
};
static kmutex_t psz_lock __cacheline_aligned;
static struct evcnt psz_ev_excl __cacheline_aligned =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "pserialize", "exclusive access");
EVCNT_ATTACH_STATIC(psz_ev_excl);
/*
* pserialize_init:
*
* Initialize passive serialization structures.
*/
void
pserialize_init(void)
{
mutex_init(&psz_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* pserialize_create:
*
* Create and initialize a passive serialization object.
*/
pserialize_t
pserialize_create(void)
{
pserialize_t psz;
psz = kmem_zalloc(sizeof(*psz), KM_SLEEP);
return psz;
}
/*
* pserialize_destroy:
*
* Destroy a passive serialization object.
*/
void
pserialize_destroy(pserialize_t psz)
{
kmem_free(psz, sizeof(*psz));
}
/*
* pserialize_perform:
*
* Perform the write side of passive serialization.
*/
void
pserialize_perform(pserialize_t psz)
{ KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); if (__predict_false(panicstr != NULL)) {
return;
}
if (__predict_false(mp_online == false)) {
psz_ev_excl.ev_count++;
return;
}
/*
* Broadcast a NOP to all CPUs and wait until all of them complete.
*/
xc_barrier(XC_HIGHPRI);
mutex_enter(&psz_lock);
psz_ev_excl.ev_count++;
mutex_exit(&psz_lock);
}
int
pserialize_read_enter(void)
{
int s;
s = splsoftserial();
curcpu()->ci_psz_read_depth++;
__insn_barrier();
return s;
}
void
pserialize_read_exit(int s)
{ KASSERT(__predict_false(cold) || kpreempt_disabled());
__insn_barrier();
if (__predict_false(curcpu()->ci_psz_read_depth-- == 0))
panic("mismatching pserialize_read_exit()"); splx(s);
}
/*
* pserialize_in_read_section:
*
* True if the caller is in a pserialize read section. To be used
* only for diagnostic assertions where we want to guarantee the
* condition like:
*
* KASSERT(pserialize_in_read_section());
*/
bool
pserialize_in_read_section(void)
{ return kpreempt_disabled() && curcpu()->ci_psz_read_depth > 0;
}
/*
* pserialize_not_in_read_section:
*
* True if the caller is not in a pserialize read section. To be
* used only for diagnostic assertions where we want to guarantee
* the condition like:
*
* KASSERT(pserialize_not_in_read_section());
*/
bool
pserialize_not_in_read_section(void)
{
bool notin;
long pctr;
pctr = lwp_pctr();
notin = __predict_true(curcpu()->ci_psz_read_depth == 0);
/*
* If we had a context switch, we're definitely not in a
* pserialize read section because pserialize read sections
* block preemption.
*/
if (__predict_false(pctr != lwp_pctr()))
notin = true;
return notin;
}
/* $NetBSD: wsmouse.c,v 1.73 2023/07/30 10:45:11 riastradh Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996, 1997 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ms.c 8.1 (Berkeley) 6/11/93
*/
/*
* Mouse driver.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: wsmouse.c,v 1.73 2023/07/30 10:45:11 riastradh Exp $");
#include "wsmouse.h"
#include "wsdisplay.h"
#include "wsmux.h"
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/tty.h>
#include <sys/signalvar.h>
#include <sys/device.h>
#include <sys/vnode.h>
#include <sys/callout.h>
#include <dev/wscons/wsconsio.h>
#include <dev/wscons/wsmousevar.h>
#include <dev/wscons/wseventvar.h>
#include <dev/wscons/wsmuxvar.h>
#include "ioconf.h"
#if defined(WSMUX_DEBUG) && NWSMUX > 0
#define DPRINTF(x) if (wsmuxdebug) printf x
#define DPRINTFN(n,x) if (wsmuxdebug > (n)) printf x
extern int wsmuxdebug;
#else
#define DPRINTF(x)
#define DPRINTFN(n,x)
#endif
#define INVALID_X INT_MAX
#define INVALID_Y INT_MAX
#define INVALID_Z INT_MAX
#define INVALID_W INT_MAX
struct wsmouse_softc {
struct wsevsrc sc_base;
const struct wsmouse_accessops *sc_accessops;
void *sc_accesscookie;
u_int sc_mb; /* mouse button state */
u_int sc_ub; /* user button state */
int sc_dx; /* delta-x */
int sc_dy; /* delta-y */
int sc_dz; /* delta-z */
int sc_dw; /* delta-w */
int sc_x; /* absolute-x */
int sc_y; /* absolute-y */
int sc_z; /* absolute-z */
int sc_w; /* absolute-w */
int sc_refcnt;
u_char sc_dying; /* device is being detached */
struct wsmouse_repeat sc_repeat;
int sc_repeat_button;
callout_t sc_repeat_callout;
unsigned int sc_repeat_delay;
int sc_reverse_scroll;
int sc_horiz_scroll_dist;
int sc_vert_scroll_dist;
};
static int wsmouse_match(device_t, cfdata_t, void *);
static void wsmouse_attach(device_t, device_t, void *);
static int wsmouse_detach(device_t, int);
static int wsmouse_activate(device_t, enum devact);
static int wsmouse_set_params(struct wsmouse_softc *,
struct wsmouse_param *, size_t);
static int wsmouse_get_params(struct wsmouse_softc *,
struct wsmouse_param *, size_t);
static int wsmouse_handle_params(struct wsmouse_softc *,
struct wsmouse_parameters *, bool);
static int wsmouse_do_ioctl(struct wsmouse_softc *, u_long, void *,
int, struct lwp *);
#if NWSMUX > 0
static int wsmouse_mux_open(struct wsevsrc *, struct wseventvar *);
static int wsmouse_mux_close(struct wsevsrc *);
#endif
static int wsmousedoioctl(device_t, u_long, void *, int, struct lwp *);
static int wsmousedoopen(struct wsmouse_softc *, struct wseventvar *);
CFATTACH_DECL_NEW(wsmouse, sizeof (struct wsmouse_softc),
wsmouse_match, wsmouse_attach, wsmouse_detach, wsmouse_activate);
static void wsmouse_repeat(void *v);
dev_type_open(wsmouseopen);
dev_type_close(wsmouseclose);
dev_type_read(wsmouseread);
dev_type_ioctl(wsmouseioctl);
dev_type_poll(wsmousepoll);
dev_type_kqfilter(wsmousekqfilter);
const struct cdevsw wsmouse_cdevsw = {
.d_open = wsmouseopen,
.d_close = wsmouseclose,
.d_read = wsmouseread,
.d_write = nowrite,
.d_ioctl = wsmouseioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = wsmousepoll,
.d_mmap = nommap,
.d_kqfilter = wsmousekqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
#if NWSMUX > 0
struct wssrcops wsmouse_srcops = {
WSMUX_MOUSE,
wsmouse_mux_open, wsmouse_mux_close, wsmousedoioctl, NULL, NULL
};
#endif
/*
* Print function (for parent devices).
*/
int
wsmousedevprint(void *aux, const char *pnp)
{
if (pnp)
aprint_normal("wsmouse at %s", pnp);
return (UNCONF);
}
int
wsmouse_match(device_t parent, cfdata_t match, void *aux)
{
return (1);
}
void
wsmouse_attach(device_t parent, device_t self, void *aux)
{
struct wsmouse_softc *sc = device_private(self);
struct wsmousedev_attach_args *ap = aux;
#if NWSMUX > 0
int mux, error;
#endif
sc->sc_base.me_dv = self;
sc->sc_accessops = ap->accessops;
sc->sc_accesscookie = ap->accesscookie;
/* Initialize button repeating. */
memset(&sc->sc_repeat, 0, sizeof(sc->sc_repeat));
sc->sc_repeat_button = -1;
sc->sc_repeat_delay = 0;
sc->sc_reverse_scroll = 0;
sc->sc_horiz_scroll_dist = WSMOUSE_DEFAULT_SCROLL_DIST;
sc->sc_vert_scroll_dist = WSMOUSE_DEFAULT_SCROLL_DIST;
callout_init(&sc->sc_repeat_callout, 0);
callout_setfunc(&sc->sc_repeat_callout, wsmouse_repeat, sc);
#if NWSMUX > 0
sc->sc_base.me_ops = &wsmouse_srcops;
mux = device_cfdata(self)->wsmousedevcf_mux;
if (mux >= 0) {
error = wsmux_attach_sc(wsmux_getmux(mux), &sc->sc_base);
if (error)
aprint_error(" attach error=%d", error);
else
aprint_normal(" mux %d", mux);
}
#else
if (device_cfdata(self)->wsmousedevcf_mux >= 0)
aprint_normal(" (mux ignored)");
#endif
aprint_naive("\n");
aprint_normal("\n");
if (!pmf_device_register(self, NULL, NULL))
aprint_error_dev(self, "couldn't establish power handler\n");
}
int
wsmouse_activate(device_t self, enum devact act)
{
struct wsmouse_softc *sc = device_private(self);
if (act == DVACT_DEACTIVATE)
sc->sc_dying = 1;
return (0);
}
/*
* Detach a mouse. To keep track of users of the softc we keep
* a reference count that's incremented while inside, e.g., read.
* If the mouse is active and the reference count is > 0 (0 is the
* normal state) we post an event and then wait for the process
* that had the reference to wake us up again. Then we blow away the
* vnode and return (which will deallocate the softc).
*/
int
wsmouse_detach(device_t self, int flags)
{
struct wsmouse_softc *sc = device_private(self);
struct wseventvar *evar;
int maj, mn;
int s;
#if NWSMUX > 0
/* Tell parent mux we're leaving. */
if (sc->sc_base.me_parent != NULL) {
DPRINTF(("wsmouse_detach:\n"));
wsmux_detach_sc(&sc->sc_base);
}
#endif
/* If we're open ... */
evar = sc->sc_base.me_evp;
if (evar != NULL && evar->io != NULL) {
s = spltty();
if (--sc->sc_refcnt >= 0) {
struct wscons_event event;
/* Wake everyone by generating a dummy event. */
event.type = 0;
event.value = 0;
if (wsevent_inject(evar, &event, 1) != 0)
wsevent_wakeup(evar);
/* Wait for processes to go away. */
if (tsleep(sc, PZERO, "wsmdet", hz * 60))
printf("wsmouse_detach: %s didn't detach\n",
device_xname(self));
}
splx(s);
}
/* locate the major number */
maj = cdevsw_lookup_major(&wsmouse_cdevsw);
/* Nuke the vnodes for any open instances (calls close). */
mn = device_unit(self);
vdevgone(maj, mn, mn, VCHR);
return (0);
}
void
wsmouse_input(device_t wsmousedev, u_int btns /* 0 is up */,
int x, int y, int z, int w, u_int flags)
{
struct wsmouse_softc *sc = device_private(wsmousedev);
struct wseventvar *evar;
int mb, ub, d, nevents;
/* one for each dimension (4) + a bit for each button */
struct wscons_event events[4 + sizeof(d) * 8];
KERNEL_LOCK(1, NULL);
/*
* Discard input if not open.
*/
evar = sc->sc_base.me_evp;
if (evar == NULL)
goto out;
#ifdef DIAGNOSTIC
if (evar->q == NULL) {
printf("wsmouse_input: evar->q=NULL\n");
goto out;
}
#endif
#if NWSMUX > 0
DPRINTFN(5,("wsmouse_input: %s mux=%p, evar=%p\n",
device_xname(sc->sc_base.me_dv),
sc->sc_base.me_parent, evar));
#endif
sc->sc_mb = btns;
if (!(flags & WSMOUSE_INPUT_ABSOLUTE_X))
sc->sc_dx += x;
if (!(flags & WSMOUSE_INPUT_ABSOLUTE_Y))
sc->sc_dy += y;
if (!(flags & WSMOUSE_INPUT_ABSOLUTE_Z))
sc->sc_dz += z;
if (!(flags & WSMOUSE_INPUT_ABSOLUTE_W))
sc->sc_dw += w;
/*
* We have at least one event (mouse button, delta-X, or
* delta-Y; possibly all three, and possibly three separate
* button events). Deliver these events until we are out
* of changes or out of room. As events get delivered,
* mark them `unchanged'.
*/
ub = sc->sc_ub;
nevents = 0;
if (flags & WSMOUSE_INPUT_ABSOLUTE_X) {
if (sc->sc_x != x) {
events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_X;
events[nevents].value = x;
nevents++;
}
} else {
if (sc->sc_dx) {
events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_X;
events[nevents].value = sc->sc_dx;
nevents++;
}
}
if (flags & WSMOUSE_INPUT_ABSOLUTE_Y) {
if (sc->sc_y != y) {
events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_Y;
events[nevents].value = y;
nevents++;
}
} else {
if (sc->sc_dy) {
events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_Y;
events[nevents].value = sc->sc_dy;
nevents++;
}
}
if (flags & WSMOUSE_INPUT_ABSOLUTE_Z) {
if (sc->sc_z != z) {
events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_Z;
events[nevents].value = z;
nevents++;
}
} else {
if (sc->sc_dz) {
events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_Z;
events[nevents].value = sc->sc_dz;
nevents++;
}
}
if (flags & WSMOUSE_INPUT_ABSOLUTE_W) {
if (sc->sc_w != w) {
events[nevents].type = WSCONS_EVENT_MOUSE_ABSOLUTE_W;
events[nevents].value = w;
nevents++;
}
} else {
if (sc->sc_dw) {
events[nevents].type = WSCONS_EVENT_MOUSE_DELTA_W;
events[nevents].value = sc->sc_dw;
nevents++;
}
}
mb = sc->sc_mb;
while ((d = mb ^ ub) != 0) {
int btnno;
/*
* Cancel button repeating if button status changed.
*/
if (sc->sc_repeat_button != -1) {
KASSERT(sc->sc_repeat_button >= 0);
KASSERT(sc->sc_repeat.wr_buttons &
(1 << sc->sc_repeat_button));
ub &= ~(1 << sc->sc_repeat_button);
sc->sc_repeat_button = -1;
callout_stop(&sc->sc_repeat_callout);
}
/*
* Mouse button change. Find the first change and drop
* it into the event queue.
*/
btnno = ffs(d) - 1;
KASSERT(btnno >= 0);
if (nevents >= __arraycount(events)) {
aprint_error_dev(sc->sc_base.me_dv,
"Event queue full (button status mb=0x%x"
" ub=0x%x)\n", mb, ub);
break;
}
events[nevents].type =
(mb & d) ? WSCONS_EVENT_MOUSE_DOWN : WSCONS_EVENT_MOUSE_UP;
events[nevents].value = btnno;
nevents++;
ub ^= (1 << btnno);
/*
* Program button repeating if configured for this button.
*/
if ((mb & d) && (sc->sc_repeat.wr_buttons & (1 << btnno)) &&
sc->sc_repeat.wr_delay_first > 0) {
sc->sc_repeat_button = btnno;
sc->sc_repeat_delay = sc->sc_repeat.wr_delay_first;
callout_schedule(&sc->sc_repeat_callout,
mstohz(sc->sc_repeat_delay));
}
}
if (nevents == 0 || wsevent_inject(evar, events, nevents) == 0) {
/* All events were correctly injected into the queue.
* Synchronize the mouse's status with what the user
* has received. */
sc->sc_x = x; sc->sc_dx = 0;
sc->sc_y = y; sc->sc_dy = 0;
sc->sc_z = z; sc->sc_dz = 0;
sc->sc_w = w; sc->sc_dw = 0;
sc->sc_ub = ub;
#if NWSMUX > 0
DPRINTFN(5,("wsmouse_input: %s wakeup evar=%p\n",
device_xname(sc->sc_base.me_dv), evar));
#endif
}
out: KERNEL_UNLOCK_ONE(NULL);
}
void
wsmouse_precision_scroll(device_t wsmousedev, int x, int y)
{
struct wsmouse_softc *sc = device_private(wsmousedev);
struct wseventvar *evar;
struct wscons_event events[2];
int nevents = 0;
evar = sc->sc_base.me_evp;
if (evar == NULL)
return;
if (sc->sc_reverse_scroll) {
x = -x;
y = -y;
}
x = (x * 4096) / sc->sc_horiz_scroll_dist;
y = (y * 4096) / sc->sc_vert_scroll_dist;
if (x != 0) {
events[nevents].type = WSCONS_EVENT_HSCROLL;
events[nevents].value = x;
nevents++;
}
if (y != 0) {
events[nevents].type = WSCONS_EVENT_VSCROLL;
events[nevents].value = y;
nevents++;
}
(void)wsevent_inject(evar, events, nevents);
}
static void
wsmouse_repeat(void *v)
{
int oldspl;
unsigned int newdelay;
struct wsmouse_softc *sc;
struct wscons_event events[2];
oldspl = spltty();
sc = (struct wsmouse_softc *)v;
if (sc->sc_repeat_button == -1) {
/* Race condition: a "button up" event came in when
* this function was already called but did not do
* spltty() yet. */
splx(oldspl);
return;
}
KASSERT(sc->sc_repeat_button >= 0);
KASSERT(sc->sc_repeat.wr_buttons & (1 << sc->sc_repeat_button));
newdelay = sc->sc_repeat_delay;
events[0].type = WSCONS_EVENT_MOUSE_UP;
events[0].value = sc->sc_repeat_button;
events[1].type = WSCONS_EVENT_MOUSE_DOWN;
events[1].value = sc->sc_repeat_button;
if (wsevent_inject(sc->sc_base.me_evp, events, 2) == 0) {
sc->sc_ub = 1 << sc->sc_repeat_button;
if (newdelay - sc->sc_repeat.wr_delay_decrement <
sc->sc_repeat.wr_delay_minimum)
newdelay = sc->sc_repeat.wr_delay_minimum;
else if (newdelay > sc->sc_repeat.wr_delay_minimum)
newdelay -= sc->sc_repeat.wr_delay_decrement;
KASSERT(newdelay >= sc->sc_repeat.wr_delay_minimum);
KASSERT(newdelay <= sc->sc_repeat.wr_delay_first);
}
/*
* Reprogram the repeating event.
*/
sc->sc_repeat_delay = newdelay;
callout_schedule(&sc->sc_repeat_callout, mstohz(newdelay));
splx(oldspl);
}
static int
wsmouse_set_params(struct wsmouse_softc *sc,
struct wsmouse_param *buf, size_t nparams)
{
size_t i = 0;
for (i = 0; i < nparams; ++i) { switch (buf[i].key) {
case WSMOUSECFG_REVERSE_SCROLLING:
sc->sc_reverse_scroll = (buf[i].value != 0);
break;
case WSMOUSECFG_HORIZSCROLLDIST:
sc->sc_horiz_scroll_dist = buf[i].value;
break;
case WSMOUSECFG_VERTSCROLLDIST:
sc->sc_vert_scroll_dist = buf[i].value;
break;
}
}
return 0;
}
static int
wsmouse_get_params(struct wsmouse_softc *sc,
struct wsmouse_param *buf, size_t nparams)
{
size_t i = 0;
for (i = 0; i < nparams; ++i) { switch (buf[i].key) {
case WSMOUSECFG_REVERSE_SCROLLING:
buf[i].value = sc->sc_reverse_scroll;
break;
case WSMOUSECFG_HORIZSCROLLDIST:
buf[i].value = sc->sc_horiz_scroll_dist;
break;
case WSMOUSECFG_VERTSCROLLDIST:
buf[i].value = sc->sc_vert_scroll_dist;
break;
}
}
return 0;
}
static int
wsmouse_handle_params(struct wsmouse_softc *sc, struct wsmouse_parameters *upl,
bool set)
{
size_t len;
struct wsmouse_param *buf;
int error = 0;
if (upl->params == NULL || upl->nparams > WSMOUSECFG_MAX)
return EINVAL;
if (upl->nparams == 0)
return 0;
len = upl->nparams * sizeof(struct wsmouse_param);
buf = kmem_alloc(len, KM_SLEEP);
if (buf == NULL)
return ENOMEM;
if ((error = copyin(upl->params, buf, len)) != 0)
goto error;
if (set) {
error = wsmouse_set_params(sc, buf, upl->nparams);
if (error != 0)
goto error;
} else {
error = wsmouse_get_params(sc, buf, upl->nparams);
if (error != 0)
goto error;
if ((error = copyout(buf, upl->params, len)) != 0)
goto error;
}
error:
kmem_free(buf, len);
return error;
}
int
wsmouseopen(dev_t dev, int flags, int mode, struct lwp *l)
{
struct wsmouse_softc *sc;
struct wseventvar *evar;
int error;
sc = device_lookup_private(&wsmouse_cd, minor(dev));
if (sc == NULL)
return ENXIO;
#if NWSMUX > 0
DPRINTF(("wsmouseopen: %s mux=%p p=%p\n", device_xname(sc->sc_base.me_dv),
sc->sc_base.me_parent, l));
#endif
if (sc->sc_dying)
return (EIO);
if ((flags & (FREAD | FWRITE)) == FWRITE)
return (0); /* always allow open for write
so ioctl() is possible. */
if (sc->sc_base.me_evp != NULL)
return (EBUSY);
evar = &sc->sc_base.me_evar;
wsevent_init(evar, l->l_proc);
sc->sc_base.me_evp = evar;
error = wsmousedoopen(sc, evar);
if (error) {
DPRINTF(("wsmouseopen: %s open failed\n",
device_xname(sc->sc_base.me_dv)));
sc->sc_base.me_evp = NULL;
wsevent_fini(evar);
}
return (error);
}
int
wsmouseclose(dev_t dev, int flags, int mode,
struct lwp *l)
{
struct wsmouse_softc *sc =
device_lookup_private(&wsmouse_cd, minor(dev));
struct wseventvar *evar = sc->sc_base.me_evp;
if (evar == NULL)
/* not open for read */
return (0);
sc->sc_base.me_evp = NULL;
(*sc->sc_accessops->disable)(sc->sc_accesscookie);
wsevent_fini(evar);
return (0);
}
int
wsmousedoopen(struct wsmouse_softc *sc, struct wseventvar *evp)
{
sc->sc_base.me_evp = evp;
sc->sc_x = INVALID_X;
sc->sc_y = INVALID_Y;
sc->sc_z = INVALID_Z;
sc->sc_w = INVALID_W;
/* Stop button repeating when messing with the device. */
if (sc->sc_repeat_button != -1) { KASSERT(sc->sc_repeat_button >= 0);
sc->sc_repeat_button = -1;
callout_stop(&sc->sc_repeat_callout);
}
/* enable the device, and punt if that's not possible */
return (*sc->sc_accessops->enable)(sc->sc_accesscookie);
}
int
wsmouseread(dev_t dev, struct uio *uio, int flags)
{
struct wsmouse_softc *sc =
device_lookup_private(&wsmouse_cd, minor(dev));
int error;
if (sc->sc_dying)
return (EIO);
#ifdef DIAGNOSTIC
if (sc->sc_base.me_evp == NULL) {
printf("wsmouseread: evp == NULL\n");
return (EINVAL);
}
#endif
sc->sc_refcnt++;
error = wsevent_read(sc->sc_base.me_evp, uio, flags);
if (--sc->sc_refcnt < 0) {
wakeup(sc);
error = EIO;
}
return (error);
}
int
wsmouseioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
return (wsmousedoioctl(device_lookup(&wsmouse_cd, minor(dev)),
cmd, data, flag, l));
}
/* A wrapper around the ioctl() workhorse to make reference counting easy. */
int
wsmousedoioctl(device_t dv, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct wsmouse_softc *sc = device_private(dv);
int error;
sc->sc_refcnt++;
error = wsmouse_do_ioctl(sc, cmd, data, flag, l); if (--sc->sc_refcnt < 0) wakeup(sc);
return (error);
}
int
wsmouse_do_ioctl(struct wsmouse_softc *sc, u_long cmd, void *data,
int flag, struct lwp *l)
{
int error;
struct wsmouse_repeat *wr;
if (sc->sc_dying)
return (EIO);
/*
* Try the generic ioctls that the wsmouse interface supports.
*/
switch (cmd) {
case FIONBIO: /* we will remove this someday (soon???) */
return (0);
case FIOASYNC:
if (sc->sc_base.me_evp == NULL)
return (EINVAL);
sc->sc_base.me_evp->async = *(int *)data != 0;
return (0);
case FIOSETOWN:
if (sc->sc_base.me_evp == NULL)
return (EINVAL);
if (-*(int *)data != sc->sc_base.me_evp->io->p_pgid && *(int *)data != sc->sc_base.me_evp->io->p_pid)
return (EPERM);
return (0);
case TIOCSPGRP:
if (sc->sc_base.me_evp == NULL)
return (EINVAL);
if (*(int *)data != sc->sc_base.me_evp->io->p_pgid)
return (EPERM);
return (0);
}
/*
* Try the wsmouse specific ioctls.
*/
switch (cmd) {
case WSMOUSEIO_GETREPEAT:
wr = (struct wsmouse_repeat *)data;
memcpy(wr, &sc->sc_repeat, sizeof(sc->sc_repeat));
return 0;
case WSMOUSEIO_SETREPEAT:
if ((flag & FWRITE) == 0)
return EACCES;
/* Validate input data. */
wr = (struct wsmouse_repeat *)data;
if (wr->wr_delay_first != 0 && (wr->wr_delay_first < wr->wr_delay_decrement || wr->wr_delay_first < wr->wr_delay_minimum ||
wr->wr_delay_first < wr->wr_delay_minimum +
wr->wr_delay_decrement))
return EINVAL;
/* Stop current repeating and set new data. */
sc->sc_repeat_button = -1;
callout_stop(&sc->sc_repeat_callout);
memcpy(&sc->sc_repeat, wr, sizeof(sc->sc_repeat));
return 0;
case WSMOUSEIO_SETVERSION:
return wsevent_setversion(sc->sc_base.me_evp, *(int *)data);
case WSMOUSEIO_GETPARAMS:
return wsmouse_handle_params(sc,
(struct wsmouse_parameters *)data, false);
case WSMOUSEIO_SETPARAMS:
if ((flag & FWRITE) == 0)
return EACCES;
return wsmouse_handle_params(sc,
(struct wsmouse_parameters *)data, true);
}
/*
* Try the mouse driver for WSMOUSEIO ioctls. It returns -1
* if it didn't recognize the request.
*/
error = (*sc->sc_accessops->ioctl)(sc->sc_accesscookie, cmd,
data, flag, l);
return (error); /* may be EPASSTHROUGH */
}
int
wsmousepoll(dev_t dev, int events, struct lwp *l)
{
struct wsmouse_softc *sc =
device_lookup_private(&wsmouse_cd, minor(dev));
if (sc->sc_base.me_evp == NULL)
return (POLLERR);
return (wsevent_poll(sc->sc_base.me_evp, events, l));
}
int
wsmousekqfilter(dev_t dev, struct knote *kn)
{
struct wsmouse_softc *sc =
device_lookup_private(&wsmouse_cd, minor(dev));
if (sc->sc_base.me_evp == NULL)
return (1);
return (wsevent_kqfilter(sc->sc_base.me_evp, kn));
}
#if NWSMUX > 0
int
wsmouse_mux_open(struct wsevsrc *me, struct wseventvar *evp)
{
struct wsmouse_softc *sc = (struct wsmouse_softc *)me;
if (sc->sc_base.me_evp != NULL)
return (EBUSY);
return wsmousedoopen(sc, evp);
}
int
wsmouse_mux_close(struct wsevsrc *me)
{
struct wsmouse_softc *sc = (struct wsmouse_softc *)me;
sc->sc_base.me_evp = NULL;
(*sc->sc_accessops->disable)(sc->sc_accesscookie);
return (0);
}
int
wsmouse_add_mux(int unit, struct wsmux_softc *muxsc)
{
struct wsmouse_softc *sc;
sc = device_lookup_private(&wsmouse_cd, unit);
if (sc == NULL)
return ENXIO;
if (sc->sc_base.me_parent != NULL || sc->sc_base.me_evp != NULL)
return (EBUSY);
return (wsmux_attach_sc(muxsc, &sc->sc_base));
}
#endif /* NWSMUX > 0 */
/* $NetBSD: random.c,v 1.10 2021/12/28 13:22:43 riastradh Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* /dev/random, /dev/urandom -- stateless version
*
* For short reads from /dev/urandom, up to 256 bytes, read from a
* per-CPU NIST Hash_DRBG instance that is reseeded as soon as the
* system has enough entropy.
*
* For all other reads, instantiate a fresh NIST Hash_DRBG from
* the global entropy pool, and draw from it.
*
* Each read is independent; there is no per-open state.
* Concurrent reads from the same open run in parallel.
*
* Reading from /dev/random may block until entropy is available.
* Either device may return short reads if interrupted.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: random.c,v 1.10 2021/12/28 13:22:43 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/conf.h>
#include <sys/cprng.h>
#include <sys/entropy.h>
#include <sys/errno.h>
#include <sys/event.h>
#include <sys/fcntl.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/poll.h>
#include <sys/random.h>
#include <sys/rnd.h>
#include <sys/rndsource.h>
#include <sys/signalvar.h>
#include <sys/systm.h>
#include <sys/vnode.h> /* IO_NDELAY */
#include "ioconf.h"
static dev_type_open(random_open);
static dev_type_close(random_close);
static dev_type_ioctl(random_ioctl);
static dev_type_poll(random_poll);
static dev_type_kqfilter(random_kqfilter);
static dev_type_read(random_read);
static dev_type_write(random_write);
const struct cdevsw rnd_cdevsw = {
.d_open = random_open,
.d_close = random_close,
.d_read = random_read,
.d_write = random_write,
.d_ioctl = random_ioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = random_poll,
.d_mmap = nommap,
.d_kqfilter = random_kqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER|D_MPSAFE,
};
#define RANDOM_BUFSIZE 512 /* XXX pulled from arse */
/* Entropy source for writes to /dev/random and /dev/urandom */
static krndsource_t user_rndsource;
void
rndattach(int num)
{
rnd_attach_source(&user_rndsource, "/dev/random", RND_TYPE_UNKNOWN,
RND_FLAG_COLLECT_VALUE);
}
static int
random_open(dev_t dev, int flags, int fmt, struct lwp *l)
{
/* Validate minor. */
switch (minor(dev)) {
case RND_DEV_RANDOM:
case RND_DEV_URANDOM:
break;
default:
return ENXIO;
}
return 0;
}
static int
random_close(dev_t dev, int flags, int fmt, struct lwp *l)
{
/* Success! */
return 0;
}
static int
random_ioctl(dev_t dev, unsigned long cmd, void *data, int flag, struct lwp *l)
{
/*
* No non-blocking/async options; otherwise defer to
* entropy_ioctl.
*/
switch (cmd) {
case FIONBIO:
case FIOASYNC:
return 0;
default:
return entropy_ioctl(cmd, data);
}
}
static int
random_poll(dev_t dev, int events, struct lwp *l)
{
/* /dev/random may block; /dev/urandom is always ready. */
switch (minor(dev)) {
case RND_DEV_RANDOM:
return entropy_poll(events);
case RND_DEV_URANDOM:
return events & (POLLIN|POLLRDNORM | POLLOUT|POLLWRNORM);
default:
return 0;
}
}
static int
random_kqfilter(dev_t dev, struct knote *kn)
{
/* Validate the event filter. */
switch (kn->kn_filter) {
case EVFILT_READ:
case EVFILT_WRITE:
break;
default:
return EINVAL;
}
/* /dev/random may block; /dev/urandom never does. */
switch (minor(dev)) {
case RND_DEV_RANDOM:
if (kn->kn_filter == EVFILT_READ)
return entropy_kqfilter(kn);
/* FALLTHROUGH */
case RND_DEV_URANDOM:
kn->kn_fop = &seltrue_filtops;
return 0;
default:
return ENXIO;
}
}
/*
* random_read(dev, uio, flags)
*
* Generate data from a PRNG seeded from the entropy pool.
*
* - If /dev/random, block until we have full entropy, or fail
* with EWOULDBLOCK, and if `depleting' entropy, return at most
* the entropy pool's capacity at once.
*
* - If /dev/urandom, generate data from whatever is in the
* entropy pool now.
*
* On interrupt, return a short read, but not shorter than 256
* bytes (actually, no shorter than RANDOM_BUFSIZE bytes, which is
* 512 for hysterical raisins).
*/
static int
random_read(dev_t dev, struct uio *uio, int flags)
{
int gflags;
/* Set the appropriate GRND_* mode. */
switch (minor(dev)) {
case RND_DEV_RANDOM:
gflags = GRND_RANDOM;
break;
case RND_DEV_URANDOM:
gflags = GRND_INSECURE;
break;
default:
return ENXIO;
}
/*
* Set GRND_NONBLOCK if the user requested IO_NDELAY (i.e., the
* file was opened with O_NONBLOCK).
*/
if (flags & IO_NDELAY)
gflags |= GRND_NONBLOCK;
/* Defer to getrandom. */
return dogetrandom(uio, gflags);
}
/*
* random_write(dev, uio, flags)
*
* Enter data from uio into the entropy pool.
*
* Assume privileged users provide full entropy, and unprivileged
* users provide no entropy. If you have a nonuniform source of
* data with n bytes of min-entropy, hash it with an XOF like
* SHAKE128 into exactly n bytes first.
*/
static int
random_write(dev_t dev, struct uio *uio, int flags)
{
kauth_cred_t cred = kauth_cred_get();
uint8_t *buf;
bool privileged = false, any = false;
int error = 0;
/* Verify user's authorization to affect the entropy pool. */
error = kauth_authorize_device(cred, KAUTH_DEVICE_RND_ADDDATA,
NULL, NULL, NULL, NULL);
if (error)
return error;
/*
* Check whether user is privileged. If so, assume user
* furnishes full-entropy data; if not, accept user's data but
* assume it has zero entropy when we do accounting. If you
* want to specify less entropy, use ioctl(RNDADDDATA).
*/
if (kauth_authorize_device(cred, KAUTH_DEVICE_RND_ADDDATA_ESTIMATE,
NULL, NULL, NULL, NULL) == 0)
privileged = true;
/* Get a buffer for transfers. */
buf = kmem_alloc(RANDOM_BUFSIZE, KM_SLEEP);
/* Consume data. */
while (uio->uio_resid) {
size_t n = MIN(uio->uio_resid, RANDOM_BUFSIZE);
/* Transfer n bytes in and enter them into the pool. */
error = uiomove(buf, n, uio);
if (error)
break;
rnd_add_data(&user_rndsource, buf, n, privileged ? n*NBBY : 0);
any = true;
/* Now's a good time to yield if needed. */
preempt_point();
/* Check for interruption. */
if (__predict_false(curlwp->l_flag & LW_PENDSIG) &&
sigispending(curlwp, 0)) {
error = EINTR;
break;
}
}
/* Zero the buffer and free it. */
explicit_memset(buf, 0, RANDOM_BUFSIZE);
kmem_free(buf, RANDOM_BUFSIZE);
/* If we added anything, consolidate entropy now. */
if (any) entropy_consolidate();
return error;
}
/* $NetBSD: bufq_impl.h,v 1.10 2016/11/16 00:46:46 pgoyette Exp $ */
/* NetBSD: bufq.h,v 1.3 2005/03/31 11:28:53 yamt Exp */
/* NetBSD: buf.h,v 1.75 2004/09/18 16:40:11 yamt Exp */
/*-
* Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)buf.h 8.9 (Berkeley) 3/30/95
*/
#if !defined(_KERNEL)
#error not supposed to be exposed to userland.
#endif
struct bufq_strat;
/*
* Device driver buffer queue.
*/
struct bufq_state {
void (*bq_put)(struct bufq_state *, struct buf *);
struct buf *(*bq_get)(struct bufq_state *, int);
struct buf *(*bq_cancel)(struct bufq_state *, struct buf *);
void (*bq_fini)(struct bufq_state *);
void *bq_private;
int bq_flags; /* Flags from bufq_alloc() */
struct bufq_strat *bq_strat;
};
static __inline void *bufq_private(const struct bufq_state *) __unused;
static __inline bool buf_inorder(const struct buf *, const struct buf *, int)
__unused;
#include <sys/null.h> /* for NULL */
static __inline void *
bufq_private(const struct bufq_state *bufq)
{
return bufq->bq_private;
}
/*
* Check if two buf's are in ascending order.
*
* this function consider a NULL buf is after any non-NULL buf.
*
* this function returns false if two are "same".
*/
static __inline bool
buf_inorder(const struct buf *bp, const struct buf *bq, int sortby)
{
KASSERT(bp != NULL || bq != NULL); if (bp == NULL || bq == NULL)
return (bq == NULL);
if (sortby == BUFQ_SORT_CYLINDER) {
if (bp->b_cylinder != bq->b_cylinder)
return bp->b_cylinder < bq->b_cylinder;
else
return bp->b_rawblkno < bq->b_rawblkno;
} else
return bp->b_rawblkno < bq->b_rawblkno;
}
struct bufq_strat {
const char *bs_name;
void (*bs_initfn)(struct bufq_state *);
int bs_prio;
int bs_refcnt;
SLIST_ENTRY(bufq_strat) bs_next;
};
#define BUFQ_DEFINE(name, prio, initfn) \
static struct bufq_strat bufq_strat_##name = { \
.bs_name = #name, \
.bs_prio = prio, \
.bs_initfn = initfn, \
.bs_refcnt = 0 \
};
int bufq_register(struct bufq_strat *);
int bufq_unregister(struct bufq_strat *);
/* $NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#include <sys/types.h>
#ifdef _KERNEL
__KERNEL_RCSID(0, "$NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $");
#include <sys/systm.h>
#else
__RCSID("$NetBSD: in6_print.c,v 1.1 2014/12/02 19:36:58 christos Exp $");
#include <stdio.h>
#define s6_addr32 __u6_addr.__u6_addr32
static const uint8_t hexdigits[] = "0123456789abcdef";
#endif
#include <netinet/in.h>
int
in6_print(char *buf, size_t len, const struct in6_addr *ia6)
{
int i;
char *bp;
char *cp, *ecp;
const uint16_t *a;
const uint8_t *d;
int dcolon = 0;
if (IN6_IS_ADDR_V4MAPPED(ia6)) { char buf4[INET_ADDRSTRLEN];
struct in_addr ia = { .s_addr = ia6->s6_addr32[3] };
in_print(buf4, sizeof(buf4), &ia);
return snprintf(buf, len, "::ffff:%s", buf4);
}
#define ADDC(c) do { \
if (cp >= ecp) {\
cp++; \
} else \
*cp++ = (char)(c); \
} while (/*CONSTCOND*/0)
#define ADDX(v) do { \
uint8_t n = hexdigits[(v)]; \
ADDC(n); \
if (cp == bp && n == '0') \
cp--; \
} while (/*CONSTCOND*/0)
cp = buf;
ecp = buf + len;
a = (const uint16_t *)ia6;
for (i = 0; i < 8; i++) {
if (dcolon == 1) {
if (*a == 0) { if (i == 7) ADDC(':');
a++;
continue;
} else
dcolon = 2;
}
if (*a == 0) { if (dcolon == 0 && *(a + 1) == 0) { if (i == 0) ADDC(':'); ADDC(':');
dcolon = 1;
} else {
ADDC('0'); ADDC(':');
}
a++;
continue;
}
d = (const u_char *)a;
bp = cp + 1;
ADDX((u_int)*d >> 4); ADDX(*d & 0xf);
d++;
ADDX((u_int)*d >> 4); ADDX(*d & 0xf); ADDC(':');
a++;
}
if (cp > buf)
--cp;
if (ecp > buf) { if (cp < ecp)
*cp = '\0';
else
*--ecp = '\0';
}
return (int)(cp - buf);
}
int
sin6_print(char *buf, size_t len, const void *v)
{
const struct sockaddr_in6 *sin6 = v;
const struct in6_addr *ia6 = &sin6->sin6_addr;
char abuf[INET6_ADDRSTRLEN];
if (!sin6->sin6_port)
return in6_print(buf, len, ia6);
in6_print(abuf, sizeof(abuf), ia6);
return snprintf(buf, len, "[%s]:%hu", abuf, ntohs(sin6->sin6_port));
}
/* $NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95
*/
/*
* Stubs for system calls and facilities not included in the system.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_stub.c,v 1.50 2020/08/01 02:04:55 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ktrace.h"
#include "opt_sysv.h"
#include "opt_modular.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstypes.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/ktrace.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/module.h>
#include <sys/bus.h>
#include <sys/userconf.h>
bool default_bus_space_is_equal(bus_space_tag_t, bus_space_tag_t);
bool default_bus_space_handle_is_equal(bus_space_tag_t, bus_space_handle_t,
bus_space_handle_t);
/*
* SYSV Semaphores, Shared Memory, Message Queues
*/
#ifndef MODULAR
#ifndef SYSVMSG
__strong_alias(msgctl1,enosys);
#endif
#ifndef SYSVSHM
__strong_alias(shmctl1,enosys);
#endif
#ifndef SYSVSEM
__strong_alias(semctl1,enosys);
#endif
#endif
/*
* ktrace stubs. ktruser() goes to enosys as we want to fail the syscall,
* but not kill the process: utrace() is a debugging feature.
*/
#ifndef KTRACE
__strong_alias(ktr_csw,nullop); /* Probes */
__strong_alias(ktr_emul,nullop);
__strong_alias(ktr_geniov,nullop);
__strong_alias(ktr_genio,nullop);
__strong_alias(ktr_mibio,nullop);
__strong_alias(ktr_namei,nullop);
__strong_alias(ktr_namei2,nullop);
__strong_alias(ktr_psig,nullop);
__strong_alias(ktr_syscall,nullop);
__strong_alias(ktr_sysret,nullop);
__strong_alias(ktr_kuser,nullop);
__strong_alias(ktr_mib,nullop);
__strong_alias(ktr_execarg,nullop);
__strong_alias(ktr_execenv,nullop);
__strong_alias(ktr_execfd,nullop);
__strong_alias(sys_fktrace,sys_nosys); /* Syscalls */
__strong_alias(sys_ktrace,sys_nosys);
__strong_alias(sys_utrace,sys_nosys);
int ktrace_on; /* Misc */
__strong_alias(ktruser,enosys);
__strong_alias(ktr_point,nullop);
#endif /* KTRACE */
__weak_alias(device_register, voidop);
__weak_alias(device_register_post_config, voidop);
__weak_alias(spldebug_start, voidop);
__weak_alias(spldebug_stop, voidop);
__weak_alias(machdep_init,nullop);
__weak_alias(pci_chipset_tag_create, eopnotsupp);
__weak_alias(pci_chipset_tag_destroy, voidop);
__weak_alias(bus_space_reserve, eopnotsupp);
__weak_alias(bus_space_reserve_subregion, eopnotsupp);
__weak_alias(bus_space_release, voidop);
__weak_alias(bus_space_reservation_map, eopnotsupp);
__weak_alias(bus_space_reservation_unmap, voidop);
__weak_alias(bus_dma_tag_create, eopnotsupp);
__weak_alias(bus_dma_tag_destroy, voidop);
__weak_alias(bus_space_tag_create, eopnotsupp);
__weak_alias(bus_space_tag_destroy, voidop);
__strict_weak_alias(bus_space_is_equal, default_bus_space_is_equal);
__strict_weak_alias(bus_space_handle_is_equal,
default_bus_space_handle_is_equal);
__weak_alias(userconf_bootinfo, voidop);
__weak_alias(userconf_init, voidop);
__weak_alias(userconf_prompt, voidop);
__weak_alias(kobj_renamespace, nullop);
__weak_alias(interrupt_get_count, nullop);
__weak_alias(interrupt_get_assigned, voidop);
__weak_alias(interrupt_get_available, voidop);
__weak_alias(interrupt_get_devname, voidop);
__weak_alias(interrupt_construct_intrids, nullret);
__weak_alias(interrupt_destruct_intrids, voidop);
__weak_alias(interrupt_distribute, eopnotsupp);
__weak_alias(interrupt_distribute_handler, eopnotsupp);
/*
* Scheduler activations system calls. These need to remain until libc's
* major version is bumped.
*/
__strong_alias(sys_sa_register,sys_nosys);
__strong_alias(sys_sa_stacks,sys_nosys);
__strong_alias(sys_sa_enable,sys_nosys);
__strong_alias(sys_sa_setconcurrency,sys_nosys);
__strong_alias(sys_sa_yield,sys_nosys);
__strong_alias(sys_sa_preempt,sys_nosys);
__strong_alias(sys_sa_unblockyield,sys_nosys);
/*
* Stubs for compat_netbsd32.
*/
__strong_alias(dosa_register,sys_nosys);
__strong_alias(sa_stacks1,sys_nosys);
/*
* Stubs for drivers. See sys/conf.h.
*/
__strong_alias(devenodev,enodev);
__strong_alias(deveopnotsupp,eopnotsupp);
__strong_alias(devnullop,nullop);
__strong_alias(ttyenodev,enodev);
__strong_alias(ttyvenodev,voidop);
__strong_alias(ttyvnullop,nullop);
/*
* Stubs for architectures that do not support kernel preemption.
*/
#ifndef __HAVE_PREEMPTION
bool
cpu_kpreempt_enter(uintptr_t where, int s)
{
return false;
}
void
cpu_kpreempt_exit(uintptr_t where)
{
}
bool
cpu_kpreempt_disabled(void)
{
return true;
}
#else
# ifndef MULTIPROCESSOR
# error __HAVE_PREEMPTION requires MULTIPROCESSOR
# endif
#endif /* !__HAVE_PREEMPTION */
int
sys_nosys(struct lwp *l, const void *v, register_t *retval)
{
mutex_enter(&proc_lock);
psignal(l->l_proc, SIGSYS);
mutex_exit(&proc_lock);
return ENOSYS;
}
/*
* Unsupported device function (e.g. writing to read-only device).
*/
int
enodev(void)
{
return (ENODEV);
}
/*
* Unconfigured device function; driver not configured.
*/
int
enxio(void)
{
return (ENXIO);
}
/*
* Unsupported ioctl function.
*/
int
enoioctl(void)
{
return (ENOTTY);
}
/*
* Unsupported system function.
* This is used for an otherwise-reasonable operation
* that is not supported by the current system binary.
*/
int
enosys(void)
{
return (ENOSYS);
}
/*
* Return error for operation not supported
* on a specific object or file type.
*/
int
eopnotsupp(void)
{
return (EOPNOTSUPP);
}
/*
* Generic null operation, void return value.
*/
void
voidop(void)
{
}
/*
* Generic null operation, always returns success.
*/
int
nullop(void *v)
{
return (0);
}
/*
* Generic null operation, always returns null.
*/
void *
nullret(void)
{
return (NULL);
}
bool
default_bus_space_handle_is_equal(bus_space_tag_t t,
bus_space_handle_t h1, bus_space_handle_t h2)
{
return memcmp(&h1, &h2, sizeof(h1)) == 0;
}
bool
default_bus_space_is_equal(bus_space_tag_t t1, bus_space_tag_t t2)
{
return memcmp(&t1, &t2, sizeof(t1)) == 0;
}
/* Stubs for architectures with no kernel FPU access. */
__weak_alias(kthread_fpu_enter_md, voidop);
__weak_alias(kthread_fpu_exit_md, voidop);
/* $NetBSD: bus_private.h,v 1.16 2022/01/22 15:10:32 skrll Exp $ */
/* NetBSD: bus.h,v 1.8 2005/03/09 19:04:46 matt Exp */
/*-
* Copyright (c) 1996, 1997, 1998, 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 Charles M. Hannum. All rights reserved.
* Copyright (c) 1996 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(_X86_BUS_PRIVATE_H_)
#define _X86_BUS_PRIVATE_H_
/*
* Cookie used for bounce buffers. A pointer to one of these it stashed in
* the DMA map.
*/
struct x86_bus_dma_cookie {
int id_flags; /* flags; see below */
/*
* Information about the original buffer used during
* DMA map syncs. Note that origibuflen is only used
* for ID_BUFTYPE_LINEAR.
*/
void *id_origbuf; /* pointer to orig buffer if
bouncing */
bus_size_t id_origbuflen; /* ...and size */
int id_buftype; /* type of buffer */
void *id_bouncebuf; /* pointer to the bounce buffer */
bus_size_t id_bouncebuflen; /* ...and size */
int id_nbouncesegs; /* number of valid bounce segs */
bus_dma_segment_t id_bouncesegs[0]; /* array of bounce buffer
physical memory segments */
};
/* id_flags */
#define X86_DMA_MIGHT_NEED_BOUNCE 0x01 /* may need bounce buffers */
#define X86_DMA_HAS_BOUNCE 0x02 /* has bounce buffers */
#define X86_DMA_IS_BOUNCING 0x04 /* is bouncing current xfer */
/* id_buftype */
#define X86_DMA_BUFTYPE_INVALID 0
#define X86_DMA_BUFTYPE_LINEAR 1
#define X86_DMA_BUFTYPE_MBUF 2
#define X86_DMA_BUFTYPE_UIO 3
#define X86_DMA_BUFTYPE_RAW 4
/*
* default address translation macros, which are appropriate where
* paddr_t == bus_addr_t.
*/
#if !defined(_BUS_PHYS_TO_BUS)
#define _BUS_PHYS_TO_BUS(pa) ((bus_addr_t)(pa))
#endif /* !defined(_BUS_PHYS_TO_BUS) */
#if !defined(_BUS_BUS_TO_PHYS)
#define _BUS_BUS_TO_PHYS(ba) ((paddr_t)(ba))
#endif /* !defined(_BUS_BUS_TO_PHYS) */
#if !defined(_BUS_VM_PAGE_TO_BUS)
#define _BUS_VM_PAGE_TO_BUS(pg) _BUS_PHYS_TO_BUS(VM_PAGE_TO_PHYS(pg))
#endif /* !defined(_BUS_VM_PAGE_TO_BUS) */
#if !defined(_BUS_BUS_TO_VM_PAGE)
#define _BUS_BUS_TO_VM_PAGE(ba) PHYS_TO_VM_PAGE(ba)
#endif /* !defined(_BUS_BUS_TO_VM_PAGE) */
#if !defined(_BUS_PMAP_ENTER)
#define _BUS_PMAP_ENTER(pmap, va, ba, prot, flags) \
pmap_enter(pmap, va, ba, prot, flags)
#endif /* _BUS_PMAP_ENTER */
#if !defined(_BUS_VIRT_TO_BUS)
#include <uvm/uvm_extern.h>
static __inline bus_addr_t _bus_virt_to_bus(struct pmap *, vaddr_t);
#define _BUS_VIRT_TO_BUS(pm, va) _bus_virt_to_bus((pm), (va))
static __inline bus_addr_t
_bus_virt_to_bus(struct pmap *pm, vaddr_t va)
{
paddr_t pa;
if (!pmap_extract(pm, va, &pa)) {
panic("_bus_virt_to_bus");
}
return _BUS_PHYS_TO_BUS(pa);
}
#endif /* !defined(_BUS_VIRT_TO_BUS) */
/*
* by default, the end address of RAM visible on bus is the same as the
* largest physical address.
*/
#ifndef _BUS_AVAIL_END
#define _BUS_AVAIL_END (avail_end - 1)
#endif
struct x86_bus_dma_tag {
bus_dma_tag_t bdt_super;
/* bdt_present: bitmap indicating overrides present (1) in *this* tag,
* bdt_exists: bitmap indicating overrides present (1) in *this* tag
* or in an ancestor's tag (follow bdt_super to ancestors)
*/
uint64_t bdt_present;
uint64_t bdt_exists;
const struct bus_dma_overrides *bdt_ov;
void *bdt_ctx;
/*
* The `bounce threshold' is checked while we are loading
* the DMA map. If the physical address of the segment
* exceeds the threshold, an error will be returned. The
* caller can then take whatever action is necessary to
* bounce the transfer. If this value is 0, it will be
* ignored.
*/
int _tag_needs_free;
bus_addr_t _bounce_thresh;
bus_addr_t _bounce_alloc_lo;
bus_addr_t _bounce_alloc_hi;
int (*_may_bounce)(bus_dma_tag_t, bus_dmamap_t, int, int *);
};
#endif /* !defined(_X86_BUS_PRIVATE_H_) */
/* $NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $ */
/*-
* Copyright (c) 1996, 1997, 1999, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_disk.c,v 1.137 2023/05/09 12:04:04 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/sysctl.h>
#include <lib/libkern/libkern.h>
/*
* Disk error is the preface to plaintive error messages
* about failing disk transfers. It prints messages of the form
hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
* if the offset of the error in the transfer and a disk label
* are both available. blkdone should be -1 if the position of the error
* is unknown; the disklabel pointer may be null from drivers that have not
* been converted to use them. The message is printed with printf
* if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
* The message should be completed (with at least a newline) with printf
* or addlog, respectively. There is no trailing space.
*/
#ifndef PRIdaddr
#define PRIdaddr PRId64
#endif
void
diskerr(const struct buf *bp, const char *dname, const char *what, int pri,
int blkdone, const struct disklabel *lp)
{
int unit = DISKUNIT(bp->b_dev), part = DISKPART(bp->b_dev);
void (*pr)(const char *, ...) __printflike(1, 2);
char partname = 'a' + part;
daddr_t sn;
if (/*CONSTCOND*/0)
/* Compiler will error this if the format is wrong... */
printf("%" PRIdaddr, bp->b_blkno);
if (pri != LOG_PRINTF) {
static const char fmt[] = "";
log(pri, fmt);
pr = addlog;
} else
pr = printf;
(*pr)("%s%d%c: %s %sing fsbn ", dname, unit, partname, what,
bp->b_flags & B_READ ? "read" : "writ");
sn = bp->b_blkno;
if (bp->b_bcount <= DEV_BSIZE)
(*pr)("%" PRIdaddr, sn);
else {
if (blkdone >= 0) {
sn += blkdone;
(*pr)("%" PRIdaddr " of ", sn);
}
(*pr)("%" PRIdaddr "-%" PRIdaddr "", bp->b_blkno,
bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
}
if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
sn += lp->d_partitions[part].p_offset;
(*pr)(" (%s%d bn %" PRIdaddr "; cn %" PRIdaddr "",
dname, unit, sn, sn / lp->d_secpercyl);
sn %= lp->d_secpercyl;
(*pr)(" tn %" PRIdaddr " sn %" PRIdaddr ")",
sn / lp->d_nsectors, sn % lp->d_nsectors);
}
}
/*
* Searches the iostatlist for the disk corresponding to the
* name provided.
*/
struct disk *
disk_find(const char *name)
{
struct io_stats *stat;
stat = iostat_find(name);
if ((stat != NULL) && (stat->io_type == IOSTAT_DISK))
return stat->io_parent;
return (NULL);
}
void
disk_init(struct disk *diskp, const char *name, const struct dkdriver *driver)
{
u_int blocksize = DEV_BSIZE;
/*
* Initialize the wedge-related locks and other fields.
*/
mutex_init(&diskp->dk_rawlock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&diskp->dk_openlock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&diskp->dk_wedges);
diskp->dk_nwedges = 0;
diskp->dk_labelsector = LABELSECTOR;
diskp->dk_blkshift = DK_BSIZE2BLKSHIFT(blocksize);
diskp->dk_byteshift = DK_BSIZE2BYTESHIFT(blocksize);
diskp->dk_name = name;
diskp->dk_driver = driver;
}
/*
* Rename a disk.
*/
void
disk_rename(struct disk *diskp, const char *name)
{
diskp->dk_name = name;
iostat_rename(diskp->dk_stats, diskp->dk_name);
}
/*
* Attach a disk.
*/
void
disk_attach(struct disk *diskp)
{
/*
* Allocate and initialize the disklabel structures.
*/
diskp->dk_label = kmem_zalloc(sizeof(struct disklabel), KM_SLEEP);
diskp->dk_cpulabel = kmem_zalloc(sizeof(struct cpu_disklabel),
KM_SLEEP);
/*
* Set up the stats collection.
*/
diskp->dk_stats = iostat_alloc(IOSTAT_DISK, diskp, diskp->dk_name);
}
int
disk_begindetach(struct disk *dk, int (*lastclose)(device_t),
device_t self, int flags)
{
int rc;
rc = 0;
mutex_enter(&dk->dk_openlock);
if (dk->dk_openmask == 0)
; /* nothing to do */
else if ((flags & DETACH_FORCE) == 0)
rc = EBUSY;
else if (lastclose != NULL)
rc = (*lastclose)(self);
mutex_exit(&dk->dk_openlock);
return rc;
}
/*
* Detach a disk.
*/
void
disk_detach(struct disk *diskp)
{
/*
* Remove from the drivelist.
*/
iostat_free(diskp->dk_stats);
/*
* Release the disk-info dictionary.
*/
if (diskp->dk_info) {
prop_object_release(diskp->dk_info);
diskp->dk_info = NULL;
}
/*
* Free the space used by the disklabel structures.
*/
kmem_free(diskp->dk_label, sizeof(*diskp->dk_label));
kmem_free(diskp->dk_cpulabel, sizeof(*diskp->dk_cpulabel));
}
void
disk_destroy(struct disk *diskp)
{
mutex_destroy(&diskp->dk_openlock);
mutex_destroy(&diskp->dk_rawlock);
}
/*
* Mark the disk as having work queued for metrics collection.
*/
void
disk_wait(struct disk *diskp)
{
iostat_wait(diskp->dk_stats);
}
/*
* Mark the disk as busy for metrics collection.
*/
void
disk_busy(struct disk *diskp)
{
iostat_busy(diskp->dk_stats);
}
/*
* Finished disk operations, gather metrics.
*/
void
disk_unbusy(struct disk *diskp, long bcount, int read)
{
iostat_unbusy(diskp->dk_stats, bcount, read);
}
/*
* Return true if disk has an I/O operation in flight.
*/
bool
disk_isbusy(struct disk *diskp)
{
return iostat_isbusy(diskp->dk_stats);
}
/*
* Bounds checking against the media size, used for the raw partition.
* secsize, mediasize and b_blkno must all be the same units.
* Possibly this has to be DEV_BSIZE (512).
*/
int
bounds_check_with_mediasize(struct buf *bp, int secsize, uint64_t mediasize)
{
int64_t sz;
if (bp->b_blkno < 0) {
/* Reject negative offsets immediately. */
bp->b_error = EINVAL;
return 0;
}
sz = howmany((int64_t)bp->b_bcount, secsize);
/*
* bp->b_bcount is a 32-bit value, and we rejected a negative
* bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
*/
if (bp->b_blkno + sz > mediasize) {
sz = mediasize - bp->b_blkno;
if (sz == 0) {
/* If exactly at end of disk, return EOF. */
bp->b_resid = bp->b_bcount;
return 0;
}
if (sz < 0) {
/* If past end of disk, return EINVAL. */
bp->b_error = EINVAL;
return 0;
}
/* Otherwise, truncate request. */
bp->b_bcount = sz * secsize;
}
return 1;
}
/*
* Determine the size of the transfer, and make sure it is
* within the boundaries of the partition. Adjust transfer
* if needed, and signal errors or early completion.
*/
int
bounds_check_with_label(struct disk *dk, struct buf *bp, int wlabel)
{
struct disklabel *lp = dk->dk_label;
struct partition *p = lp->d_partitions + DISKPART(bp->b_dev);
uint64_t p_size, p_offset, labelsector;
int64_t sz;
if (bp->b_blkno < 0) {
/* Reject negative offsets immediately. */
bp->b_error = EINVAL;
return -1;
}
/* Protect against division by zero. XXX: Should never happen?!?! */
if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0) {
bp->b_error = EINVAL;
return -1;
}
p_size = (uint64_t)p->p_size << dk->dk_blkshift;
p_offset = (uint64_t)p->p_offset << dk->dk_blkshift;
#if RAW_PART == 3
labelsector = lp->d_partitions[2].p_offset;
#else
labelsector = lp->d_partitions[RAW_PART].p_offset;
#endif
labelsector = (labelsector + dk->dk_labelsector) << dk->dk_blkshift;
sz = howmany((int64_t)bp->b_bcount, DEV_BSIZE);
/*
* bp->b_bcount is a 32-bit value, and we rejected a negative
* bp->b_blkno already, so "bp->b_blkno + sz" cannot overflow.
*/
if (bp->b_blkno + sz > p_size) {
sz = p_size - bp->b_blkno;
if (sz == 0) {
/* If exactly at end of disk, return EOF. */
bp->b_resid = bp->b_bcount;
return 0;
}
if (sz < 0) {
/* If past end of disk, return EINVAL. */
bp->b_error = EINVAL;
return -1;
}
/* Otherwise, truncate request. */
bp->b_bcount = sz << DEV_BSHIFT;
}
/* Overwriting disk label? */
if (bp->b_blkno + p_offset <= labelsector &&
bp->b_blkno + p_offset + sz > labelsector &&
(bp->b_flags & B_READ) == 0 && !wlabel) {
bp->b_error = EROFS;
return -1;
}
/* calculate cylinder for disksort to order transfers with */
bp->b_cylinder = (bp->b_blkno + p->p_offset) /
(lp->d_secsize / DEV_BSIZE) / lp->d_secpercyl;
return 1;
}
int
disk_read_sectors(void (*strat)(struct buf *), const struct disklabel *lp,
struct buf *bp, unsigned int sector, int count)
{
if ((lp->d_secsize / DEV_BSIZE) == 0 || lp->d_secpercyl == 0)
return EINVAL;
bp->b_blkno = btodb((off_t)sector * lp->d_secsize);
bp->b_bcount = count * lp->d_secsize;
bp->b_flags = (bp->b_flags & ~B_WRITE) | B_READ;
bp->b_oflags &= ~BO_DONE;
bp->b_cylinder = sector / lp->d_secpercyl;
(*strat)(bp);
return biowait(bp);
}
const char *
convertdisklabel(struct disklabel *lp, void (*strat)(struct buf *),
struct buf *bp, uint32_t secperunit)
{
struct partition rp, *altp, *p;
int geom_ok;
const char *str;
memset(&rp, 0, sizeof(rp));
rp.p_size = secperunit;
rp.p_fstype = FS_UNUSED;
/* If we can seek to d_secperunit - 1, believe the disk geometry. */
if (secperunit != 0 &&
disk_read_sectors(strat, lp, bp, secperunit - 1, 1) == 0)
geom_ok = 1;
else
geom_ok = 0;
#if 0
printf("%s: secperunit (%" PRIu32 ") %s\n", __func__,
secperunit, geom_ok ? "ok" : "not ok");
#endif
p = &lp->d_partitions[RAW_PART];
if (RAW_PART == 'c' - 'a')
altp = &lp->d_partitions['d' - 'a'];
else
altp = &lp->d_partitions['c' - 'a'];
if (lp->d_npartitions > RAW_PART && p->p_offset == 0 && p->p_size != 0)
return NULL; /* already a raw partition */
else if (lp->d_npartitions > MAX('c', 'd') - 'a' &&
altp->p_offset == 0 && altp->p_size != 0) {
/* alternate partition ('c' or 'd') is suitable for raw slot,
* swap with 'd' or 'c'.
*/
rp = *p;
*p = *altp;
*altp = rp;
return NULL;
} else if (lp->d_npartitions <= RAW_PART &&
lp->d_npartitions > 'c' - 'a') {
/* No raw partition is present, but the alternate is present.
* Copy alternate to raw partition.
*/
lp->d_npartitions = RAW_PART + 1;
*p = *altp;
return NULL;
} else if (!geom_ok)
str = "no raw partition and disk reports bad geometry";
else if (lp->d_npartitions <= RAW_PART) {
memset(&lp->d_partitions[lp->d_npartitions], 0,
sizeof(struct partition) * (RAW_PART - lp->d_npartitions));
*p = rp;
lp->d_npartitions = RAW_PART + 1;
return NULL;
} else if (lp->d_npartitions < MAXPARTITIONS) {
memmove(p + 1, p,
sizeof(struct partition) * (lp->d_npartitions - RAW_PART));
*p = rp;
lp->d_npartitions++;
return NULL;
} else
str = "no raw partition and partition table is full";
#ifdef DIAGNOSTIC
printf("Bad partition: %s\n", str);
printf("type = %u, subtype = %u, typename = %s\n",
lp->d_type, lp->d_subtype, lp->d_typename);
printf("secsize = %u, nsectors = %u, ntracks = %u\n",
lp->d_secsize, lp->d_nsectors, lp->d_ntracks);
printf("ncylinders = %u, secpercyl = %u, secperunit = %u\n",
lp->d_ncylinders, lp->d_secpercyl, lp->d_secperunit);
printf("npartitions = %u\n", lp->d_npartitions);
for (size_t i = 0; i < MIN(lp->d_npartitions, MAXPARTITIONS); i++) {
p = &lp->d_partitions[i];
printf("\t%c: offset = %u size = %u fstype = %u\n",
(char)(i + 'a'), p->p_offset, p->p_size, p->p_fstype);
}
#endif
return str;
}
/*
* disk_ioctl --
* Generic disk ioctl handling.
*/
int
disk_ioctl(struct disk *dk, dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
struct dkwedge_info *dkw;
struct partinfo *pi;
struct partition *dp;
#ifdef __HAVE_OLD_DISKLABEL
struct disklabel newlabel;
#endif
switch (cmd) {
case DIOCGDISKINFO: {
prop_dictionary_t disk_info;
int error;
mutex_enter(&dk->dk_openlock);
if ((disk_info = dk->dk_info) == NULL) {
error = ENOTSUP;
} else {
prop_object_retain(disk_info);
error = 0;
}
mutex_exit(&dk->dk_openlock);
if (error)
return error;
error = prop_dictionary_copyout_ioctl(data, cmd, disk_info);
prop_object_release(disk_info);
return error;
}
case DIOCGSECTORSIZE:
*(u_int *)data = dk->dk_geom.dg_secsize;
return 0;
case DIOCGMEDIASIZE:
*(off_t *)data = (off_t)dk->dk_geom.dg_secsize *
dk->dk_geom.dg_secperunit;
return 0;
default:
break;
}
if (dev == NODEV)
return EPASSTHROUGH;
/* The following should be moved to dk_ioctl */
switch (cmd) {
case DIOCGDINFO:
if (dk->dk_label == NULL)
return EBUSY;
memcpy(data, dk->dk_label, sizeof (*dk->dk_label));
return 0;
#ifdef __HAVE_OLD_DISKLABEL
case ODIOCGDINFO:
if (dk->dk_label == NULL)
return EBUSY;
memcpy(&newlabel, dk->dk_label, sizeof(newlabel));
if (newlabel.d_npartitions > OLDMAXPARTITIONS)
return ENOTTY;
memcpy(data, &newlabel, sizeof(struct olddisklabel));
return 0;
#endif
case DIOCGPARTINFO:
pi = data;
memset(pi, 0, sizeof(*pi));
pi->pi_secsize = dk->dk_geom.dg_secsize;
pi->pi_bsize = MAX(BLKDEV_IOSIZE, pi->pi_secsize);
if (DISKPART(dev) == RAW_PART) {
pi->pi_size = dk->dk_geom.dg_secperunit;
return 0;
}
if (dk->dk_label == NULL)
return EBUSY;
dp = &dk->dk_label->d_partitions[DISKPART(dev)];
pi->pi_offset = dp->p_offset;
pi->pi_size = dp->p_size;
pi->pi_fstype = dp->p_fstype;
pi->pi_frag = dp->p_frag;
pi->pi_fsize = dp->p_fsize;
pi->pi_cpg = dp->p_cpg;
/*
* dholland 20130616: XXX this logic should not be
* here. It is here because the old buffer cache
* demands that all accesses to the same blocks need
* to be the same size; but it only works for FFS and
* nowadays I think it'll fail silently if the size
* info in the disklabel is wrong. (Or missing.) The
* buffer cache needs to be smarter; or failing that
* we need a reliable way here to get the right block
* size; or a reliable way to guarantee that (a) the
* fs is not mounted when we get here and (b) any
* buffers generated here will get purged when the fs
* does get mounted.
*/
if (dp->p_fstype == FS_BSDFFS && dp->p_frag != 0 && dp->p_fsize != 0) pi->pi_bsize = dp->p_frag * dp->p_fsize;
return 0;
case DIOCAWEDGE:
if ((flag & FWRITE) == 0)
return EBADF;
dkw = data;
strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
return dkwedge_add(dkw);
case DIOCDWEDGE:
if ((flag & FWRITE) == 0)
return EBADF;
dkw = data;
strlcpy(dkw->dkw_parent, dk->dk_name, sizeof(dkw->dkw_parent));
return dkwedge_del(dkw);
case DIOCLWEDGES:
return dkwedge_list(dk, data, l);
case DIOCMWEDGES:
if ((flag & FWRITE) == 0)
return EBADF;
dkwedge_discover(dk);
return 0;
case DIOCRMWEDGES:
if ((flag & FWRITE) == 0)
return EBADF;
dkwedge_delidle(dk);
return 0;
default:
return EPASSTHROUGH;
}
}
/*
* disk_set_info --
* Canonicalize dk->dk_geom and set some parameters.
*
* If disk_set_info can happen concurrently with disk_ioctl in a
* driver, the driver must serialize calls to disk_set_info with
* dk_openlock.
*/
void
disk_set_info(device_t dev, struct disk *dk, const char *type)
{
struct disk_geom *dg = &dk->dk_geom;
if (dg->dg_secsize == 0) {
#ifdef DIAGNOSTIC
printf("%s: fixing 0 sector size\n", dk->dk_name);
#endif
dg->dg_secsize = DEV_BSIZE;
}
dk->dk_blkshift = DK_BSIZE2BLKSHIFT(dg->dg_secsize);
dk->dk_byteshift = DK_BSIZE2BYTESHIFT(dg->dg_secsize);
if (dg->dg_secperunit == 0) {
#ifdef DIAGNOSTIC
if (dg->dg_ncylinders == 0) {
printf("%s: secperunit and ncylinders are zero\n",
dk->dk_name);
}
if (dg->dg_nsectors == 0 || dg->dg_ntracks == 0) {
printf("%s: secperunit and (sectors or tracks) "
"are zero\n", dk->dk_name);
}
#endif
dg->dg_secperunit = (int64_t) dg->dg_nsectors *
dg->dg_ntracks * dg->dg_ncylinders;
}
if (dg->dg_ncylinders == 0) {
if (dg->dg_ntracks && dg->dg_nsectors)
dg->dg_ncylinders = dg->dg_secperunit /
(dg->dg_ntracks * dg->dg_nsectors);
}
prop_dictionary_t disk_info, odisk_info, geom;
disk_info = prop_dictionary_create();
geom = prop_dictionary_create();
prop_dictionary_set_uint64(geom, "sectors-per-unit",
dg->dg_secperunit);
prop_dictionary_set_uint32(geom, "sector-size", dg->dg_secsize);
if (dg->dg_nsectors)
prop_dictionary_set_uint16(geom, "sectors-per-track",
dg->dg_nsectors);
if (dg->dg_ntracks)
prop_dictionary_set_uint16(geom, "tracks-per-cylinder",
dg->dg_ntracks);
if (dg->dg_ncylinders)
prop_dictionary_set_uint64(geom, "cylinders-per-unit",
dg->dg_ncylinders);
prop_dictionary_set(disk_info, "geometry", geom);
if (type)
prop_dictionary_set_string_nocopy(disk_info, "type", type);
prop_object_release(geom);
odisk_info = dk->dk_info;
dk->dk_info = disk_info;
if (dev)
prop_dictionary_set(device_properties(dev), "disk-info",
disk_info);
/*
* Don't release disk_info here; we keep a reference to it.
* disk_detach() will release it when we go away.
*/
if (odisk_info)
prop_object_release(odisk_info);
}
int
disklabel_dev_unit(dev_t dev)
{
return DISKUNIT(dev);
}
/* $NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_20.c,v 1.46 2020/06/28 14:37:53 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/dirent.h>
#include <sys/sysctl.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/vfs_syscalls.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/mount.h>
#include <compat/sys/statvfs.h>
static const struct syscall_package vfs_syscalls_20_syscalls[] = {
{ SYS_compat_20_fhstatfs, 0, (sy_call_t *)compat_20_sys_fhstatfs },
{ SYS_compat_20_fstatfs, 0, (sy_call_t *)compat_20_sys_fstatfs },
{ SYS_compat_20_getfsstat, 0, (sy_call_t *)compat_20_sys_getfsstat },
{ SYS_compat_20_statfs, 0, (sy_call_t *)compat_20_sys_statfs },
{ 0, 0, NULL }
};
/*
* Get filesystem statistics.
*/
/* ARGSUSED */
int
compat_20_sys_statfs(struct lwp *l, const struct compat_20_sys_statfs_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(struct statfs12 *) buf;
} */
struct mount *mp;
struct statvfs *sbuf;
int error;
struct vnode *vp;
error = namei_simple_user(SCARG(uap, path),
NSM_FOLLOW_TRYEMULROOT, &vp);
if (error != 0)
return error;
mp = vp->v_mount;
sbuf = STATVFSBUF_GET();
if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
goto done;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
done:
vrele(vp);
STATVFSBUF_PUT(sbuf);
return error;
}
/*
* Get filesystem statistics.
*/
/* ARGSUSED */
int
compat_20_sys_fstatfs(struct lwp *l, const struct compat_20_sys_fstatfs_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct statfs12 *) buf;
} */
struct file *fp;
struct mount *mp;
struct statvfs *sbuf;
int error;
/* fd_getvnode() will use the descriptor for us */
if ((error = fd_getvnode(SCARG(uap, fd), &fp)) != 0)
return (error);
mp = fp->f_vnode->v_mount;
sbuf = STATVFSBUF_GET();
if ((error = dostatvfs(mp, sbuf, l, 0, 1)) != 0)
goto out;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
out:
fd_putfile(SCARG(uap, fd));
STATVFSBUF_PUT(sbuf);
return error;
}
/*
* Get statistics on all filesystems.
*/
int
compat_20_sys_getfsstat(struct lwp *l, const struct compat_20_sys_getfsstat_args *uap, register_t *retval)
{
/* {
syscallarg(struct statfs12 *) buf;
syscallarg(long) bufsize;
syscallarg(int) flags;
} */
return do_sys_getvfsstat(l, SCARG(uap, buf), SCARG(uap, bufsize),
SCARG(uap, flags), statvfs_to_statfs12_copy,
sizeof(struct statfs12), retval);
}
int
compat_20_sys_fhstatfs(struct lwp *l, const struct compat_20_sys_fhstatfs_args *uap, register_t *retval)
{
/* {
syscallarg(const struct compat_30_fhandle *) fhp;
syscallarg(struct statfs12 *) buf;
} */
struct statvfs *sbuf;
struct compat_30_fhandle fh;
struct mount *mp;
struct vnode *vp;
int error;
/*
* Must be super user
*/
if ((error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_FILEHANDLE, 0, NULL, NULL, NULL)))
return (error);
if ((error = copyin(SCARG(uap, fhp), &fh, sizeof(fh))) != 0)
return (error);
if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
error = VFS_FHTOVP(mp, (struct fid*)&fh.fh_fid, LK_EXCLUSIVE, &vp);
if (error != 0)
return (error);
mp = vp->v_mount;
VOP_UNLOCK(vp);
sbuf = STATVFSBUF_GET();
if ((error = VFS_STATVFS(mp, sbuf)) != 0)
goto out;
error = statvfs_to_statfs12_copy(sbuf, SCARG(uap, buf), 0);
out:
vrele(vp);
STATVFSBUF_PUT(sbuf);
return error;
}
int
vfs_syscalls_20_init(void)
{
return syscall_establish(NULL, vfs_syscalls_20_syscalls);
}
int
vfs_syscalls_20_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_20_syscalls);
}
/* $NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $ */
/*
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_inode.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_inode.c,v 1.112 2020/09/05 16:30:13 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#include "opt_wapbl.h"
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/kernel.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/kmem.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#ifdef UFS_EXTATTR
#include <ufs/ufs/extattr.h>
#endif
#ifdef UVMHIST
#include <uvm/uvm.h>
#endif
#include <uvm/uvm_page.h>
#include <uvm/uvm_stat.h>
/*
* Last reference to an inode. If necessary, write or delete it.
*/
int
ufs_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
struct bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct inode *ip = VTOI(vp);
struct mount *mp = vp->v_mount;
mode_t mode;
int allerror = 0, error;
bool wapbl_locked = false;
UFS_WAPBL_JUNLOCK_ASSERT(mp);
/*
* Ignore inodes related to stale file handles.
*/
if (ip->i_mode == 0)
goto out;
if (ip->i_nlink <= 0 && (mp->mnt_flag & MNT_RDONLY) == 0) {
#ifdef UFS_EXTATTR
ufs_extattr_vnode_inactive(vp, curlwp);
#endif
/*
* All file blocks must be freed before we can let the vnode
* be reclaimed, so can't postpone full truncating any further.
*/
ufs_truncate_all(vp);
#if defined(QUOTA) || defined(QUOTA2)
error = UFS_WAPBL_BEGIN(mp);
if (error) {
allerror = error;
} else {
wapbl_locked = true;
(void)chkiq(ip, -1, NOCRED, 0);
}
#endif
DIP_ASSIGN(ip, rdev, 0);
mode = ip->i_mode;
ip->i_mode = 0;
ip->i_omode = mode;
DIP_ASSIGN(ip, mode, 0);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Defer final inode free and update to ufs_reclaim().
*/
}
if (ip->i_flag & (IN_CHANGE | IN_UPDATE | IN_MODIFIED)) {
if (! wapbl_locked) { error = UFS_WAPBL_BEGIN(mp);
if (error) {
allerror = error;
goto out;
}
wapbl_locked = true;
}
UFS_UPDATE(vp, NULL, NULL, 0);
}
out:
if (wapbl_locked) UFS_WAPBL_END(mp);
/*
* If we are done with the inode, reclaim it
* so that it can be reused immediately.
*/
*ap->a_recycle = (ip->i_mode == 0);
if (ip->i_mode == 0 && (DIP(ip, size) != 0 || DIP(ip, blocks) != 0)) {
printf("%s: unlinked ino %" PRId64 " on \"%s\" has"
" non zero size %" PRIx64 " or blocks %" PRIx64
" with allerror %d\n",
__func__, ip->i_number, mp->mnt_stat.f_mntonname,
DIP(ip, size), DIP(ip, blocks), allerror);
panic("%s: dirty filesystem?", __func__);
}
return (allerror);
}
/*
* Reclaim an inode so that it can be used for other purposes.
*/
int
ufs_reclaim(struct vnode *vp)
{
struct inode *ip = VTOI(vp);
if (!UFS_WAPBL_BEGIN(vp->v_mount)) { UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE); UFS_WAPBL_END(vp->v_mount);
}
UFS_UPDATE(vp, NULL, NULL, UPDATE_CLOSE);
if (ip->i_devvp) { vrele(ip->i_devvp);
ip->i_devvp = 0;
}
#if defined(QUOTA) || defined(QUOTA2)
ufsquota_free(ip);
#endif
#ifdef UFS_DIRHASH
if (ip->i_dirhash != NULL) ufsdirhash_free(ip);
#endif
return (0);
}
/*
* allocate a range of blocks in a file.
* after this function returns, any page entirely contained within the range
* will map to invalid data and thus must be overwritten before it is made
* accessible to others.
*/
int
ufs_balloc_range(struct vnode *vp, off_t off, off_t len, kauth_cred_t cred,
int flags)
{
off_t neweof; /* file size after the operation */
off_t neweob; /* offset next to the last block after the operation */
off_t pagestart; /* starting offset of range covered by pgs */
off_t eob; /* offset next to allocated blocks */
struct uvm_object *uobj;
int i, delta, error, npages;
int bshift = vp->v_mount->mnt_fs_bshift;
int bsize = 1 << bshift;
int ppb = MAX(bsize >> PAGE_SHIFT, 1);
struct vm_page **pgs;
size_t pgssize;
UVMHIST_FUNC("ufs_balloc_range"); UVMHIST_CALLED(ubchist);
UVMHIST_LOG(ubchist, "vp %#jx off 0x%jx len 0x%jx u_size 0x%jx",
(uintptr_t)vp, off, len, vp->v_size);
neweof = MAX(vp->v_size, off + len);
GOP_SIZE(vp, neweof, &neweob, 0);
error = 0;
uobj = &vp->v_uobj;
/*
* read or create pages covering the range of the allocation and
* keep them locked until the new block is allocated, so there
* will be no window where the old contents of the new block are
* visible to racing threads.
*/
pagestart = trunc_page(off) & ~(bsize - 1);
npages = MIN(ppb, (round_page(neweob) - pagestart) >> PAGE_SHIFT);
pgssize = npages * sizeof(struct vm_page *);
pgs = kmem_zalloc(pgssize, KM_SLEEP);
/*
* adjust off to be block-aligned.
*/
delta = off & (bsize - 1);
off -= delta;
len += delta;
genfs_node_wrlock(vp);
rw_enter(uobj->vmobjlock, RW_WRITER);
error = VOP_GETPAGES(vp, pagestart, pgs, &npages, 0,
VM_PROT_WRITE, 0, PGO_SYNCIO | PGO_PASTEOF | PGO_NOBLOCKALLOC |
PGO_NOTIMESTAMP | PGO_GLOCKHELD);
if (error) {
genfs_node_unlock(vp);
goto out;
}
/*
* now allocate the range.
*/
error = GOP_ALLOC(vp, off, len, flags, cred);
genfs_node_unlock(vp);
/*
* if the allocation succeeded, mark all the pages dirty
* and clear PG_RDONLY on any pages that are now fully backed
* by disk blocks. if the allocation failed, we do not invalidate
* the pages since they might have already existed and been dirty,
* in which case we need to keep them around. if we created the pages,
* they will be clean and read-only, and leaving such pages
* in the cache won't cause any problems.
*/
GOP_SIZE(vp, off + len, &eob, 0);
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) { KASSERT((pgs[i]->flags & PG_RELEASED) == 0); if (!error) { if (off <= pagestart + (i << PAGE_SHIFT) &&
pagestart + ((i + 1) << PAGE_SHIFT) <= eob) {
pgs[i]->flags &= ~PG_RDONLY;
}
uvm_pagemarkdirty(pgs[i], UVM_PAGE_STATUS_DIRTY);
}
uvm_pagelock(pgs[i]);
uvm_pageactivate(pgs[i]);
uvm_pageunlock(pgs[i]);
}
uvm_page_unbusy(pgs, npages);
rw_exit(uobj->vmobjlock);
out:
kmem_free(pgs, pgssize);
return error;
}
int
ufs_truncate_retry(struct vnode *vp, int ioflag, uint64_t newsize,
kauth_cred_t cred)
{
struct inode *ip = VTOI(vp);
struct mount *mp = vp->v_mount;
int error = 0;
UFS_WAPBL_JUNLOCK_ASSERT(mp);
/*
* Truncate might temporarily fail, loop until done.
*/
do {
error = UFS_WAPBL_BEGIN(mp);
if (error)
goto out;
error = UFS_TRUNCATE(vp, newsize, ioflag, cred);
UFS_WAPBL_END(mp); if (error != 0 && error != EAGAIN)
goto out;
} while (ip->i_size != newsize);
out:
return error;
}
/* truncate all the data of the inode including extended attributes */
int
ufs_truncate_all(struct vnode *vp)
{
struct inode *ip = VTOI(vp);
off_t isize = ip->i_size;
if (ip->i_ump->um_fstype == UFS2) isize += ip->i_ffs2_extsize; if (isize == 0)
return 0;
return ufs_truncate_retry(vp, IO_NORMAL | IO_EXT, 0, NOCRED);
}
/* $NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $ */
/*
* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and
* Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.46 2022/10/26 23:24:09 riastradh Exp $");
#include "opt_ddb.h"
#include "opt_lockdebug.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/proc.h>
#include <sys/kernel.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sysctl.h>
#include <sys/lockdebug.h>
#include <sys/intr.h>
#include <sys/atomic.h>
static void updatepri(struct lwp *);
static void resetpriority(struct lwp *);
/* Number of hardclock ticks per sched_tick() */
u_int sched_rrticks __read_mostly;
/*
* Force switch among equal priority processes every 100ms.
* Called from hardclock every hz/10 == sched_rrticks hardclock ticks.
*/
/* ARGSUSED */
void
sched_tick(struct cpu_info *ci)
{
struct schedstate_percpu *spc = &ci->ci_schedstate;
pri_t pri = PRI_NONE;
lwp_t *l;
spc->spc_ticks = sched_rrticks;
if (CURCPU_IDLE_P()) {
spc_lock(ci);
sched_resched_cpu(ci, MAXPRI_KTHREAD, true);
/* spc now unlocked */
return;
}
l = ci->ci_onproc;
if (l == NULL) {
return;
}
/*
* Can only be spc_lwplock or a turnstile lock at this point
* (if we interrupted priority inheritance trylock dance).
*/
KASSERT(l->l_mutex != spc->spc_mutex);
switch (l->l_class) {
case SCHED_FIFO:
/* No timeslicing for FIFO jobs. */
break;
case SCHED_RR:
/* Force it into mi_switch() to look for other jobs to run. */
pri = MAXPRI_KERNEL_RT;
break;
default:
if (spc->spc_flags & SPCF_SHOULDYIELD) {
/*
* Process is stuck in kernel somewhere, probably
* due to buggy or inefficient code. Force a
* kernel preemption.
*/
pri = MAXPRI_KERNEL_RT;
} else if (spc->spc_flags & SPCF_SEENRR) {
/*
* The process has already been through a roundrobin
* without switching and may be hogging the CPU.
* Indicate that the process should yield.
*/
pri = MAXPRI_KTHREAD;
spc->spc_flags |= SPCF_SHOULDYIELD;
} else if ((spc->spc_flags & SPCF_1STCLASS) == 0) {
/*
* For SMT or asymmetric systems push a little
* harder: if this is not a 1st class CPU, try to
* find a better one to run this LWP.
*/
pri = MAXPRI_KTHREAD;
spc->spc_flags |= SPCF_SHOULDYIELD;
} else {
spc->spc_flags |= SPCF_SEENRR;
}
break;
}
if (pri != PRI_NONE) {
spc_lock(ci);
sched_resched_cpu(ci, pri, true);
/* spc now unlocked */
}
}
/*
* Why PRIO_MAX - 2? From setpriority(2):
*
* prio is a value in the range -20 to 20. The default priority is
* 0; lower priorities cause more favorable scheduling. A value of
* 19 or 20 will schedule a process only when nothing at priority <=
* 0 is runnable.
*
* This gives estcpu influence over 18 priority levels, and leaves nice
* with 40 levels. One way to think about it is that nice has 20 levels
* either side of estcpu's 18.
*/
#define ESTCPU_SHIFT 11
#define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT)
#define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1))
#define ESTCPULIM(e) uimin((e), ESTCPU_MAX)
/*
* The main parameter used by this algorithm is 'l_estcpu'. It is an estimate
* of the recent CPU utilization of the thread.
*
* l_estcpu is:
* - increased each time the hardclock ticks and the thread is found to
* be executing, in sched_schedclock() called from hardclock()
* - decreased (filtered) on each sched tick, in sched_pstats_hook()
* If the lwp is sleeping for more than a second, we don't touch l_estcpu: it
* will be updated in sched_setrunnable() when the lwp wakes up, in burst mode
* (ie, we decrease it n times).
*
* Note that hardclock updates l_estcpu and l_cpticks independently.
*
* -----------------------------------------------------------------------------
*
* Here we describe how l_estcpu is decreased.
*
* Constants for digital decay (filter):
* 90% of l_estcpu usage in (5 * loadavg) seconds
*
* We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we
* want to compute a value of decay such that the following loop:
* for (i = 0; i < (5 * loadavg); i++)
* l_estcpu *= decay;
* will result in
* l_estcpu *= 0.1;
* for all values of loadavg.
*
* Mathematically this loop can be expressed by saying:
* decay ** (5 * loadavg) ~= .1
*
* And finally, the corresponding value of decay we're using is:
* decay = (2 * loadavg) / (2 * loadavg + 1)
*
* -----------------------------------------------------------------------------
*
* Now, let's prove that the value of decay stated above will always fulfill
* the equation:
* decay ** (5 * loadavg) ~= .1
*
* If we compute b as:
* b = 2 * loadavg
* then
* decay = b / (b + 1)
*
* We now need to prove two things:
* 1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)].
* 2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)].
*
* Facts:
* * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ...
* Therefore, for x close to zero, exp(x) =~ 1 + x.
* In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
*
* * For b large enough, (b-1)/b =~ b/(b+1).
*
* * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ...
* Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
*
* * ln(0.1) =~ -2.30
*
* Proof of (1):
* factor ** (5 * loadavg) =~ 0.1
* => ln(factor) =~ -2.30 / (5 * loadavg)
* => factor =~ exp(-1 / ((5 / 2.30) * loadavg))
* =~ exp(-1 / (2 * loadavg))
* =~ exp(-1 / b)
* =~ (b - 1) / b
* =~ b / (b + 1)
* =~ (2 * loadavg) / ((2 * loadavg) + 1)
*
* Proof of (2):
* (b / (b + 1)) ** power =~ .1
* => power * ln(b / (b + 1)) =~ -2.30
* => power * (-1 / (b + 1)) =~ -2.30
* => power =~ 2.30 * (b + 1)
* => power =~ 4.60 * loadavg + 2.30
* => power =~ 5 * loadavg
*
* Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1)
*/
/* See calculations above */
#define loadfactor(loadavg) (2 * (loadavg))
static fixpt_t
decay_cpu(fixpt_t loadfac, fixpt_t estcpu)
{
if (estcpu == 0) {
return 0;
}
#if !defined(_LP64)
/* avoid 64bit arithmetics. */
#define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1))
if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) {
return estcpu * loadfac / (loadfac + FSCALE);
}
#endif
return (uint64_t)estcpu * loadfac / (loadfac + FSCALE);
}
static fixpt_t
decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n)
{
/*
* For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT),
* if we slept for at least seven times the loadfactor, we will decay
* l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can
* return zero directly.
*
* Note that our ESTCPU_MAX is actually much smaller than
* (255 << ESTCPU_SHIFT).
*/
if ((n << FSHIFT) >= 7 * loadfac) {
return 0;
}
while (estcpu != 0 && n > 1) { estcpu = decay_cpu(loadfac, estcpu);
n--;
}
return estcpu;
}
/*
* sched_pstats_hook:
*
* Periodically called from sched_pstats(); used to recalculate priorities.
*/
void
sched_pstats_hook(struct lwp *l, int batch)
{
fixpt_t loadfac;
/*
* If the LWP has slept an entire second, stop recalculating
* its priority until it wakes up.
*/
KASSERT(lwp_locked(l, NULL));
if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
l->l_stat == LSSUSPENDED) {
if (l->l_slptime > 1) {
return;
}
}
loadfac = loadfactor(averunnable.ldavg[0]);
l->l_estcpu = decay_cpu(loadfac, l->l_estcpu);
resetpriority(l);
}
/*
* Recalculate the priority of an LWP after it has slept for a while.
*/
static void
updatepri(struct lwp *l)
{
fixpt_t loadfac;
KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_slptime > 1);
loadfac = loadfactor(averunnable.ldavg[0]);
l->l_slptime--; /* the first time was done in sched_pstats */
l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); resetpriority(l);
}
void
sched_rqinit(void)
{
}
void
sched_setrunnable(struct lwp *l)
{ if (l->l_slptime > 1) updatepri(l);
}
void
sched_nice(struct proc *p, int n)
{
struct lwp *l;
KASSERT(mutex_owned(p->p_lock));
p->p_nice = n;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
lwp_lock(l);
resetpriority(l);
lwp_unlock(l);
}
}
/*
* Recompute the priority of an LWP. Arrange to reschedule if
* the resulting priority is better than that of the current LWP.
*/
static void
resetpriority(struct lwp *l)
{
pri_t pri;
struct proc *p = l->l_proc;
KASSERT(lwp_locked(l, NULL)); if (l->l_class != SCHED_OTHER)
return;
/* See comments above ESTCPU_SHIFT definition. */
pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice;
pri = imax(pri, 0);
if (pri != l->l_priority) lwp_changepri(l, pri);
}
/*
* We adjust the priority of the current LWP. The priority of a LWP
* gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu)
* is increased here. The formula for computing priorities will compute a
* different value each time l_estcpu increases. This can cause a switch,
* but unless the priority crosses a PPQ boundary the actual queue will not
* change. The CPU usage estimator ramps up quite quickly when the process
* is running (linearly), and decays away exponentially, at a rate which is
* proportionally slower when the system is busy. The basic principle is
* that the system will 90% forget that the process used a lot of CPU time
* in (5 * loadavg) seconds. This causes the system to favor processes which
* haven't run much recently, and to round-robin among other processes.
*/
void
sched_schedclock(struct lwp *l)
{
if (l->l_class != SCHED_OTHER)
return;
KASSERT(!CURCPU_IDLE_P());
l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM);
lwp_lock(l);
resetpriority(l);
lwp_unlock(l);
}
/*
* sched_proc_fork:
*
* Inherit the parent's scheduler history.
*/
void
sched_proc_fork(struct proc *parent, struct proc *child)
{
lwp_t *pl;
KASSERT(mutex_owned(parent->p_lock));
pl = LIST_FIRST(&parent->p_lwps);
child->p_estcpu_inherited = pl->l_estcpu;
child->p_forktime = sched_pstats_ticks;
}
/*
* sched_proc_exit:
*
* Chargeback parents for the sins of their children.
*/
void
sched_proc_exit(struct proc *parent, struct proc *child)
{
fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
fixpt_t estcpu;
lwp_t *pl, *cl;
/* XXX Only if parent != init?? */
mutex_enter(parent->p_lock);
pl = LIST_FIRST(&parent->p_lwps);
cl = LIST_FIRST(&child->p_lwps);
estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited,
sched_pstats_ticks - child->p_forktime);
if (cl->l_estcpu > estcpu) { lwp_lock(pl);
pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu);
lwp_unlock(pl);
}
mutex_exit(parent->p_lock);
}
void
sched_wakeup(struct lwp *l)
{
}
void
sched_slept(struct lwp *l)
{
}
void
sched_lwp_fork(struct lwp *l1, struct lwp *l2)
{
l2->l_estcpu = l1->l_estcpu;
}
void
sched_lwp_collect(struct lwp *t)
{
lwp_t *l;
/* Absorb estcpu value of collected LWP. */
l = curlwp;
lwp_lock(l);
l->l_estcpu += t->l_estcpu;
lwp_unlock(l);
}
void
sched_oncpu(lwp_t *l)
{
}
void
sched_newts(lwp_t *l)
{
}
/*
* Sysctl nodes and initialization.
*/
static int
sysctl_sched_rtts(SYSCTLFN_ARGS)
{
struct sysctlnode node;
int rttsms = hztoms(sched_rrticks);
node = *rnode;
node.sysctl_data = &rttsms;
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup")
{
const struct sysctlnode *node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sched",
SYSCTL_DESCR("Scheduler options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node == NULL)
return;
sched_rrticks = hz / 10;
sysctl_createv(NULL, 0, &node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "name", NULL,
NULL, 0, __UNCONST("4.4BSD"), 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(NULL, 0, &node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "rtts",
SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
sysctl_sched_rtts, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
}
/* $NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $ */
/*
* Copyright (c) 2005 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* tmpfs vnode interface for named pipes.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_fifoops.c,v 1.15 2021/07/19 01:30:25 dholland Exp $");
#include <sys/param.h>
#include <sys/vnode.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>
/*
* vnode operations vector used for fifos stored in a tmpfs file system.
*/
int (**tmpfs_fifoop_p)(void *);
const struct vnodeopv_entry_desc tmpfs_fifoop_entries[] = {
{ &vop_default_desc, vn_default_error },
GENFS_FIFOOP_ENTRIES,
{ &vop_close_desc, tmpfs_fifo_close },
{ &vop_access_desc, tmpfs_access },
{ &vop_accessx_desc, genfs_accessx },
{ &vop_getattr_desc, tmpfs_getattr },
{ &vop_setattr_desc, tmpfs_setattr },
{ &vop_read_desc, tmpfs_fifo_read },
{ &vop_write_desc, tmpfs_fifo_write },
{ &vop_fcntl_desc, genfs_fcntl },
{ &vop_fsync_desc, vn_fifo_bypass },
{ &vop_inactive_desc, tmpfs_inactive },
{ &vop_reclaim_desc, tmpfs_reclaim },
{ &vop_lock_desc, genfs_lock },
{ &vop_unlock_desc, genfs_unlock },
{ &vop_strategy_desc, vn_fifo_bypass },
{ &vop_print_desc, tmpfs_print },
{ &vop_islocked_desc, genfs_islocked },
{ &vop_bwrite_desc, genfs_nullop },
{ NULL, NULL }
};
const struct vnodeopv_desc tmpfs_fifoop_opv_desc = {
&tmpfs_fifoop_p, tmpfs_fifoop_entries
};
int
tmpfs_fifo_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap __unused = v;
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_close), v);
}
int
tmpfs_fifo_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_ATIME);
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), v);
}
int
tmpfs_fifo_write(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
vnode_t *vp = ap->a_vp;
tmpfs_update(vp, TMPFS_UPDATE_MTIME);
return VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), v);
}
/* $NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $ */
/*-
* Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file contains kauth(9) listeners needed to implement the traditional
* NetBSD superuser access restrictions.
*
* There are two main resources a request can be issued to: user-owned and
* system owned. For the first, traditional Unix access checks are done, as
* well as superuser checks. If needed, the request context is examined before
* a decision is made. For the latter, usually only superuser checks are done
* as normal users are not allowed to access system resources.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_suser.c,v 1.58 2024/03/01 22:01:03 andvar Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/mutex.h>
#include <sys/mount.h>
#include <sys/socketvar.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <secmodel/secmodel.h>
#include <secmodel/suser/suser.h>
MODULE(MODULE_CLASS_SECMODEL, suser, NULL);
static kauth_listener_t l_generic, l_system, l_process, l_network, l_machdep,
l_device, l_vnode;
static secmodel_t suser_sm;
SYSCTL_SETUP(sysctl_security_suser_setup, "secmodel_user sysctl")
{
const struct sysctlnode *rnode;
sysctl_createv(clog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "models", NULL,
NULL, 0, NULL, 0,
CTL_SECURITY, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "suser", NULL,
NULL, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "name", NULL,
NULL, 0, __UNCONST(SECMODEL_SUSER_NAME), 0,
CTL_CREATE, CTL_EOL);
}
void
secmodel_suser_init(void)
{
}
void
secmodel_suser_start(void)
{
l_generic = kauth_listen_scope(KAUTH_SCOPE_GENERIC,
secmodel_suser_generic_cb, NULL);
l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
secmodel_suser_system_cb, NULL);
l_process = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
secmodel_suser_process_cb, NULL);
l_network = kauth_listen_scope(KAUTH_SCOPE_NETWORK,
secmodel_suser_network_cb, NULL);
l_machdep = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
secmodel_suser_machdep_cb, NULL);
l_device = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
secmodel_suser_device_cb, NULL);
l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
secmodel_suser_vnode_cb, NULL);
}
void
secmodel_suser_stop(void)
{
kauth_unlisten_scope(l_generic);
kauth_unlisten_scope(l_system);
kauth_unlisten_scope(l_process);
kauth_unlisten_scope(l_network);
kauth_unlisten_scope(l_machdep);
kauth_unlisten_scope(l_device);
kauth_unlisten_scope(l_vnode);
}
static bool
suser_isroot(kauth_cred_t cred)
{
return kauth_cred_geteuid(cred) == 0;
}
static int
suser_eval(const char *what, void *arg, void *ret)
{
int error = 0;
if (strcasecmp(what, "is-root") == 0) {
kauth_cred_t cred = arg;
bool *bp = ret;
*bp = suser_isroot(cred);
} else {
error = ENOENT;
}
return error;
}
static int
suser_modcmd(modcmd_t cmd, void *arg)
{
int error = 0;
switch (cmd) {
case MODULE_CMD_INIT:
error = secmodel_register(&suser_sm,
SECMODEL_SUSER_ID, SECMODEL_SUSER_NAME,
NULL, suser_eval, NULL);
if (error != 0)
printf("suser_modcmd::init: secmodel_register "
"returned %d\n", error);
secmodel_suser_init();
secmodel_suser_start();
break;
case MODULE_CMD_FINI:
secmodel_suser_stop();
error = secmodel_deregister(suser_sm);
if (error != 0)
printf("suser_modcmd::fini: secmodel_deregister "
"returned %d\n", error);
break;
case MODULE_CMD_AUTOUNLOAD:
error = EPERM;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Generic
* Responsibility: Superuser access
*/
int
secmodel_suser_generic_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_GENERIC_ISSUSER:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: System
* Responsibility: Superuser access
*/
int
secmodel_suser_system_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
enum kauth_system_req req;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)arg0;
switch (action) {
case KAUTH_SYSTEM_CPU:
switch (req) {
case KAUTH_REQ_SYSTEM_CPU_SETSTATE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_DEVMAPPER:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_FS_QUOTA:
switch (req) {
case KAUTH_REQ_SYSTEM_FS_QUOTA_GET:
case KAUTH_REQ_SYSTEM_FS_QUOTA_ONOFF:
case KAUTH_REQ_SYSTEM_FS_QUOTA_MANAGE:
case KAUTH_REQ_SYSTEM_FS_QUOTA_NOLIMIT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SYSVIPC:
switch (req) {
case KAUTH_REQ_SYSTEM_SYSVIPC_BYPASS:
case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_LOCK:
case KAUTH_REQ_SYSTEM_SYSVIPC_SHM_UNLOCK:
case KAUTH_REQ_SYSTEM_SYSVIPC_MSGQ_OVERSIZE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_MOUNT:
switch (req) {
case KAUTH_REQ_SYSTEM_MOUNT_DEVICE:
case KAUTH_REQ_SYSTEM_MOUNT_GET:
case KAUTH_REQ_SYSTEM_MOUNT_NEW:
case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
case KAUTH_REQ_SYSTEM_MOUNT_UMAP:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
default:
break;
}
break;
case KAUTH_SYSTEM_MQUEUE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_PSET:
switch (req) {
case KAUTH_REQ_SYSTEM_PSET_ASSIGN:
case KAUTH_REQ_SYSTEM_PSET_BIND:
case KAUTH_REQ_SYSTEM_PSET_CREATE:
case KAUTH_REQ_SYSTEM_PSET_DESTROY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_TIME:
switch (req) {
case KAUTH_REQ_SYSTEM_TIME_ADJTIME:
case KAUTH_REQ_SYSTEM_TIME_NTPADJTIME:
case KAUTH_REQ_SYSTEM_TIME_TIMECOUNTERS:
case KAUTH_REQ_SYSTEM_TIME_SYSTEM:
case KAUTH_REQ_SYSTEM_TIME_RTCOFFSET:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SEMAPHORE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_SYSCTL:
switch (req) {
case KAUTH_REQ_SYSTEM_SYSCTL_ADD:
case KAUTH_REQ_SYSTEM_SYSCTL_DELETE:
case KAUTH_REQ_SYSTEM_SYSCTL_DESC:
case KAUTH_REQ_SYSTEM_SYSCTL_MODIFY:
case KAUTH_REQ_SYSTEM_SYSCTL_PRVT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_SWAPCTL:
case KAUTH_SYSTEM_ACCOUNTING:
case KAUTH_SYSTEM_REBOOT:
case KAUTH_SYSTEM_CHROOT:
case KAUTH_SYSTEM_FILEHANDLE:
case KAUTH_SYSTEM_MKNOD:
case KAUTH_SYSTEM_SETIDCORE:
case KAUTH_SYSTEM_MODULE:
case KAUTH_SYSTEM_FS_RESERVEDSPACE:
case KAUTH_SYSTEM_MAP_VA_ZERO:
case KAUTH_SYSTEM_FS_EXTATTR:
case KAUTH_SYSTEM_FS_SNAPSHOT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_DEBUG:
break;
case KAUTH_SYSTEM_CHSYSFLAGS:
/* Deprecated. */
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_SYSTEM_VERIEXEC:
switch (req) {
case KAUTH_REQ_SYSTEM_VERIEXEC_ACCESS:
case KAUTH_REQ_SYSTEM_VERIEXEC_MODIFY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_LFS:
switch (req) {
case KAUTH_REQ_SYSTEM_LFS_MARKV:
case KAUTH_REQ_SYSTEM_LFS_BMAPV:
case KAUTH_REQ_SYSTEM_LFS_SEGCLEAN:
case KAUTH_REQ_SYSTEM_LFS_SEGWAIT:
case KAUTH_REQ_SYSTEM_LFS_FCNTL:
if (isroot)
result = KAUTH_RESULT_ALLOW;
default:
break;
}
break;
case KAUTH_SYSTEM_INTR:
switch (req) {
case KAUTH_REQ_SYSTEM_INTR_AFFINITY:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_SYSTEM_KERNADDR:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Process
* Responsibility: Superuser access
*/
int
secmodel_suser_process_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_PROCESS_SIGNAL:
case KAUTH_PROCESS_KTRACE:
case KAUTH_PROCESS_PROCFS:
case KAUTH_PROCESS_PTRACE:
case KAUTH_PROCESS_SCHEDULER_GETPARAM:
case KAUTH_PROCESS_SCHEDULER_SETPARAM:
case KAUTH_PROCESS_SCHEDULER_GETAFFINITY:
case KAUTH_PROCESS_SCHEDULER_SETAFFINITY:
case KAUTH_PROCESS_SETID:
case KAUTH_PROCESS_KEVENT_FILTER:
case KAUTH_PROCESS_NICE:
case KAUTH_PROCESS_FORK:
case KAUTH_PROCESS_CORENAME:
case KAUTH_PROCESS_STOPFLAG:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_PROCESS_CANSEE: {
unsigned long req;
req = (unsigned long)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_CANSEE_ARGS:
case KAUTH_REQ_PROCESS_CANSEE_ENTRY:
case KAUTH_REQ_PROCESS_CANSEE_OPENFILES:
case KAUTH_REQ_PROCESS_CANSEE_EPROC:
case KAUTH_REQ_PROCESS_CANSEE_KPTR:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
case KAUTH_REQ_PROCESS_CANSEE_ENV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_PROCESS_RLIMIT: {
enum kauth_process_req req;
req = (enum kauth_process_req)(uintptr_t)arg1;
switch (req) {
case KAUTH_REQ_PROCESS_RLIMIT_SET:
case KAUTH_REQ_PROCESS_RLIMIT_GET:
case KAUTH_REQ_PROCESS_RLIMIT_BYPASS:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Network
* Responsibility: Superuser access
*/
int
secmodel_suser_network_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
enum kauth_network_req req;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
req = (enum kauth_network_req)(uintptr_t)arg0;
switch (action) {
case KAUTH_NETWORK_ALTQ:
switch (req) {
case KAUTH_REQ_NETWORK_ALTQ_AFMAP:
case KAUTH_REQ_NETWORK_ALTQ_BLUE:
case KAUTH_REQ_NETWORK_ALTQ_CBQ:
case KAUTH_REQ_NETWORK_ALTQ_CDNR:
case KAUTH_REQ_NETWORK_ALTQ_CONF:
case KAUTH_REQ_NETWORK_ALTQ_FIFOQ:
case KAUTH_REQ_NETWORK_ALTQ_HFSC:
case KAUTH_REQ_NETWORK_ALTQ_JOBS:
case KAUTH_REQ_NETWORK_ALTQ_PRIQ:
case KAUTH_REQ_NETWORK_ALTQ_RED:
case KAUTH_REQ_NETWORK_ALTQ_RIO:
case KAUTH_REQ_NETWORK_ALTQ_WFQ:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_BIND:
switch (req) {
case KAUTH_REQ_NETWORK_BIND_PORT:
case KAUTH_REQ_NETWORK_BIND_PRIVPORT:
case KAUTH_REQ_NETWORK_BIND_ANYADDR:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_FIREWALL:
switch (req) {
case KAUTH_REQ_NETWORK_FIREWALL_FW:
case KAUTH_REQ_NETWORK_FIREWALL_NAT:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_FORWSRCRT:
case KAUTH_NETWORK_ROUTE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_NETWORK_INTERFACE:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_GET:
case KAUTH_REQ_NETWORK_INTERFACE_SET:
case KAUTH_REQ_NETWORK_INTERFACE_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_SETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_FIRMWARE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_BRIDGE:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_BRIDGE_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_PPP:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_PPP_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_PVC:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_PVC_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_SLIP:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_SLIP_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_TUN:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_TUN_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_IPV6:
switch (req) {
case KAUTH_REQ_NETWORK_IPV6_HOPBYHOP:
case KAUTH_REQ_NETWORK_IPV6_JOIN_MULTICAST:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_NFS:
switch (req) {
case KAUTH_REQ_NETWORK_NFS_EXPORT:
case KAUTH_REQ_NETWORK_NFS_SVC:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_SMB:
switch (req) {
case KAUTH_REQ_NETWORK_SMB_SHARE_ACCESS:
case KAUTH_REQ_NETWORK_SMB_SHARE_CREATE:
case KAUTH_REQ_NETWORK_SMB_VC_ACCESS:
case KAUTH_REQ_NETWORK_SMB_VC_CREATE:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_INTERFACE_WG:
switch (req) {
case KAUTH_REQ_NETWORK_INTERFACE_WG_GETPRIV:
case KAUTH_REQ_NETWORK_INTERFACE_WG_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
case KAUTH_NETWORK_SOCKET:
switch (req) {
case KAUTH_REQ_NETWORK_SOCKET_DROP:
case KAUTH_REQ_NETWORK_SOCKET_OPEN:
case KAUTH_REQ_NETWORK_SOCKET_RAWSOCK:
case KAUTH_REQ_NETWORK_SOCKET_SETPRIV:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_NETWORK_SOCKET_CANSEE:
if (isroot) {
result = KAUTH_RESULT_ALLOW;
break;
}
break;
default:
break;
}
break;
case KAUTH_NETWORK_IPSEC:
switch (req) {
case KAUTH_REQ_NETWORK_IPSEC_BYPASS:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Machdep
* Responsibility: Superuser access
*/
int
secmodel_suser_machdep_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_MACHDEP_CPU_UCODE_APPLY:
case KAUTH_MACHDEP_IOPERM_GET:
case KAUTH_MACHDEP_LDT_GET:
case KAUTH_MACHDEP_LDT_SET:
case KAUTH_MACHDEP_MTRR_GET:
case KAUTH_MACHDEP_CACHEFLUSH:
case KAUTH_MACHDEP_IOPERM_SET:
case KAUTH_MACHDEP_IOPL:
case KAUTH_MACHDEP_MTRR_SET:
case KAUTH_MACHDEP_NVRAM:
case KAUTH_MACHDEP_UNMANAGEDMEM:
case KAUTH_MACHDEP_PXG:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_MACHDEP_SVS_DISABLE:
/* Deprecated. */
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
/*
* kauth(9) listener
*
* Security model: Traditional NetBSD
* Scope: Device
* Responsibility: Superuser access
*/
int
secmodel_suser_device_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_DEVICE_BLUETOOTH_SETPRIV:
case KAUTH_DEVICE_BLUETOOTH_SEND:
case KAUTH_DEVICE_BLUETOOTH_RECV:
case KAUTH_DEVICE_TTY_OPEN:
case KAUTH_DEVICE_TTY_PRIVSET:
case KAUTH_DEVICE_TTY_STI:
case KAUTH_DEVICE_TTY_VIRTUAL:
case KAUTH_DEVICE_RND_ADDDATA:
case KAUTH_DEVICE_RND_ADDDATA_ESTIMATE:
case KAUTH_DEVICE_RND_GETPRIV:
case KAUTH_DEVICE_RND_SETPRIV:
case KAUTH_DEVICE_WSCONS_KEYBOARD_BELL:
case KAUTH_DEVICE_WSCONS_KEYBOARD_KEYREPEAT:
case KAUTH_DEVICE_NVMM_CTL:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_DEVICE_BLUETOOTH_BCSP:
case KAUTH_DEVICE_BLUETOOTH_BTUART: {
enum kauth_device_req req;
req = (enum kauth_device_req)(uintptr_t)arg0;
switch (req) {
case KAUTH_REQ_DEVICE_BLUETOOTH_BCSP_ADD:
case KAUTH_REQ_DEVICE_BLUETOOTH_BTUART_ADD:
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
}
case KAUTH_DEVICE_GPIO_PINSET:
/*
* root can access gpio pins, secmodel_securelevel can veto
* this decision.
*/
if (isroot)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
return (result);
}
int
secmodel_suser_vnode_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
bool isroot;
int result;
isroot = suser_isroot(cred);
result = KAUTH_RESULT_DEFER;
if (isroot) {
/* Superuser can execute only if the file's executable. */
if ((action & KAUTH_VNODE_EXECUTE) == 0 ||
(action & KAUTH_VNODE_IS_EXEC))
result = KAUTH_RESULT_ALLOW;
}
return (result);
}
/* $NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $ */
/* from: NetBSD: exec_aout.c,v 1.15 1996/09/26 23:34:46 cgd Exp */
/*
* Copyright (c) 1998, 2001 Matthew R. Green.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: netbsd32_exec_aout.c,v 1.31 2021/01/19 03:20:13 simonb Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/signal.h>
#include <sys/signalvar.h>
#include <compat/netbsd32/netbsd32.h>
#ifndef EXEC_AOUT
#define EXEC_AOUT
#endif
#include <compat/netbsd32/netbsd32_exec.h>
#include <machine/frame.h>
#include <machine/netbsd32_machdep.h>
#ifdef COMPAT_NOMID
static int netbsd32_exec_aout_nomid(struct lwp *, struct exec_package *);
#endif
/*
* exec_netbsd32_makecmds(): Check if it's an netbsd32 a.out format
* executable.
*
* Given a lwp pointer and an exec package pointer, see if the referent
* of the epp is in netbsd32 a.out format. Check 'standard' magic
* numbers for this architecture.
*
* This function, in the former case, or the hook, in the latter, is
* responsible for creating a set of vmcmds which can be used to build
* the process's vm space and inserting them into the exec package.
*/
int
exec_netbsd32_makecmds(struct lwp *l, struct exec_package *epp)
{
netbsd32_u_long midmag, magic;
u_short mid;
int error;
struct netbsd32_exec *execp = epp->ep_hdr;
if (epp->ep_hdrvalid < sizeof(struct netbsd32_exec))
return ENOEXEC;
midmag = (netbsd32_u_long)ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0x3ff;
magic = midmag & 0xffff;
midmag = mid << 16 | magic;
/* this is already needed by setup_stack() */
epp->ep_flags |= EXEC_32;
switch (midmag) {
case (NETBSD32_MID_MACHINE << 16) | ZMAGIC:
error = netbsd32_exec_aout_prep_zmagic(l, epp);
break;
case (NETBSD32_MID_MACHINE << 16) | NMAGIC:
error = netbsd32_exec_aout_prep_nmagic(l, epp);
break;
case (NETBSD32_MID_MACHINE << 16) | OMAGIC:
error = netbsd32_exec_aout_prep_omagic(l, epp);
break;
default:
#ifdef COMPAT_NOMID
error = netbsd32_exec_aout_nomid(l, epp);
#else
error = ENOEXEC;
#endif
break;
}
if (error) {
kill_vmcmds(&epp->ep_vmcmds);
epp->ep_flags &= ~EXEC_32;
} else
epp->ep_flags &= ~EXEC_TOPDOWN_VM;
return error;
}
/*
* netbsd32_exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's
* exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
netbsd32_exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
error = vn_marktext(epp->ep_vp);
if (error)
return error;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's
* exec package
*/
int
netbsd32_exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's
* exec package
*/
int
netbsd32_exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
PAGE_SIZE);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
#ifdef COMPAT_NOMID
/*
* netbsd32_exec_aout_prep_oldzmagic():
* Prepare the vmcmds to build a vmspace for an old ZMAGIC
* binary. [386BSD/BSDI/4.4BSD/NetBSD0.8]
*
* Cloned from exec_aout_prep_zmagic() in kern/exec_aout.c; a more verbose
* description of operation is there.
* There were copies of this in the mac68k, hp300, and i386 ports.
*/
static int
netbsd32_exec_aout_prep_oldzmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
error = vn_marktext(epp->ep_vp);
if (error)
return error;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, PAGE_SIZE, /* XXX CLBYTES? */
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, execp->a_data,
epp->ep_daddr, epp->ep_vp,
execp->a_text + PAGE_SIZE, /* XXX CLBYTES? */
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_oldnmagic():
* Prepare the vmcmds to build a vmspace for an old NMAGIC
* binary. [BSDI]
*
* Cloned from exec_aout_prep_nmagic() in kern/exec_aout.c; with text starting
* at 0.
* XXX: There must be a better way to share this code.
*/
static int
netbsd32_exec_aout_prep_oldnmagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
epp->ep_vm_minaddr = exec_vm_minaddr(VM_MIN_ADDRESS);
epp->ep_vm_maxaddr = VM_MAXUSER_ADDRESS32;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct netbsd32_exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* netbsd32_exec_aout_prep_oldomagic():
* Prepare the vmcmds to build a vmspace for an old OMAGIC
* binary. [BSDI]
*
* Cloned from exec_aout_prep_omagic() in kern/exec_aout.c; with text starting
* at 0.
* XXX: There must be a better way to share this code.
*/
static int
netbsd32_exec_aout_prep_oldomagic(struct lwp *l, struct exec_package *epp)
{
struct netbsd32_exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = 0;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct netbsd32_exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = roundup(epp->ep_daddr + execp->a_data, PAGE_SIZE);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - roundup(execp->a_text,
PAGE_SIZE);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
static int
netbsd32_exec_aout_nomid(struct lwp *l, struct exec_package *epp)
{
int error;
u_long midmag, magic;
u_short mid;
struct exec *execp = epp->ep_hdr;
/* check on validity of epp->ep_hdr performed by exec_out_makecmds */
midmag = ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0xffff;
magic = midmag & 0xffff;
if (magic == 0) {
magic = (execp->a_midmag & 0xffff);
mid = MID_ZERO;
}
midmag = mid << 16 | magic;
switch (midmag) {
case (MID_ZERO << 16) | ZMAGIC:
/*
* 386BSD's ZMAGIC format:
*/
return netbsd32_exec_aout_prep_oldzmagic(l, epp);
break;
case (MID_ZERO << 16) | QMAGIC:
/*
* BSDI's QMAGIC format:
* same as new ZMAGIC format, but with different magic number
*/
return netbsd32_exec_aout_prep_zmagic(l, epp);
break;
case (MID_ZERO << 16) | NMAGIC:
/*
* BSDI's NMAGIC format:
* same as NMAGIC format, but with different magic number
* and with text starting at 0.
*/
return netbsd32_exec_aout_prep_oldnmagic(l, epp);
case (MID_ZERO << 16) | OMAGIC:
/*
* BSDI's OMAGIC format:
* same as OMAGIC format, but with different magic number
* and with text starting at 0.
*/
return netbsd32_exec_aout_prep_oldomagic(l, epp);
default:
return ENOEXEC;
}
return error;
}
#endif
/* $NetBSD: uvm_device.c,v 1.80 2022/07/07 13:27:02 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_device.c,v 1.1.2.9 1998/02/06 05:11:47 chs Exp
*/
/*
* uvm_device.c: the device pager.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_device.c,v 1.80 2022/07/07 13:27:02 riastradh Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <uvm/uvm.h>
#include <uvm/uvm_device.h>
#include <uvm/uvm_pmap.h>
/*
* private global data structure
*
* we keep a list of active device objects in the system.
*/
LIST_HEAD(udv_list_struct, uvm_device);
static struct udv_list_struct udv_list;
static kmutex_t udv_lock __cacheline_aligned;
/*
* functions
*/
static void udv_init(void);
static void udv_reference(struct uvm_object *);
static void udv_detach(struct uvm_object *);
static int udv_fault(struct uvm_faultinfo *, vaddr_t,
struct vm_page **, int, int, vm_prot_t,
int);
/*
* master pager structure
*/
const struct uvm_pagerops uvm_deviceops = {
.pgo_init = udv_init,
.pgo_reference = udv_reference,
.pgo_detach = udv_detach,
.pgo_fault = udv_fault,
};
/*
* the ops!
*/
/*
* udv_init
*
* init pager private data structures.
*/
static void
udv_init(void)
{
LIST_INIT(&udv_list);
mutex_init(&udv_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* udv_attach
*
* get a VM object that is associated with a device. allocate a new
* one if needed.
*
* => caller must _not_ already be holding the lock on the uvm_object.
* => in fact, nothing should be locked so that we can sleep here.
*/
struct uvm_object *
udv_attach(dev_t device, vm_prot_t accessprot,
voff_t off, /* used only for access check */
vsize_t size /* used only for access check */)
{
struct uvm_device *udv, *lcv;
const struct cdevsw *cdev;
dev_mmap_t *mapfn;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "(device=%#jx)", device,0,0,0);
KASSERT(size > 0);
/*
* before we do anything, ensure this device supports mmap
*/
cdev = cdevsw_lookup(device);
if (cdev == NULL) {
return NULL;
}
mapfn = cdev->d_mmap;
if (mapfn == NULL || mapfn == nommap) {
return NULL;
}
/*
* Negative offsets on the object are not allowed, unless the
* device has affirmatively set D_NEGOFFSAFE.
*/
if ((cdev->d_flag & D_NEGOFFSAFE) == 0 && off != UVM_UNKNOWN_OFFSET) { if (off < 0)
return NULL;
#if SIZE_MAX > UINT32_MAX /* XXX -Wtype-limits */
if (size > __type_max(voff_t))
return NULL;
#endif
if (off > __type_max(voff_t) - size)
return NULL;
}
/*
* Check that the specified range of the device allows the
* desired protection.
*
* XXX assumes VM_PROT_* == PROT_*
* XXX clobbers off and size, but nothing else here needs them.
*/
do {
KASSERTMSG((off % PAGE_SIZE) == 0, "off=%jd", (intmax_t)off); KASSERTMSG(size >= PAGE_SIZE, "size=%"PRIuVSIZE, size); if (cdev_mmap(device, off, accessprot) == -1)
return NULL;
KASSERT(off <= __type_max(voff_t) - PAGE_SIZE ||
(cdev->d_flag & D_NEGOFFSAFE) != 0);
if (__predict_false(off > __type_max(voff_t) - PAGE_SIZE)) {
/*
* off += PAGE_SIZE, with two's-complement
* wraparound, or
*
* off += PAGE_SIZE - 2*(VOFF_MAX + 1).
*/
CTASSERT(MIN_PAGE_SIZE >= 2);
off -= __type_max(voff_t);
off += PAGE_SIZE - 2;
off -= __type_max(voff_t);
} else {
off += PAGE_SIZE;
}
size -= PAGE_SIZE;
} while (size != 0);
/*
* keep looping until we get it
*/
for (;;) {
/*
* first, attempt to find it on the main list
*/
mutex_enter(&udv_lock);
LIST_FOREACH(lcv, &udv_list, u_list) {
if (device == lcv->u_device)
break;
}
/*
* got it on main list. put a hold on it and unlock udv_lock.
*/
if (lcv) {
/*
* if someone else has a hold on it, sleep and start
* over again.
*/
if (lcv->u_flags & UVM_DEVICE_HOLD) {
lcv->u_flags |= UVM_DEVICE_WANTED;
UVM_UNLOCK_AND_WAIT(lcv, &udv_lock, false,
"udv_attach",0);
continue;
}
/* we are now holding it */
lcv->u_flags |= UVM_DEVICE_HOLD;
mutex_exit(&udv_lock);
/*
* bump reference count, unhold, return.
*/
rw_enter(lcv->u_obj.vmobjlock, RW_WRITER);
lcv->u_obj.uo_refs++;
rw_exit(lcv->u_obj.vmobjlock);
mutex_enter(&udv_lock);
if (lcv->u_flags & UVM_DEVICE_WANTED) wakeup(lcv);
lcv->u_flags &= ~(UVM_DEVICE_WANTED|UVM_DEVICE_HOLD);
mutex_exit(&udv_lock);
return &lcv->u_obj;
}
/*
* Did not find it on main list. Need to allocate a new one.
*/
mutex_exit(&udv_lock);
/* Note: both calls may allocate memory and sleep. */
udv = kmem_alloc(sizeof(*udv), KM_SLEEP);
uvm_obj_init(&udv->u_obj, &uvm_deviceops, true, 1);
mutex_enter(&udv_lock);
/*
* now we have to double check to make sure no one added it
* to the list while we were sleeping...
*/
LIST_FOREACH(lcv, &udv_list, u_list) {
if (device == lcv->u_device)
break;
}
/*
* did we lose a race to someone else?
* free our memory and retry.
*/
if (lcv) {
mutex_exit(&udv_lock);
uvm_obj_destroy(&udv->u_obj, true);
kmem_free(udv, sizeof(*udv));
continue;
}
/*
* we have it! init the data structures, add to list
* and return.
*/
udv->u_flags = 0;
udv->u_device = device;
LIST_INSERT_HEAD(&udv_list, udv, u_list);
mutex_exit(&udv_lock);
return &udv->u_obj;
}
/*NOTREACHED*/
}
/*
* udv_reference
*
* add a reference to a VM object. Note that the reference count must
* already be one (the passed in reference) so there is no chance of the
* udv being released or locked out here.
*
* => caller must call with object unlocked.
*/
static void
udv_reference(struct uvm_object *uobj)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
rw_enter(uobj->vmobjlock, RW_WRITER);
uobj->uo_refs++;
UVMHIST_LOG(maphist, "<- done (uobj=%#jx, ref = %jd)",
(uintptr_t)uobj, uobj->uo_refs,0,0);
rw_exit(uobj->vmobjlock);
}
/*
* udv_detach
*
* remove a reference to a VM object.
*
* => caller must call with object unlocked and map locked.
*/
static void
udv_detach(struct uvm_object *uobj)
{
struct uvm_device *udv = (struct uvm_device *)uobj;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* loop until done
*/
again:
rw_enter(uobj->vmobjlock, RW_WRITER);
if (uobj->uo_refs > 1) {
uobj->uo_refs--;
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(maphist," <- done, uobj=%#jx, ref=%jd",
(uintptr_t)uobj,uobj->uo_refs,0,0);
return;
}
/*
* is it being held? if so, wait until others are done.
*/
mutex_enter(&udv_lock);
if (udv->u_flags & UVM_DEVICE_HOLD) {
udv->u_flags |= UVM_DEVICE_WANTED;
rw_exit(uobj->vmobjlock);
UVM_UNLOCK_AND_WAIT(udv, &udv_lock, false, "udv_detach",0);
goto again;
}
/*
* got it! nuke it now.
*/
LIST_REMOVE(udv, u_list); if (udv->u_flags & UVM_DEVICE_WANTED) wakeup(udv);
mutex_exit(&udv_lock);
rw_exit(uobj->vmobjlock);
uvm_obj_destroy(uobj, true);
kmem_free(udv, sizeof(*udv));
UVMHIST_LOG(maphist," <- done, freed uobj=%#jx", (uintptr_t)uobj,
0, 0, 0);
}
/*
* udv_fault: non-standard fault routine for device "pages"
*
* => rather than having a "get" function, we have a fault routine
* since we don't return vm_pages we need full control over the
* pmap_enter map in
* => all the usual fault data structured are locked by the caller
* (i.e. maps(read), amap (if any), uobj)
* => on return, we unlock all fault data structures
* => flags: PGO_ALLPAGES: get all of the pages
* PGO_LOCKED: fault data structures are locked
* XXX: currently PGO_LOCKED is always required ... consider removing
* it as a flag
* => NOTE: vaddr is the VA of pps[0] in ufi->entry, _NOT_ pps[centeridx]
*/
static int
udv_fault(struct uvm_faultinfo *ufi, vaddr_t vaddr, struct vm_page **pps,
int npages, int centeridx, vm_prot_t access_type,
int flags)
{
struct vm_map_entry *entry = ufi->entry;
struct uvm_object *uobj = entry->object.uvm_obj;
struct uvm_device *udv = (struct uvm_device *)uobj;
vaddr_t curr_va;
off_t curr_offset;
paddr_t paddr, mdpgno;
u_int mmapflags;
int lcv, retval;
dev_t device;
vm_prot_t mapprot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
UVMHIST_LOG(maphist," flags=%#jx", flags,0,0,0);
/*
* we do not allow device mappings to be mapped copy-on-write
* so we kill any attempt to do so here.
*/
if (UVM_ET_ISCOPYONWRITE(entry)) {
UVMHIST_LOG(maphist, "<- failed -- COW entry (etype=%#jx)",
entry->etype, 0,0,0);
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
return EIO;
}
/*
* get device map function.
*/
device = udv->u_device;
if (cdevsw_lookup(device) == NULL) {
/* XXX This should not happen */
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
return EIO;
}
/*
* now we must determine the offset in udv to use and the VA to
* use for pmap_enter. note that we always use orig_map's pmap
* for pmap_enter (even if we have a submap). since virtual
* addresses in a submap must match the main map, this is ok.
*/
/* udv offset = (offset from start of entry) + entry's offset */
curr_offset = entry->offset + (vaddr - entry->start);
/* pmap va = vaddr (virtual address of pps[0]) */
curr_va = vaddr;
/*
* loop over the page range entering in as needed
*/
retval = 0;
for (lcv = 0 ; lcv < npages ; lcv++, curr_offset += PAGE_SIZE,
curr_va += PAGE_SIZE) {
if ((flags & PGO_ALLPAGES) == 0 && lcv != centeridx)
continue;
if (pps[lcv] == PGO_DONTCARE)
continue;
mdpgno = cdev_mmap(device, curr_offset, access_type);
if (mdpgno == -1) {
retval = EIO;
break;
}
paddr = pmap_phys_address(mdpgno);
mmapflags = pmap_mmap_flags(mdpgno);
mapprot = ufi->entry->protection;
UVMHIST_LOG(maphist,
" MAPPING: device: pm=%#jx, va=%#jx, pa=%#jx, at=%jd",
(uintptr_t)ufi->orig_map->pmap, curr_va, paddr, mapprot);
if (pmap_enter(ufi->orig_map->pmap, curr_va, paddr, mapprot,
PMAP_CANFAIL | mapprot | mmapflags) != 0) {
/*
* pmap_enter() didn't have the resource to
* enter this mapping. Unlock everything,
* wait for the pagedaemon to free up some
* pages, and then tell uvm_fault() to start
* the fault again.
*
* XXX Needs some rethinking for the PGO_ALLPAGES
* XXX case.
*/
pmap_update(ufi->orig_map->pmap); /* sync what we have so far */
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap,
uobj);
return ENOMEM;
}
}
pmap_update(ufi->orig_map->pmap);
uvmfault_unlockall(ufi, ufi->entry->aref.ar_amap, uobj);
return retval;
}
/* $NetBSD: if43_20.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2018 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Goyette
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if43_20.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/mbuf.h> /* for MLEN */
#include <sys/protosw.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/bpf.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <net/if_gre.h>
#include <net/if_tap.h>
#include <net80211/ieee80211_ioctl.h>
#include <compat/common/compat_mod.h>
static int
if43_cvtcmd_20(u_long ncmd)
{
switch (ncmd) {
case OSIOCG80211STATS:
case OSIOCG80211ZSTATS:
return 0;
default:
return EINVAL;
}
}
void
if43_20_init(void)
{
MODULE_HOOK_SET(if43_cvtcmd_20_hook, if43_cvtcmd_20);
}
void
if43_20_fini(void)
{
MODULE_HOOK_UNSET(if43_cvtcmd_20_hook);
}
/* $NetBSD: uipc_syscalls_50.c,v 1.12 2022/09/28 15:32:09 msaitoh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/msg.h>
#include <sys/sysctl.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <compat/net/if.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_mod.h>
/*ARGSUSED*/
static int
compat_ifdatareq(struct lwp *l, u_long cmd, void *data)
{
struct if_data ifi;
struct ifdatareq50 *ifdr = data;
struct ifnet *ifp;
int error;
/* Validate arguments. */
switch (cmd) {
case OSIOCGIFDATA:
case OSIOCZIFDATA:
break;
default:
return ENOSYS;
}
ifp = ifunit(ifdr->ifdr_name);
if (ifp == NULL)
return ENXIO;
/* Do work. */
switch (cmd) {
case OSIOCGIFDATA:
if_export_if_data(ifp, &ifi, false);
ifdatan2o(&ifdr->ifdr_data, &ifi);
return 0;
case OSIOCZIFDATA:
if (l != NULL) {
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp,
(void *)cmd, NULL);
if (error != 0)
return error;
}
if_export_if_data(ifp, &ifi, true);
ifdatan2o(&ifdr->ifdr_data, &ifi);
/* XXX if_lastchange? */
return 0;
default:
/* Impossible due to above validation, but makes gcc happy. */
return ENOSYS;
}
}
void
uipc_syscalls_50_init(void)
{
MODULE_HOOK_SET(uipc_syscalls_50_hook, compat_ifdatareq);
}
void
uipc_syscalls_50_fini(void)
{
MODULE_HOOK_UNSET(uipc_syscalls_50_hook);
}
/* $NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_lock.c,v 1.188 2024/01/14 11:46:05 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_lockdebug.h"
#endif
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/lock.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/cpu.h>
#include <sys/syslog.h>
#include <sys/atomic.h>
#include <sys/lwp.h>
#include <sys/pserialize.h>
#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
#include <sys/ksyms.h>
#endif
#include <machine/lock.h>
#include <dev/lockstat.h>
#define RETURN_ADDRESS (uintptr_t)__builtin_return_address(0)
bool kernel_lock_dodebug;
__cpu_simple_lock_t kernel_lock[CACHE_LINE_SIZE / sizeof(__cpu_simple_lock_t)]
__cacheline_aligned;
void
assert_sleepable(void)
{
const char *reason;
long pctr;
bool idle;
if (__predict_false(panicstr != NULL)) {
return;
}
LOCKDEBUG_BARRIER(kernel_lock, 1);
/*
* Avoid disabling/re-enabling preemption here since this
* routine may be called in delicate situations.
*/
do {
pctr = lwp_pctr();
idle = CURCPU_IDLE_P();
} while (__predict_false(pctr != lwp_pctr()));
reason = NULL;
if (__predict_false(idle) && !cold) {
reason = "idle";
goto panic;
}
if (__predict_false(cpu_intr_p())) {
reason = "interrupt";
goto panic;
}
if (__predict_false(cpu_softintr_p())) {
reason = "softint";
goto panic;
}
if (__predict_false(!pserialize_not_in_read_section())) {
reason = "pserialize";
goto panic;
}
return;
panic: panic("%s: %s caller=%p", __func__, reason, (void *)RETURN_ADDRESS);
}
/*
* Functions for manipulating the kernel_lock. We put them here
* so that they show up in profiles.
*/
#define _KERNEL_LOCK_ABORT(msg) \
LOCKDEBUG_ABORT(__func__, __LINE__, kernel_lock, &_kernel_lock_ops, msg)
#ifdef LOCKDEBUG
#define _KERNEL_LOCK_ASSERT(cond) \
do { \
if (!(cond)) \
_KERNEL_LOCK_ABORT("assertion failed: " #cond); \
} while (/* CONSTCOND */ 0)
#else
#define _KERNEL_LOCK_ASSERT(cond) /* nothing */
#endif
static void _kernel_lock_dump(const volatile void *, lockop_printer_t);
lockops_t _kernel_lock_ops = {
.lo_name = "Kernel lock",
.lo_type = LOCKOPS_SPIN,
.lo_dump = _kernel_lock_dump,
};
#ifdef LOCKDEBUG
#ifdef DDB
#include <ddb/ddb.h>
#endif
static void
kernel_lock_trace_ipi(void *cookie)
{
printf("%s[%d %s]: hogging kernel lock\n", cpu_name(curcpu()),
curlwp->l_lid,
curlwp->l_name ? curlwp->l_name : curproc->p_comm);
#ifdef DDB
db_stacktrace();
#endif
}
#endif
/*
* Initialize the kernel lock.
*/
void
kernel_lock_init(void)
{
__cpu_simple_lock_init(kernel_lock);
kernel_lock_dodebug = LOCKDEBUG_ALLOC(kernel_lock, &_kernel_lock_ops,
RETURN_ADDRESS);
}
CTASSERT(CACHE_LINE_SIZE >= sizeof(__cpu_simple_lock_t));
/*
* Print debugging information about the kernel lock.
*/
static void
_kernel_lock_dump(const volatile void *junk, lockop_printer_t pr)
{
struct cpu_info *ci = curcpu();
(void)junk;
pr("curcpu holds : %18d wanted by: %#018lx\n",
ci->ci_biglock_count, (long)ci->ci_biglock_wanted);
}
/*
* Acquire 'nlocks' holds on the kernel lock.
*
* Although it may not look it, this is one of the most central, intricate
* routines in the kernel, and tons of code elsewhere depends on its exact
* behaviour. If you change something in here, expect it to bite you in the
* rear.
*/
void
_kernel_lock(int nlocks)
{
struct cpu_info *ci;
LOCKSTAT_TIMER(spintime);
LOCKSTAT_FLAG(lsflag);
struct lwp *owant;
#ifdef LOCKDEBUG
static struct cpu_info *kernel_lock_holder;
u_int spins = 0;
u_int starttime = getticks();
#endif
int s;
struct lwp *l = curlwp;
_KERNEL_LOCK_ASSERT(nlocks > 0);
s = splvm();
ci = curcpu();
if (ci->ci_biglock_count != 0) {
_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock));
ci->ci_biglock_count += nlocks;
l->l_blcnt += nlocks;
splx(s);
return;
}
_KERNEL_LOCK_ASSERT(l->l_blcnt == 0); LOCKDEBUG_WANTLOCK(kernel_lock_dodebug, kernel_lock, RETURN_ADDRESS,
0);
if (__predict_true(__cpu_simple_lock_try(kernel_lock))) {
#ifdef LOCKDEBUG
kernel_lock_holder = curcpu();
#endif
ci->ci_biglock_count = nlocks;
l->l_blcnt = nlocks;
LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
RETURN_ADDRESS, 0);
splx(s);
return;
}
/*
* To remove the ordering constraint between adaptive mutexes
* and kernel_lock we must make it appear as if this thread is
* blocking. For non-interlocked mutex release, a store fence
* is required to ensure that the result of any mutex_exit()
* by the current LWP becomes visible on the bus before the set
* of ci->ci_biglock_wanted becomes visible.
*
* This membar_producer matches the membar_consumer in
* mutex_vector_enter.
*
* That way, if l has just released a mutex, mutex_vector_enter
* can't see this store ci->ci_biglock_wanted := l until it
* will also see the mutex_exit store mtx->mtx_owner := 0 which
* clears the has-waiters bit.
*/
membar_producer();
owant = ci->ci_biglock_wanted;
atomic_store_relaxed(&ci->ci_biglock_wanted, l);
#if defined(DIAGNOSTIC) && !defined(LOCKDEBUG)
l->l_ld_wanted = __builtin_return_address(0);
#endif
/*
* Spin until we acquire the lock. Once we have it, record the
* time spent with lockstat.
*/
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_START_TIMER(lsflag, spintime);
do {
splx(s);
while (__SIMPLELOCK_LOCKED_P(kernel_lock)) {
#ifdef LOCKDEBUG
if (SPINLOCK_SPINOUT(spins) && start_init_exec &&
(getticks() - starttime) > 10*hz) {
ipi_msg_t msg = {
.func = kernel_lock_trace_ipi,
};
kpreempt_disable();
ipi_unicast(&msg, kernel_lock_holder);
ipi_wait(&msg);
kpreempt_enable();
_KERNEL_LOCK_ABORT("spinout");
}
#endif
SPINLOCK_BACKOFF_HOOK;
SPINLOCK_SPIN_HOOK;
}
s = splvm();
} while (!__cpu_simple_lock_try(kernel_lock));
ci->ci_biglock_count = nlocks;
l->l_blcnt = nlocks;
LOCKSTAT_STOP_TIMER(lsflag, spintime); LOCKDEBUG_LOCKED(kernel_lock_dodebug, kernel_lock, NULL,
RETURN_ADDRESS, 0);
if (owant == NULL) { LOCKSTAT_EVENT_RA(lsflag, kernel_lock,
LB_KERNEL_LOCK | LB_SPIN, 1, spintime, RETURN_ADDRESS);
}
LOCKSTAT_EXIT(lsflag);
splx(s);
/*
* Now that we have kernel_lock, reset ci_biglock_wanted. This
* store must be visible on other CPUs before a mutex_exit() on
* this CPU can test the has-waiters bit.
*
* This membar_enter matches the membar_enter in
* mutex_vector_enter. (Yes, not membar_exit -- the legacy
* naming is confusing, but store-before-load usually pairs
* with store-before-load, in the extremely rare cases where it
* is used at all.)
*
* That way, mutex_vector_enter can't see this store
* ci->ci_biglock_wanted := owant until it has set the
* has-waiters bit.
*/
(void)atomic_swap_ptr(&ci->ci_biglock_wanted, owant);
#ifndef __HAVE_ATOMIC_AS_MEMBAR
membar_enter();
#endif
#ifdef LOCKDEBUG
kernel_lock_holder = curcpu();
#endif
}
/*
* Release 'nlocks' holds on the kernel lock. If 'nlocks' is zero, release
* all holds.
*/
void
_kernel_unlock(int nlocks, int *countp)
{
struct cpu_info *ci;
u_int olocks;
int s;
struct lwp *l = curlwp;
_KERNEL_LOCK_ASSERT(nlocks < 2);
olocks = l->l_blcnt;
if (olocks == 0) {
_KERNEL_LOCK_ASSERT(nlocks <= 0); if (countp != NULL)
*countp = 0;
return;
}
_KERNEL_LOCK_ASSERT(__SIMPLELOCK_LOCKED_P(kernel_lock)); if (nlocks == 0)
nlocks = olocks;
else if (nlocks == -1) {
nlocks = 1;
_KERNEL_LOCK_ASSERT(olocks == 1);
}
s = splvm();
ci = curcpu();
_KERNEL_LOCK_ASSERT(ci->ci_biglock_count >= l->l_blcnt);
if (ci->ci_biglock_count == nlocks) {
LOCKDEBUG_UNLOCKED(kernel_lock_dodebug, kernel_lock,
RETURN_ADDRESS, 0);
ci->ci_biglock_count = 0;
__cpu_simple_unlock(kernel_lock);
l->l_blcnt -= nlocks;
splx(s);
if (l->l_dopreempt) kpreempt(0);
} else {
ci->ci_biglock_count -= nlocks;
l->l_blcnt -= nlocks;
splx(s);
}
if (countp != NULL)
*countp = olocks;
}
bool
_kernel_locked_p(void)
{
return __SIMPLELOCK_LOCKED_P(kernel_lock);
}
/* $NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
***********************************************************************
* *
* Copyright (c) David L. Mills 1993-2001 *
* *
* Permission to use, copy, modify, and distribute this software and *
* its documentation for any purpose and without fee is hereby *
* granted, provided that the above copyright notice appears in all *
* copies and that both the copyright notice and this permission *
* notice appear in supporting documentation, and that the name *
* University of Delaware not be used in advertising or publicity *
* pertaining to distribution of the software without specific, *
* written prior permission. The University of Delaware makes no *
* representations about the suitability this software for any *
* purpose. It is provided "as is" without express or implied *
* warranty. *
* *
**********************************************************************/
/*
* Adapted from the original sources for FreeBSD and timecounters by:
* Poul-Henning Kamp <phk@FreeBSD.org>.
*
* The 32bit version of the "LP" macros seems a bit past its "sell by"
* date so I have retained only the 64bit version and included it directly
* in this file.
*
* Only minor changes done to interface with the timecounters over in
* sys/kern/kern_clock.c. Some of the comments below may be (even more)
* confusing and/or plain wrong in that context.
*/
#include <sys/cdefs.h>
/* __FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $"); */
__KERNEL_RCSID(0, "$NetBSD: kern_ntptime.c,v 1.64 2022/10/26 23:23:52 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#endif
#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <compat/sys/timex.h>
/*
* Single-precision macros for 64-bit machines
*/
typedef int64_t l_fp;
#define L_ADD(v, u) ((v) += (u))
#define L_SUB(v, u) ((v) -= (u))
#define L_ADDHI(v, a) ((v) += (int64_t)(a) << 32)
#define L_NEG(v) ((v) = -(v))
#define L_RSHIFT(v, n) \
do { \
if ((v) < 0) \
(v) = -(-(v) >> (n)); \
else \
(v) = (v) >> (n); \
} while (0)
#define L_MPY(v, a) ((v) *= (a))
#define L_CLR(v) ((v) = 0)
#define L_ISNEG(v) ((v) < 0)
#define L_LINT(v, a) ((v) = (int64_t)((uint64_t)(a) << 32))
#define L_GINT(v) ((v) < 0 ? -(-(v) >> 32) : (v) >> 32)
#ifdef NTP
/*
* Generic NTP kernel interface
*
* These routines constitute the Network Time Protocol (NTP) interfaces
* for user and daemon application programs. The ntp_gettime() routine
* provides the time, maximum error (synch distance) and estimated error
* (dispersion) to client user application programs. The ntp_adjtime()
* routine is used by the NTP daemon to adjust the system clock to an
* externally derived time. The time offset and related variables set by
* this routine are used by other routines in this module to adjust the
* phase and frequency of the clock discipline loop which controls the
* system clock.
*
* When the kernel time is reckoned directly in nanoseconds (NTP_NANO
* defined), the time at each tick interrupt is derived directly from
* the kernel time variable. When the kernel time is reckoned in
* microseconds, (NTP_NANO undefined), the time is derived from the
* kernel time variable together with a variable representing the
* leftover nanoseconds at the last tick interrupt. In either case, the
* current nanosecond time is reckoned from these values plus an
* interpolated value derived by the clock routines in another
* architecture-specific module. The interpolation can use either a
* dedicated counter or a processor cycle counter (PCC) implemented in
* some architectures.
*
* Note that all routines must run at priority splclock or higher.
*/
/*
* Phase/frequency-lock loop (PLL/FLL) definitions
*
* The nanosecond clock discipline uses two variable types, time
* variables and frequency variables. Both types are represented as 64-
* bit fixed-point quantities with the decimal point between two 32-bit
* halves. On a 32-bit machine, each half is represented as a single
* word and mathematical operations are done using multiple-precision
* arithmetic. On a 64-bit machine, ordinary computer arithmetic is
* used.
*
* A time variable is a signed 64-bit fixed-point number in ns and
* fraction. It represents the remaining time offset to be amortized
* over succeeding tick interrupts. The maximum time offset is about
* 0.5 s and the resolution is about 2.3e-10 ns.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s| ns |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*
* A frequency variable is a signed 64-bit fixed-point number in ns/s
* and fraction. It represents the ns and fraction to be added to the
* kernel time variable at each second. The maximum frequency offset is
* about +-500000 ns/s and the resolution is about 2.3e-10 ns/s.
*
* 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 3 3
* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* |s s s s s s s s s s s s s| ns/s |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
* | fraction |
* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
*/
/*
* The following variables establish the state of the PLL/FLL and the
* residual time and frequency offset of the local clock.
*/
#define SHIFT_PLL 4 /* PLL loop gain (shift) */
#define SHIFT_FLL 2 /* FLL loop gain (shift) */
static int time_state = TIME_OK; /* clock state */
static int time_status = STA_UNSYNC; /* clock status bits */
static long time_tai; /* TAI offset (s) */
static long time_monitor; /* last time offset scaled (ns) */
static long time_constant; /* poll interval (shift) (s) */
static long time_precision = 1; /* clock precision (ns) */
static long time_maxerror = MAXPHASE / 1000; /* maximum error (us) */
static long time_esterror = MAXPHASE / 1000; /* estimated error (us) */
static time_t time_reftime; /* time at last adjustment (s) */
static l_fp time_offset; /* time offset (ns) */
static l_fp time_freq; /* frequency offset (ns/s) */
#endif /* NTP */
static l_fp time_adj; /* tick adjust (ns/s) */
int64_t time_adjtime; /* correction from adjtime(2) (usec) */
#ifdef NTP
#ifdef PPS_SYNC
/*
* The following variables are used when a pulse-per-second (PPS) signal
* is available and connected via a modem control lead. They establish
* the engineering parameters of the clock discipline loop when
* controlled by the PPS signal.
*/
#define PPS_FAVG 2 /* min freq avg interval (s) (shift) */
#define PPS_FAVGDEF 8 /* default freq avg int (s) (shift) */
#define PPS_FAVGMAX 15 /* max freq avg interval (s) (shift) */
#define PPS_PAVG 4 /* phase avg interval (s) (shift) */
#define PPS_VALID 120 /* PPS signal watchdog max (s) */
#define PPS_MAXWANDER 100000 /* max PPS wander (ns/s) */
#define PPS_POPCORN 2 /* popcorn spike threshold (shift) */
static struct timespec pps_tf[3]; /* phase median filter */
static l_fp pps_freq; /* scaled frequency offset (ns/s) */
static long pps_fcount; /* frequency accumulator */
static long pps_jitter; /* nominal jitter (ns) */
static long pps_stabil; /* nominal stability (scaled ns/s) */
static long pps_lastsec; /* time at last calibration (s) */
static int pps_valid; /* signal watchdog counter */
static int pps_shift = PPS_FAVG; /* interval duration (s) (shift) */
static int pps_shiftmax = PPS_FAVGDEF; /* max interval duration (s) (shift) */
static int pps_intcnt; /* wander counter */
/*
* PPS signal quality monitors
*/
static long pps_calcnt; /* calibration intervals */
static long pps_jitcnt; /* jitter limit exceeded */
static long pps_stbcnt; /* stability limit exceeded */
static long pps_errcnt; /* calibration errors */
#endif /* PPS_SYNC */
/*
* End of phase/frequency-lock loop (PLL/FLL) definitions
*/
static void hardupdate(long offset);
/*
* ntp_gettime() - NTP user application interface
*/
void
ntp_gettime(struct ntptimeval *ntv)
{
memset(ntv, 0, sizeof(*ntv));
mutex_spin_enter(&timecounter_lock);
nanotime(&ntv->time);
ntv->maxerror = time_maxerror;
ntv->esterror = time_esterror;
ntv->tai = time_tai;
ntv->time_state = time_state;
mutex_spin_exit(&timecounter_lock);
}
/* ARGSUSED */
/*
* ntp_adjtime() - NTP daemon application interface
*/
int
sys_ntp_adjtime(struct lwp *l, const struct sys_ntp_adjtime_args *uap, register_t *retval)
{
/* {
syscallarg(struct timex *) tp;
} */
struct timex ntv;
int error;
error = copyin((void *)SCARG(uap, tp), (void *)&ntv, sizeof(ntv));
if (error != 0)
return (error);
if (ntv.modes != 0 && (error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_NTPADJTIME, NULL,
NULL, NULL)) != 0)
return (error);
ntp_adjtime1(&ntv);
error = copyout((void *)&ntv, (void *)SCARG(uap, tp), sizeof(ntv));
if (!error)
*retval = ntp_timestatus();
return error;
}
void
ntp_adjtime1(struct timex *ntv)
{
long freq;
int modes;
/*
* Update selected clock variables - only the superuser can
* change anything. Note that there is no error checking here on
* the assumption the superuser should know what it is doing.
* Note that either the time constant or TAI offset are loaded
* from the ntv.constant member, depending on the mode bits. If
* the STA_PLL bit in the status word is cleared, the state and
* status words are reset to the initial values at boot.
*/
mutex_spin_enter(&timecounter_lock);
modes = ntv->modes;
if (modes != 0)
/* We need to save the system time during shutdown */
time_adjusted |= 2;
if (modes & MOD_MAXERROR) time_maxerror = ntv->maxerror; if (modes & MOD_ESTERROR) time_esterror = ntv->esterror; if (modes & MOD_STATUS) { if (time_status & STA_PLL && !(ntv->status & STA_PLL)) { time_state = TIME_OK;
time_status = STA_UNSYNC;
#ifdef PPS_SYNC
pps_shift = PPS_FAVG;
#endif /* PPS_SYNC */
}
time_status &= STA_RONLY;
time_status |= ntv->status & ~STA_RONLY;
}
if (modes & MOD_TIMECONST) { if (ntv->constant < 0)
time_constant = 0;
else if (ntv->constant > MAXTC)
time_constant = MAXTC;
else
time_constant = ntv->constant;
}
if (modes & MOD_TAI) { if (ntv->constant > 0) /* XXX zero & negative numbers ? */ time_tai = ntv->constant;
}
#ifdef PPS_SYNC
if (modes & MOD_PPSMAX) {
if (ntv->shift < PPS_FAVG)
pps_shiftmax = PPS_FAVG;
else if (ntv->shift > PPS_FAVGMAX)
pps_shiftmax = PPS_FAVGMAX;
else
pps_shiftmax = ntv->shift;
}
#endif /* PPS_SYNC */
if (modes & MOD_NANO) time_status |= STA_NANO; if (modes & MOD_MICRO) time_status &= ~STA_NANO; if (modes & MOD_CLKB) time_status |= STA_CLK; if (modes & MOD_CLKA) time_status &= ~STA_CLK; if (modes & MOD_FREQUENCY) { freq = MIN(INT32_MAX, MAX(INT32_MIN, ntv->freq));
freq = (freq * (int64_t)1000) >> 16;
if (freq > MAXFREQ)
L_LINT(time_freq, MAXFREQ);
else if (freq < -MAXFREQ)
L_LINT(time_freq, -MAXFREQ);
else {
/*
* ntv.freq is [PPM * 2^16] = [us/s * 2^16]
* time_freq is [ns/s * 2^32]
*/
time_freq = ntv->freq * 1000LL * 65536LL;
}
#ifdef PPS_SYNC
pps_freq = time_freq;
#endif /* PPS_SYNC */
}
if (modes & MOD_OFFSET) { if (time_status & STA_NANO) {
hardupdate(ntv->offset);
} else {
long offset = ntv->offset;
offset = MIN(offset, MAXPHASE/1000);
offset = MAX(offset, -MAXPHASE/1000);
hardupdate(offset * 1000);
}
}
/*
* Retrieve all clock variables. Note that the TAI offset is
* returned only by ntp_gettime();
*/
if (time_status & STA_NANO)
ntv->offset = L_GINT(time_offset);
else
ntv->offset = L_GINT(time_offset) / 1000; /* XXX rounding ? */
if (time_freq < 0)
ntv->freq = L_GINT(-((-time_freq / 1000LL) << 16));
else
ntv->freq = L_GINT((time_freq / 1000LL) << 16);
ntv->maxerror = time_maxerror;
ntv->esterror = time_esterror;
ntv->status = time_status;
ntv->constant = time_constant;
if (time_status & STA_NANO)
ntv->precision = time_precision;
else
ntv->precision = time_precision / 1000;
ntv->tolerance = MAXFREQ * SCALE_PPM;
#ifdef PPS_SYNC
ntv->shift = pps_shift;
ntv->ppsfreq = L_GINT((pps_freq / 1000LL) << 16);
if (time_status & STA_NANO)
ntv->jitter = pps_jitter;
else
ntv->jitter = pps_jitter / 1000;
ntv->stabil = pps_stabil;
ntv->calcnt = pps_calcnt;
ntv->errcnt = pps_errcnt;
ntv->jitcnt = pps_jitcnt;
ntv->stbcnt = pps_stbcnt;
#endif /* PPS_SYNC */
mutex_spin_exit(&timecounter_lock);
}
#endif /* NTP */
/*
* second_overflow() - called after ntp_tick_adjust()
*
* This routine is ordinarily called immediately following the above
* routine ntp_tick_adjust(). While these two routines are normally
* combined, they are separated here only for the purposes of
* simulation.
*/
void
ntp_update_second(int64_t *adjustment, time_t *newsec)
{
int tickrate;
l_fp ftemp; /* 32/64-bit temporary */
KASSERT(mutex_owned(&timecounter_lock));
#ifdef NTP
/*
* On rollover of the second both the nanosecond and microsecond
* clocks are updated and the state machine cranked as
* necessary. The phase adjustment to be used for the next
* second is calculated and the maximum error is increased by
* the tolerance.
*/
time_maxerror += MAXFREQ / 1000;
/*
* Leap second processing. If in leap-insert state at
* the end of the day, the system clock is set back one
* second; if in leap-delete state, the system clock is
* set ahead one second. The nano_time() routine or
* external clock driver will insure that reported time
* is always monotonic.
*/
switch (time_state) {
/*
* No warning.
*/
case TIME_OK:
if (time_status & STA_INS)
time_state = TIME_INS; else if (time_status & STA_DEL) time_state = TIME_DEL;
break;
/*
* Insert second 23:59:60 following second
* 23:59:59.
*/
case TIME_INS:
if (!(time_status & STA_INS))
time_state = TIME_OK; else if ((*newsec) % 86400 == 0) { (*newsec)--;
time_state = TIME_OOP;
time_tai++;
}
break;
/*
* Delete second 23:59:59.
*/
case TIME_DEL:
if (!(time_status & STA_DEL))
time_state = TIME_OK; else if (((*newsec) + 1) % 86400 == 0) { (*newsec)++;
time_tai--;
time_state = TIME_WAIT;
}
break;
/*
* Insert second in progress.
*/
case TIME_OOP:
time_state = TIME_WAIT;
break;
/*
* Wait for status bits to clear.
*/
case TIME_WAIT:
if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK;
}
/*
* Compute the total time adjustment for the next second
* in ns. The offset is reduced by a factor depending on
* whether the PPS signal is operating. Note that the
* value is in effect scaled by the clock frequency,
* since the adjustment is added at each tick interrupt.
*/
ftemp = time_offset;
#ifdef PPS_SYNC
/* XXX even if PPS signal dies we should finish adjustment ? */
if (time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)
L_RSHIFT(ftemp, pps_shift);
else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#else
L_RSHIFT(ftemp, SHIFT_PLL + time_constant);
#endif /* PPS_SYNC */
time_adj = ftemp;
L_SUB(time_offset, ftemp);
L_ADD(time_adj, time_freq);
#ifdef PPS_SYNC
if (pps_valid > 0)
pps_valid--;
else
time_status &= ~STA_PPSSIGNAL;
#endif /* PPS_SYNC */
#else /* !NTP */
L_CLR(time_adj);
#endif /* !NTP */
/*
* Apply any correction from adjtime(2). If more than one second
* off we slew at a rate of 5ms/s (5000 PPM) else 500us/s (500PPM)
* until the last second is slewed the final < 500 usecs.
*/
if (time_adjtime != 0) { if (time_adjtime > 1000000)
tickrate = 5000;
else if (time_adjtime < -1000000)
tickrate = -5000;
else if (time_adjtime > 500)
tickrate = 500;
else if (time_adjtime < -500)
tickrate = -500;
else
tickrate = time_adjtime;
time_adjtime -= tickrate;
L_LINT(ftemp, tickrate * 1000);
L_ADD(time_adj, ftemp);
}
*adjustment = time_adj;
}
/*
* ntp_init() - initialize variables and structures
*
* This routine must be called after the kernel variables hz and tick
* are set or changed and before the next tick interrupt. In this
* particular implementation, these values are assumed set elsewhere in
* the kernel. The design allows the clock frequency and tick interval
* to be changed while the system is running. So, this routine should
* probably be integrated with the code that does that.
*/
void
ntp_init(void)
{
/*
* The following variables are initialized only at startup. Only
* those structures not cleared by the compiler need to be
* initialized, and these only in the simulator. In the actual
* kernel, any nonzero values here will quickly evaporate.
*/
L_CLR(time_adj);
#ifdef NTP
L_CLR(time_offset);
L_CLR(time_freq);
#ifdef PPS_SYNC
pps_tf[0].tv_sec = pps_tf[0].tv_nsec = 0;
pps_tf[1].tv_sec = pps_tf[1].tv_nsec = 0;
pps_tf[2].tv_sec = pps_tf[2].tv_nsec = 0;
pps_fcount = 0;
L_CLR(pps_freq);
#endif /* PPS_SYNC */
#endif
}
#ifdef NTP
/*
* hardupdate() - local clock update
*
* This routine is called by ntp_adjtime() to update the local clock
* phase and frequency. The implementation is of an adaptive-parameter,
* hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
* time and frequency offset estimates for each call. If the kernel PPS
* discipline code is configured (PPS_SYNC), the PPS signal itself
* determines the new time offset, instead of the calling argument.
* Presumably, calls to ntp_adjtime() occur only when the caller
* believes the local clock is valid within some bound (+-128 ms with
* NTP). If the caller's time is far different than the PPS time, an
* argument will ensue, and it's not clear who will lose.
*
* For uncompensated quartz crystal oscillators and nominal update
* intervals less than 256 s, operation should be in phase-lock mode,
* where the loop is disciplined to phase. For update intervals greater
* than 1024 s, operation should be in frequency-lock mode, where the
* loop is disciplined to frequency. Between 256 s and 1024 s, the mode
* is selected by the STA_MODE status bit.
*
* Note: splclock() is in effect.
*/
void
hardupdate(long offset)
{
long mtemp;
l_fp ftemp;
KASSERT(mutex_owned(&timecounter_lock));
/*
* Select how the phase is to be controlled and from which
* source. If the PPS signal is present and enabled to
* discipline the time, the PPS offset is used; otherwise, the
* argument offset is used.
*/
if (!(time_status & STA_PLL))
return;
if (!(time_status & STA_PPSTIME && time_status &
STA_PPSSIGNAL)) {
if (offset > MAXPHASE)
time_monitor = MAXPHASE;
else if (offset < -MAXPHASE)
time_monitor = -MAXPHASE;
else
time_monitor = offset;
L_LINT(time_offset, time_monitor);
}
/*
* Select how the frequency is to be controlled and in which
* mode (PLL or FLL). If the PPS signal is present and enabled
* to discipline the frequency, the PPS frequency is used;
* otherwise, the argument offset is used to compute it.
*/
if (time_status & STA_PPSFREQ && time_status & STA_PPSSIGNAL) {
time_reftime = time_second;
return;
}
if (time_status & STA_FREQHOLD || time_reftime == 0) time_reftime = time_second;
mtemp = time_second - time_reftime;
L_LINT(ftemp, time_monitor);
L_RSHIFT(ftemp, (SHIFT_PLL + 2 + time_constant) << 1);
L_MPY(ftemp, mtemp);
L_ADD(time_freq, ftemp);
time_status &= ~STA_MODE;
if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp >
MAXSEC)) {
L_LINT(ftemp, (time_monitor << 4) / mtemp);
L_RSHIFT(ftemp, SHIFT_FLL + 4);
L_ADD(time_freq, ftemp);
time_status |= STA_MODE;
}
time_reftime = time_second;
if (L_GINT(time_freq) > MAXFREQ)
L_LINT(time_freq, MAXFREQ); else if (L_GINT(time_freq) < -MAXFREQ) L_LINT(time_freq, -MAXFREQ);
}
#ifdef PPS_SYNC
/*
* hardpps() - discipline CPU clock oscillator to external PPS signal
*
* This routine is called at each PPS interrupt in order to discipline
* the CPU clock oscillator to the PPS signal. It measures the PPS phase
* and leaves it in a handy spot for the hardclock() routine. It
* integrates successive PPS phase differences and calculates the
* frequency offset. This is used in hardclock() to discipline the CPU
* clock oscillator so that intrinsic frequency error is cancelled out.
* The code requires the caller to capture the time and hardware counter
* value at the on-time PPS signal transition.
*
* Note that, on some Unix systems, this routine runs at an interrupt
* priority level higher than the timer interrupt routine hardclock().
* Therefore, the variables used are distinct from the hardclock()
* variables, except for certain exceptions: The PPS frequency pps_freq
* and phase pps_offset variables are determined by this routine and
* updated atomically. The time_tolerance variable can be considered a
* constant, since it is infrequently changed, and then only when the
* PPS signal is disabled. The watchdog counter pps_valid is updated
* once per second by hardclock() and is atomically cleared in this
* routine.
*/
void
hardpps(struct timespec *tsp, /* time at PPS */
long nsec /* hardware counter at PPS */)
{
long u_sec, u_nsec, v_nsec; /* temps */
l_fp ftemp;
KASSERT(mutex_owned(&timecounter_lock));
/*
* The signal is first processed by a range gate and frequency
* discriminator. The range gate rejects noise spikes outside
* the range +-500 us. The frequency discriminator rejects input
* signals with apparent frequency outside the range 1 +-500
* PPM. If two hits occur in the same second, we ignore the
* later hit; if not and a hit occurs outside the range gate,
* keep the later hit for later comparison, but do not process
* it.
*/
time_status |= STA_PPSSIGNAL | STA_PPSJITTER;
time_status &= ~(STA_PPSWANDER | STA_PPSERROR);
pps_valid = PPS_VALID;
u_sec = tsp->tv_sec;
u_nsec = tsp->tv_nsec;
if (u_nsec >= (NANOSECOND >> 1)) {
u_nsec -= NANOSECOND;
u_sec++;
}
v_nsec = u_nsec - pps_tf[0].tv_nsec;
if (u_sec == pps_tf[0].tv_sec && v_nsec < NANOSECOND -
MAXFREQ)
return;
pps_tf[2] = pps_tf[1];
pps_tf[1] = pps_tf[0];
pps_tf[0].tv_sec = u_sec;
pps_tf[0].tv_nsec = u_nsec;
/*
* Compute the difference between the current and previous
* counter values. If the difference exceeds 0.5 s, assume it
* has wrapped around, so correct 1.0 s. If the result exceeds
* the tick interval, the sample point has crossed a tick
* boundary during the last second, so correct the tick. Very
* intricate.
*/
u_nsec = nsec;
if (u_nsec > (NANOSECOND >> 1))
u_nsec -= NANOSECOND;
else if (u_nsec < -(NANOSECOND >> 1))
u_nsec += NANOSECOND;
pps_fcount += u_nsec;
if (v_nsec > MAXFREQ || v_nsec < -MAXFREQ)
return;
time_status &= ~STA_PPSJITTER;
/*
* A three-stage median filter is used to help denoise the PPS
* time. The median sample becomes the time offset estimate; the
* difference between the other two samples becomes the time
* dispersion (jitter) estimate.
*/
if (pps_tf[0].tv_nsec > pps_tf[1].tv_nsec) {
if (pps_tf[1].tv_nsec > pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 0 1 2 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[2].tv_nsec;
} else if (pps_tf[2].tv_nsec > pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 2 0 1 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[1].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 0 2 1 */
u_nsec = pps_tf[0].tv_nsec - pps_tf[1].tv_nsec;
}
} else {
if (pps_tf[1].tv_nsec < pps_tf[2].tv_nsec) {
v_nsec = pps_tf[1].tv_nsec; /* 2 1 0 */
u_nsec = pps_tf[2].tv_nsec - pps_tf[0].tv_nsec;
} else if (pps_tf[2].tv_nsec < pps_tf[0].tv_nsec) {
v_nsec = pps_tf[0].tv_nsec; /* 1 0 2 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[2].tv_nsec;
} else {
v_nsec = pps_tf[2].tv_nsec; /* 1 2 0 */
u_nsec = pps_tf[1].tv_nsec - pps_tf[0].tv_nsec;
}
}
/*
* Nominal jitter is due to PPS signal noise and interrupt
* latency. If it exceeds the popcorn threshold, the sample is
* discarded. otherwise, if so enabled, the time offset is
* updated. We can tolerate a modest loss of data here without
* much degrading time accuracy.
*/
if (u_nsec > (pps_jitter << PPS_POPCORN)) {
time_status |= STA_PPSJITTER;
pps_jitcnt++;
} else if (time_status & STA_PPSTIME) {
time_monitor = -v_nsec;
L_LINT(time_offset, time_monitor);
}
pps_jitter += (u_nsec - pps_jitter) >> PPS_FAVG;
u_sec = pps_tf[0].tv_sec - pps_lastsec;
if (u_sec < (1 << pps_shift))
return;
/*
* At the end of the calibration interval the difference between
* the first and last counter values becomes the scaled
* frequency. It will later be divided by the length of the
* interval to determine the frequency update. If the frequency
* exceeds a sanity threshold, or if the actual calibration
* interval is not equal to the expected length, the data are
* discarded. We can tolerate a modest loss of data here without
* much degrading frequency accuracy.
*/
pps_calcnt++;
v_nsec = -pps_fcount;
pps_lastsec = pps_tf[0].tv_sec;
pps_fcount = 0;
u_nsec = MAXFREQ << pps_shift;
if (v_nsec > u_nsec || v_nsec < -u_nsec || u_sec != (1 <<
pps_shift)) {
time_status |= STA_PPSERROR;
pps_errcnt++;
return;
}
/*
* Here the raw frequency offset and wander (stability) is
* calculated. If the wander is less than the wander threshold
* for four consecutive averaging intervals, the interval is
* doubled; if it is greater than the threshold for four
* consecutive intervals, the interval is halved. The scaled
* frequency offset is converted to frequency offset. The
* stability metric is calculated as the average of recent
* frequency changes, but is used only for performance
* monitoring.
*/
L_LINT(ftemp, v_nsec);
L_RSHIFT(ftemp, pps_shift);
L_SUB(ftemp, pps_freq);
u_nsec = L_GINT(ftemp);
if (u_nsec > PPS_MAXWANDER) {
L_LINT(ftemp, PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else if (u_nsec < -PPS_MAXWANDER) {
L_LINT(ftemp, -PPS_MAXWANDER);
pps_intcnt--;
time_status |= STA_PPSWANDER;
pps_stbcnt++;
} else {
pps_intcnt++;
}
if (pps_intcnt >= 4) {
pps_intcnt = 4;
if (pps_shift < pps_shiftmax) {
pps_shift++;
pps_intcnt = 0;
}
} else if (pps_intcnt <= -4 || pps_shift > pps_shiftmax) {
pps_intcnt = -4;
if (pps_shift > PPS_FAVG) {
pps_shift--;
pps_intcnt = 0;
}
}
if (u_nsec < 0)
u_nsec = -u_nsec;
pps_stabil += (u_nsec * SCALE_PPM - pps_stabil) >> PPS_FAVG;
/*
* The PPS frequency is recalculated and clamped to the maximum
* MAXFREQ. If enabled, the system clock frequency is updated as
* well.
*/
L_ADD(pps_freq, ftemp);
u_nsec = L_GINT(pps_freq);
if (u_nsec > MAXFREQ)
L_LINT(pps_freq, MAXFREQ);
else if (u_nsec < -MAXFREQ)
L_LINT(pps_freq, -MAXFREQ);
if (time_status & STA_PPSFREQ)
time_freq = pps_freq;
}
#endif /* PPS_SYNC */
#endif /* NTP */
#ifdef NTP
int
ntp_timestatus(void)
{
int rv;
/*
* Status word error decode. If any of these conditions
* occur, an error is returned, instead of the status
* word. Most applications will care only about the fact
* the system clock may not be trusted, not about the
* details.
*
* Hardware or software error
*/
mutex_spin_enter(&timecounter_lock);
if ((time_status & (STA_UNSYNC | STA_CLOCKERR)) ||
/*
* PPS signal lost when either time or frequency
* synchronization requested
*/
(time_status & (STA_PPSFREQ | STA_PPSTIME) &&
!(time_status & STA_PPSSIGNAL)) ||
/*
* PPS jitter exceeded when time synchronization
* requested
*/
(time_status & STA_PPSTIME &&
time_status & STA_PPSJITTER) ||
/*
* PPS wander exceeded or calibration error when
* frequency synchronization requested
*/
(time_status & STA_PPSFREQ &&
time_status & (STA_PPSWANDER | STA_PPSERROR)))
rv = TIME_ERROR;
else
rv = time_state;
mutex_spin_exit(&timecounter_lock);
return rv;
}
/*ARGSUSED*/
/*
* ntp_gettime() - NTP user application interface
*/
int
sys___ntp_gettime50(struct lwp *l, const struct sys___ntp_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(struct ntptimeval *) ntvp;
} */
struct ntptimeval ntv;
int error = 0;
if (SCARG(uap, ntvp)) {
ntp_gettime(&ntv);
error = copyout((void *)&ntv, (void *)SCARG(uap, ntvp),
sizeof(ntv));
}
if (!error) {
*retval = ntp_timestatus();
}
return(error);
}
/*
* return information about kernel precision timekeeping
*/
static int
sysctl_kern_ntptime(SYSCTLFN_ARGS)
{
struct sysctlnode node;
struct ntptimeval ntv;
ntp_gettime(&ntv);
node = *rnode;
node.sysctl_data = &ntv;
node.sysctl_size = sizeof(ntv);
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
SYSCTL_SETUP(sysctl_kern_ntptime_setup, "sysctl kern.ntptime node setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "ntptime",
SYSCTL_DESCR("Kernel clock values for NTP"),
sysctl_kern_ntptime, 0, NULL,
sizeof(struct ntptimeval),
CTL_KERN, KERN_NTPTIME, CTL_EOL);
}
#endif /* !NTP */
/* $NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 2000 Paycounter, Inc.
* Copyright (c) 2005 Robert N. M. Watson
* Author: Alfred Perlstein <alfred@paycounter.com>, <alfred@FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_accf.c,v 1.13 2014/02/25 18:30:11 pooka Exp $");
#define ACCEPT_FILTER_MOD
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/rwlock.h>
#include <sys/protosw.h>
#include <sys/sysctl.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/queue.h>
#include <sys/once.h>
#include <sys/atomic.h>
#include <sys/module.h>
static krwlock_t accept_filter_lock;
static LIST_HEAD(, accept_filter) accept_filtlsthd =
LIST_HEAD_INITIALIZER(&accept_filtlsthd);
/*
* Names of Accept filter sysctl objects
*/
static struct sysctllog *ctllog;
static void
sysctl_net_inet_accf_setup(void)
{
sysctl_createv(&ctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET, CTL_EOL);
sysctl_createv(&ctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "accf",
SYSCTL_DESCR("Accept filters"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET, SO_ACCEPTFILTER, CTL_EOL);
}
int
accept_filt_add(struct accept_filter *filt)
{
struct accept_filter *p;
accept_filter_init();
rw_enter(&accept_filter_lock, RW_WRITER);
LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
if (strcmp(p->accf_name, filt->accf_name) == 0) {
rw_exit(&accept_filter_lock);
return EEXIST;
}
}
LIST_INSERT_HEAD(&accept_filtlsthd, filt, accf_next);
rw_exit(&accept_filter_lock);
return 0;
}
int
accept_filt_del(struct accept_filter *p)
{
rw_enter(&accept_filter_lock, RW_WRITER);
if (p->accf_refcnt != 0) {
rw_exit(&accept_filter_lock);
return EBUSY;
}
LIST_REMOVE(p, accf_next);
rw_exit(&accept_filter_lock);
return 0;
}
struct accept_filter *
accept_filt_get(char *name)
{
struct accept_filter *p;
char buf[32];
u_int gen;
do {
rw_enter(&accept_filter_lock, RW_READER);
LIST_FOREACH(p, &accept_filtlsthd, accf_next) {
if (strcmp(p->accf_name, name) == 0) {
atomic_inc_uint(&p->accf_refcnt);
break;
}
}
rw_exit(&accept_filter_lock);
if (p != NULL) {
break;
}
/* Try to autoload a module to satisfy the request. */
strcpy(buf, "accf_");
strlcat(buf, name, sizeof(buf));
gen = module_gen;
(void)module_autoload(buf, MODULE_CLASS_ANY);
} while (gen != module_gen);
return p;
}
/*
* Accept filter initialization routine.
* This should be called only once.
*/
static int
accept_filter_init0(void)
{
rw_init(&accept_filter_lock);
sysctl_net_inet_accf_setup();
return 0;
}
/*
* Initialization routine: This can also be replaced with
* accept_filt_generic_mod_event for attaching new accept filter.
*/
void
accept_filter_init(void)
{
static ONCE_DECL(accept_filter_init_once);
RUN_ONCE(&accept_filter_init_once, accept_filter_init0);
}
int
accept_filt_getopt(struct socket *so, struct sockopt *sopt)
{
struct accept_filter_arg afa;
int error;
KASSERT(solocked(so)); if ((so->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
goto out;
}
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
error = EINVAL;
goto out;
}
memset(&afa, 0, sizeof(afa));
strcpy(afa.af_name, so->so_accf->so_accept_filter->accf_name);
if (so->so_accf->so_accept_filter_str != NULL) strcpy(afa.af_arg, so->so_accf->so_accept_filter_str);
error = sockopt_set(sopt, &afa, sizeof(afa));
out:
return error;
}
/*
* Simple delete case, with socket locked.
*/
int
accept_filt_clear(struct socket *so)
{
struct accept_filter_arg afa;
struct accept_filter *afp;
struct socket *so2, *next;
struct so_accf *af;
KASSERT(solocked(so)); if ((so->so_options & SO_ACCEPTCONN) == 0) {
return EINVAL;
}
if (so->so_accf != NULL) {
/* Break in-flight processing. */
for (so2 = TAILQ_FIRST(&so->so_q0); so2 != NULL; so2 = next) {
next = TAILQ_NEXT(so2, so_qe);
if (so2->so_upcall == NULL) {
continue;
}
so2->so_upcall = NULL;
so2->so_upcallarg = NULL;
so2->so_options &= ~SO_ACCEPTFILTER;
so2->so_rcv.sb_flags &= ~SB_UPCALL;
soisconnected(so2);
}
af = so->so_accf;
afp = af->so_accept_filter;
if (afp != NULL && afp->accf_destroy != NULL) { (*afp->accf_destroy)(so);
}
if (af->so_accept_filter_str != NULL) { kmem_free(af->so_accept_filter_str,
sizeof(afa.af_name));
}
kmem_free(af, sizeof(*af));
so->so_accf = NULL;
atomic_dec_uint(&afp->accf_refcnt);
}
so->so_options &= ~SO_ACCEPTFILTER;
return 0;
}
/*
* setsockopt() for accept filters. Called with the socket unlocked,
* will always return it locked.
*/
int
accept_filt_setopt(struct socket *so, const struct sockopt *sopt)
{
struct accept_filter_arg afa;
struct accept_filter *afp;
struct so_accf *newaf;
int error;
accept_filter_init(); if (sopt == NULL || sopt->sopt_size == 0) { solock(so);
return accept_filt_clear(so);
}
/*
* Pre-allocate any memory we may need later to avoid blocking at
* untimely moments. This does not optimize for invalid arguments.
*/
error = sockopt_get(sopt, &afa, sizeof(afa));
if (error) {
solock(so);
return error;
}
afa.af_name[sizeof(afa.af_name)-1] = '\0';
afa.af_arg[sizeof(afa.af_arg)-1] = '\0';
afp = accept_filt_get(afa.af_name);
if (afp == NULL) {
solock(so);
return ENOENT;
}
/*
* Allocate the new accept filter instance storage. We may
* have to free it again later if we fail to attach it. If
* attached properly, 'newaf' is NULLed to avoid a free()
* while in use.
*/
newaf = kmem_zalloc(sizeof(*newaf), KM_SLEEP);
if (afp->accf_create != NULL && afa.af_name[0] != '\0') {
/*
* FreeBSD did a variable-size allocation here
* with the actual string length from afa.af_name
* but it is so short, why bother tracking it?
* XXX as others have noted, this is an API mistake;
* XXX accept_filter_arg should have a mandatory namelen.
* XXX (but it's a bit too late to fix that now)
*/
newaf->so_accept_filter_str =
kmem_alloc(sizeof(afa.af_name), KM_SLEEP);
strcpy(newaf->so_accept_filter_str, afa.af_name);
}
/*
* Require a listen socket; don't try to replace an existing filter
* without first removing it.
*/
solock(so); if ((so->so_options & SO_ACCEPTCONN) == 0 || so->so_accf != NULL) {
error = EINVAL;
goto out;
}
/*
* Invoke the accf_create() method of the filter if required. The
* socket lock is held over this call, so create methods for filters
* shouldn't block.
*/
if (afp->accf_create != NULL) {
newaf->so_accept_filter_arg =
(*afp->accf_create)(so, afa.af_arg);
if (newaf->so_accept_filter_arg == NULL) {
error = EINVAL;
goto out;
}
}
newaf->so_accept_filter = afp;
so->so_accf = newaf;
so->so_options |= SO_ACCEPTFILTER;
newaf = NULL;
out:
if (newaf != NULL) { if (newaf->so_accept_filter_str != NULL) kmem_free(newaf->so_accept_filter_str,
sizeof(afa.af_name));
kmem_free(newaf, sizeof(*newaf));
atomic_dec_uint(&afp->accf_refcnt);
}
return error;
}
/* $NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Gregory McGarry, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_ras.c,v 1.42 2022/08/08 22:31:45 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/xcall.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_extern.h>
#define MAX_RAS_PER_PROC 16
u_int ras_per_proc = MAX_RAS_PER_PROC;
#ifdef DEBUG
int ras_debug = 0;
#define DPRINTF(x) if (ras_debug) printf x
#else
#define DPRINTF(x) /* nothing */
#endif
/*
* Force all CPUs through cpu_switchto(), waiting until complete.
* Context switching will drain the write buffer on the calling
* CPU.
*/
static void
ras_sync(void)
{
/* No need to sync if exiting or single threaded. */
if (curproc->p_nlwps > 1 && ncpu > 1) { xc_barrier(0);
}
}
/*
* Check the specified address to see if it is within the
* sequence. If it is found, we return the restart address,
* otherwise we return -1. If we do perform a restart, we
* mark the sequence as hit.
*
* No locking required: we disable preemption and ras_sync()
* guarantees that individual entries are valid while we still
* have visibility of them.
*/
void *
ras_lookup(struct proc *p, void *addr)
{
struct ras *rp;
void *startaddr;
lwp_t *l;
startaddr = (void *)-1;
l = curlwp;
KPREEMPT_DISABLE(l);
for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (addr > rp->ras_startaddr && addr < rp->ras_endaddr) {
startaddr = rp->ras_startaddr;
DPRINTF(("RAS hit: p=%p %p\n", p, addr));
break;
}
}
KPREEMPT_ENABLE(l);
return startaddr;
}
/*
* During a fork, we copy all of the sequences from parent p1 to
* the child p2.
*
* No locking required as the parent must be paused.
*/
int
ras_fork(struct proc *p1, struct proc *p2)
{
struct ras *rp, *nrp;
for (rp = p1->p_raslist; rp != NULL; rp = rp->ras_next) {
nrp = kmem_alloc(sizeof(*nrp), KM_SLEEP);
nrp->ras_startaddr = rp->ras_startaddr;
nrp->ras_endaddr = rp->ras_endaddr;
nrp->ras_next = p2->p_raslist;
p2->p_raslist = nrp;
}
DPRINTF(("ras_fork: p1=%p, p2=%p\n", p1, p2));
return 0;
}
/*
* Nuke all sequences for this process.
*/
int
ras_purgeall(void)
{
struct ras *rp, *nrp;
proc_t *p;
p = curproc;
if (p->p_raslist == NULL)
return 0;
mutex_enter(&p->p_auxlock);
if ((rp = p->p_raslist) != NULL) {
p->p_raslist = NULL;
ras_sync(); for(; rp != NULL; rp = nrp) {
nrp = rp->ras_next;
kmem_free(rp, sizeof(*rp));
}
}
mutex_exit(&p->p_auxlock);
return 0;
}
#if defined(__HAVE_RAS)
/*
* Install the new sequence. If it already exists, return
* an error.
*/
static int
ras_install(void *addr, size_t len)
{
struct ras *rp;
struct ras *newrp;
void *endaddr;
int nras, error;
proc_t *p;
if (len == 0)
return EINVAL;
if ((uintptr_t)addr < VM_MIN_ADDRESS ||
(uintptr_t)addr > VM_MAXUSER_ADDRESS)
return EINVAL;
if (len > VM_MAXUSER_ADDRESS - (uintptr_t)addr)
return EINVAL;
endaddr = (char *)addr + len;
newrp = kmem_alloc(sizeof(*newrp), KM_SLEEP);
newrp->ras_startaddr = addr;
newrp->ras_endaddr = endaddr;
error = 0;
nras = 0;
p = curproc;
mutex_enter(&p->p_auxlock);
for (rp = p->p_raslist; rp != NULL; rp = rp->ras_next) { if (++nras >= ras_per_proc) {
error = EINVAL;
break;
}
if (addr < rp->ras_endaddr && endaddr > rp->ras_startaddr) {
error = EEXIST;
break;
}
}
if (rp == NULL) {
newrp->ras_next = p->p_raslist;
p->p_raslist = newrp;
ras_sync();
mutex_exit(&p->p_auxlock);
} else {
mutex_exit(&p->p_auxlock);
kmem_free(newrp, sizeof(*newrp));
}
return error;
}
/*
* Nuke the specified sequence. Both address and len must
* match, otherwise we return an error.
*/
static int
ras_purge(void *addr, size_t len)
{
struct ras *rp, **link;
proc_t *p;
p = curproc;
mutex_enter(&p->p_auxlock);
link = &p->p_raslist;
for (rp = *link; rp != NULL; link = &rp->ras_next, rp = *link) { if (addr == rp->ras_startaddr &&
(char *)rp->ras_endaddr - (char *)rp->ras_startaddr == len)
break;
}
if (rp != NULL) {
*link = rp->ras_next;
ras_sync();
mutex_exit(&p->p_auxlock);
kmem_free(rp, sizeof(*rp));
return 0;
} else {
mutex_exit(&p->p_auxlock);
return ESRCH;
}
}
#endif /* defined(__HAVE_RAS) */
/*ARGSUSED*/
int
sys_rasctl(struct lwp *l, const struct sys_rasctl_args *uap, register_t *retval)
{
#if defined(__HAVE_RAS)
/* {
syscallarg(void *) addr;
syscallarg(size_t) len;
syscallarg(int) op;
} */
void *addr;
size_t len;
int op;
int error;
/*
* first, extract syscall args from the uap.
*/
addr = (void *)SCARG(uap, addr);
len = (size_t)SCARG(uap, len);
op = SCARG(uap, op);
DPRINTF(("sys_rasctl: p=%p addr=%p, len=%ld, op=0x%x\n",
curproc, addr, (long)len, op));
switch (op) {
case RAS_INSTALL:
error = ras_install(addr, len);
break;
case RAS_PURGE:
error = ras_purge(addr, len);
break;
case RAS_PURGE_ALL:
error = ras_purgeall();
break;
default:
error = EINVAL;
break;
}
return (error);
#else
return (EOPNOTSUPP);
#endif
}
/* $NetBSD: vnd_50.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vn.c 1.13 94/04/02$
*
* @(#)vn.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vnd_50.c,v 1.5 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/compat_stub.h>
#include <net/zlib.h>
#include <dev/vndvar.h>
#include <compat/common/compat_mod.h>
static int compat_50_vndioctl(u_long, struct lwp *, void *, int, struct vattr *,
int (*)(struct lwp *, void *, int, struct vattr *));
static int
compat_50_vndioctl(u_long cmd, struct lwp *l, void *data, int unit,
struct vattr *vattr_p,
int (*get)(struct lwp *, void *, int, struct vattr *))
{
struct vnd_user50 *vnu = data;
int error;
if (cmd != VNDIOCGET50)
return EPASSTHROUGH;
error = (*get)(l, data, unit, vattr_p);
if (error != 0)
return error;
vnu->vnu_dev = vattr_p->va_fsid;
vnu->vnu_ino = vattr_p->va_fileid;
return 0;
}
void
vnd_50_init(void)
{
MODULE_HOOK_SET(compat_vndioctl_50_hook, compat_50_vndioctl);
}
void
vnd_50_fini(void)
{
MODULE_HOOK_UNSET(compat_vndioctl_50_hook);
}
/* $NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
* @(#)kern_time.c 8.4 (Berkeley) 5/26/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_time.c,v 1.38 2023/07/08 20:02:10 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/lwp.h>
#include <sys/timex.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/intr.h>
/*
* Compute number of hz until specified time. Used to compute second
* argument to callout_reset() from an absolute time.
*/
int
tvhzto(const struct timeval *tvp)
{
struct timeval now, tv;
tv = *tvp; /* Don't modify original tvp. */
getmicrotime(&now);
timersub(&tv, &now, &tv);
return tvtohz(&tv);
}
/*
* Compute number of ticks in the specified amount of time.
*/
int
tvtohz(const struct timeval *tv)
{
unsigned long ticks;
long sec, usec;
/*
* If the number of usecs in the whole seconds part of the time
* difference fits in a long, then the total number of usecs will
* fit in an unsigned long. Compute the total and convert it to
* ticks, rounding up and adding 1 to allow for the current tick
* to expire. Rounding also depends on unsigned long arithmetic
* to avoid overflow.
*
* Otherwise, if the number of ticks in the whole seconds part of
* the time difference fits in a long, then convert the parts to
* ticks separately and add, using similar rounding methods and
* overflow avoidance. This method would work in the previous
* case, but it is slightly slower and assumes that hz is integral.
*
* Otherwise, round the time difference down to the maximum
* representable value.
*
* If ints are 32-bit, then the maximum value for any timeout in
* 10ms ticks is 248 days.
*/
sec = tv->tv_sec;
usec = tv->tv_usec;
KASSERT(usec >= 0); KASSERT(usec < 1000000);
/* catch overflows in conversion time_t->int */
if (tv->tv_sec > INT_MAX)
return INT_MAX;
if (tv->tv_sec < 0)
return 0;
if (sec < 0 || (sec == 0 && usec == 0)) {
/*
* Would expire now or in the past. Return 0 ticks.
* This is different from the legacy tvhzto() interface,
* and callers need to check for it.
*/
ticks = 0;
} else if (sec <= (LONG_MAX / 1000000))
ticks = (((sec * 1000000) + (unsigned long)usec + (tick - 1))
/ tick) + 1;
else if (sec <= (LONG_MAX / hz)) ticks = (sec * hz) +
(((unsigned long)usec + (tick - 1)) / tick) + 1;
else
ticks = LONG_MAX;
if (ticks > INT_MAX)
ticks = INT_MAX;
return ((int)ticks);
}
int
tshzto(const struct timespec *tsp)
{
struct timespec now, ts;
ts = *tsp; /* Don't modify original tsp. */
getnanotime(&now);
timespecsub(&ts, &now, &ts);
return tstohz(&ts);
}
int
tshztoup(const struct timespec *tsp)
{
struct timespec now, ts;
ts = *tsp; /* Don't modify original tsp. */
getnanouptime(&now);
timespecsub(&ts, &now, &ts);
return tstohz(&ts);
}
/*
* Compute number of ticks in the specified amount of time.
*/
int
tstohz(const struct timespec *ts)
{
struct timeval tv;
/*
* usec has great enough resolution for hz, so convert to a
* timeval and use tvtohz() above.
*/
TIMESPEC_TO_TIMEVAL(&tv, ts);
return tvtohz(&tv);
}
/*
* Check that a proposed value to load into the .it_value or
* .it_interval part of an interval timer is acceptable, and
* fix it to have at least minimal value (i.e. if it is less
* than the resolution of the clock, round it up.). We don't
* timeout the 0,0 value because this means to disable the
* timer or the interval.
*/
int
itimerfix(struct timeval *tv)
{ if (tv->tv_usec < 0 || tv->tv_usec >= 1000000)
return EINVAL;
if (tv->tv_sec < 0)
return ETIMEDOUT;
if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) tv->tv_usec = tick;
return 0;
}
int
itimespecfix(struct timespec *ts)
{
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
return EINVAL;
if (ts->tv_sec < 0)
return ETIMEDOUT;
if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000) ts->tv_nsec = tick * 1000;
return 0;
}
int
inittimeleft(struct timespec *ts, struct timespec *sleepts)
{ if (itimespecfix(ts)) {
return -1;
}
KASSERT(ts->tv_sec >= 0);
getnanouptime(sleepts);
return 0;
}
int
gettimeleft(struct timespec *ts, struct timespec *sleepts)
{
struct timespec now, sleptts;
KASSERT(ts->tv_sec >= 0);
/*
* Reduce ts by elapsed time based on monotonic time scale.
*/
getnanouptime(&now);
KASSERT(timespeccmp(sleepts, &now, <=));
timespecsub(&now, sleepts, &sleptts);
*sleepts = now;
if (timespeccmp(ts, &sleptts, <=)) { /* timed out */
timespecclear(ts);
return 0;
}
timespecsub(ts, &sleptts, ts);
return tstohz(ts);
}
void
clock_timeleft(clockid_t clockid, struct timespec *ts, struct timespec *sleepts)
{
struct timespec sleptts;
clock_gettime1(clockid, &sleptts);
timespecadd(ts, sleepts, ts);
timespecsub(ts, &sleptts, ts);
*sleepts = sleptts;
}
int
clock_gettime1(clockid_t clock_id, struct timespec *ts)
{
int error;
struct proc *p;
#define CPUCLOCK_ID_MASK (~(CLOCK_THREAD_CPUTIME_ID|CLOCK_PROCESS_CPUTIME_ID))
if (clock_id & CLOCK_PROCESS_CPUTIME_ID) {
pid_t pid = clock_id & CPUCLOCK_ID_MASK;
struct timeval cputime;
mutex_enter(&proc_lock);
p = pid == 0 ? curproc : proc_find(pid);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
mutex_enter(p->p_lock);
calcru(p, /*usertime*/NULL, /*systime*/NULL, /*intrtime*/NULL,
&cputime);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
// XXX: Perhaps create a special kauth type
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_PTRACE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
if (error)
return error;
TIMEVAL_TO_TIMESPEC(&cputime, ts);
return 0;
} else if (clock_id & CLOCK_THREAD_CPUTIME_ID) {
struct lwp *l;
lwpid_t lid = clock_id & CPUCLOCK_ID_MASK;
struct bintime tm = {0, 0};
p = curproc;
mutex_enter(p->p_lock);
l = lid == 0 ? curlwp : lwp_find(p, lid); if (l == NULL) { mutex_exit(p->p_lock);
return ESRCH;
}
addrulwp(l, &tm);
mutex_exit(p->p_lock);
bintime2timespec(&tm, ts);
return 0;
}
switch (clock_id) {
case CLOCK_REALTIME:
nanotime(ts);
break;
case CLOCK_MONOTONIC:
nanouptime(ts);
break;
default:
return EINVAL;
}
return 0;
}
/*
* Calculate delta and convert from struct timespec to the ticks.
*/
int
ts2timo(clockid_t clock_id, int flags, struct timespec *ts,
int *timo, struct timespec *start)
{
int error;
struct timespec tsd;
if (ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000L)
return EINVAL;
if ((flags & TIMER_ABSTIME) != 0 || start != NULL) {
error = clock_gettime1(clock_id, &tsd);
if (error != 0)
return error;
if (start != NULL) *start = tsd;
}
if ((flags & TIMER_ABSTIME) != 0) {
if (!timespecsubok(ts, &tsd))
return EINVAL;
timespecsub(ts, &tsd, ts);
}
error = itimespecfix(ts);
if (error != 0)
return error;
if (ts->tv_sec == 0 && ts->tv_nsec == 0)
return ETIMEDOUT;
*timo = tstohz(ts);
KASSERT(*timo > 0);
return 0;
}
bool
timespecaddok(const struct timespec *tsp, const struct timespec *usp)
{
enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
time_t a = tsp->tv_sec;
time_t b = usp->tv_sec;
bool carry;
/*
* Caller is responsible for guaranteeing valid timespec
* inputs. Any user-controlled inputs must be validated or
* adjusted.
*/
KASSERT(tsp->tv_nsec >= 0); KASSERT(usp->tv_nsec >= 0); KASSERT(tsp->tv_nsec < 1000000000L); KASSERT(usp->tv_nsec < 1000000000L);
CTASSERT(1000000000L <= __type_max(long) - 1000000000L);
/*
* Fail if a + b + carry overflows TIME_MAX, or if a + b
* overflows TIME_MIN because timespecadd adds the carry after
* computing a + b.
*
* Break it into two mutually exclusive and exhaustive cases:
* I. a >= 0
* II. a < 0
*/
carry = (tsp->tv_nsec + usp->tv_nsec >= 1000000000L);
if (a >= 0) {
/*
* Case I: a >= 0. If b < 0, then b + 1 <= 0, so
*
* a + b + 1 <= a + 0 <= TIME_MAX,
*
* and
*
* a + b >= 0 + b = b >= TIME_MIN,
*
* so this can't overflow.
*
* If b >= 0, then a + b + carry >= a + b >= 0, so
* negative results and thus results below TIME_MIN are
* impossible; we need only avoid
*
* a + b + carry > TIME_MAX,
*
* which we will do by rejecting if
*
* b > TIME_MAX - a - carry,
*
* which in turn is incidentally always false if b < 0
* so we don't need extra logic to discriminate on the
* b >= 0 and b < 0 cases.
*
* Since 0 <= a <= TIME_MAX, we know
*
* 0 <= TIME_MAX - a <= TIME_MAX,
*
* and hence
*
* -1 <= TIME_MAX - a - 1 < TIME_MAX.
*
* So we can compute TIME_MAX - a - carry (i.e., either
* TIME_MAX - a or TIME_MAX - a - 1) safely without
* overflow.
*/
if (b > TIME_MAX - a - carry)
return false;
} else {
/*
* Case II: a < 0. If b >= 0, then since a + 1 <= 0,
* we have
*
* a + b + 1 <= b <= TIME_MAX,
*
* and
*
* a + b >= a >= TIME_MIN,
*
* so this can't overflow.
*
* If b < 0, then the intermediate a + b is negative
* and the outcome a + b + 1 is nonpositive, so we need
* only avoid
*
* a + b < TIME_MIN,
*
* which we will do by rejecting if
*
* a < TIME_MIN - b.
*
* (Reminder: The carry is added afterward in
* timespecadd, so to avoid overflow it is not enough
* to merely reject a + b + carry < TIME_MIN.)
*
* It is safe to compute the difference TIME_MIN - b
* because b is negative, so the result lies in
* (TIME_MIN, 0].
*/
if (b < 0 && a < TIME_MIN - b)
return false;
}
return true;
}
bool
timespecsubok(const struct timespec *tsp, const struct timespec *usp)
{
enum { TIME_MIN = __type_min(time_t), TIME_MAX = __type_max(time_t) };
time_t a = tsp->tv_sec, b = usp->tv_sec;
bool borrow;
/*
* Caller is responsible for guaranteeing valid timespec
* inputs. Any user-controlled inputs must be validated or
* adjusted.
*/
KASSERT(tsp->tv_nsec >= 0); KASSERT(usp->tv_nsec >= 0); KASSERT(tsp->tv_nsec < 1000000000L); KASSERT(usp->tv_nsec < 1000000000L);
CTASSERT(1000000000L <= __type_max(long) - 1000000000L);
/*
* Fail if a - b - borrow overflows TIME_MIN, or if a - b
* overflows TIME_MAX because timespecsub subtracts the borrow
* after computing a - b.
*
* Break it into two mutually exclusive and exhaustive cases:
* I. a < 0
* II. a >= 0
*/
borrow = (tsp->tv_nsec - usp->tv_nsec < 0);
if (a < 0) {
/*
* Case I: a < 0. If b < 0, then -b - 1 >= 0, so
*
* a - b - 1 >= a + 0 >= TIME_MIN,
*
* and, since a <= -1, provided that TIME_MIN <=
* -TIME_MAX - 1 so that TIME_MAX <= -TIME_MIN - 1 (in
* fact, equality holds, under the assumption of
* two's-complement arithmetic),
*
* a - b <= -1 - b = -b - 1 <= TIME_MAX,
*
* so this can't overflow.
*/
CTASSERT(TIME_MIN <= -TIME_MAX - 1);
/*
* If b >= 0, then a - b - borrow <= a - b < 0, so
* positive results and thus results above TIME_MAX are
* impossible; we need only avoid
*
* a - b - borrow < TIME_MIN,
*
* which we will do by rejecting if
*
* a < TIME_MIN + b + borrow.
*
* The right-hand side is safe to evaluate for any
* values of b and borrow as long as TIME_MIN +
* TIME_MAX + 1 <= TIME_MAX, i.e., TIME_MIN <= -1.
* (Note: If time_t were unsigned, this would fail!)
*
* Note: Unlike Case I in timespecaddok, this criterion
* does not work for b < 0, nor can the roles of a and
* b in the inequality be reversed (e.g., -b < TIME_MIN
* - a + borrow) without extra cases like checking for
* b = TEST_MIN.
*/
CTASSERT(TIME_MIN < -1);
if (b >= 0 && a < TIME_MIN + b + borrow)
return false;
} else {
/*
* Case II: a >= 0. If b >= 0, then
*
* a - b <= a <= TIME_MAX,
*
* and, provided TIME_MIN <= -TIME_MAX - 1 (in fact,
* equality holds, under the assumption of
* two's-complement arithmetic)
*
* a - b - 1 >= -b - 1 >= -TIME_MAX - 1 >= TIME_MIN,
*
* so this can't overflow.
*/
CTASSERT(TIME_MIN <= -TIME_MAX - 1);
/*
* If b < 0, then a - b >= a >= 0, so negative results
* and thus results below TIME_MIN are impossible; we
* need only avoid
*
* a - b > TIME_MAX,
*
* which we will do by rejecting if
*
* a > TIME_MAX + b.
*
* (Reminder: The borrow is subtracted afterward in
* timespecsub, so to avoid overflow it is not enough
* to merely reject a - b - borrow > TIME_MAX.)
*
* It is safe to compute the sum TIME_MAX + b because b
* is negative, so the result lies in [0, TIME_MAX).
*/
if (b < 0 && a > TIME_MAX + b)
return false;
}
return true;
}
/* $NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vfsops.c 8.7 (Berkeley) 5/10/95
*/
/*
* procfs VFS interface
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vfsops.c,v 1.114 2024/01/17 10:21:01 hannken Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/fstrans.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/procfs/procfs.h>
#include <uvm/uvm_extern.h> /* for PAGE_SIZE */
MODULE(MODULE_CLASS_VFS, procfs, "ptrace_common");
VFS_PROTOS(procfs);
#define PROCFS_HASHSIZE 256
#define PROCFS_EXEC_HOOK ((void *)1)
#define PROCFS_EXIT_HOOK ((void *)2)
static kauth_listener_t procfs_listener;
static void *procfs_exechook;
static void *procfs_exithook;
LIST_HEAD(hashhead, pfsnode);
static u_long procfs_hashmask;
static struct hashhead *procfs_hashtab;
static kmutex_t procfs_hashlock;
static struct hashhead *
procfs_hashhead(pid_t pid)
{
return &procfs_hashtab[pid & procfs_hashmask];
}
void
procfs_hashrem(struct pfsnode *pfs)
{
mutex_enter(&procfs_hashlock);
LIST_REMOVE(pfs, pfs_hash);
mutex_exit(&procfs_hashlock);
}
/*
* VFS Operations.
*
* mount system call
*/
/* ARGSUSED */
int
procfs_mount(
struct mount *mp,
const char *path,
void *data,
size_t *data_len)
{
struct lwp *l = curlwp;
struct procfsmount *pmnt;
struct procfs_args *args = data;
int error;
if (args == NULL)
return EINVAL;
if (UIO_MX & (UIO_MX-1)) {
log(LOG_ERR, "procfs: invalid directory entry size");
return (EINVAL);
}
if (mp->mnt_flag & MNT_GETARGS) {
if (*data_len < sizeof *args)
return EINVAL;
pmnt = VFSTOPROC(mp);
if (pmnt == NULL)
return EIO;
args->version = PROCFS_ARGSVERSION;
args->flags = pmnt->pmnt_flags;
*data_len = sizeof *args;
return 0;
}
if (mp->mnt_flag & MNT_UPDATE)
return (EOPNOTSUPP);
if (*data_len >= sizeof *args && args->version != PROCFS_ARGSVERSION)
return EINVAL;
pmnt = kmem_zalloc(sizeof(struct procfsmount), KM_SLEEP);
mp->mnt_stat.f_namemax = PROCFS_MAXNAMLEN;
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_data = pmnt;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, "procfs", UIO_SYSSPACE,
mp->mnt_op->vfs_name, mp, l);
if (*data_len >= sizeof *args)
pmnt->pmnt_flags = args->flags;
else
pmnt->pmnt_flags = 0;
mp->mnt_iflag |= IMNT_MPSAFE | IMNT_SHRLOOKUP;
return error;
}
/*
* unmount system call
*/
int
procfs_unmount(struct mount *mp, int mntflags)
{
int error;
int flags = 0;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if ((error = vflush(mp, 0, flags)) != 0)
return (error);
kmem_free(mp->mnt_data, sizeof(struct procfsmount));
mp->mnt_data = NULL;
return 0;
}
int
procfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
int error;
error = procfs_allocvp(mp, vpp, 0, PFSroot, -1);
if (error == 0) {
error = vn_lock(*vpp, lktype);
if (error != 0) { vrele(*vpp);
*vpp = NULL;
}
}
return error;
}
/* ARGSUSED */
int
procfs_start(struct mount *mp, int flags)
{
return (0);
}
/*
* Get file system statistics.
*/
int
procfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
genfs_statvfs(mp, sbp);
sbp->f_bsize = PAGE_SIZE;
sbp->f_frsize = PAGE_SIZE;
sbp->f_iosize = PAGE_SIZE;
sbp->f_blocks = 1;
sbp->f_files = maxproc; /* approx */
sbp->f_ffree = maxproc - atomic_load_relaxed(&nprocs); /* approx */
sbp->f_favail = maxproc - atomic_load_relaxed(&nprocs); /* approx */
return (0);
}
/*ARGSUSED*/
int
procfs_sync(
struct mount *mp,
int waitfor,
kauth_cred_t uc)
{
return (0);
}
/*ARGSUSED*/
int
procfs_vget(struct mount *mp, ino_t ino, int lktype,
struct vnode **vpp)
{
return (EOPNOTSUPP);
}
int
procfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
int error;
struct pfskey pfskey;
struct pfsnode *pfs;
KASSERT(key_len == sizeof(pfskey));
memcpy(&pfskey, key, key_len);
pfs = kmem_alloc(sizeof(*pfs), KM_SLEEP);
pfs->pfs_pid = pfskey.pk_pid;
pfs->pfs_type = pfskey.pk_type;
pfs->pfs_fd = pfskey.pk_fd;
pfs->pfs_vnode = vp;
pfs->pfs_mount = mp;
pfs->pfs_flags = 0;
pfs->pfs_fileno =
PROCFS_FILENO(pfs->pfs_pid, pfs->pfs_type, pfs->pfs_fd);
vp->v_tag = VT_PROCFS;
vp->v_op = procfs_vnodeop_p;
vp->v_data = pfs;
switch (pfs->pfs_type) {
case PFSroot: /* /proc = dr-xr-xr-x */
vp->v_vflag |= VV_ROOT;
/*FALLTHROUGH*/
case PFSproc: /* /proc/N = dr-xr-xr-x */
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VDIR;
break;
case PFStask: /* /proc/N/task = dr-xr-xr-x */
if (pfs->pfs_fd == -1) { pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|
S_IROTH|S_IXOTH;
vp->v_type = VDIR;
break;
}
/*FALLTHROUGH*/
case PFScurproc: /* /proc/curproc = lr-xr-xr-x */
case PFSself: /* /proc/self = lr-xr-xr-x */
case PFScwd: /* /proc/N/cwd = lr-xr-xr-x */
case PFSchroot: /* /proc/N/chroot = lr-xr-xr-x */
case PFSexe: /* /proc/N/exe = lr-xr-xr-x */
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VLNK;
break;
case PFSfd:
if (pfs->pfs_fd == -1) { /* /proc/N/fd = dr-x------ */
pfs->pfs_mode = S_IRUSR|S_IXUSR;
vp->v_type = VDIR;
} else { /* /proc/N/fd/M = [ps-]rw------- */
file_t *fp;
vnode_t *vxp;
struct proc *p;
mutex_enter(&proc_lock);
p = procfs_proc_find(mp, pfs->pfs_pid);
mutex_exit(&proc_lock);
if (p == NULL) {
error = ENOENT;
goto bad;
}
KASSERT(rw_read_held(&p->p_reflock)); if ((fp = fd_getfile2(p, pfs->pfs_fd)) == NULL) {
error = EBADF;
goto bad;
}
pfs->pfs_mode = S_IRUSR|S_IWUSR;
switch (fp->f_type) {
case DTYPE_VNODE:
vxp = fp->f_vnode;
/*
* We make symlinks for directories
* to avoid cycles.
*/
if (vxp->v_type == VDIR ||
procfs_proc_is_linux_compat())
goto symlink;
vp->v_type = vxp->v_type;
break;
case DTYPE_PIPE:
vp->v_type = VFIFO;
break;
case DTYPE_SOCKET:
vp->v_type = VSOCK;
break;
case DTYPE_KQUEUE:
case DTYPE_MISC:
case DTYPE_SEM:
symlink:
pfs->pfs_mode = S_IRUSR|S_IXUSR|S_IRGRP|
S_IXGRP|S_IROTH|S_IXOTH;
vp->v_type = VLNK;
break;
default:
error = EOPNOTSUPP;
closef(fp);
goto bad;
}
closef(fp);
}
break;
case PFSfile: /* /proc/N/file = -rw------- */
case PFSmem: /* /proc/N/mem = -rw------- */
case PFSregs: /* /proc/N/regs = -rw------- */
case PFSfpregs: /* /proc/N/fpregs = -rw------- */
pfs->pfs_mode = S_IRUSR|S_IWUSR;
vp->v_type = VREG;
break;
case PFSnote: /* /proc/N/note = --w------ */
case PFSnotepg: /* /proc/N/notepg = --w------ */
pfs->pfs_mode = S_IWUSR;
vp->v_type = VREG;
break;
case PFSmap: /* /proc/N/map = -r-------- */
case PFSmaps: /* /proc/N/maps = -r-------- */
case PFSauxv: /* /proc/N/auxv = -r-------- */
case PFSenviron: /* /proc/N/environ = -r-------- */
pfs->pfs_mode = S_IRUSR;
vp->v_type = VREG;
break;
case PFSstatus: /* /proc/N/status = -r--r--r-- */
case PFSstat: /* /proc/N/stat = -r--r--r-- */
case PFScmdline: /* /proc/N/cmdline = -r--r--r-- */
case PFSemul: /* /proc/N/emul = -r--r--r-- */
case PFSmeminfo: /* /proc/meminfo = -r--r--r-- */
case PFScpustat: /* /proc/stat = -r--r--r-- */
case PFSdevices: /* /proc/devices = -r--r--r-- */
case PFScpuinfo: /* /proc/cpuinfo = -r--r--r-- */
case PFSuptime: /* /proc/uptime = -r--r--r-- */
case PFSmounts: /* /proc/mounts = -r--r--r-- */
case PFSloadavg: /* /proc/loadavg = -r--r--r-- */
case PFSstatm: /* /proc/N/statm = -r--r--r-- */
case PFSversion: /* /proc/version = -r--r--r-- */
case PFSlimit: /* /proc/limit = -r--r--r-- */
pfs->pfs_mode = S_IRUSR|S_IRGRP|S_IROTH;
vp->v_type = VREG;
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
procfs_machdep_allocvp(vp);
break;
#endif
default:
panic("procfs_allocvp");
}
mutex_enter(&procfs_hashlock);
LIST_INSERT_HEAD(procfs_hashhead(pfs->pfs_pid), pfs, pfs_hash);
mutex_exit(&procfs_hashlock);
uvm_vnp_setsize(vp, 0);
*new_key = &pfs->pfs_key;
return 0;
bad:
vp->v_tag =VT_NON;
vp->v_type = VNON;
vp->v_op = NULL;
vp->v_data = NULL;
kmem_free(pfs, sizeof(*pfs));
return error;
}
void
procfs_init(void)
{
}
void
procfs_reinit(void)
{
}
void
procfs_done(void)
{
}
extern const struct vnodeopv_desc procfs_vnodeop_opv_desc;
const struct vnodeopv_desc * const procfs_vnodeopv_descs[] = {
&procfs_vnodeop_opv_desc,
NULL,
};
struct vfsops procfs_vfsops = {
.vfs_name = MOUNT_PROCFS,
.vfs_min_mount_data = sizeof (struct procfs_args),
.vfs_mount = procfs_mount,
.vfs_start = procfs_start,
.vfs_unmount = procfs_unmount,
.vfs_root = procfs_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = procfs_statvfs,
.vfs_sync = procfs_sync,
.vfs_vget = procfs_vget,
.vfs_loadvnode = procfs_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = procfs_init,
.vfs_reinit = procfs_reinit,
.vfs_done = procfs_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = procfs_vnodeopv_descs
};
static void
procfs_exechook_cb(struct proc *p, void *arg)
{
struct hashhead *head;
struct pfsnode *pfs;
struct mount *mp;
struct pfskey key;
struct vnode *vp;
int error;
if (arg == PROCFS_EXEC_HOOK && !(p->p_flag & PK_SUGID))
return;
head = procfs_hashhead(p->p_pid);
again:
mutex_enter(&procfs_hashlock);
LIST_FOREACH(pfs, head, pfs_hash) {
if (pfs->pfs_pid != p->p_pid)
continue;
mp = pfs->pfs_mount;
key = pfs->pfs_key;
vfs_ref(mp);
mutex_exit(&procfs_hashlock);
error = vcache_get(mp, &key, sizeof(key), &vp);
vfs_rele(mp);
if (error != 0)
goto again;
if (vrecycle(vp))
goto again;
do {
error = vfs_suspend(mp, 0);
} while (error == EINTR || error == ERESTART);
vgone(vp);
if (error == 0)
vfs_resume(mp);
goto again;
}
mutex_exit(&procfs_hashlock);
}
static int
procfs_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
struct pfsnode *pfs;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
pfs = arg1;
if (action != KAUTH_PROCESS_PROCFS)
return result;
switch (pfs->pfs_type) {
case PFSregs:
case PFSfpregs:
case PFSmem:
if (kauth_cred_getuid(cred) != kauth_cred_getuid(p->p_cred) ||
ISSET(p->p_flag, PK_SUGID))
break;
/*FALLTHROUGH*/
default:
result = KAUTH_RESULT_ALLOW;
break;
}
return result;
}
SYSCTL_SETUP(procfs_sysctl_setup, "procfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "procfs",
SYSCTL_DESCR("Process file system"),
NULL, 0, NULL, 0,
CTL_VFS, 12, CTL_EOL);
/*
* XXX the "12" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "12" is the order as taken from sys/mount.h
*/
}
static int
procfs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&procfs_vfsops);
if (error != 0)
break;
procfs_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
procfs_listener_cb, NULL);
procfs_exechook = exechook_establish(procfs_exechook_cb,
PROCFS_EXEC_HOOK);
procfs_exithook = exithook_establish(procfs_exechook_cb,
PROCFS_EXIT_HOOK);
mutex_init(&procfs_hashlock, MUTEX_DEFAULT, IPL_NONE);
procfs_hashtab = hashinit(PROCFS_HASHSIZE, HASH_LIST, true,
&procfs_hashmask);
break;
case MODULE_CMD_FINI:
error = vfs_detach(&procfs_vfsops);
if (error != 0)
break;
kauth_unlisten_scope(procfs_listener);
exechook_disestablish(procfs_exechook);
exithook_disestablish(procfs_exithook);
mutex_destroy(&procfs_hashlock);
hashdone(procfs_hashtab, HASH_LIST, procfs_hashmask);
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $ */
/*-
* Copyright (c) 1998, 1999, 2000, 2002, 2003, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum; by Jason R. Thorpe of the Numerical Aerospace
* Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scsipi_base.c,v 1.189 2022/04/09 23:38:32 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/buf.h>
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/pool.h>
#include <sys/errno.h>
#include <sys/device.h>
#include <sys/proc.h>
#include <sys/kthread.h>
#include <sys/hash.h>
#include <sys/atomic.h>
#include <dev/scsipi/scsi_sdt.h>
#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsipi_disk.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_message.h>
#include <machine/param.h>
SDT_PROVIDER_DEFINE(scsi);
SDT_PROBE_DEFINE3(scsi, base, tag, get,
"struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);
SDT_PROBE_DEFINE3(scsi, base, tag, put,
"struct scsipi_xfer *"/*xs*/, "uint8_t"/*tag*/, "uint8_t"/*type*/);
SDT_PROBE_DEFINE3(scsi, base, adapter, request__start,
"struct scsipi_channel *"/*chan*/,
"scsipi_adapter_req_t"/*req*/,
"void *"/*arg*/);
SDT_PROBE_DEFINE3(scsi, base, adapter, request__done,
"struct scsipi_channel *"/*chan*/,
"scsipi_adapter_req_t"/*req*/,
"void *"/*arg*/);
SDT_PROBE_DEFINE1(scsi, base, queue, batch__start,
"struct scsipi_channel *"/*chan*/);
SDT_PROBE_DEFINE2(scsi, base, queue, run,
"struct scsipi_channel *"/*chan*/,
"struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, queue, batch__done,
"struct scsipi_channel *"/*chan*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, execute, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, enqueue, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, done, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, redone, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, complete, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, restart, "struct scsipi_xfer *"/*xs*/);
SDT_PROBE_DEFINE1(scsi, base, xfer, free, "struct scsipi_xfer *"/*xs*/);
static int scsipi_complete(struct scsipi_xfer *);
static void scsipi_request_sense(struct scsipi_xfer *);
static int scsipi_enqueue(struct scsipi_xfer *);
static void scsipi_run_queue(struct scsipi_channel *chan);
static void scsipi_completion_thread(void *);
static void scsipi_get_tag(struct scsipi_xfer *);
static void scsipi_put_tag(struct scsipi_xfer *);
static int scsipi_get_resource(struct scsipi_channel *);
static void scsipi_put_resource(struct scsipi_channel *);
static void scsipi_async_event_max_openings(struct scsipi_channel *,
struct scsipi_max_openings *);
static void scsipi_async_event_channel_reset(struct scsipi_channel *);
static void scsipi_channel_freeze_locked(struct scsipi_channel *, int);
static void scsipi_adapter_lock(struct scsipi_adapter *adapt);
static void scsipi_adapter_unlock(struct scsipi_adapter *adapt);
static void scsipi_update_timeouts(struct scsipi_xfer *xs);
static struct pool scsipi_xfer_pool;
int scsipi_xs_count = 0;
/*
* scsipi_init:
*
* Called when a scsibus or atapibus is attached to the system
* to initialize shared data structures.
*/
void
scsipi_init(void)
{
static int scsipi_init_done;
if (scsipi_init_done)
return;
scsipi_init_done = 1;
/* Initialize the scsipi_xfer pool. */
pool_init(&scsipi_xfer_pool, sizeof(struct scsipi_xfer), 0,
0, 0, "scxspl", NULL, IPL_BIO);
pool_prime(&scsipi_xfer_pool, 1);
scsipi_ioctl_init();
}
/*
* scsipi_channel_init:
*
* Initialize a scsipi_channel when it is attached.
*/
int
scsipi_channel_init(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
int i;
/* Initialize shared data. */
scsipi_init();
/* Initialize the queues. */
TAILQ_INIT(&chan->chan_queue);
TAILQ_INIT(&chan->chan_complete);
for (i = 0; i < SCSIPI_CHAN_PERIPH_BUCKETS; i++)
LIST_INIT(&chan->chan_periphtab[i]);
/*
* Create the asynchronous completion thread.
*/
if (kthread_create(PRI_NONE, 0, NULL, scsipi_completion_thread, chan,
&chan->chan_thread, "%s", chan->chan_name)) {
aprint_error_dev(adapt->adapt_dev, "unable to create completion thread for "
"channel %d\n", chan->chan_channel);
panic("scsipi_channel_init");
}
return 0;
}
/*
* scsipi_channel_shutdown:
*
* Shutdown a scsipi_channel.
*/
void
scsipi_channel_shutdown(struct scsipi_channel *chan)
{
mutex_enter(chan_mtx(chan));
/*
* Shut down the completion thread.
*/
chan->chan_tflags |= SCSIPI_CHANT_SHUTDOWN;
cv_broadcast(chan_cv_complete(chan));
/*
* Now wait for the thread to exit.
*/
while (chan->chan_thread != NULL)
cv_wait(chan_cv_thread(chan), chan_mtx(chan));
mutex_exit(chan_mtx(chan));
}
static uint32_t
scsipi_chan_periph_hash(uint64_t t, uint64_t l)
{
uint32_t hash;
hash = hash32_buf(&t, sizeof(t), HASH32_BUF_INIT);
hash = hash32_buf(&l, sizeof(l), hash);
return hash & SCSIPI_CHAN_PERIPH_HASHMASK;
}
/*
* scsipi_insert_periph:
*
* Insert a periph into the channel.
*/
void
scsipi_insert_periph(struct scsipi_channel *chan, struct scsipi_periph *periph)
{
uint32_t hash;
hash = scsipi_chan_periph_hash(periph->periph_target,
periph->periph_lun);
mutex_enter(chan_mtx(chan));
LIST_INSERT_HEAD(&chan->chan_periphtab[hash], periph, periph_hash);
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_remove_periph:
*
* Remove a periph from the channel.
*/
void
scsipi_remove_periph(struct scsipi_channel *chan,
struct scsipi_periph *periph)
{
LIST_REMOVE(periph, periph_hash);
}
/*
* scsipi_lookup_periph:
*
* Lookup a periph on the specified channel.
*/
static struct scsipi_periph *
scsipi_lookup_periph_internal(struct scsipi_channel *chan, int target, int lun, bool lock)
{
struct scsipi_periph *periph;
uint32_t hash;
if (target >= chan->chan_ntargets ||
lun >= chan->chan_nluns)
return NULL;
hash = scsipi_chan_periph_hash(target, lun);
if (lock)
mutex_enter(chan_mtx(chan));
LIST_FOREACH(periph, &chan->chan_periphtab[hash], periph_hash) {
if (periph->periph_target == target &&
periph->periph_lun == lun)
break;
}
if (lock)
mutex_exit(chan_mtx(chan));
return periph;
}
struct scsipi_periph *
scsipi_lookup_periph_locked(struct scsipi_channel *chan, int target, int lun)
{
return scsipi_lookup_periph_internal(chan, target, lun, false);
}
struct scsipi_periph *
scsipi_lookup_periph(struct scsipi_channel *chan, int target, int lun)
{
return scsipi_lookup_periph_internal(chan, target, lun, true);
}
/*
* scsipi_get_resource:
*
* Allocate a single xfer `resource' from the channel.
*
* NOTE: Must be called with channel lock held
*/
static int
scsipi_get_resource(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
if (chan->chan_flags & SCSIPI_CHAN_OPENINGS) {
if (chan->chan_openings > 0) {
chan->chan_openings--;
return 1;
}
return 0;
}
if (adapt->adapt_openings > 0) {
adapt->adapt_openings--;
return 1;
}
return 0;
}
/*
* scsipi_grow_resources:
*
* Attempt to grow resources for a channel. If this succeeds,
* we allocate one for our caller.
*
* NOTE: Must be called with channel lock held
*/
static inline int
scsipi_grow_resources(struct scsipi_channel *chan)
{
if (chan->chan_flags & SCSIPI_CHAN_CANGROW) {
if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
mutex_exit(chan_mtx(chan));
scsipi_adapter_request(chan,
ADAPTER_REQ_GROW_RESOURCES, NULL);
mutex_enter(chan_mtx(chan));
return scsipi_get_resource(chan);
}
/*
* ask the channel thread to do it. It'll have to thaw the
* queue
*/
scsipi_channel_freeze_locked(chan, 1);
chan->chan_tflags |= SCSIPI_CHANT_GROWRES;
cv_broadcast(chan_cv_complete(chan));
return 0;
}
return 0;
}
/*
* scsipi_put_resource:
*
* Free a single xfer `resource' to the channel.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_put_resource(struct scsipi_channel *chan)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
if (chan->chan_flags & SCSIPI_CHAN_OPENINGS)
chan->chan_openings++;
else
adapt->adapt_openings++;
}
/*
* scsipi_get_tag:
*
* Get a tag ID for the specified xfer.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_get_tag(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int bit, tag;
u_int word;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
bit = 0; /* XXX gcc */
for (word = 0; word < PERIPH_NTAGWORDS; word++) {
bit = ffs(periph->periph_freetags[word]);
if (bit != 0)
break;
}
#ifdef DIAGNOSTIC
if (word == PERIPH_NTAGWORDS) {
scsipi_printaddr(periph);
printf("no free tags\n");
panic("scsipi_get_tag");
}
#endif
bit -= 1;
periph->periph_freetags[word] &= ~(1U << bit);
tag = (word << 5) | bit;
/* XXX Should eventually disallow this completely. */
if (tag >= periph->periph_openings) { scsipi_printaddr(periph);
printf("WARNING: tag %d greater than available openings %d\n",
tag, periph->periph_openings);
}
xs->xs_tag_id = tag;
SDT_PROBE3(scsi, base, tag, get,
xs, xs->xs_tag_id, xs->xs_tag_type);
}
/*
* scsipi_put_tag:
*
* Put the tag ID for the specified xfer back into the pool.
*
* NOTE: Must be called with channel lock held
*/
static void
scsipi_put_tag(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int word, bit;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
SDT_PROBE3(scsi, base, tag, put,
xs, xs->xs_tag_id, xs->xs_tag_type);
word = xs->xs_tag_id >> 5;
bit = xs->xs_tag_id & 0x1f;
periph->periph_freetags[word] |= (1U << bit);
}
/*
* scsipi_get_xs:
*
* Allocate an xfer descriptor and associate it with the
* specified peripheral. If the peripheral has no more
* available command openings, we either block waiting for
* one to become available, or fail.
*
* When this routine is called with the channel lock held
* the flags must include XS_CTL_NOSLEEP.
*/
struct scsipi_xfer *
scsipi_get_xs(struct scsipi_periph *periph, int flags)
{
struct scsipi_xfer *xs;
bool lock = (flags & XS_CTL_NOSLEEP) == 0;
SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_get_xs\n"));
KASSERT(!cold);
#ifdef DIAGNOSTIC
/*
* URGENT commands can never be ASYNC.
*/
if ((flags & (XS_CTL_URGENT|XS_CTL_ASYNC)) ==
(XS_CTL_URGENT|XS_CTL_ASYNC)) {
scsipi_printaddr(periph);
printf("URGENT and ASYNC\n");
panic("scsipi_get_xs");
}
#endif
/*
* Wait for a command opening to become available. Rules:
*
* - All xfers must wait for an available opening.
* Exception: URGENT xfers can proceed when
* active == openings, because we use the opening
* of the command we're recovering for.
* - if the periph has sense pending, only URGENT & REQSENSE
* xfers may proceed.
*
* - If the periph is recovering, only URGENT xfers may
* proceed.
*
* - If the periph is currently executing a recovery
* command, URGENT commands must block, because only
* one recovery command can execute at a time.
*/
if (lock) mutex_enter(chan_mtx(periph->periph_channel));
for (;;) {
if (flags & XS_CTL_URGENT) {
if (periph->periph_active > periph->periph_openings)
goto wait_for_opening;
if (periph->periph_flags & PERIPH_SENSE) {
if ((flags & XS_CTL_REQSENSE) == 0)
goto wait_for_opening;
} else {
if ((periph->periph_flags &
PERIPH_RECOVERY_ACTIVE) != 0)
goto wait_for_opening;
periph->periph_flags |= PERIPH_RECOVERY_ACTIVE;
}
break;
}
if (periph->periph_active >= periph->periph_openings ||
(periph->periph_flags & PERIPH_RECOVERING) != 0)
goto wait_for_opening;
periph->periph_active++;
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
break;
wait_for_opening:
if (flags & XS_CTL_NOSLEEP) {
KASSERT(!lock);
return NULL;
}
KASSERT(lock);
SC_DEBUG(periph, SCSIPI_DB3, ("sleeping\n"));
periph->periph_flags |= PERIPH_WAITING;
cv_wait(periph_cv_periph(periph),
chan_mtx(periph->periph_channel));
}
if (lock)
mutex_exit(chan_mtx(periph->periph_channel));
SC_DEBUG(periph, SCSIPI_DB3, ("calling pool_get\n"));
xs = pool_get(&scsipi_xfer_pool,
((flags & XS_CTL_NOSLEEP) != 0 ? PR_NOWAIT : PR_WAITOK));
if (xs == NULL) {
if (lock)
mutex_enter(chan_mtx(periph->periph_channel));
if (flags & XS_CTL_URGENT) {
if ((flags & XS_CTL_REQSENSE) == 0) periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
} else
periph->periph_active--; if (lock) mutex_exit(chan_mtx(periph->periph_channel));
scsipi_printaddr(periph);
printf("unable to allocate %sscsipi_xfer\n",
(flags & XS_CTL_URGENT) ? "URGENT " : "");
}
SC_DEBUG(periph, SCSIPI_DB3, ("returning\n"));
if (xs != NULL) {
memset(xs, 0, sizeof(*xs));
callout_init(&xs->xs_callout, 0);
xs->xs_periph = periph;
xs->xs_control = flags;
xs->xs_status = 0;
if ((flags & XS_CTL_NOSLEEP) == 0) mutex_enter(chan_mtx(periph->periph_channel)); TAILQ_INSERT_TAIL(&periph->periph_xferq, xs, device_q); KASSERT(mutex_owned(chan_mtx(periph->periph_channel))); if ((flags & XS_CTL_NOSLEEP) == 0) mutex_exit(chan_mtx(periph->periph_channel));
}
return xs;
}
/*
* scsipi_put_xs:
*
* Release an xfer descriptor, decreasing the outstanding command
* count for the peripheral. If there is a thread waiting for
* an opening, wake it up. If not, kick any queued I/O the
* peripheral may have.
*
* NOTE: Must be called with channel lock held
*/
void
scsipi_put_xs(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int flags = xs->xs_control;
SDT_PROBE1(scsi, base, xfer, free, xs);
SC_DEBUG(periph, SCSIPI_DB3, ("scsipi_free_xs\n"));
KASSERT(mutex_owned(chan_mtx(periph->periph_channel)));
TAILQ_REMOVE(&periph->periph_xferq, xs, device_q);
callout_destroy(&xs->xs_callout);
pool_put(&scsipi_xfer_pool, xs);
#ifdef DIAGNOSTIC
if ((periph->periph_flags & PERIPH_RECOVERY_ACTIVE) != 0 &&
periph->periph_active == 0) {
scsipi_printaddr(periph);
printf("recovery without a command to recovery for\n");
panic("scsipi_put_xs");
}
#endif
if (flags & XS_CTL_URGENT) {
if ((flags & XS_CTL_REQSENSE) == 0)
periph->periph_flags &= ~PERIPH_RECOVERY_ACTIVE;
} else
periph->periph_active--;
if (periph->periph_active == 0 &&
(periph->periph_flags & PERIPH_WAITDRAIN) != 0) {
periph->periph_flags &= ~PERIPH_WAITDRAIN;
cv_broadcast(periph_cv_active(periph));
}
if (periph->periph_flags & PERIPH_WAITING) {
periph->periph_flags &= ~PERIPH_WAITING;
cv_broadcast(periph_cv_periph(periph));
} else {
if (periph->periph_switch->psw_start != NULL &&
device_is_active(periph->periph_dev)) {
SC_DEBUG(periph, SCSIPI_DB2,
("calling private start()\n"));
(*periph->periph_switch->psw_start)(periph);
}
}
}
/*
* scsipi_channel_freeze:
*
* Freeze a channel's xfer queue.
*/
void
scsipi_channel_freeze(struct scsipi_channel *chan, int count)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
chan->chan_qfreeze += count;
if (lock)
mutex_exit(chan_mtx(chan));
}
static void
scsipi_channel_freeze_locked(struct scsipi_channel *chan, int count)
{
chan->chan_qfreeze += count;
}
/*
* scsipi_channel_thaw:
*
* Thaw a channel's xfer queue.
*/
void
scsipi_channel_thaw(struct scsipi_channel *chan, int count)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
chan->chan_qfreeze -= count;
/*
* Don't let the freeze count go negative.
*
* Presumably the adapter driver could keep track of this,
* but it might just be easier to do this here so as to allow
* multiple callers, including those outside the adapter driver.
*/
if (chan->chan_qfreeze < 0) {
chan->chan_qfreeze = 0;
}
if (lock)
mutex_exit(chan_mtx(chan));
/*
* until the channel is running
*/
if (!lock)
return;
/*
* Kick the channel's queue here. Note, we may be running in
* interrupt context (softclock or HBA's interrupt), so the adapter
* driver had better not sleep.
*/
if (chan->chan_qfreeze == 0)
scsipi_run_queue(chan);
}
/*
* scsipi_channel_timed_thaw:
*
* Thaw a channel after some time has expired. This will also
* run the channel's queue if the freeze count has reached 0.
*/
void
scsipi_channel_timed_thaw(void *arg)
{
struct scsipi_channel *chan = arg;
scsipi_channel_thaw(chan, 1);
}
/*
* scsipi_periph_freeze:
*
* Freeze a device's xfer queue.
*/
void
scsipi_periph_freeze_locked(struct scsipi_periph *periph, int count)
{
periph->periph_qfreeze += count;
}
/*
* scsipi_periph_thaw:
*
* Thaw a device's xfer queue.
*/
void
scsipi_periph_thaw_locked(struct scsipi_periph *periph, int count)
{
periph->periph_qfreeze -= count;
#ifdef DIAGNOSTIC
if (periph->periph_qfreeze < 0) {
static const char pc[] = "periph freeze count < 0";
scsipi_printaddr(periph);
printf("%s\n", pc);
panic(pc);
}
#endif
if (periph->periph_qfreeze == 0 &&
(periph->periph_flags & PERIPH_WAITING) != 0)
cv_broadcast(periph_cv_periph(periph));
}
void
scsipi_periph_freeze(struct scsipi_periph *periph, int count)
{
mutex_enter(chan_mtx(periph->periph_channel));
scsipi_periph_freeze_locked(periph, count);
mutex_exit(chan_mtx(periph->periph_channel));
}
void
scsipi_periph_thaw(struct scsipi_periph *periph, int count)
{
mutex_enter(chan_mtx(periph->periph_channel));
scsipi_periph_thaw_locked(periph, count);
mutex_exit(chan_mtx(periph->periph_channel));
}
/*
* scsipi_periph_timed_thaw:
*
* Thaw a device after some time has expired.
*/
void
scsipi_periph_timed_thaw(void *arg)
{
struct scsipi_periph *periph = arg;
struct scsipi_channel *chan = periph->periph_channel;
callout_stop(&periph->periph_callout);
mutex_enter(chan_mtx(chan));
scsipi_periph_thaw_locked(periph, 1);
if ((periph->periph_channel->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/*
* Kick the channel's queue here. Note, we're running in
* interrupt context (softclock), so the adapter driver
* had better not sleep.
*/
mutex_exit(chan_mtx(chan));
scsipi_run_queue(periph->periph_channel);
} else {
/*
* Tell the completion thread to kick the channel's queue here.
*/
periph->periph_channel->chan_tflags |= SCSIPI_CHANT_KICK;
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
}
}
/*
* scsipi_wait_drain:
*
* Wait for a periph's pending xfers to drain.
*/
void
scsipi_wait_drain(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
mutex_enter(chan_mtx(chan));
while (periph->periph_active != 0) {
periph->periph_flags |= PERIPH_WAITDRAIN;
cv_wait(periph_cv_active(periph), chan_mtx(chan));
}
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_kill_pending:
*
* Kill off all pending xfers for a periph.
*
* NOTE: Must be called with channel lock held
*/
void
scsipi_kill_pending(struct scsipi_periph *periph)
{
struct scsipi_channel *chan = periph->periph_channel;
(*chan->chan_bustype->bustype_kill_pending)(periph);
while (periph->periph_active != 0) {
periph->periph_flags |= PERIPH_WAITDRAIN;
cv_wait(periph_cv_active(periph), chan_mtx(chan));
}
}
/*
* scsipi_print_cdb:
* prints a command descriptor block (for debug purpose, error messages,
* SCSIVERBOSE, ...)
*/
void
scsipi_print_cdb(struct scsipi_generic *cmd)
{
int i, j;
printf("0x%02x", cmd->opcode);
switch (CDB_GROUPID(cmd->opcode)) {
case CDB_GROUPID_0:
j = CDB_GROUP0;
break;
case CDB_GROUPID_1:
j = CDB_GROUP1;
break;
case CDB_GROUPID_2:
j = CDB_GROUP2;
break;
case CDB_GROUPID_3:
j = CDB_GROUP3;
break;
case CDB_GROUPID_4:
j = CDB_GROUP4;
break;
case CDB_GROUPID_5:
j = CDB_GROUP5;
break;
case CDB_GROUPID_6:
j = CDB_GROUP6;
break;
case CDB_GROUPID_7:
j = CDB_GROUP7;
break;
default:
j = 0;
}
if (j == 0)
j = sizeof (cmd->bytes);
for (i = 0; i < j-1; i++) /* already done the opcode */
printf(" %02x", cmd->bytes[i]);
}
/*
* scsipi_interpret_sense:
*
* Look at the returned sense and act on the error, determining
* the unix error number to pass back. (0 = report no error)
*
* NOTE: If we return ERESTART, we are expected to have
* thawed the device!
*
* THIS IS THE DEFAULT ERROR HANDLER FOR SCSI DEVICES.
*/
int
scsipi_interpret_sense(struct scsipi_xfer *xs)
{
struct scsi_sense_data *sense;
struct scsipi_periph *periph = xs->xs_periph;
u_int8_t key;
int error;
u_int32_t info;
static const char *error_mes[] = {
"soft error (corrected)",
"not ready", "medium error",
"non-media hardware failure", "illegal request",
"unit attention", "readonly device",
"no data found", "vendor unique",
"copy aborted", "command aborted",
"search returned equal", "volume overflow",
"verify miscompare", "unknown error key"
};
sense = &xs->sense.scsi_sense;
#ifdef SCSIPI_DEBUG
if (periph->periph_flags & SCSIPI_DB1) {
int count, len;
scsipi_printaddr(periph);
printf(" sense debug information:\n");
printf("\tcode 0x%x valid %d\n",
SSD_RCODE(sense->response_code),
sense->response_code & SSD_RCODE_VALID ? 1 : 0);
printf("\tseg 0x%x key 0x%x ili 0x%x eom 0x%x fmark 0x%x\n",
sense->segment,
SSD_SENSE_KEY(sense->flags),
sense->flags & SSD_ILI ? 1 : 0,
sense->flags & SSD_EOM ? 1 : 0,
sense->flags & SSD_FILEMARK ? 1 : 0);
printf("\ninfo: 0x%x 0x%x 0x%x 0x%x followed by %d "
"extra bytes\n",
sense->info[0],
sense->info[1],
sense->info[2],
sense->info[3],
sense->extra_len);
len = SSD_ADD_BYTES_LIM(sense);
printf("\textra (up to %d bytes): ", len);
for (count = 0; count < len; count++)
printf("0x%x ", sense->csi[count]);
printf("\n");
}
#endif
/*
* If the periph has its own error handler, call it first.
* If it returns a legit error value, return that, otherwise
* it wants us to continue with normal error processing.
*/
if (periph->periph_switch->psw_error != NULL) {
SC_DEBUG(periph, SCSIPI_DB2,
("calling private err_handler()\n"));
error = (*periph->periph_switch->psw_error)(xs);
if (error != EJUSTRETURN)
return error;
}
/* otherwise use the default */
switch (SSD_RCODE(sense->response_code)) {
/*
* Old SCSI-1 and SASI devices respond with
* codes other than 70.
*/
case 0x00: /* no error (command completed OK) */
return 0;
case 0x04: /* drive not ready after it was selected */
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
return 0;
/* XXX - display some sort of error here? */
return EIO;
case 0x20: /* invalid command */
if ((xs->xs_control &
XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
return 0;
return EINVAL;
case 0x25: /* invalid LUN (Adaptec ACB-4000) */
return EACCES;
/*
* If it's code 70, use the extended stuff and
* interpret the key
*/
case 0x71: /* delayed error */
scsipi_printaddr(periph);
key = SSD_SENSE_KEY(sense->flags);
printf(" DEFERRED ERROR, key = 0x%x\n", key);
/* FALLTHROUGH */
case 0x70:
if ((sense->response_code & SSD_RCODE_VALID) != 0)
info = _4btol(sense->info);
else
info = 0;
key = SSD_SENSE_KEY(sense->flags);
switch (key) {
case SKEY_NO_SENSE:
case SKEY_RECOVERED_ERROR:
if (xs->resid == xs->datalen && xs->datalen) {
/*
* Why is this here?
*/
xs->resid = 0; /* not short read */
}
error = 0;
break;
case SKEY_EQUAL:
error = 0;
break;
case SKEY_NOT_READY:
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control & XS_CTL_IGNORE_NOT_READY) != 0)
return 0;
if (sense->asc == 0x3A) {
error = ENODEV; /* Medium not present */
if (xs->xs_control & XS_CTL_SILENT_NODEV)
return error;
} else
error = EIO;
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return error;
break;
case SKEY_ILLEGAL_REQUEST:
if ((xs->xs_control &
XS_CTL_IGNORE_ILLEGAL_REQUEST) != 0)
return 0;
/*
* Handle the case where a device reports
* Logical Unit Not Supported during discovery.
*/
if ((xs->xs_control & XS_CTL_DISCOVERY) != 0 &&
sense->asc == 0x25 &&
sense->ascq == 0x00)
return EINVAL;
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return EIO;
error = EINVAL;
break;
case SKEY_UNIT_ATTENTION:
if (sense->asc == 0x29 &&
sense->ascq == 0x00) {
/* device or bus reset */
return ERESTART;
}
if ((periph->periph_flags & PERIPH_REMOVABLE) != 0)
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
if ((xs->xs_control &
XS_CTL_IGNORE_MEDIA_CHANGE) != 0 ||
/* XXX Should reupload any transient state. */
(periph->periph_flags &
PERIPH_REMOVABLE) == 0) {
return ERESTART;
}
if ((xs->xs_control & XS_CTL_SILENT) != 0)
return EIO;
error = EIO;
break;
case SKEY_DATA_PROTECT:
error = EROFS;
break;
case SKEY_BLANK_CHECK:
error = 0;
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
break;
case SKEY_VOLUME_OVERFLOW:
error = ENOSPC;
break;
default:
error = EIO;
break;
}
/* Print verbose decode if appropriate and possible */
if ((key == 0) ||
((xs->xs_control & XS_CTL_SILENT) != 0) ||
(scsipi_print_sense(xs, 0) != 0))
return error;
/* Print brief(er) sense information */
scsipi_printaddr(periph);
printf("%s", error_mes[key - 1]);
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
switch (key) {
case SKEY_NOT_READY:
case SKEY_ILLEGAL_REQUEST:
case SKEY_UNIT_ATTENTION:
case SKEY_DATA_PROTECT:
break;
case SKEY_BLANK_CHECK:
printf(", requested size: %d (decimal)",
info);
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries)
printf(", retrying");
printf(", cmd 0x%x, info 0x%x",
xs->cmd->opcode, info);
break;
default:
printf(", info = %d (decimal)", info);
}
}
if (sense->extra_len != 0) {
int n;
printf(", data =");
for (n = 0; n < sense->extra_len; n++)
printf(" %02x",
sense->csi[n]);
}
printf("\n");
return error;
/*
* Some other code, just report it
*/
default:
#if defined(SCSIDEBUG) || defined(DEBUG)
{
static const char *uc = "undecodable sense error";
int i;
u_int8_t *cptr = (u_int8_t *) sense;
scsipi_printaddr(periph);
if (xs->cmd == &xs->cmdstore) {
printf("%s for opcode 0x%x, data=",
uc, xs->cmdstore.opcode);
} else {
printf("%s, data=", uc);
}
for (i = 0; i < sizeof (sense); i++)
printf(" 0x%02x", *(cptr++) & 0xff);
printf("\n");
}
#else
scsipi_printaddr(periph);
printf("Sense Error Code 0x%x",
SSD_RCODE(sense->response_code));
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
struct scsi_sense_data_unextended *usense =
(struct scsi_sense_data_unextended *)sense;
printf(" at block no. %d (decimal)",
_3btol(usense->block));
}
printf("\n");
#endif
return EIO;
}
}
/*
* scsipi_test_unit_ready:
*
* Issue a `test unit ready' request.
*/
int
scsipi_test_unit_ready(struct scsipi_periph *periph, int flags)
{
struct scsi_test_unit_ready cmd;
int retries;
/* some ATAPI drives don't support TEST UNIT READY. Sigh */
if (periph->periph_quirks & PQUIRK_NOTUR)
return 0;
if (flags & XS_CTL_DISCOVERY)
retries = 0;
else
retries = SCSIPIRETRIES;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_TEST_UNIT_READY;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
retries, 10000, NULL, flags);
}
static const struct scsipi_inquiry3_pattern {
const char vendor[8];
const char product[16];
const char revision[4];
} scsipi_inquiry3_quirk[] = {
{ "ES-6600 ", "", "" },
};
static int
scsipi_inquiry3_ok(const struct scsipi_inquiry_data *ib)
{
for (size_t i = 0; i < __arraycount(scsipi_inquiry3_quirk); i++) {
const struct scsipi_inquiry3_pattern *q =
&scsipi_inquiry3_quirk[i];
#define MATCH(field) \
(q->field[0] ? memcmp(ib->field, q->field, sizeof(ib->field)) == 0 : 1)
if (MATCH(vendor) && MATCH(product) && MATCH(revision))
return 0;
}
return 1;
}
/*
* scsipi_inquire:
*
* Ask the device about itself.
*/
int
scsipi_inquire(struct scsipi_periph *periph, struct scsipi_inquiry_data *inqbuf,
int flags)
{
struct scsipi_inquiry cmd;
int error;
int retries;
if (flags & XS_CTL_DISCOVERY)
retries = 0;
else
retries = SCSIPIRETRIES;
/*
* If we request more data than the device can provide, it SHOULD just
* return a short response. However, some devices error with an
* ILLEGAL REQUEST sense code, and yet others have even more special
* failure modes (such as the GL641USB flash adapter, which goes loony
* and sends corrupted CRCs). To work around this, and to bring our
* behavior more in line with other OSes, we do a shorter inquiry,
* covering all the SCSI-2 information, first, and then request more
* data iff the "additional length" field indicates there is more.
* - mycroft, 2003/10/16
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = INQUIRY;
cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI2;
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI2, retries,
10000, NULL, flags | XS_CTL_DATA_IN);
if (!error &&
inqbuf->additional_length > SCSIPI_INQUIRY_LENGTH_SCSI2 - 4) {
if (scsipi_inquiry3_ok(inqbuf)) {
#if 0
printf("inquire: addlen=%d, retrying\n", inqbuf->additional_length);
#endif
cmd.length = SCSIPI_INQUIRY_LENGTH_SCSI3;
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)inqbuf, SCSIPI_INQUIRY_LENGTH_SCSI3, retries,
10000, NULL, flags | XS_CTL_DATA_IN);
#if 0
printf("inquire: error=%d\n", error);
#endif
}
}
#ifdef SCSI_OLD_NOINQUIRY
/*
* Kludge for the Adaptec ACB-4000 SCSI->MFM translator.
* This board doesn't support the INQUIRY command at all.
*/
if (error == EINVAL || error == EACCES) {
/*
* Conjure up an INQUIRY response.
*/
inqbuf->device = (error == EINVAL ?
SID_QUAL_LU_PRESENT :
SID_QUAL_LU_NOTPRESENT) | T_DIRECT;
inqbuf->dev_qual2 = 0;
inqbuf->version = 0;
inqbuf->response_format = SID_FORMAT_SCSI1;
inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
memcpy(inqbuf->vendor, "ADAPTEC ACB-4000 ", 28);
error = 0;
}
/*
* Kludge for the Emulex MT-02 SCSI->QIC translator.
* This board gives an empty response to an INQUIRY command.
*/
else if (error == 0 &&
inqbuf->device == (SID_QUAL_LU_PRESENT | T_DIRECT) &&
inqbuf->dev_qual2 == 0 &&
inqbuf->version == 0 &&
inqbuf->response_format == SID_FORMAT_SCSI1) {
/*
* Fill out the INQUIRY response.
*/
inqbuf->device = (SID_QUAL_LU_PRESENT | T_SEQUENTIAL);
inqbuf->dev_qual2 = SID_REMOVABLE;
inqbuf->additional_length = SCSIPI_INQUIRY_LENGTH_SCSI2 - 4;
inqbuf->flags1 = inqbuf->flags2 = inqbuf->flags3 = 0;
memcpy(inqbuf->vendor, "EMULEX MT-02 QIC ", 28);
}
#endif /* SCSI_OLD_NOINQUIRY */
return error;
}
/*
* scsipi_prevent:
*
* Prevent or allow the user to remove the media
*/
int
scsipi_prevent(struct scsipi_periph *periph, int type, int flags)
{
struct scsi_prevent_allow_medium_removal cmd;
if (periph->periph_quirks & PQUIRK_NODOORLOCK)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_PREVENT_ALLOW_MEDIUM_REMOVAL;
cmd.how = type;
return (scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, 5000, NULL, flags));
}
/*
* scsipi_start:
*
* Send a START UNIT.
*/
int
scsipi_start(struct scsipi_periph *periph, int type, int flags)
{
struct scsipi_start_stop cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = START_STOP;
cmd.byte2 = 0x00;
cmd.how = type;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd), 0, 0,
SCSIPIRETRIES, (type & SSS_START) ? 60000 : 10000, NULL, flags);
}
/*
* scsipi_mode_sense, scsipi_mode_sense_big:
* get a sense page from a device
*/
int
scsipi_mode_sense(struct scsipi_periph *periph, int byte2, int page,
struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_sense_6 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SENSE_6;
cmd.byte2 = byte2;
cmd.page = page;
cmd.length = len & 0xff;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}
int
scsipi_mode_sense_big(struct scsipi_periph *periph, int byte2, int page,
struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_sense_10 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SENSE_10;
cmd.byte2 = byte2;
cmd.page = page;
_lto2b(len, cmd.length);
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_IN);
}
int
scsipi_mode_select(struct scsipi_periph *periph, int byte2,
struct scsi_mode_parameter_header_6 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_select_6 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SELECT_6;
cmd.byte2 = byte2;
cmd.length = len & 0xff;
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}
int
scsipi_mode_select_big(struct scsipi_periph *periph, int byte2,
struct scsi_mode_parameter_header_10 *data, int len, int flags, int retries,
int timeout)
{
struct scsi_mode_select_10 cmd;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MODE_SELECT_10;
cmd.byte2 = byte2;
_lto2b(len, cmd.length);
return scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, retries, timeout, NULL, flags | XS_CTL_DATA_OUT);
}
/*
* scsipi_get_opcodeinfo:
*
* query the device for supported commands and their timeout
* building a timeout lookup table if timeout information is available.
*/
void
scsipi_get_opcodeinfo(struct scsipi_periph *periph)
{
u_int8_t *data;
int len = 16*1024;
int rc;
struct scsi_repsuppopcode cmd;
/* refrain from asking for supported opcodes */
if (periph->periph_quirks & PQUIRK_NOREPSUPPOPC ||
periph->periph_type == T_PROCESSOR || /* spec. */
periph->periph_type == T_CDROM) /* spec. */
return;
scsipi_free_opcodeinfo(periph);
/*
* query REPORT SUPPORTED OPERATION CODES
* if OK
* enumerate all codes
* if timeout exists insert maximum into opcode table
*/
data = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_MAINTENANCE_IN;
cmd.svcaction = RSOC_REPORT_SUPPORTED_OPCODES;
cmd.repoption = RSOC_RCTD|RSOC_ALL;
_lto4b(len, cmd.alloclen);
rc = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)data, len, 0, 1000, NULL,
XS_CTL_DATA_IN|XS_CTL_SILENT);
if (rc == 0) {
int count;
int dlen = _4btol(data);
u_int8_t *c = data + 4;
SC_DEBUG(periph, SCSIPI_DB3,
("supported opcode timeout-values loaded\n"));
SC_DEBUG(periph, SCSIPI_DB3,
("CMD LEN SA spec nom. time cmd timeout\n"));
struct scsipi_opcodes *tot = malloc(sizeof(struct scsipi_opcodes),
M_DEVBUF, M_WAITOK|M_ZERO);
count = 0;
while (tot != NULL &&
dlen >= (int)sizeof(struct scsi_repsupopcode_all_commands_descriptor)) {
struct scsi_repsupopcode_all_commands_descriptor *acd
= (struct scsi_repsupopcode_all_commands_descriptor *)c;
#ifdef SCSIPI_DEBUG
int cdblen = _2btol((const u_int8_t *)&acd->cdblen);
#endif
dlen -= sizeof(struct scsi_repsupopcode_all_commands_descriptor);
c += sizeof(struct scsi_repsupopcode_all_commands_descriptor);
SC_DEBUG(periph, SCSIPI_DB3,
("0x%02x(%2d) ", acd->opcode, cdblen));
tot->opcode_info[acd->opcode].ti_flags = SCSIPI_TI_VALID;
if (acd->flags & RSOC_ACD_SERVACTV) {
SC_DEBUGN(periph, SCSIPI_DB3,
("0x%02x%02x ",
acd->serviceaction[0],
acd->serviceaction[1]));
} else {
SC_DEBUGN(periph, SCSIPI_DB3, (" "));
}
if (acd->flags & RSOC_ACD_CTDP
&& dlen >= (int)sizeof(struct scsi_repsupopcode_timeouts_descriptor)) {
struct scsi_repsupopcode_timeouts_descriptor *td
= (struct scsi_repsupopcode_timeouts_descriptor *)c;
long nomto = _4btol(td->nom_process_timeout);
long cmdto = _4btol(td->cmd_process_timeout);
long t = (cmdto > nomto) ? cmdto : nomto;
dlen -= sizeof(struct scsi_repsupopcode_timeouts_descriptor);
c += sizeof(struct scsi_repsupopcode_timeouts_descriptor);
SC_DEBUGN(periph, SCSIPI_DB3,
("0x%02x %10ld %10ld",
td->cmd_specific,
nomto, cmdto));
if (t > tot->opcode_info[acd->opcode].ti_timeout) {
tot->opcode_info[acd->opcode].ti_timeout = t;
++count;
}
}
SC_DEBUGN(periph, SCSIPI_DB3,("\n"));
}
if (count > 0) {
periph->periph_opcs = tot;
} else {
free(tot, M_DEVBUF);
SC_DEBUG(periph, SCSIPI_DB3,
("no usable timeout values available\n"));
}
} else {
SC_DEBUG(periph, SCSIPI_DB3,
("SCSI_MAINTENANCE_IN"
"[RSOC_REPORT_SUPPORTED_OPCODES] failed error=%d"
" - no device provided timeout "
"values available\n", rc));
}
free(data, M_DEVBUF);
}
/*
* scsipi_update_timeouts:
* Override timeout value if device/config provided
* timeouts are available.
*/
static void
scsipi_update_timeouts(struct scsipi_xfer *xs)
{
struct scsipi_opcodes *opcs;
u_int8_t cmd;
int timeout;
struct scsipi_opinfo *oi;
if (xs->timeout <= 0) {
return;
}
opcs = xs->xs_periph->periph_opcs;
if (opcs == NULL) {
return;
}
cmd = xs->cmd->opcode;
oi = &opcs->opcode_info[cmd];
timeout = 1000 * (int)oi->ti_timeout;
if (timeout > xs->timeout && timeout < 86400000) {
/*
* pick up device configured timeouts if they
* are longer than the requested ones but less
* than a day
*/
#ifdef SCSIPI_DEBUG
if ((oi->ti_flags & SCSIPI_TI_LOGGED) == 0) {
SC_DEBUG(xs->xs_periph, SCSIPI_DB3,
("Overriding command 0x%02x "
"timeout of %d with %d ms\n",
cmd, xs->timeout, timeout));
oi->ti_flags |= SCSIPI_TI_LOGGED;
}
#endif
xs->timeout = timeout;
}
}
/*
* scsipi_free_opcodeinfo:
*
* free the opcode information table
*/
void
scsipi_free_opcodeinfo(struct scsipi_periph *periph)
{
if (periph->periph_opcs != NULL) {
free(periph->periph_opcs, M_DEVBUF);
}
periph->periph_opcs = NULL;
}
/*
* scsipi_done:
*
* This routine is called by an adapter's interrupt handler when
* an xfer is completed.
*/
void
scsipi_done(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int freezecnt;
SC_DEBUG(periph, SCSIPI_DB2, ("scsipi_done\n"));
#ifdef SCSIPI_DEBUG
if (periph->periph_dbflags & SCSIPI_DB1)
show_scsipi_cmd(xs);
#endif
mutex_enter(chan_mtx(chan));
SDT_PROBE1(scsi, base, xfer, done, xs);
/*
* The resource this command was using is now free.
*/
if (xs->xs_status & XS_STS_DONE) {
/* XXX in certain circumstances, such as a device
* being detached, a xs that has already been
* scsipi_done()'d by the main thread will be done'd
* again by scsibusdetach(). Putting the xs on the
* chan_complete queue causes list corruption and
* everyone dies. This prevents that, but perhaps
* there should be better coordination somewhere such
* that this won't ever happen (and can be turned into
* a KASSERT().
*/
SDT_PROBE1(scsi, base, xfer, redone, xs);
mutex_exit(chan_mtx(chan));
goto out;
}
scsipi_put_resource(chan);
xs->xs_periph->periph_sent--;
/*
* If the command was tagged, free the tag.
*/
if (XS_CTL_TAGTYPE(xs) != 0)
scsipi_put_tag(xs);
else
periph->periph_flags &= ~PERIPH_UNTAG;
/* Mark the command as `done'. */
xs->xs_status |= XS_STS_DONE;
#ifdef DIAGNOSTIC
if ((xs->xs_control & (XS_CTL_ASYNC|XS_CTL_POLL)) ==
(XS_CTL_ASYNC|XS_CTL_POLL))
panic("scsipi_done: ASYNC and POLL");
#endif
/*
* If the xfer had an error of any sort, freeze the
* periph's queue. Freeze it again if we were requested
* to do so in the xfer.
*/
freezecnt = 0;
if (xs->error != XS_NOERROR)
freezecnt++;
if (xs->xs_control & XS_CTL_FREEZE_PERIPH)
freezecnt++;
if (freezecnt != 0)
scsipi_periph_freeze_locked(periph, freezecnt);
/*
* record the xfer with a pending sense, in case a SCSI reset is
* received before the thread is waked up.
*/
if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
periph->periph_flags |= PERIPH_SENSE;
periph->periph_xscheck = xs;
}
/*
* If this was an xfer that was not to complete asynchronously,
* let the requesting thread perform error checking/handling
* in its context.
*/
if ((xs->xs_control & XS_CTL_ASYNC) == 0) {
/*
* If it's a polling job, just return, to unwind the
* call graph. We don't need to restart the queue,
* because polling jobs are treated specially, and
* are really only used during crash dumps anyway
* (XXX or during boot-time autoconfiguration of
* ATAPI devices).
*/
if (xs->xs_control & XS_CTL_POLL) {
mutex_exit(chan_mtx(chan));
return;
}
cv_broadcast(xs_cv(xs));
mutex_exit(chan_mtx(chan));
goto out;
}
/*
* Catch the extremely common case of I/O completing
* without error; no use in taking a context switch
* if we can handle it in interrupt context.
*/
if (xs->error == XS_NOERROR) {
mutex_exit(chan_mtx(chan));
(void) scsipi_complete(xs);
goto out;
}
/*
* There is an error on this xfer. Put it on the channel's
* completion queue, and wake up the completion thread.
*/
TAILQ_INSERT_TAIL(&chan->chan_complete, xs, channel_q);
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
out:
/*
* If there are more xfers on the channel's queue, attempt to
* run them.
*/
scsipi_run_queue(chan);
}
/*
* scsipi_complete:
*
* Completion of a scsipi_xfer. This is the guts of scsipi_done().
*
* NOTE: This routine MUST be called with valid thread context
* except for the case where the following two conditions are
* true:
*
* xs->error == XS_NOERROR
* XS_CTL_ASYNC is set in xs->xs_control
*
* The semantics of this routine can be tricky, so here is an
* explanation:
*
* 0 Xfer completed successfully.
*
* ERESTART Xfer had an error, but was restarted.
*
* anything else Xfer had an error, return value is Unix
* errno.
*
* If the return value is anything but ERESTART:
*
* - If XS_CTL_ASYNC is set, `xs' has been freed back to
* the pool.
* - If there is a buf associated with the xfer,
* it has been biodone()'d.
*/
static int
scsipi_complete(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int error;
SDT_PROBE1(scsi, base, xfer, complete, xs);
#ifdef DIAGNOSTIC
if ((xs->xs_control & XS_CTL_ASYNC) != 0 && xs->bp == NULL)
panic("scsipi_complete: XS_CTL_ASYNC but no buf");
#endif
/*
* If command terminated with a CHECK CONDITION, we need to issue a
* REQUEST_SENSE command. Once the REQUEST_SENSE has been processed
* we'll have the real status.
* Must be processed with channel lock held to avoid missing
* a SCSI bus reset for this command.
*/
mutex_enter(chan_mtx(chan));
if (xs->error == XS_BUSY && xs->status == SCSI_CHECK) {
/* request sense for a request sense ? */
if (xs->xs_control & XS_CTL_REQSENSE) {
scsipi_printaddr(periph);
printf("request sense for a request sense ?\n");
/* XXX maybe we should reset the device ? */
/* we've been frozen because xs->error != XS_NOERROR */
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
if (xs->resid < xs->datalen) {
printf("we read %d bytes of sense anyway:\n",
xs->datalen - xs->resid);
scsipi_print_sense_data((void *)xs->data, 0);
}
return EINVAL;
}
mutex_exit(chan_mtx(chan)); // XXX allows other commands to queue or run
scsipi_request_sense(xs);
} else
mutex_exit(chan_mtx(chan));
/*
* If it's a user level request, bypass all usual completion
* processing, let the user work it out..
*/
if ((xs->xs_control & XS_CTL_USERCMD) != 0) {
SC_DEBUG(periph, SCSIPI_DB3, ("calling user done()\n"));
mutex_enter(chan_mtx(chan));
if (xs->error != XS_NOERROR)
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
scsipi_user_done(xs);
SC_DEBUG(periph, SCSIPI_DB3, ("returned from user done()\n "));
return 0;
}
switch (xs->error) {
case XS_NOERROR:
error = 0;
break;
case XS_SENSE:
case XS_SHORTSENSE:
error = (*chan->chan_bustype->bustype_interpret_sense)(xs);
break;
case XS_RESOURCE_SHORTAGE:
/*
* XXX Should freeze channel's queue.
*/
scsipi_printaddr(periph);
printf("adapter resource shortage\n");
/* FALLTHROUGH */
case XS_BUSY:
if (xs->error == XS_BUSY && xs->status == SCSI_QUEUE_FULL) {
struct scsipi_max_openings mo;
/*
* We set the openings to active - 1, assuming that
* the command that got us here is the first one that
* can't fit into the device's queue. If that's not
* the case, I guess we'll find out soon enough.
*/
mo.mo_target = periph->periph_target;
mo.mo_lun = periph->periph_lun;
if (periph->periph_active < periph->periph_openings)
mo.mo_openings = periph->periph_active - 1;
else
mo.mo_openings = periph->periph_openings - 1;
#ifdef DIAGNOSTIC
if (mo.mo_openings < 0) {
scsipi_printaddr(periph);
printf("QUEUE FULL resulted in < 0 openings\n");
panic("scsipi_done");
}
#endif
if (mo.mo_openings == 0) {
scsipi_printaddr(periph);
printf("QUEUE FULL resulted in 0 openings\n");
mo.mo_openings = 1;
}
scsipi_async_event(chan, ASYNC_EVENT_MAX_OPENINGS, &mo);
error = ERESTART;
} else if (xs->xs_retries != 0) {
xs->xs_retries--;
/*
* Wait one second, and try again.
*/
mutex_enter(chan_mtx(chan));
if ((xs->xs_control & XS_CTL_POLL) ||
(chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/* XXX: quite extreme */
kpause("xsbusy", false, hz, chan_mtx(chan));
} else if (!callout_pending(&periph->periph_callout)) {
scsipi_periph_freeze_locked(periph, 1);
callout_reset(&periph->periph_callout,
hz, scsipi_periph_timed_thaw, periph);
}
mutex_exit(chan_mtx(chan));
error = ERESTART;
} else
error = EBUSY;
break;
case XS_REQUEUE:
error = ERESTART;
break;
case XS_SELTIMEOUT:
case XS_TIMEOUT:
/*
* If the device hasn't gone away, honor retry counts.
*
* Note that if we're in the middle of probing it,
* it won't be found because it isn't here yet so
* we won't honor the retry count in that case.
*/
if (scsipi_lookup_periph(chan, periph->periph_target,
periph->periph_lun) && xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
break;
case XS_RESET:
if (xs->xs_control & XS_CTL_REQSENSE) {
/*
* request sense interrupted by reset: signal it
* with EINTR return code.
*/
error = EINTR;
} else {
if (xs->xs_retries != 0) {
xs->xs_retries--;
error = ERESTART;
} else
error = EIO;
}
break;
case XS_DRIVER_STUFFUP:
scsipi_printaddr(periph);
printf("generic HBA error\n");
error = EIO;
break;
default:
scsipi_printaddr(periph);
printf("invalid return code from adapter: %d\n", xs->error);
error = EIO;
break;
}
mutex_enter(chan_mtx(chan));
if (error == ERESTART) {
SDT_PROBE1(scsi, base, xfer, restart, xs);
/*
* If we get here, the periph has been thawed and frozen
* again if we had to issue recovery commands. Alternatively,
* it may have been frozen again and in a timed thaw. In
* any case, we thaw the periph once we re-enqueue the
* command. Once the periph is fully thawed, it will begin
* operation again.
*/
xs->error = XS_NOERROR;
xs->status = SCSI_OK;
xs->xs_status &= ~XS_STS_DONE;
xs->xs_requeuecnt++;
error = scsipi_enqueue(xs);
if (error == 0) {
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
return ERESTART;
}
}
/*
* scsipi_done() freezes the queue if not XS_NOERROR.
* Thaw it here.
*/
if (xs->error != XS_NOERROR)
scsipi_periph_thaw_locked(periph, 1);
mutex_exit(chan_mtx(chan));
if (periph->periph_switch->psw_done)
periph->periph_switch->psw_done(xs, error);
mutex_enter(chan_mtx(chan));
if (xs->xs_control & XS_CTL_ASYNC)
scsipi_put_xs(xs);
mutex_exit(chan_mtx(chan));
return error;
}
/*
* Issue a request sense for the given scsipi_xfer. Called when the xfer
* returns with a CHECK_CONDITION status. Must be called in valid thread
* context.
*/
static void
scsipi_request_sense(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
int flags, error;
struct scsi_request_sense cmd;
periph->periph_flags |= PERIPH_SENSE;
/* if command was polling, request sense will too */
flags = xs->xs_control & XS_CTL_POLL;
/* Polling commands can't sleep */
if (flags)
flags |= XS_CTL_NOSLEEP;
flags |= XS_CTL_REQSENSE | XS_CTL_URGENT | XS_CTL_DATA_IN |
XS_CTL_THAW_PERIPH | XS_CTL_FREEZE_PERIPH;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SCSI_REQUEST_SENSE;
cmd.length = sizeof(struct scsi_sense_data);
error = scsipi_command(periph, (void *)&cmd, sizeof(cmd),
(void *)&xs->sense.scsi_sense, sizeof(struct scsi_sense_data),
0, 1000, NULL, flags);
periph->periph_flags &= ~PERIPH_SENSE;
periph->periph_xscheck = NULL;
switch (error) {
case 0:
/* we have a valid sense */
xs->error = XS_SENSE;
return;
case EINTR:
/* REQUEST_SENSE interrupted by bus reset. */
xs->error = XS_RESET;
return;
case EIO:
/* request sense couldn't be performed */
/*
* XXX this isn't quite right but we don't have anything
* better for now
*/
xs->error = XS_DRIVER_STUFFUP;
return;
default:
/* Notify that request sense failed. */
xs->error = XS_DRIVER_STUFFUP;
scsipi_printaddr(periph);
printf("request sense failed with error %d\n", error);
return;
}
}
/*
* scsipi_enqueue:
*
* Enqueue an xfer on a channel.
*/
static int
scsipi_enqueue(struct scsipi_xfer *xs)
{
struct scsipi_channel *chan = xs->xs_periph->periph_channel;
struct scsipi_xfer *qxs;
SDT_PROBE1(scsi, base, xfer, enqueue, xs);
/*
* If the xfer is to be polled, and there are already jobs on
* the queue, we can't proceed.
*/
KASSERT(mutex_owned(chan_mtx(chan))); if ((xs->xs_control & XS_CTL_POLL) != 0 &&
TAILQ_FIRST(&chan->chan_queue) != NULL) {
xs->error = XS_DRIVER_STUFFUP;
return EAGAIN;
}
/*
* If we have an URGENT xfer, it's an error recovery command
* and it should just go on the head of the channel's queue.
*/
if (xs->xs_control & XS_CTL_URGENT) {
TAILQ_INSERT_HEAD(&chan->chan_queue, xs, channel_q);
goto out;
}
/*
* If this xfer has already been on the queue before, we
* need to reinsert it in the correct order. That order is:
*
* Immediately before the first xfer for this periph
* with a requeuecnt less than xs->xs_requeuecnt.
*
* Failing that, at the end of the queue. (We'll end up
* there naturally.)
*/
if (xs->xs_requeuecnt != 0) { for (qxs = TAILQ_FIRST(&chan->chan_queue); qxs != NULL;
qxs = TAILQ_NEXT(qxs, channel_q)) {
if (qxs->xs_periph == xs->xs_periph &&
qxs->xs_requeuecnt < xs->xs_requeuecnt)
break;
}
if (qxs != NULL) {
TAILQ_INSERT_AFTER(&chan->chan_queue, qxs, xs,
channel_q);
goto out;
}
}
TAILQ_INSERT_TAIL(&chan->chan_queue, xs, channel_q);
out:
if (xs->xs_control & XS_CTL_THAW_PERIPH) scsipi_periph_thaw_locked(xs->xs_periph, 1);
return 0;
}
/*
* scsipi_run_queue:
*
* Start as many xfers as possible running on the channel.
*/
static void
scsipi_run_queue(struct scsipi_channel *chan)
{
struct scsipi_xfer *xs;
struct scsipi_periph *periph;
SDT_PROBE1(scsi, base, queue, batch__start, chan);
for (;;) {
mutex_enter(chan_mtx(chan));
/*
* If the channel is frozen, we can't do any work right
* now.
*/
if (chan->chan_qfreeze != 0) {
mutex_exit(chan_mtx(chan));
break;
}
/*
* Look for work to do, and make sure we can do it.
*/
for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL;
xs = TAILQ_NEXT(xs, channel_q)) {
periph = xs->xs_periph;
if ((periph->periph_sent >= periph->periph_openings) || periph->periph_qfreeze != 0 ||
(periph->periph_flags & PERIPH_UNTAG) != 0)
continue;
if ((periph->periph_flags & (PERIPH_RECOVERING | PERIPH_SENSE)) != 0 &&
(xs->xs_control & XS_CTL_URGENT) == 0)
continue;
/*
* We can issue this xfer!
*/
goto got_one;
}
/*
* Can't find any work to do right now.
*/
mutex_exit(chan_mtx(chan));
break;
got_one:
/*
* Have an xfer to run. Allocate a resource from
* the adapter to run it. If we can't allocate that
* resource, we don't dequeue the xfer.
*/
if (scsipi_get_resource(chan) == 0) {
/*
* Adapter is out of resources. If the adapter
* supports it, attempt to grow them.
*/
if (scsipi_grow_resources(chan) == 0) {
/*
* Wasn't able to grow resources,
* nothing more we can do.
*/
if (xs->xs_control & XS_CTL_POLL) { scsipi_printaddr(xs->xs_periph);
printf("polling command but no "
"adapter resources");
/* We'll panic shortly... */
}
mutex_exit(chan_mtx(chan));
/*
* XXX: We should be able to note that
* XXX: that resources are needed here!
*/
break;
}
/*
* scsipi_grow_resources() allocated the resource
* for us.
*/
}
/*
* We have a resource to run this xfer, do it!
*/
TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);
/*
* If the command is to be tagged, allocate a tag ID
* for it.
*/
if (XS_CTL_TAGTYPE(xs) != 0)
scsipi_get_tag(xs);
else
periph->periph_flags |= PERIPH_UNTAG;
periph->periph_sent++;
mutex_exit(chan_mtx(chan));
SDT_PROBE2(scsi, base, queue, run, chan, xs); scsipi_adapter_request(chan, ADAPTER_REQ_RUN_XFER, xs);
}
SDT_PROBE1(scsi, base, queue, batch__done, chan);
}
/*
* scsipi_execute_xs:
*
* Begin execution of an xfer, waiting for it to complete, if necessary.
*/
int
scsipi_execute_xs(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsipi_channel *chan = periph->periph_channel;
int oasync, async, poll, error;
KASSERT(!cold); scsipi_update_timeouts(xs);
(chan->chan_bustype->bustype_cmd)(xs);
xs->xs_status &= ~XS_STS_DONE;
xs->error = XS_NOERROR;
xs->resid = xs->datalen;
xs->status = SCSI_OK;
SDT_PROBE1(scsi, base, xfer, execute, xs);
#ifdef SCSIPI_DEBUG
if (xs->xs_periph->periph_dbflags & SCSIPI_DB3) {
printf("scsipi_execute_xs: ");
show_scsipi_xs(xs);
printf("\n");
}
#endif
/*
* Deal with command tagging:
*
* - If the device's current operating mode doesn't
* include tagged queueing, clear the tag mask.
*
* - If the device's current operating mode *does*
* include tagged queueing, set the tag_type in
* the xfer to the appropriate byte for the tag
* message.
*/
if ((PERIPH_XFER_MODE(periph) & PERIPH_CAP_TQING) == 0 ||
(xs->xs_control & XS_CTL_REQSENSE)) {
xs->xs_control &= ~XS_CTL_TAGMASK;
xs->xs_tag_type = 0;
} else {
/*
* If the request doesn't specify a tag, give Head
* tags to URGENT operations and Simple tags to
* everything else.
*/
if (XS_CTL_TAGTYPE(xs) == 0) { if (xs->xs_control & XS_CTL_URGENT)
xs->xs_control |= XS_CTL_HEAD_TAG;
else
xs->xs_control |= XS_CTL_SIMPLE_TAG;
}
switch (XS_CTL_TAGTYPE(xs)) {
case XS_CTL_ORDERED_TAG:
xs->xs_tag_type = MSG_ORDERED_Q_TAG;
break;
case XS_CTL_SIMPLE_TAG:
xs->xs_tag_type = MSG_SIMPLE_Q_TAG;
break;
case XS_CTL_HEAD_TAG:
xs->xs_tag_type = MSG_HEAD_OF_Q_TAG;
break;
default:
scsipi_printaddr(periph);
printf("invalid tag mask 0x%08x\n",
XS_CTL_TAGTYPE(xs));
panic("scsipi_execute_xs");
}
}
/* If the adapter wants us to poll, poll. */
if (chan->chan_adapter->adapt_flags & SCSIPI_ADAPT_POLL_ONLY) xs->xs_control |= XS_CTL_POLL;
/*
* If we don't yet have a completion thread, or we are to poll for
* completion, clear the ASYNC flag.
*/
oasync = (xs->xs_control & XS_CTL_ASYNC); if (chan->chan_thread == NULL || (xs->xs_control & XS_CTL_POLL) != 0)
xs->xs_control &= ~XS_CTL_ASYNC;
async = (xs->xs_control & XS_CTL_ASYNC);
poll = (xs->xs_control & XS_CTL_POLL);
#ifdef DIAGNOSTIC
if (oasync != 0 && xs->bp == NULL) panic("scsipi_execute_xs: XS_CTL_ASYNC but no buf");
#endif
/*
* Enqueue the transfer. If we're not polling for completion, this
* should ALWAYS return `no error'.
*/
error = scsipi_enqueue(xs);
if (error) {
if (poll == 0) {
scsipi_printaddr(periph);
printf("not polling, but enqueue failed with %d\n",
error);
panic("scsipi_execute_xs");
}
scsipi_printaddr(periph);
printf("should have flushed queue?\n");
goto free_xs;
}
mutex_exit(chan_mtx(chan));
restarted:
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
/*
* The xfer is enqueued, and possibly running. If it's to be
* completed asynchronously, just return now.
*/
if (async)
return 0;
/*
* Not an asynchronous command; wait for it to complete.
*/
while ((xs->xs_status & XS_STS_DONE) == 0) {
if (poll) {
scsipi_printaddr(periph);
printf("polling command not done\n");
panic("scsipi_execute_xs");
}
cv_wait(xs_cv(xs), chan_mtx(chan));
}
/*
* Command is complete. scsipi_done() has awakened us to perform
* the error handling.
*/
mutex_exit(chan_mtx(chan));
error = scsipi_complete(xs);
if (error == ERESTART)
goto restarted;
/*
* If it was meant to run async and we cleared async ourselves,
* don't return an error here. It has already been handled
*/
if (oasync)
error = 0;
/*
* Command completed successfully or fatal error occurred. Fall
* into....
*/
mutex_enter(chan_mtx(chan));
free_xs:
scsipi_put_xs(xs);
mutex_exit(chan_mtx(chan));
/*
* Kick the queue, keep it running in case it stopped for some
* reason.
*/
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
return error;
}
/*
* scsipi_completion_thread:
*
* This is the completion thread. We wait for errors on
* asynchronous xfers, and perform the error handling
* function, restarting the command, if necessary.
*/
static void
scsipi_completion_thread(void *arg)
{
struct scsipi_channel *chan = arg;
struct scsipi_xfer *xs;
if (chan->chan_init_cb)
(*chan->chan_init_cb)(chan, chan->chan_init_cb_arg);
mutex_enter(chan_mtx(chan));
chan->chan_flags |= SCSIPI_CHAN_TACTIVE;
for (;;) {
xs = TAILQ_FIRST(&chan->chan_complete);
if (xs == NULL && chan->chan_tflags == 0) {
/* nothing to do; wait */
cv_wait(chan_cv_complete(chan), chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
/* call chan_callback from thread context */
chan->chan_tflags &= ~SCSIPI_CHANT_CALLBACK;
chan->chan_callback(chan, chan->chan_callback_arg);
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_GROWRES) {
/* attempt to get more openings for this channel */
chan->chan_tflags &= ~SCSIPI_CHANT_GROWRES;
mutex_exit(chan_mtx(chan));
scsipi_adapter_request(chan,
ADAPTER_REQ_GROW_RESOURCES, NULL);
scsipi_channel_thaw(chan, 1);
if (chan->chan_tflags & SCSIPI_CHANT_GROWRES)
kpause("scsizzz", FALSE, hz/10, NULL);
mutex_enter(chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_KICK) {
/* explicitly run the queues for this channel */
chan->chan_tflags &= ~SCSIPI_CHANT_KICK;
mutex_exit(chan_mtx(chan));
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
continue;
}
if (chan->chan_tflags & SCSIPI_CHANT_SHUTDOWN) {
break;
}
if (xs) {
TAILQ_REMOVE(&chan->chan_complete, xs, channel_q);
mutex_exit(chan_mtx(chan));
/*
* Have an xfer with an error; process it.
*/
(void) scsipi_complete(xs);
/*
* Kick the queue; keep it running if it was stopped
* for some reason.
*/
scsipi_run_queue(chan);
mutex_enter(chan_mtx(chan));
}
}
chan->chan_thread = NULL;
/* In case parent is waiting for us to exit. */
cv_broadcast(chan_cv_thread(chan));
mutex_exit(chan_mtx(chan));
kthread_exit(0);
}
/*
* scsipi_thread_call_callback:
*
* request to call a callback from the completion thread
*/
int
scsipi_thread_call_callback(struct scsipi_channel *chan,
void (*callback)(struct scsipi_channel *, void *), void *arg)
{
mutex_enter(chan_mtx(chan));
if ((chan->chan_flags & SCSIPI_CHAN_TACTIVE) == 0) {
/* kernel thread doesn't exist yet */
mutex_exit(chan_mtx(chan));
return ESRCH;
}
if (chan->chan_tflags & SCSIPI_CHANT_CALLBACK) {
mutex_exit(chan_mtx(chan));
return EBUSY;
}
scsipi_channel_freeze(chan, 1);
chan->chan_callback = callback;
chan->chan_callback_arg = arg;
chan->chan_tflags |= SCSIPI_CHANT_CALLBACK;
cv_broadcast(chan_cv_complete(chan));
mutex_exit(chan_mtx(chan));
return 0;
}
/*
* scsipi_async_event:
*
* Handle an asynchronous event from an adapter.
*/
void
scsipi_async_event(struct scsipi_channel *chan, scsipi_async_event_t event,
void *arg)
{
bool lock = chan_running(chan) > 0;
if (lock)
mutex_enter(chan_mtx(chan));
switch (event) {
case ASYNC_EVENT_MAX_OPENINGS:
scsipi_async_event_max_openings(chan,
(struct scsipi_max_openings *)arg);
break;
case ASYNC_EVENT_XFER_MODE:
if (chan->chan_bustype->bustype_async_event_xfer_mode) {
chan->chan_bustype->bustype_async_event_xfer_mode(
chan, arg);
}
break;
case ASYNC_EVENT_RESET:
scsipi_async_event_channel_reset(chan);
break;
}
if (lock)
mutex_exit(chan_mtx(chan));
}
/*
* scsipi_async_event_max_openings:
*
* Update the maximum number of outstanding commands a
* device may have.
*/
static void
scsipi_async_event_max_openings(struct scsipi_channel *chan,
struct scsipi_max_openings *mo)
{
struct scsipi_periph *periph;
int minlun, maxlun;
if (mo->mo_lun == -1) {
/*
* Wildcarded; apply it to all LUNs.
*/
minlun = 0;
maxlun = chan->chan_nluns - 1;
} else
minlun = maxlun = mo->mo_lun;
/* XXX This could really suck with a large LUN space. */
for (; minlun <= maxlun; minlun++) {
periph = scsipi_lookup_periph_locked(chan, mo->mo_target, minlun);
if (periph == NULL)
continue;
if (mo->mo_openings < periph->periph_openings)
periph->periph_openings = mo->mo_openings;
else if (mo->mo_openings > periph->periph_openings &&
(periph->periph_flags & PERIPH_GROW_OPENINGS) != 0)
periph->periph_openings = mo->mo_openings;
}
}
/*
* scsipi_set_xfer_mode:
*
* Set the xfer mode for the specified I_T Nexus.
*/
void
scsipi_set_xfer_mode(struct scsipi_channel *chan, int target, int immed)
{
struct scsipi_xfer_mode xm;
struct scsipi_periph *itperiph;
int lun;
/*
* Go to the minimal xfer mode.
*/
xm.xm_target = target;
xm.xm_mode = 0;
xm.xm_period = 0; /* ignored */
xm.xm_offset = 0; /* ignored */
/*
* Find the first LUN we know about on this I_T Nexus.
*/
for (itperiph = NULL, lun = 0; lun < chan->chan_nluns; lun++) {
itperiph = scsipi_lookup_periph(chan, target, lun);
if (itperiph != NULL)
break;
}
if (itperiph != NULL) {
xm.xm_mode = itperiph->periph_cap;
/*
* Now issue the request to the adapter.
*/
scsipi_adapter_request(chan, ADAPTER_REQ_SET_XFER_MODE, &xm);
/*
* If we want this to happen immediately, issue a dummy
* command, since most adapters can't really negotiate unless
* they're executing a job.
*/
if (immed != 0) {
(void) scsipi_test_unit_ready(itperiph,
XS_CTL_DISCOVERY | XS_CTL_IGNORE_ILLEGAL_REQUEST |
XS_CTL_IGNORE_NOT_READY |
XS_CTL_IGNORE_MEDIA_CHANGE);
}
}
}
/*
* scsipi_channel_reset:
*
* handle scsi bus reset
* called with channel lock held
*/
static void
scsipi_async_event_channel_reset(struct scsipi_channel *chan)
{
struct scsipi_xfer *xs, *xs_next;
struct scsipi_periph *periph;
int target, lun;
/*
* Channel has been reset. Also mark as reset pending REQUEST_SENSE
* commands; as the sense is not available any more.
* can't call scsipi_done() from here, as the command has not been
* sent to the adapter yet (this would corrupt accounting).
*/
for (xs = TAILQ_FIRST(&chan->chan_queue); xs != NULL; xs = xs_next) {
xs_next = TAILQ_NEXT(xs, channel_q);
if (xs->xs_control & XS_CTL_REQSENSE) {
TAILQ_REMOVE(&chan->chan_queue, xs, channel_q);
xs->error = XS_RESET;
if ((xs->xs_control & XS_CTL_ASYNC) != 0)
TAILQ_INSERT_TAIL(&chan->chan_complete, xs,
channel_q);
}
}
cv_broadcast(chan_cv_complete(chan));
/* Catch xs with pending sense which may not have a REQSENSE xs yet */
for (target = 0; target < chan->chan_ntargets; target++) {
if (target == chan->chan_id)
continue;
for (lun = 0; lun < chan->chan_nluns; lun++) {
periph = scsipi_lookup_periph_locked(chan, target, lun);
if (periph) {
xs = periph->periph_xscheck;
if (xs)
xs->error = XS_RESET;
}
}
}
}
/*
* scsipi_target_detach:
*
* detach all periph associated with a I_T
* must be called from valid thread context
*/
int
scsipi_target_detach(struct scsipi_channel *chan, int target, int lun,
int flags)
{
struct scsipi_periph *periph;
device_t tdev;
int ctarget, mintarget, maxtarget;
int clun, minlun, maxlun;
int error = 0;
if (target == -1) {
mintarget = 0;
maxtarget = chan->chan_ntargets;
} else {
if (target == chan->chan_id)
return EINVAL;
if (target < 0 || target >= chan->chan_ntargets)
return EINVAL;
mintarget = target;
maxtarget = target + 1;
}
if (lun == -1) {
minlun = 0;
maxlun = chan->chan_nluns;
} else {
if (lun < 0 || lun >= chan->chan_nluns)
return EINVAL;
minlun = lun;
maxlun = lun + 1;
}
/* for config_detach */
KERNEL_LOCK(1, curlwp);
mutex_enter(chan_mtx(chan));
for (ctarget = mintarget; ctarget < maxtarget; ctarget++) {
if (ctarget == chan->chan_id)
continue;
for (clun = minlun; clun < maxlun; clun++) {
periph = scsipi_lookup_periph_locked(chan, ctarget, clun);
if (periph == NULL)
continue;
tdev = periph->periph_dev;
mutex_exit(chan_mtx(chan));
error = config_detach(tdev, flags);
if (error)
goto out;
mutex_enter(chan_mtx(chan));
KASSERT(scsipi_lookup_periph_locked(chan, ctarget, clun) == NULL);
}
}
mutex_exit(chan_mtx(chan));
out:
KERNEL_UNLOCK_ONE(curlwp);
return error;
}
/*
* scsipi_adapter_addref:
*
* Add a reference to the adapter pointed to by the provided
* link, enabling the adapter if necessary.
*/
int
scsipi_adapter_addref(struct scsipi_adapter *adapt)
{
int error = 0;
if (atomic_inc_uint_nv(&adapt->adapt_refcnt) == 1
&& adapt->adapt_enable != NULL) {
scsipi_adapter_lock(adapt);
error = scsipi_adapter_enable(adapt, 1);
scsipi_adapter_unlock(adapt);
if (error)
atomic_dec_uint(&adapt->adapt_refcnt);
}
return error;
}
/*
* scsipi_adapter_delref:
*
* Delete a reference to the adapter pointed to by the provided
* link, disabling the adapter if possible.
*/
void
scsipi_adapter_delref(struct scsipi_adapter *adapt)
{
membar_release();
if (atomic_dec_uint_nv(&adapt->adapt_refcnt) == 0
&& adapt->adapt_enable != NULL) {
membar_acquire();
scsipi_adapter_lock(adapt);
(void) scsipi_adapter_enable(adapt, 0);
scsipi_adapter_unlock(adapt);
}
}
static struct scsipi_syncparam {
int ss_factor;
int ss_period; /* ns * 100 */
} scsipi_syncparams[] = {
{ 0x08, 625 }, /* FAST-160 (Ultra320) */
{ 0x09, 1250 }, /* FAST-80 (Ultra160) */
{ 0x0a, 2500 }, /* FAST-40 40MHz (Ultra2) */
{ 0x0b, 3030 }, /* FAST-40 33MHz (Ultra2) */
{ 0x0c, 5000 }, /* FAST-20 (Ultra) */
};
static const int scsipi_nsyncparams =
sizeof(scsipi_syncparams) / sizeof(scsipi_syncparams[0]);
int
scsipi_sync_period_to_factor(int period /* ns * 100 */)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (period <= scsipi_syncparams[i].ss_period)
return scsipi_syncparams[i].ss_factor;
}
return (period / 100) / 4;
}
int
scsipi_sync_factor_to_period(int factor)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (factor == scsipi_syncparams[i].ss_factor)
return scsipi_syncparams[i].ss_period;
}
return (factor * 4) * 100;
}
int
scsipi_sync_factor_to_freq(int factor)
{
int i;
for (i = 0; i < scsipi_nsyncparams; i++) {
if (factor == scsipi_syncparams[i].ss_factor)
return 100000000 / scsipi_syncparams[i].ss_period;
}
return 10000000 / ((factor * 4) * 10);
}
static inline void
scsipi_adapter_lock(struct scsipi_adapter *adapt)
{
if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_LOCK(1, NULL);
}
static inline void
scsipi_adapter_unlock(struct scsipi_adapter *adapt)
{
if ((adapt->adapt_flags & SCSIPI_ADAPT_MPSAFE) == 0) KERNEL_UNLOCK_ONE(NULL);
}
void
scsipi_adapter_minphys(struct scsipi_channel *chan, struct buf *bp)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
scsipi_adapter_lock(adapt);
(adapt->adapt_minphys)(bp);
scsipi_adapter_unlock(chan->chan_adapter);
}
void
scsipi_adapter_request(struct scsipi_channel *chan,
scsipi_adapter_req_t req, void *arg)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
scsipi_adapter_lock(adapt); SDT_PROBE3(scsi, base, adapter, request__start, chan, req, arg);
(adapt->adapt_request)(chan, req, arg);
SDT_PROBE3(scsi, base, adapter, request__done, chan, req, arg); scsipi_adapter_unlock(adapt);
}
int
scsipi_adapter_ioctl(struct scsipi_channel *chan, u_long cmd,
void *data, int flag, struct proc *p)
{
struct scsipi_adapter *adapt = chan->chan_adapter;
int error;
if (adapt->adapt_ioctl == NULL)
return ENOTTY;
scsipi_adapter_lock(adapt);
error = (adapt->adapt_ioctl)(chan, cmd, data, flag, p);
scsipi_adapter_unlock(adapt);
return error;
}
int
scsipi_adapter_enable(struct scsipi_adapter *adapt, int enable)
{
int error;
scsipi_adapter_lock(adapt);
error = (adapt->adapt_enable)(adapt->adapt_dev, enable);
scsipi_adapter_unlock(adapt);
return error;
}
#ifdef SCSIPI_DEBUG
/*
* Given a scsipi_xfer, dump the request, in all its glory
*/
void
show_scsipi_xs(struct scsipi_xfer *xs)
{
printf("xs(%p): ", xs);
printf("xs_control(0x%08x)", xs->xs_control);
printf("xs_status(0x%08x)", xs->xs_status);
printf("periph(%p)", xs->xs_periph);
printf("retr(0x%x)", xs->xs_retries);
printf("timo(0x%x)", xs->timeout);
printf("cmd(%p)", xs->cmd);
printf("len(0x%x)", xs->cmdlen);
printf("data(%p)", xs->data);
printf("len(0x%x)", xs->datalen);
printf("res(0x%x)", xs->resid);
printf("err(0x%x)", xs->error);
printf("bp(%p)", xs->bp);
show_scsipi_cmd(xs);
}
void
show_scsipi_cmd(struct scsipi_xfer *xs)
{
u_char *b = (u_char *) xs->cmd;
int i = 0;
scsipi_printaddr(xs->xs_periph);
printf(" command: ");
if ((xs->xs_control & XS_CTL_RESET) == 0) {
while (i < xs->cmdlen) {
if (i)
printf(",");
printf("0x%x", b[i++]);
}
printf("-[%d bytes]\n", xs->datalen);
if (xs->datalen)
show_mem(xs->data, uimin(64, xs->datalen));
} else
printf("-RESET-\n");
}
void
show_mem(u_char *address, int num)
{
int x;
printf("------------------------------");
for (x = 0; x < num; x++) {
if ((x % 16) == 0)
printf("\n%03d: ", x);
printf("%02x ", *address++);
}
printf("\n------------------------------\n");
}
#endif /* SCSIPI_DEBUG */
/* $NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $ */
/* $KAME: nd6.c,v 1.279 2002/06/08 11:16:51 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: nd6.c,v 1.282 2024/04/11 07:34:37 knakahara Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_net_mpsafe.h"
#endif
#include "bridge.h"
#include "carp.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kmem.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/syslog.h>
#include <sys/queue.h>
#include <sys/cprng.h>
#include <sys/workqueue.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_llatbl.h>
#include <net/if_types.h>
#include <net/nd.h>
#include <net/route.h>
#include <net/if_ether.h>
#include <net/if_arc.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/in6_ifattach.h>
#include <netinet/icmp6.h>
#include <netinet6/icmp6_private.h>
#include <compat/netinet6/in6_var.h>
#include <compat/netinet6/nd6.h>
#define ND6_SLOWTIMER_INTERVAL (60 * 60) /* 1 hour */
#define ND6_RECALC_REACHTM_INTERVAL (60 * 120) /* 2 hours */
/* timer values */
int nd6_prune = 1; /* walk list every 1 seconds */
int nd6_useloopback = 1; /* use loopback interface for local traffic */
/* preventing too many loops in ND option parsing */
int nd6_maxndopt = 10; /* max # of ND options allowed */
#ifdef ND6_DEBUG
int nd6_debug = 1;
#else
int nd6_debug = 0;
#endif
krwlock_t nd6_lock __cacheline_aligned;
int nd6_recalc_reachtm_interval = ND6_RECALC_REACHTM_INTERVAL;
static void nd6_slowtimo(void *);
static void nd6_free(struct llentry *, int);
static bool nd6_nud_enabled(struct ifnet *);
static unsigned int nd6_llinfo_reachable(struct ifnet *);
static unsigned int nd6_llinfo_retrans(struct ifnet *);
static union l3addr *nd6_llinfo_holdsrc(struct llentry *, union l3addr *);
static void nd6_llinfo_output(struct ifnet *, const union l3addr *,
const union l3addr *, const uint8_t *, const union l3addr *);
static void nd6_llinfo_missed(struct ifnet *, const union l3addr *,
int16_t, struct mbuf *);
static void nd6_timer(void *);
static void nd6_timer_work(struct work *, void *);
static struct nd_opt_hdr *nd6_option(union nd_opts *);
static callout_t nd6_slowtimo_ch;
static callout_t nd6_timer_ch;
static struct workqueue *nd6_timer_wq;
static struct work nd6_timer_wk;
struct nd_domain nd6_nd_domain = {
.nd_family = AF_INET6,
.nd_delay = 5, /* delay first probe time 5 second */
.nd_mmaxtries = 3, /* maximum unicast query */
.nd_umaxtries = 3, /* maximum multicast query */
.nd_retransmultiple = BACKOFF_MULTIPLE,
.nd_maxretrans = MAX_RETRANS_TIMER,
.nd_maxnudhint = 0, /* max # of subsequent upper layer hints */
.nd_maxqueuelen = 1, /* max # of packets in unresolved ND entries */
.nd_nud_enabled = nd6_nud_enabled,
.nd_reachable = nd6_llinfo_reachable,
.nd_retrans = nd6_llinfo_retrans,
.nd_holdsrc = nd6_llinfo_holdsrc,
.nd_output = nd6_llinfo_output,
.nd_missed = nd6_llinfo_missed,
.nd_free = nd6_free,
};
MALLOC_DEFINE(M_IP6NDP, "NDP", "IPv6 Neighbour Discovery");
void
nd6_init(void)
{
int error;
nd_attach_domain(&nd6_nd_domain);
nd6_nbr_init();
rw_init(&nd6_lock);
callout_init(&nd6_slowtimo_ch, CALLOUT_MPSAFE);
callout_init(&nd6_timer_ch, CALLOUT_MPSAFE);
error = workqueue_create(&nd6_timer_wq, "nd6_timer",
nd6_timer_work, NULL, PRI_SOFTNET, IPL_SOFTNET, WQ_MPSAFE);
if (error)
panic("%s: workqueue_create failed (%d)\n", __func__, error);
/* start timer */
callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
nd6_slowtimo, NULL);
callout_reset(&nd6_timer_ch, hz, nd6_timer, NULL);
}
struct nd_kifinfo *
nd6_ifattach(struct ifnet *ifp)
{
struct nd_kifinfo *nd;
nd = kmem_zalloc(sizeof(*nd), KM_SLEEP);
nd->chlim = IPV6_DEFHLIM;
nd->basereachable = REACHABLE_TIME;
nd->reachable = ND_COMPUTE_RTIME(nd->basereachable);
nd->retrans = RETRANS_TIMER;
nd->flags = ND6_IFF_PERFORMNUD;
/* A loopback interface always has ND6_IFF_AUTO_LINKLOCAL.
* A bridge interface should not have ND6_IFF_AUTO_LINKLOCAL
* because one of its members should. */
if ((ip6_auto_linklocal && ifp->if_type != IFT_BRIDGE) ||
(ifp->if_flags & IFF_LOOPBACK))
nd->flags |= ND6_IFF_AUTO_LINKLOCAL;
return nd;
}
void
nd6_ifdetach(struct ifnet *ifp, struct in6_ifextra *ext)
{
/* Ensure all IPv6 addresses are purged before calling nd6_purge */
if_purgeaddrs(ifp, AF_INET6, in6_purgeaddr);
nd6_purge(ifp, ext);
kmem_free(ext->nd_ifinfo, sizeof(struct nd_kifinfo));
}
void
nd6_option_init(void *opt, int icmp6len, union nd_opts *ndopts)
{
memset(ndopts, 0, sizeof(*ndopts));
ndopts->nd_opts_search = (struct nd_opt_hdr *)opt;
ndopts->nd_opts_last
= (struct nd_opt_hdr *)(((u_char *)opt) + icmp6len);
if (icmp6len == 0) {
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
}
/*
* Take one ND option.
*/
static struct nd_opt_hdr *
nd6_option(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int olen;
KASSERT(ndopts != NULL);
KASSERT(ndopts->nd_opts_last != NULL);
if (ndopts->nd_opts_search == NULL)
return NULL;
if (ndopts->nd_opts_done)
return NULL;
nd_opt = ndopts->nd_opts_search;
/* make sure nd_opt_len is inside the buffer */
if ((void *)&nd_opt->nd_opt_len >= (void *)ndopts->nd_opts_last) {
memset(ndopts, 0, sizeof(*ndopts));
return NULL;
}
olen = nd_opt->nd_opt_len << 3;
if (olen == 0) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
memset(ndopts, 0, sizeof(*ndopts));
return NULL;
}
ndopts->nd_opts_search = (struct nd_opt_hdr *)((char *)nd_opt + olen);
if (ndopts->nd_opts_search > ndopts->nd_opts_last) {
/* option overruns the end of buffer, invalid */
memset(ndopts, 0, sizeof(*ndopts));
return NULL;
} else if (ndopts->nd_opts_search == ndopts->nd_opts_last) {
/* reached the end of options chain */
ndopts->nd_opts_done = 1;
ndopts->nd_opts_search = NULL;
}
return nd_opt;
}
/*
* Parse multiple ND options.
* This function is much easier to use, for ND routines that do not need
* multiple options of the same type.
*/
int
nd6_options(union nd_opts *ndopts)
{
struct nd_opt_hdr *nd_opt;
int i = 0;
KASSERT(ndopts != NULL);
KASSERT(ndopts->nd_opts_last != NULL);
if (ndopts->nd_opts_search == NULL)
return 0;
while (1) {
nd_opt = nd6_option(ndopts);
if (nd_opt == NULL && ndopts->nd_opts_last == NULL) {
/*
* Message validation requires that all included
* options have a length that is greater than zero.
*/
ICMP6_STATINC(ICMP6_STAT_ND_BADOPT);
memset(ndopts, 0, sizeof(*ndopts));
return -1;
}
if (nd_opt == NULL)
goto skip1;
switch (nd_opt->nd_opt_type) {
case ND_OPT_SOURCE_LINKADDR:
case ND_OPT_TARGET_LINKADDR:
case ND_OPT_MTU:
case ND_OPT_REDIRECTED_HEADER:
case ND_OPT_NONCE:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) {
nd6log(LOG_INFO,
"duplicated ND6 option found (type=%d)\n",
nd_opt->nd_opt_type);
/* XXX bark? */
} else {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
break;
case ND_OPT_PREFIX_INFORMATION:
if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) {
ndopts->nd_opt_array[nd_opt->nd_opt_type]
= nd_opt;
}
ndopts->nd_opts_pi_end =
(struct nd_opt_prefix_info *)nd_opt;
break;
default:
/*
* Unknown options must be silently ignored,
* to accommodate future extension to the protocol.
*/
nd6log(LOG_DEBUG,
"nd6_options: unsupported option %d - "
"option ignored\n", nd_opt->nd_opt_type);
}
skip1:
i++;
if (i > nd6_maxndopt) {
ICMP6_STATINC(ICMP6_STAT_ND_TOOMANYOPT);
nd6log(LOG_INFO, "too many loop in nd opt\n");
break;
}
if (ndopts->nd_opts_done)
break;
}
return 0;
}
/*
* Gets source address of the first packet in hold queue
* and stores it in @src.
* Returns pointer to @src (if hold queue is not empty) or NULL.
*/
static struct in6_addr *
nd6_llinfo_get_holdsrc(struct llentry *ln, struct in6_addr *src)
{
struct ip6_hdr *hip6;
if (ln == NULL || ln->ln_hold == NULL)
return NULL;
/*
* assuming every packet in ln_hold has the same IP header
*/
hip6 = mtod(ln->ln_hold, struct ip6_hdr *);
/* XXX pullup? */
if (sizeof(*hip6) < ln->ln_hold->m_len)
*src = hip6->ip6_src;
else
src = NULL;
return src;
}
static union l3addr *
nd6_llinfo_holdsrc(struct llentry *ln, union l3addr *src)
{
if (nd6_llinfo_get_holdsrc(ln, &src->addr6) == NULL)
return NULL;
return src;
}
static void
nd6_llinfo_output(struct ifnet *ifp, const union l3addr *daddr,
const union l3addr *taddr, __unused const uint8_t *tlladdr,
const union l3addr *hsrc)
{
nd6_ns_output(ifp,
daddr != NULL ? &daddr->addr6 : NULL,
taddr != NULL ? &taddr->addr6 : NULL,
hsrc != NULL ? &hsrc->addr6 : NULL, NULL);
}
static bool
nd6_nud_enabled(struct ifnet *ifp)
{
struct nd_kifinfo *ndi = ND_IFINFO(ifp);
return ndi->flags & ND6_IFF_PERFORMNUD;
}
static unsigned int
nd6_llinfo_reachable(struct ifnet *ifp)
{
struct nd_kifinfo *ndi = ND_IFINFO(ifp);
return ndi->reachable;
}
static unsigned int
nd6_llinfo_retrans(struct ifnet *ifp)
{
struct nd_kifinfo *ndi = ND_IFINFO(ifp);
return ndi->retrans;
}
static void
nd6_llinfo_missed(struct ifnet *ifp, const union l3addr *taddr,
int16_t type, struct mbuf *m)
{
struct in6_addr mdaddr6 = zeroin6_addr;
struct sockaddr_in6 dsin6, tsin6;
struct sockaddr *sa;
if (m != NULL) {
if (type == ND_LLINFO_PROBE) {
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
/* XXX pullup? */
if (sizeof(*ip6) < m->m_len)
mdaddr6 = ip6->ip6_src;
m_freem(m);
} else
icmp6_error2(m, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_ADDR, 0, ifp, &mdaddr6);
}
if (!IN6_IS_ADDR_UNSPECIFIED(&mdaddr6)) {
sockaddr_in6_init(&dsin6, &mdaddr6, 0, 0, 0);
sa = sin6tosa(&dsin6);
} else
sa = NULL;
sockaddr_in6_init(&tsin6, &taddr->addr6, 0, 0, 0);
rt_clonedmsg(RTM_MISS, sa, sin6tosa(&tsin6), NULL, ifp);
}
/*
* ND6 timer routine to expire default route list and prefix list
*/
static void
nd6_timer_work(struct work *wk, void *arg)
{
struct in6_ifaddr *ia6, *nia6;
int s, bound;
struct psref psref;
callout_reset(&nd6_timer_ch, nd6_prune * hz,
nd6_timer, NULL);
SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
/* expire interface addresses */
bound = curlwp_bind();
s = pserialize_read_enter();
for (ia6 = IN6_ADDRLIST_READER_FIRST(); ia6; ia6 = nia6) {
nia6 = IN6_ADDRLIST_READER_NEXT(ia6);
ia6_acquire(ia6, &psref);
pserialize_read_exit(s);
/* check address lifetime */
if (IFA6_IS_INVALID(ia6)) {
struct ifnet *ifp;
ifp = ia6->ia_ifa.ifa_ifp;
IFNET_LOCK(ifp);
/*
* Need to take the lock first to prevent if_detach
* from running in6_purgeaddr concurrently.
*/
if (!if_is_deactivated(ifp)) {
ia6_release(ia6, &psref);
in6_purgeaddr(&ia6->ia_ifa);
} else {
/*
* ifp is being destroyed, ia6 will be destroyed
* by if_detach.
*/
ia6_release(ia6, &psref);
}
ia6 = NULL;
IFNET_UNLOCK(ifp);
} else if (IFA6_IS_DEPRECATED(ia6)) {
int oldflags = ia6->ia6_flags;
if ((oldflags & IN6_IFF_DEPRECATED) == 0) {
ia6->ia6_flags |= IN6_IFF_DEPRECATED;
rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6);
}
} else {
/*
* A new RA might have made a deprecated address
* preferred.
*/
if (ia6->ia6_flags & IN6_IFF_DEPRECATED) {
ia6->ia6_flags &= ~IN6_IFF_DEPRECATED;
rt_addrmsg(RTM_NEWADDR, (struct ifaddr *)ia6);
}
}
s = pserialize_read_enter();
ia6_release(ia6, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
static void
nd6_timer(void *ignored_arg)
{
workqueue_enqueue(nd6_timer_wq, &nd6_timer_wk, NULL);
}
/*
* Nuke neighbor cache/prefix/default router management table, right before
* ifp goes away.
*/
void
nd6_purge(struct ifnet *ifp, struct in6_ifextra *ext)
{
/*
* During detach, the ND info might be already removed, but
* then is explitly passed as argument.
* Otherwise get it from ifp->if_afdata.
*/
if (ext == NULL)
ext = ifp->if_afdata[AF_INET6];
if (ext == NULL)
return;
/*
* We may not need to nuke the neighbor cache entries here
* because the neighbor cache is kept in if_afdata[AF_INET6].
* nd6_purge() is invoked by in6_ifdetach() which is called
* from if_detach() where everything gets purged. However
* in6_ifdetach is directly called from vlan(4), so we still
* need to purge entries here.
*/
if (ext->lltable != NULL)
lltable_purge_entries(ext->lltable);
}
struct llentry *
nd6_lookup(const struct in6_addr *addr6, const struct ifnet *ifp, bool wlock)
{
struct sockaddr_in6 sin6;
struct llentry *ln;
sockaddr_in6_init(&sin6, addr6, 0, 0, 0);
IF_AFDATA_RLOCK(ifp);
ln = lla_lookup(LLTABLE6(ifp), wlock ? LLE_EXCLUSIVE : 0,
sin6tosa(&sin6));
IF_AFDATA_RUNLOCK(ifp);
return ln;
}
struct llentry *
nd6_create(const struct in6_addr *addr6, const struct ifnet *ifp)
{
struct sockaddr_in6 sin6;
struct llentry *ln;
struct rtentry *rt;
sockaddr_in6_init(&sin6, addr6, 0, 0, 0);
rt = rtalloc1(sin6tosa(&sin6), 0);
IF_AFDATA_WLOCK(ifp);
ln = lla_create(LLTABLE6(ifp), LLE_EXCLUSIVE, sin6tosa(&sin6), rt);
IF_AFDATA_WUNLOCK(ifp);
if (rt != NULL)
rt_unref(rt);
if (ln != NULL)
ln->ln_state = ND_LLINFO_NOSTATE;
return ln;
}
/*
* Test whether a given IPv6 address is a neighbor or not, ignoring
* the actual neighbor cache. The neighbor cache is ignored in order
* to not reenter the routing code from within itself.
*/
static int
nd6_is_new_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
{
struct ifaddr *dstaddr;
int s;
/*
* A link-local address is always a neighbor.
* XXX: a link does not necessarily specify a single interface.
*/
if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
struct sockaddr_in6 sin6_copy;
u_int32_t zone;
/*
* We need sin6_copy since sa6_recoverscope() may modify the
* content (XXX).
*/
sin6_copy = *addr;
if (sa6_recoverscope(&sin6_copy))
return 0; /* XXX: should be impossible */
if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
return 0;
if (sin6_copy.sin6_scope_id == zone)
return 1;
else
return 0;
}
/*
* If the address is assigned on the node of the other side of
* a p2p interface, the address should be a neighbor.
*/
s = pserialize_read_enter();
dstaddr = ifa_ifwithdstaddr(sin6tocsa(addr));
if (dstaddr != NULL) {
if (dstaddr->ifa_ifp == ifp) {
pserialize_read_exit(s);
return 1;
}
}
pserialize_read_exit(s);
return 0;
}
/*
* Detect if a given IPv6 address identifies a neighbor on a given link.
* XXX: should take care of the destination of a p2p link?
*/
int
nd6_is_addr_neighbor(const struct sockaddr_in6 *addr, struct ifnet *ifp)
{
struct llentry *ln;
struct rtentry *rt;
/*
* A link-local address is always a neighbor.
* XXX: a link does not necessarily specify a single interface.
*/
if (IN6_IS_ADDR_LINKLOCAL(&addr->sin6_addr)) {
struct sockaddr_in6 sin6_copy;
u_int32_t zone;
/*
* We need sin6_copy since sa6_recoverscope() may modify the
* content (XXX).
*/
sin6_copy = *addr;
if (sa6_recoverscope(&sin6_copy))
return 0; /* XXX: should be impossible */
if (in6_setscope(&sin6_copy.sin6_addr, ifp, &zone))
return 0;
if (sin6_copy.sin6_scope_id == zone)
return 1;
else
return 0;
}
if (nd6_is_new_addr_neighbor(addr, ifp))
return 1;
/*
* Even if the address matches none of our addresses, it might be
* in the neighbor cache or a connected route.
*/
ln = nd6_lookup(&addr->sin6_addr, ifp, false);
if (ln != NULL) {
LLE_RUNLOCK(ln);
return 1;
}
rt = rtalloc1(sin6tocsa(addr), 0);
if (rt == NULL)
return 0;
if ((rt->rt_flags & RTF_CONNECTED) && (rt->rt_ifp == ifp
#if NBRIDGE > 0
|| rt->rt_ifp->if_bridge == ifp->if_bridge
#endif
#if NCARP > 0
|| (ifp->if_type == IFT_CARP && rt->rt_ifp == ifp->if_carpdev) ||
(rt->rt_ifp->if_type == IFT_CARP && rt->rt_ifp->if_carpdev == ifp)||
(ifp->if_type == IFT_CARP && rt->rt_ifp->if_type == IFT_CARP &&
rt->rt_ifp->if_carpdev == ifp->if_carpdev)
#endif
)) {
rt_unref(rt);
return 1;
}
rt_unref(rt);
return 0;
}
/*
* Free an nd6 llinfo entry.
* Since the function would cause significant changes in the kernel, DO NOT
* make it global, unless you have a strong reason for the change, and are sure
* that the change is safe.
*/
static void
nd6_free(struct llentry *ln, int gc)
{
struct ifnet *ifp;
KASSERT(ln != NULL);
LLE_WLOCK_ASSERT(ln);
/*
* If the reason for the deletion is just garbage collection,
* and the neighbor is an active router, do not delete it.
* Instead, reset the GC timer using the router's lifetime.
* XXX: the check for ln_state should be redundant,
* but we intentionally keep it just in case.
*/
if (!ip6_forwarding && ln->ln_router &&
ln->ln_state == ND_LLINFO_STALE && gc)
{
nd_set_timer(ln, ND_TIMER_EXPIRE);
LLE_WUNLOCK(ln);
return;
}
ifp = ln->lle_tbl->llt_ifp;
if (ln->la_flags & LLE_VALID || gc) {
struct sockaddr_in6 sin6;
const char *lladdr;
sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0);
lladdr = ln->la_flags & LLE_VALID ?
(const char *)&ln->ll_addr : NULL;
rt_clonedmsg(RTM_DELETE, NULL, sin6tosa(&sin6), lladdr, ifp);
}
/*
* Save to unlock. We still hold an extra reference and will not
* free(9) in llentry_free() if someone else holds one as well.
*/
LLE_WUNLOCK(ln);
IF_AFDATA_LOCK(ifp);
LLE_WLOCK(ln);
lltable_free_entry(LLTABLE6(ifp), ln);
IF_AFDATA_UNLOCK(ifp);
}
/*
* Upper-layer reachability hint for Neighbor Unreachability Detection.
*
* XXX cost-effective methods?
*/
void
nd6_nud_hint(struct rtentry *rt)
{
struct llentry *ln;
struct ifnet *ifp;
if (rt == NULL)
return;
ifp = rt->rt_ifp;
ln = nd6_lookup(&(satocsin6(rt_getkey(rt)))->sin6_addr, ifp, true);
nd_nud_hint(ln);
}
struct gc_args {
int gc_entries;
const struct in6_addr *skip_in6;
};
static int
nd6_purge_entry(struct lltable *llt, struct llentry *ln, void *farg)
{
struct gc_args *args = farg;
int *n = &args->gc_entries;
const struct in6_addr *skip_in6 = args->skip_in6;
if (*n <= 0)
return 0;
if (ND_IS_LLINFO_PERMANENT(ln))
return 0;
if (IN6_ARE_ADDR_EQUAL(&ln->r_l3addr.addr6, skip_in6))
return 0;
LLE_WLOCK(ln);
if (ln->ln_state > ND_LLINFO_INCOMPLETE)
ln->ln_state = ND_LLINFO_STALE;
else
ln->ln_state = ND_LLINFO_PURGE;
nd_set_timer(ln, ND_TIMER_IMMEDIATE);
LLE_WUNLOCK(ln);
(*n)--;
return 0;
}
static void
nd6_gc_neighbors(struct lltable *llt, const struct in6_addr *in6)
{
if (ip6_neighborgcthresh >= 0 &&
lltable_get_entry_count(llt) >= ip6_neighborgcthresh) {
struct gc_args gc_args = {10, in6};
/*
* XXX entries that are "less recently used" should be
* freed first.
*/
lltable_foreach_lle(llt, nd6_purge_entry, &gc_args);
}
}
void
nd6_rtrequest(int req, struct rtentry *rt, const struct rt_addrinfo *info)
{
struct sockaddr *gate = rt->rt_gateway;
struct ifnet *ifp = rt->rt_ifp;
uint8_t namelen = strlen(ifp->if_xname), addrlen = ifp->if_addrlen;
struct ifaddr *ifa;
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
if (req == RTM_LLINFO_UPD) {
int rc;
struct in6_addr *in6;
struct in6_addr in6_all;
int anycast;
if ((ifa = info->rti_ifa) == NULL)
return;
in6 = &ifatoia6(ifa)->ia_addr.sin6_addr;
anycast = ifatoia6(ifa)->ia6_flags & IN6_IFF_ANYCAST;
in6_all = in6addr_linklocal_allnodes;
if ((rc = in6_setscope(&in6_all, ifa->ifa_ifp, NULL)) != 0) {
log(LOG_ERR, "%s: failed to set scope %s "
"(errno=%d)\n", __func__, if_name(ifp), rc);
return;
}
/* XXX don't set Override for proxy addresses */
nd6_na_output(ifa->ifa_ifp, &in6_all, in6,
(anycast ? 0 : ND_NA_FLAG_OVERRIDE)
#if 0
| (ip6_forwarding ? ND_NA_FLAG_ROUTER : 0)
#endif
, 1, NULL);
return;
}
if ((rt->rt_flags & RTF_GATEWAY) != 0) {
if (req != RTM_ADD)
return;
/*
* linklayers with particular MTU limitation.
*/
switch(ifp->if_type) {
#if NARCNET > 0
case IFT_ARCNET:
if (rt->rt_rmx.rmx_mtu > ARC_PHDS_MAXMTU) /* RFC2497 */
rt->rt_rmx.rmx_mtu = ARC_PHDS_MAXMTU;
break;
#endif
}
return;
}
if (nd6_need_cache(ifp) == 0 && (rt->rt_flags & RTF_HOST) == 0) {
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
/*
* This is probably an interface direct route for a link
* which does not need neighbor caches (e.g. fe80::%lo0/64).
* We do not need special treatment below for such a route.
* Moreover, the RTF_LLINFO flag which would be set below
* would annoy the ndp(8) command.
*/
return;
}
switch (req) {
case RTM_ADD: {
struct psref psref;
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
/*
* There is no backward compatibility :)
*
* if ((rt->rt_flags & RTF_HOST) == 0 &&
* SIN(rt_mask(rt))->sin_addr.s_addr != 0xffffffff)
* rt->rt_flags |= RTF_CLONING;
*/
/* XXX should move to route.c? */
if (rt->rt_flags & (RTF_CONNECTED | RTF_LOCAL)) {
union {
struct sockaddr sa;
struct sockaddr_dl sdl;
struct sockaddr_storage ss;
} u;
/*
* Case 1: This route should come from a route to
* interface (RTF_CLONING case) or the route should be
* treated as on-link but is currently not
* (RTF_LLINFO && ln == NULL case).
*/
if (sockaddr_dl_init(&u.sdl, sizeof(u.ss),
ifp->if_index, ifp->if_type,
NULL, namelen, NULL, addrlen) == NULL) {
printf("%s.%d: sockaddr_dl_init(, %zu, ) "
"failed on %s\n", __func__, __LINE__,
sizeof(u.ss), if_name(ifp));
}
rt_setgate(rt, &u.sa);
gate = rt->rt_gateway;
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
if (gate == NULL) {
log(LOG_ERR,
"%s: rt_setgate failed on %s\n", __func__,
if_name(ifp));
break;
}
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
if ((rt->rt_flags & RTF_CONNECTED) != 0)
break;
}
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
/*
* In IPv4 code, we try to annonuce new RTF_ANNOUNCE entry here.
* We don't do that here since llinfo is not ready yet.
*
* There are also couple of other things to be discussed:
* - unsolicited NA code needs improvement beforehand
* - RFC2461 says we MAY send multicast unsolicited NA
* (7.2.6 paragraph 4), however, it also says that we
* SHOULD provide a mechanism to prevent multicast NA storm.
* we don't have anything like it right now.
* note that the mechanism needs a mutual agreement
* between proxies, which means that we need to implement
* a new protocol, or a new kludge.
* - from RFC2461 6.2.4, host MUST NOT send an unsolicited NA.
* we need to check ip6forwarding before sending it.
* (or should we allow proxy ND configuration only for
* routers? there's no mention about proxy ND from hosts)
*/
#if 0
/* XXX it does not work */
if (rt->rt_flags & RTF_ANNOUNCE)
nd6_na_output(ifp,
&satocsin6(rt_getkey(rt))->sin6_addr,
&satocsin6(rt_getkey(rt))->sin6_addr,
ip6_forwarding ? ND_NA_FLAG_ROUTER : 0,
1, NULL);
#endif
if ((ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) == 0) {
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
/*
* Address resolution isn't necessary for a point to
* point link, so we can skip this test for a p2p link.
*/
if (gate->sa_family != AF_LINK ||
gate->sa_len <
sockaddr_dl_measure(namelen, addrlen)) {
log(LOG_DEBUG,
"nd6_rtrequest: bad gateway value: %s\n",
if_name(ifp));
break;
}
satosdl(gate)->sdl_type = ifp->if_type;
satosdl(gate)->sdl_index = ifp->if_index;
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
}
RT_DPRINTF("rt_getkey(rt) = %p\n", rt_getkey(rt));
/*
* When called from rt_ifa_addlocal, we cannot depend on that
* the address (rt_getkey(rt)) exits in the address list of the
* interface. So check RTF_LOCAL instead.
*/
if (rt->rt_flags & RTF_LOCAL) {
if (nd6_useloopback)
rt->rt_ifp = lo0ifp; /* XXX */
break;
}
/*
* check if rt_getkey(rt) is an address assigned
* to the interface.
*/
ifa = (struct ifaddr *)in6ifa_ifpwithaddr_psref(ifp,
&satocsin6(rt_getkey(rt))->sin6_addr, &psref);
if (ifa != NULL) {
if (nd6_useloopback) {
rt->rt_ifp = lo0ifp; /* XXX */
/*
* Make sure rt_ifa be equal to the ifaddr
* corresponding to the address.
* We need this because when we refer
* rt_ifa->ia6_flags in ip6_input, we assume
* that the rt_ifa points to the address instead
* of the loopback address.
*/
if (!ISSET(info->rti_flags, RTF_DONTCHANGEIFA)
&& ifa != rt->rt_ifa)
rt_replace_ifa(rt, ifa);
}
} else if (rt->rt_flags & RTF_ANNOUNCE) {
/* join solicited node multicast for proxy ND */
if (ifp->if_flags & IFF_MULTICAST) {
struct in6_addr llsol;
int error;
llsol = satocsin6(rt_getkey(rt))->sin6_addr;
llsol.s6_addr32[0] = htonl(0xff020000);
llsol.s6_addr32[1] = 0;
llsol.s6_addr32[2] = htonl(1);
llsol.s6_addr8[12] = 0xff;
if (in6_setscope(&llsol, ifp, NULL))
goto out;
if (!in6_addmulti(&llsol, ifp, &error, 0)) {
char ip6buf[INET6_ADDRSTRLEN];
nd6log(LOG_ERR, "%s: failed to join "
"%s (errno=%d)\n", if_name(ifp),
IN6_PRINT(ip6buf, &llsol), error);
}
}
}
out:
ifa_release(ifa, &psref);
/*
* If we have too many cache entries, initiate immediate
* purging for some entries.
*/
if (rt->rt_ifp != NULL)
nd6_gc_neighbors(LLTABLE6(rt->rt_ifp), NULL);
break;
}
case RTM_DELETE:
/* leave from solicited node multicast for proxy ND */
if ((rt->rt_flags & RTF_ANNOUNCE) != 0 &&
(ifp->if_flags & IFF_MULTICAST) != 0) {
struct in6_addr llsol;
llsol = satocsin6(rt_getkey(rt))->sin6_addr;
llsol.s6_addr32[0] = htonl(0xff020000);
llsol.s6_addr32[1] = 0;
llsol.s6_addr32[2] = htonl(1);
llsol.s6_addr8[12] = 0xff;
if (in6_setscope(&llsol, ifp, NULL) == 0)
in6_lookup_and_delete_multi(&llsol, ifp);
}
break;
}
}
static void
nd6_setifflags(struct ifnet *ifp, uint32_t flags)
{
struct nd_kifinfo *ndi = ND_IFINFO(ifp);
struct ifaddr *ifa;
struct in6_ifaddr *ia;
int s;
if (ndi->flags & ND6_IFF_IFDISABLED && !(flags & ND6_IFF_IFDISABLED)) {
/*
* If the interface is marked as ND6_IFF_IFDISABLED and
* has a link-local address with IN6_IFF_DUPLICATED,
* do not clear ND6_IFF_IFDISABLED.
* See RFC 4862, section 5.4.5.
*/
bool duplicated_linklocal = false;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if ((ia->ia6_flags & IN6_IFF_DUPLICATED) && IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia)))
{
duplicated_linklocal = true;
break;
}
}
pserialize_read_exit(s);
if (duplicated_linklocal) {
flags |= ND6_IFF_IFDISABLED;
log(LOG_ERR, "%s: Cannot enable an interface"
" with a link-local address marked"
" duplicate.\n", if_name(ifp));
} else {
ndi->flags &= ~ND6_IFF_IFDISABLED;
if (ifp->if_flags & IFF_UP) in6_if_up(ifp);
}
} else if (!(ndi->flags & ND6_IFF_IFDISABLED) &&
(flags & ND6_IFF_IFDISABLED))
{
struct psref psref;
int bound = curlwp_bind();
/* Mark all IPv6 addresses as tentative. */
ndi->flags |= ND6_IFF_IFDISABLED;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family != AF_INET6)
continue;
ifa_acquire(ifa, &psref);
pserialize_read_exit(s);
nd6_dad_stop(ifa);
ia = (struct in6_ifaddr *)ifa;
ia->ia6_flags |= IN6_IFF_TENTATIVE;
s = pserialize_read_enter();
ifa_release(ifa, &psref);
}
pserialize_read_exit(s);
curlwp_bindx(bound);
}
if (flags & ND6_IFF_AUTO_LINKLOCAL) {
if (!(ndi->flags & ND6_IFF_AUTO_LINKLOCAL)) {
/* auto_linklocal 0->1 transition */
ndi->flags |= ND6_IFF_AUTO_LINKLOCAL;
in6_ifattach(ifp, NULL);
} else if (!(flags & ND6_IFF_IFDISABLED) &&
ifp->if_flags & IFF_UP)
{
/*
* When the IF already has
* ND6_IFF_AUTO_LINKLOCAL, no link-local
* address is assigned, and IFF_UP, try to
* assign one.
*/
bool haslinklocal = 0;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) { if (ifa->ifa_addr->sa_family !=AF_INET6)
continue;
ia = (struct in6_ifaddr *)ifa;
if (IN6_IS_ADDR_LINKLOCAL(IA6_IN6(ia))){
haslinklocal = true;
break;
}
}
pserialize_read_exit(s);
if (!haslinklocal)
in6_ifattach(ifp, NULL);
}
}
ndi->flags = flags;
}
int
nd6_ioctl(u_long cmd, void *data, struct ifnet *ifp)
{
#ifdef OSIOCGIFINFO_IN6_90
struct in6_ndireq90 *ondi = (struct in6_ndireq90 *)data;
struct in6_ndifreq90 *ndif = (struct in6_ndifreq90 *)data;
#define OND ondi->ndi
#endif
struct in6_ndireq *ndi = (struct in6_ndireq *)data;
struct in6_nbrinfo *nbi = (struct in6_nbrinfo *)data;
struct nd_kifinfo *ifndi = ND_IFINFO(ifp);
int error = 0;
#define ND ndi->ndi
switch (cmd) {
#ifdef OSIOCSRTRFLUSH_IN6
case OSIOCGDRLST_IN6: /* FALLTHROUGH */
case OSIOCGPRLST_IN6: /* FALLTHROUGH */
case OSIOCSNDFLUSH_IN6: /* FALLTHROUGH */
case OSIOCSPFXFLUSH_IN6: /* FALLTHROUGH */
case OSIOCSRTRFLUSH_IN6: /* FALLTHROUGH */
break;
case OSIOCGDEFIFACE_IN6:
ndif->ifindex = 0;
break;
case OSIOCSDEFIFACE_IN6:
error = ENOTSUP;
break;
#endif
#ifdef OSIOCGIFINFO_IN6
case OSIOCGIFINFO_IN6: /* FALLTHROUGH */
#endif
#ifdef OSIOCGIFINFO_IN6_90
case OSIOCGIFINFO_IN6_90:
memset(&OND, 0, sizeof(OND));
OND.initialized = 1;
OND.chlim = ifndi->chlim;
OND.basereachable = ifndi->basereachable;
OND.retrans = ifndi->retrans;
OND.flags = ifndi->flags;
break;
case OSIOCSIFINFO_IN6_90:
/* Allow userland to set Neighbor Unreachability Detection
* timers. */
if (OND.chlim != 0) ifndi->chlim = OND.chlim; if (OND.basereachable != 0 &&
OND.basereachable != ifndi->basereachable)
{
ifndi->basereachable = OND.basereachable;
ifndi->reachable = ND_COMPUTE_RTIME(OND.basereachable);
}
if (OND.retrans != 0) ifndi->retrans = OND.retrans;
/* Retain the old behaviour .... */
/* FALLTHROUGH */
case OSIOCSIFINFO_FLAGS_90:
nd6_setifflags(ifp, OND.flags);
break;
#undef OND
#endif
case SIOCGIFINFO_IN6:
ND.chlim = ifndi->chlim;
ND.basereachable = ifndi->basereachable;
ND.retrans = ifndi->retrans;
ND.flags = ifndi->flags;
break;
case SIOCSIFINFO_IN6:
/* Allow userland to set Neighbor Unreachability Detection
* timers. */
if (ND.chlim != 0) ifndi->chlim = ND.chlim; if (ND.basereachable != 0 &&
ND.basereachable != ifndi->basereachable)
{
ifndi->basereachable = ND.basereachable;
ifndi->reachable = ND_COMPUTE_RTIME(ND.basereachable);
}
if (ND.retrans != 0) ifndi->retrans = ND.retrans;
break;
case SIOCSIFINFO_FLAGS:
nd6_setifflags(ifp, ND.flags);
break;
#undef ND
case SIOCGNBRINFO_IN6:
{
struct llentry *ln;
struct in6_addr nb_addr = nbi->addr; /* make local for safety */
if ((error = in6_setscope(&nb_addr, ifp, NULL)) != 0)
return error;
ln = nd6_lookup(&nb_addr, ifp, false);
if (ln == NULL) {
error = EINVAL;
break;
}
nbi->state = ln->ln_state;
nbi->asked = ln->ln_asked;
nbi->isrouter = ln->ln_router;
nbi->expire = ln->ln_expire ? time_mono_to_wall(ln->ln_expire) : 0;
LLE_RUNLOCK(ln);
break;
}
}
return error;
}
void
nd6_llinfo_release_pkts(struct llentry *ln, struct ifnet *ifp)
{
struct mbuf *m_hold, *m_hold_next;
struct sockaddr_in6 sin6;
LLE_WLOCK_ASSERT(ln);
sockaddr_in6_init(&sin6, &ln->r_l3addr.addr6, 0, 0, 0);
m_hold = ln->la_hold, ln->la_hold = NULL, ln->la_numheld = 0;
LLE_ADDREF(ln);
LLE_WUNLOCK(ln);
for (; m_hold != NULL; m_hold = m_hold_next) {
m_hold_next = m_hold->m_nextpkt;
m_hold->m_nextpkt = NULL;
/*
* we assume ifp is not a p2p here, so
* just set the 2nd argument as the
* 1st one.
*/
ip6_if_output(ifp, ifp, m_hold, &sin6, NULL);
}
LLE_WLOCK(ln);
LLE_REMREF(ln);
}
/*
* Create neighbor cache entry and cache link-layer address,
* on reception of inbound ND6 packets. (RS/RA/NS/redirect)
*/
void
nd6_cache_lladdr(
struct ifnet *ifp,
struct in6_addr *from,
char *lladdr,
int lladdrlen,
int type, /* ICMP6 type */
int code /* type dependent information */
)
{
struct llentry *ln = NULL;
int is_newentry;
int do_update;
int olladdr;
int llchange;
int newstate = 0;
KASSERT(ifp != NULL);
KASSERT(from != NULL);
/* nothing must be updated for unspecified address */
if (IN6_IS_ADDR_UNSPECIFIED(from))
return;
/*
* Validation about ifp->if_addrlen and lladdrlen must be done in
* the caller.
*
* XXX If the link does not have link-layer adderss, what should
* we do? (ifp->if_addrlen == 0)
* Spec says nothing in sections for RA, RS and NA. There's small
* description on it in NS section (RFC 2461 7.2.3).
*/
ln = nd6_lookup(from, ifp, true);
if (ln == NULL) {
#if 0
/* nothing must be done if there's no lladdr */
if (!lladdr || !lladdrlen)
return NULL;
#endif
ln = nd6_create(from, ifp);
is_newentry = 1;
} else {
/* do nothing if static ndp is set */
if (ln->la_flags & LLE_STATIC) {
LLE_WUNLOCK(ln);
return;
}
is_newentry = 0;
}
if (ln == NULL)
return;
olladdr = (ln->la_flags & LLE_VALID) ? 1 : 0;
if (olladdr && lladdr) {
llchange = memcmp(lladdr, &ln->ll_addr, ifp->if_addrlen);
} else
llchange = 0;
/*
* newentry olladdr lladdr llchange (*=record)
* 0 n n -- (1)
* 0 y n -- (2)
* 0 n y -- (3) * STALE
* 0 y y n (4) *
* 0 y y y (5) * STALE
* 1 -- n -- (6) NOSTATE(= PASSIVE)
* 1 -- y -- (7) * STALE
*/
if (lladdr) { /* (3-5) and (7) */
/*
* Record source link-layer address
* XXX is it dependent to ifp->if_type?
*/
memcpy(&ln->ll_addr, lladdr, ifp->if_addrlen);
ln->la_flags |= LLE_VALID;
}
if (!is_newentry) {
if ((!olladdr && lladdr) || /* (3) */
(olladdr && lladdr && llchange)) { /* (5) */
do_update = 1;
newstate = ND_LLINFO_STALE;
} else /* (1-2,4) */
do_update = 0;
} else {
do_update = 1;
if (lladdr == NULL) /* (6) */
newstate = ND_LLINFO_NOSTATE;
else /* (7) */
newstate = ND_LLINFO_STALE;
}
if (do_update) {
/*
* Update the state of the neighbor cache.
*/
ln->ln_state = newstate;
if (ln->ln_state == ND_LLINFO_STALE) {
/*
* XXX: since nd6_output() below will cause
* state tansition to DELAY and reset the timer,
* we must set the timer now, although it is actually
* meaningless.
*/
nd_set_timer(ln, ND_TIMER_GC);
nd6_llinfo_release_pkts(ln, ifp);
} else if (ln->ln_state == ND_LLINFO_INCOMPLETE) {
/* probe right away */
nd_set_timer(ln, ND_TIMER_IMMEDIATE);
}
}
/*
* ICMP6 type dependent behavior.
*
* NS: clear IsRouter if new entry
* RS: clear IsRouter
* RA: set IsRouter if there's lladdr
* redir: clear IsRouter if new entry
*
* RA case, (1):
* The spec says that we must set IsRouter in the following cases:
* - If lladdr exist, set IsRouter. This means (1-5).
* - If it is old entry (!newentry), set IsRouter. This means (7).
* So, based on the spec, in (1-5) and (7) cases we must set IsRouter.
* A question arises for (1) case. (1) case has no lladdr in the
* neighbor cache, this is similar to (6).
* This case is rare but we figured that we MUST NOT set IsRouter.
*
* newentry olladdr lladdr llchange NS RS RA redir
* D R
* 0 n n -- (1) c ? s
* 0 y n -- (2) c s s
* 0 n y -- (3) c s s
* 0 y y n (4) c s s
* 0 y y y (5) c s s
* 1 -- n -- (6) c c c s
* 1 -- y -- (7) c c s c s
*
* (c=clear s=set)
*/
switch (type & 0xff) {
case ND_NEIGHBOR_SOLICIT:
/*
* New entry must have is_router flag cleared.
*/
if (is_newentry) /* (6-7) */
ln->ln_router = 0;
break;
case ND_REDIRECT:
/*
* If the icmp is a redirect to a better router, always set the
* is_router flag. Otherwise, if the entry is newly created,
* clear the flag. [RFC 2461, sec 8.3]
*/
if (code == ND_REDIRECT_ROUTER)
ln->ln_router = 1;
else if (is_newentry) /* (6-7) */
ln->ln_router = 0;
break;
case ND_ROUTER_SOLICIT:
/*
* is_router flag must always be cleared.
*/
ln->ln_router = 0;
break;
case ND_ROUTER_ADVERT:
/*
* Mark an entry with lladdr as a router.
*/
if ((!is_newentry && (olladdr || lladdr)) || /* (2-5) */
(is_newentry && lladdr)) { /* (7) */
ln->ln_router = 1;
}
break;
}
if (do_update && lladdr != NULL) {
struct sockaddr_in6 sin6;
sockaddr_in6_init(&sin6, from, 0, 0, 0);
rt_clonedmsg(is_newentry ? RTM_ADD : RTM_CHANGE,
NULL, sin6tosa(&sin6), lladdr, ifp);
}
if (ln != NULL)
LLE_WUNLOCK(ln);
/*
* If we have too many cache entries, initiate immediate
* purging for some entries.
*/
if (is_newentry)
nd6_gc_neighbors(LLTABLE6(ifp), &ln->r_l3addr.addr6);
}
static void
nd6_slowtimo(void *ignored_arg)
{
struct nd_kifinfo *ndi;
struct ifnet *ifp;
struct psref psref;
int s;
SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
callout_reset(&nd6_slowtimo_ch, ND6_SLOWTIMER_INTERVAL * hz,
nd6_slowtimo, NULL);
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) {
ndi = ND_IFINFO(ifp);
if (ndi->basereachable && /* already initialized */
(ndi->recalctm -= ND6_SLOWTIMER_INTERVAL) <= 0) {
if_acquire(ifp, &psref);
pserialize_read_exit(s);
/*
* Since reachable time rarely changes by router
* advertisements, we SHOULD insure that a new random
* value gets recomputed at least once every few hours.
* (RFC 2461, 6.3.4)
*/
ndi->recalctm = nd6_recalc_reachtm_interval;
ndi->reachable = ND_COMPUTE_RTIME(ndi->basereachable);
s = pserialize_read_enter();
if_release(ifp, &psref);
}
}
pserialize_read_exit(s);
SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
/*
* Return 0 if a neighbor cache is found. Return EWOULDBLOCK if a cache is not
* found and trying to resolve a neighbor; in this case the mbuf is queued in
* the list. Otherwise return errno after freeing the mbuf.
*/
int
nd6_resolve(struct ifnet *ifp, const struct rtentry *rt, struct mbuf *m,
const struct sockaddr *_dst, uint8_t *lldst, size_t dstsize)
{
struct llentry *ln = NULL;
bool created = false;
const struct sockaddr_in6 *dst = satocsin6(_dst);
int error;
struct nd_kifinfo *ndi = ND_IFINFO(ifp);
/* discard the packet if IPv6 operation is disabled on the interface */
if (ndi->flags & ND6_IFF_IFDISABLED) {
m_freem(m);
return ENETDOWN; /* better error? */
}
/*
* Address resolution or Neighbor Unreachability Detection
* for the next hop.
* At this point, the destination of the packet must be a unicast
* or an anycast address(i.e. not a multicast).
*/
/* Look up the neighbor cache for the nexthop */
ln = nd6_lookup(&dst->sin6_addr, ifp, false);
if (ln != NULL && (ln->la_flags & LLE_VALID) != 0 &&
ln->ln_state == ND_LLINFO_REACHABLE) {
/* Fast path */
memcpy(lldst, &ln->ll_addr, MIN(dstsize, ifp->if_addrlen));
LLE_RUNLOCK(ln);
return 0;
}
if (ln != NULL)
LLE_RUNLOCK(ln);
/* Slow path */
ln = nd6_lookup(&dst->sin6_addr, ifp, true);
if (ln == NULL && nd6_is_addr_neighbor(dst, ifp)) {
/*
* Since nd6_is_addr_neighbor() internally calls nd6_lookup(),
* the condition below is not very efficient. But we believe
* it is tolerable, because this should be a rare case.
*/
ln = nd6_create(&dst->sin6_addr, ifp);
if (ln == NULL) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"%s: can't allocate llinfo for %s "
"(ln=%p, rt=%p)\n", __func__,
IN6_PRINT(ip6buf, &dst->sin6_addr), ln, rt);
m_freem(m);
return ENOBUFS;
}
created = true;
}
if (ln == NULL) {
m_freem(m);
return ENETDOWN; /* better error? */
}
error = nd_resolve(ln, rt, m, lldst, dstsize);
if (created)
nd6_gc_neighbors(LLTABLE6(ifp), &dst->sin6_addr);
return error;
}
int
nd6_need_cache(struct ifnet *ifp)
{
/*
* XXX: we currently do not make neighbor cache on any interface
* other than ARCnet, Ethernet, and GIF.
*
* RFC2893 says:
* - unidirectional tunnels needs no ND
*/
switch (ifp->if_type) {
case IFT_ARCNET:
case IFT_ETHER:
case IFT_IEEE1394:
case IFT_CARP:
case IFT_GIF: /* XXX need more cases? */
case IFT_IPSEC:
case IFT_PPP:
case IFT_TUNNEL:
return 1;
default:
return 0;
}
}
int
nd6_sysctl(
int name,
void *oldp, /* syscall arg, need copyout */
size_t *oldlenp,
void *newp, /* syscall arg, need copyin */
size_t newlen
)
{
int error;
if (newp)
return EPERM;
switch (name) {
/* call the nd6 compat_90 hook to validate the nd6-related names */
case OICMPV6CTL_ND6_DRLIST: /* FALLTHROUGH */
case OICMPV6CTL_ND6_PRLIST:
MODULE_HOOK_CALL(net_inet6_nd_90_hook, (name), ENOPROTOOPT,
error);
if (error == 0)
*oldlenp = 0;
return error;
case ICMPV6CTL_ND6_MAXQLEN:
return 0;
default:
return ENOPROTOOPT;
}
}
/* $NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $ */
/*-
* Copyright (c) 1998, 2004 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Originally written by Julian Elischer (julian@tfs.com)
* for TRW Financial Systems for use under the MACH(2.5) operating system.
*
* TRW Financial Systems, in accordance with their agreement with Carnegie
* Mellon University, makes this software available to CMU to distribute
* or use in any manner that they see fit as long as this message is kept with
* the software. For this reason TFS also grants any other persons or
* organisations permission to use or modify this software.
*
* TFS supplies this software to be publicly redistributed
* on the understanding that TFS is not responsible for the correct
* functioning of this software in any circumstances.
*
* Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
* major changes by Julian Elischer (julian@jules.dialix.oz.au) May 1993
*
* A lot of rewhacking done by mjacob (mjacob@nas.nasa.gov).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: st.c,v 1.243 2022/02/23 21:54:41 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_scsi.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/malloc.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/proc.h>
#include <sys/mtio.h>
#include <sys/device.h>
#include <sys/conf.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/iostat.h>
#include <sys/sysctl.h>
#include <dev/scsipi/scsi_spc.h>
#include <dev/scsipi/scsipi_all.h>
#include <dev/scsipi/scsi_all.h>
#include <dev/scsipi/scsi_tape.h>
#include <dev/scsipi/scsipiconf.h>
#include <dev/scsipi/scsipi_base.h>
#include <dev/scsipi/stvar.h>
/* Defines for device specific stuff */
#define DEF_FIXED_BSIZE 512
#define STMODE(z) ( minor(z) & 0x03)
#define STDSTY(z) ((minor(z) >> 2) & 0x03)
#define STUNIT(z) ((minor(z) >> 4) )
#define STNMINOR 16
#define NORMAL_MODE 0
#define NOREW_MODE 1
#define EJECT_MODE 2
#define CTRL_MODE 3
#ifndef ST_MOUNT_DELAY
#define ST_MOUNT_DELAY 0
#endif
static dev_type_open(stopen);
static dev_type_close(stclose);
static dev_type_read(stread);
static dev_type_write(stwrite);
static dev_type_ioctl(stioctl);
static dev_type_strategy(ststrategy);
static dev_type_dump(stdump);
const struct bdevsw st_bdevsw = {
.d_open = stopen,
.d_close = stclose,
.d_strategy = ststrategy,
.d_ioctl = stioctl,
.d_dump = stdump,
.d_psize = nosize,
.d_discard = nodiscard,
.d_flag = D_TAPE | D_MPSAFE
};
const struct cdevsw st_cdevsw = {
.d_open = stopen,
.d_close = stclose,
.d_read = stread,
.d_write = stwrite,
.d_ioctl = stioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_TAPE | D_MPSAFE
};
/*
* Define various devices that we know mis-behave in some way,
* and note how they are bad, so we can correct for them
*/
static const struct st_quirk_inquiry_pattern st_quirk_patterns[] = {
{{T_SEQUENTIAL, T_REMOV,
" ", " ", " "}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_24}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 0, HALFINCH_1600}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 0, HALFINCH_6250} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 3600 ", ""}, {0, 12, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 3800 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " SLR5 4/8GB ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
/*
* lacking a manual for the 4200, it's not clear what the
* specific density codes should be- the device is a 2.5GB
* capable QIC drive, those density codes aren't readily
* available. The 'default' will just have to do.
*/
{{T_SEQUENTIAL, T_REMOV,
"TANDBERG", " TDC 4200 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
/*
* At least -005 and -007 need this. I'll assume they all do unless I
* hear otherwise. - mycroft, 31MAR1994
*/
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 2525 25462", ""}, {0, 0, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{ST_Q_SENSE_HELP, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
/*
* One user reports that this works for his tape drive. It probably
* needs more work. - mycroft, 09APR1994
*/
{{T_SEQUENTIAL, T_REMOV,
"SANKYO ", "CP525 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ANRITSU ", "DMT780 ", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 150 21247", ""}, {ST_Q_ERASE_NOIMM, 12, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{0, 0, QIC_150}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"ARCHIVE ", "VIPER 150 21531", ""}, {ST_Q_ERASE_NOIMM, 12, {
{ST_Q_SENSE_HELP, 0, 0}, /* minor 0-3 */
{0, 0, QIC_150}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5099ES SCSI", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_11}, /* minor 4-7 */
{0, 0, QIC_24}, /* minor 8-11 */
{0, 0, QIC_24} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5150ES SCSI", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{0, 0, QIC_24}, /* minor 4-7 */
{0, 0, QIC_120}, /* minor 8-11 */
{0, 0, QIC_150} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WANGTEK ", "5525ES SCSI REV7", ""}, {0, 0, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_BLKSIZE, 0, QIC_525}, /* minor 4-7 */
{0, 0, QIC_150}, /* minor 8-11 */
{0, 0, QIC_120} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"WangDAT ", "Model 1300 ", ""}, {0, 0, {
{0, 0, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, DDS}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 1024, DDS}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 0, DDS} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"EXABYTE ", "EXB-8200 ", "263H"}, {0, 5, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"STK", "9490", ""},
{ST_Q_FORCE_BLKSIZE, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"STK", "SD-3", ""},
{ST_Q_FORCE_BLKSIZE, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"IBM", "03590", ""}, {ST_Q_IGNORE_LOADS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"HP ", "T4000s ", ""}, {ST_Q_UNIMODAL, 0, {
{0, 0, QIC_3095}, /* minor 0-3 */
{0, 0, QIC_3095}, /* minor 4-7 */
{0, 0, QIC_3095}, /* minor 8-11 */
{0, 0, QIC_3095}, /* minor 12-15 */
}}},
#if 0
{{T_SEQUENTIAL, T_REMOV,
"EXABYTE ", "EXB-8200 ", ""}, {0, 12, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
#endif
{{T_SEQUENTIAL, T_REMOV,
"TEAC ", "MT-2ST/N50 ", ""}, {ST_Q_IGNORE_LOADS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"OnStream", "ADR50 Drive", ""}, {ST_Q_UNIMODAL, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"OnStream DI-30", "", "1.0"}, {ST_Q_NOFILEMARKS, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"NCR H621", "0-STD-03-46F880 ", ""}, {ST_Q_NOPREVENT, 0, {
{0, 0, 0}, /* minor 0-3 */
{0, 0, 0}, /* minor 4-7 */
{0, 0, 0}, /* minor 8-11 */
{0, 0, 0} /* minor 12-15 */
}}},
{{T_SEQUENTIAL, T_REMOV,
"Seagate STT3401A", "hp0atxa", ""}, {0, 0, {
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 0-3 */
{ST_Q_FORCE_BLKSIZE, 1024, 0}, /* minor 4-7 */
{ST_Q_FORCE_BLKSIZE, 512, 0}, /* minor 8-11 */
{ST_Q_FORCE_BLKSIZE, 512, 0} /* minor 12-15 */
}}},
};
#define NOEJECT 0
#define EJECT 1
static void st_identify_drive(struct st_softc *,
struct scsipi_inquiry_pattern *);
static void st_loadquirks(struct st_softc *);
static int st_mount_tape(dev_t, int);
static void st_unmount(struct st_softc *, boolean);
static int st_decide_mode(struct st_softc *, boolean);
static void ststart(struct scsipi_periph *);
static int ststart1(struct scsipi_periph *, struct buf *, int *);
static void strestart(void *);
static void stdone(struct scsipi_xfer *, int);
static int st_read(struct st_softc *, char *, int, int);
static int st_space(struct st_softc *, int, u_int, int);
static int st_write_filemarks(struct st_softc *, int, int);
static int st_check_eod(struct st_softc *, boolean, int *, int);
static int st_load(struct st_softc *, u_int, int);
static int st_rewind(struct st_softc *, u_int, int);
static int st_interpret_sense(struct scsipi_xfer *);
static int st_touch_tape(struct st_softc *);
static int st_erase(struct st_softc *, int full, int flags);
static void st_updatefilepos(struct st_softc *);
static int st_rdpos(struct st_softc *, int, uint32_t *);
static int st_setpos(struct st_softc *, int, uint32_t *);
static const struct scsipi_periphsw st_switch = {
st_interpret_sense,
ststart,
NULL,
stdone
};
#if defined(ST_ENABLE_EARLYWARN)
#define ST_INIT_FLAGS ST_EARLYWARN
#else
#define ST_INIT_FLAGS 0
#endif
/*
* The routine called by the low level scsi routine when it discovers
* A device suitable for this driver
*/
void
stattach(device_t parent, device_t self, void *aux)
{
struct st_softc *st = device_private(self);
struct scsipibus_attach_args *sa = aux;
struct scsipi_periph *periph = sa->sa_periph;
SC_DEBUG(periph, SCSIPI_DB2, ("stattach: "));
st->sc_dev = self;
/* Store information needed to contact our base driver */
st->sc_periph = periph;
periph->periph_dev = st->sc_dev;
periph->periph_switch = &st_switch;
/* Set initial flags */
st->flags = ST_INIT_FLAGS;
/* Set up the buf queues for this device */
bufq_alloc(&st->buf_queue, "fcfs", 0);
bufq_alloc(&st->buf_defer, "fcfs", 0);
callout_init(&st->sc_callout, 0);
mutex_init(&st->sc_iolock, MUTEX_DEFAULT, IPL_VM);
/*
* Check if the drive is a known criminal and take
* Any steps needed to bring it into line
*/
st_identify_drive(st, &sa->sa_inqbuf);
aprint_naive("\n");
aprint_normal("\n");
/* Use the subdriver to request information regarding the drive. */
aprint_normal_dev(self, "%s", st->quirkdata ? "quirks apply, " : "");
if (scsipi_test_unit_ready(periph,
XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE) ||
st->ops(st, ST_OPS_MODESENSE,
XS_CTL_DISCOVERY | XS_CTL_SILENT | XS_CTL_IGNORE_MEDIA_CHANGE))
aprint_normal("drive empty\n");
else {
aprint_normal("density code %d, ", st->media_density);
if (st->media_blksize > 0)
aprint_normal("%d-byte", st->media_blksize);
else
aprint_normal("variable");
aprint_normal(" blocks, write-%s\n",
(st->flags & ST_READONLY) ? "protected" : "enabled");
}
st->stats = iostat_alloc(IOSTAT_TAPE, parent,
device_xname(st->sc_dev));
rnd_attach_source(&st->rnd_source, device_xname(st->sc_dev),
RND_TYPE_TAPE, RND_FLAG_DEFAULT);
}
int
stdetach(device_t self, int flags)
{
struct st_softc *st = device_private(self);
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_channel *chan = periph->periph_channel;
int bmaj, cmaj, mn;
/* locate the major number */
bmaj = bdevsw_lookup_major(&st_bdevsw);
cmaj = cdevsw_lookup_major(&st_cdevsw);
/* kill any pending restart */
callout_halt(&st->sc_callout, NULL);
mutex_enter(chan_mtx(chan));
/* Kill off any queued buffers. */
bufq_drain(st->buf_defer);
bufq_drain(st->buf_queue);
/* Kill off any pending commands. */
scsipi_kill_pending(st->sc_periph);
mutex_exit(chan_mtx(chan));
bufq_free(st->buf_defer);
bufq_free(st->buf_queue);
mutex_destroy(&st->sc_iolock);
/* Nuke the vnodes for any open instances */
mn = STUNIT(device_unit(self));
vdevgone(bmaj, mn, mn+STNMINOR-1, VBLK);
vdevgone(cmaj, mn, mn+STNMINOR-1, VCHR);
iostat_free(st->stats);
/* Unhook the entropy source. */
rnd_detach_source(&st->rnd_source);
return 0;
}
/*
* Use the inquiry routine in 'scsi_base' to get drive info so we can
* Further tailor our behaviour.
*/
static void
st_identify_drive(struct st_softc *st, struct scsipi_inquiry_pattern *inqbuf)
{
const struct st_quirk_inquiry_pattern *finger;
int priority;
finger = scsipi_inqmatch(inqbuf,
st_quirk_patterns,
sizeof(st_quirk_patterns) / sizeof(st_quirk_patterns[0]),
sizeof(st_quirk_patterns[0]), &priority);
if (priority != 0) {
st->quirkdata = &finger->quirkdata;
st->drive_quirks = finger->quirkdata.quirks;
st->quirks = finger->quirkdata.quirks; /* start value */
st->page_0_size = finger->quirkdata.page_0_size;
KASSERT(st->page_0_size <= MAX_PAGE_0_SIZE);
st_loadquirks(st);
}
}
/*
* initialise the subdevices to the default (QUIRK) state.
* this will remove any setting made by the system operator or previous
* operations.
*/
static void
st_loadquirks(struct st_softc *st)
{
const struct modes *mode;
struct modes *mode2;
int i;
mode = st->quirkdata->modes;
mode2 = st->modes;
for (i = 0; i < 4; i++) {
memset(mode2, 0, sizeof(struct modes));
st->modeflags[i] &= ~(BLKSIZE_SET_BY_QUIRK |
DENSITY_SET_BY_QUIRK | BLKSIZE_SET_BY_USER |
DENSITY_SET_BY_USER);
if ((mode->quirks | st->drive_quirks) & ST_Q_FORCE_BLKSIZE) {
mode2->blksize = mode->blksize;
st->modeflags[i] |= BLKSIZE_SET_BY_QUIRK;
}
if (mode->density) {
mode2->density = mode->density;
st->modeflags[i] |= DENSITY_SET_BY_QUIRK;
}
mode2->quirks |= mode->quirks;
mode++;
mode2++;
}
}
/* open the device. */
static int
stopen(dev_t dev, int flags, int mode, struct lwp *l)
{
u_int stmode, dsty;
int error, sflags, unit, tries, ntries;
struct st_softc *st;
struct scsipi_periph *periph;
struct scsipi_adapter *adapt;
unit = STUNIT(dev);
st = device_lookup_private(&st_cd, unit);
if (st == NULL)
return ENXIO;
stmode = STMODE(dev);
dsty = STDSTY(dev);
periph = st->sc_periph;
adapt = periph->periph_channel->chan_adapter;
SC_DEBUG(periph, SCSIPI_DB1,
("open: dev=0x%"PRIx64" (unit %d (of %d))\n", dev, unit,
st_cd.cd_ndevs));
/* Only allow one at a time */
if (periph->periph_flags & PERIPH_OPEN) {
aprint_error_dev(st->sc_dev, "already open\n");
return EBUSY;
}
if ((error = scsipi_adapter_addref(adapt)) != 0)
return error;
/* clear any latched errors. */
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
/*
* Catch any unit attention errors. Be silent about this
* unless we're already mounted. We ignore media change
* if we're in control mode or not mounted yet.
*/
if ((st->flags & ST_MOUNTED) == 0 || stmode == CTRL_MODE) {
#ifdef SCSIDEBUG
sflags = XS_CTL_IGNORE_MEDIA_CHANGE;
#else
sflags = XS_CTL_SILENT|XS_CTL_IGNORE_MEDIA_CHANGE;
#endif
} else
sflags = 0;
/*
* If we're already mounted or we aren't configured for
* a mount delay, only try a test unit ready once. Otherwise,
* try up to ST_MOUNT_DELAY times with a rest interval of
* one second between each try.
*/
if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0)
ntries = 1;
else
ntries = ST_MOUNT_DELAY;
for (error = tries = 0; tries < ntries; tries++) {
int slpintr, oflags;
/*
* If we had no error, or we're opening the control mode
* device, we jump out right away.
*/
error = scsipi_test_unit_ready(periph, sflags);
if (error == 0 || stmode == CTRL_MODE)
break;
/*
* We had an error.
*
* If we're already mounted or we aren't configured for
* a mount delay, or the error isn't a NOT READY error,
* skip to the error exit now.
*/
if ((st->flags & ST_MOUNTED) || ST_MOUNT_DELAY == 0 ||
(st->mt_key != SKEY_NOT_READY)) {
device_printf(st->sc_dev,
"mount error (sense key=%d) - "
"terminating mount session\n",
st->mt_key);
/*
* the following should not trigger unless
* something serious happened while the device
* was open (PREVENT MEDIUM REMOVAL in effect)
*/
if (st->flags & ST_WRITTEN &&
st->mt_key == SKEY_UNIT_ATTENTION) {
/*
* device / media state may have changed
* refrain from writing missing file marks
* onto potentially newly inserted/formatted
* media (e. g. emergency EJECT/RESET/etc.)
*/
st->flags &= ~(ST_WRITTEN|ST_FM_WRITTEN);
device_printf(st->sc_dev,
"CAUTION: file marks/data may be missing"
" - ASC = 0x%02x, ASCQ = 0x%02x\n",
st->asc, st->ascq);
}
goto bad;
}
/* clear any latched errors. */
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
/*
* Fake that we have the device open so
* we block other apps from getting in.
*/
oflags = periph->periph_flags;
periph->periph_flags |= PERIPH_OPEN;
slpintr = kpause("stload", true, hz, NULL);
periph->periph_flags = oflags; /* restore flags */
if (slpintr != 0 && slpintr != EWOULDBLOCK) {
device_printf(st->sc_dev, "load interrupted\n");
goto bad;
}
}
/*
* If the mode is 3 (e.g. minor = 3,7,11,15) then the device has
* been opened to set defaults and perform other, usually non-I/O
* related, operations. In this case, do a quick check to see
* whether the unit actually had a tape loaded (this will be known
* as to whether or not we got a NOT READY for the above
* unit attention). If a tape is there, go do a mount sequence.
*/
if (stmode == CTRL_MODE && st->mt_key != SKEY_NO_SENSE &&
st->mt_key != SKEY_UNIT_ATTENTION) {
periph->periph_flags |= PERIPH_OPEN;
return 0;
}
/*
* If we get this far and had an error set, that means we failed
* to pass the 'test unit ready' test for the non-controlmode device,
* so we bounce the open.
*/
if (error)
return error;
/* Else, we're now committed to saying we're open. */
periph->periph_flags |= PERIPH_OPEN; /* unit attn are now errors */
/*
* If it's a different mode, or if the media has been
* invalidated, unmount the tape from the previous
* session but continue with open processing
*/
if (st->last_dsty != dsty ||
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
st_unmount(st, NOEJECT);
/*
* If we are not mounted, then we should start a new
* mount session.
*/
if (!(st->flags & ST_MOUNTED)) { if ((error = st_mount_tape(dev, flags)) != 0)
goto bad;
st->last_dsty = dsty;
}
if (!(st->quirks & ST_Q_NOPREVENT)) { scsipi_prevent(periph, SPAMR_PREVENT_DT,
XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
}
SC_DEBUG(periph, SCSIPI_DB2, ("open complete\n"));
return 0;
bad:
st_unmount(st, NOEJECT);
scsipi_adapter_delref(adapt);
periph->periph_flags &= ~PERIPH_OPEN;
return error;
}
static int
stclose(dev_t dev, int flags, int mode, struct lwp *l)
{
int stxx, error = 0;
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_adapter *adapt = periph->periph_channel->chan_adapter;
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("closing\n"));
/*
* Make sure that a tape opened in write-only mode will have
* file marks written on it when closed, even if not written to.
*
* This is for SUN compatibility. Actually, the Sun way of
* things is to:
*
* only write filemarks if there are fmks to be written and
* - open for write (possibly read/write)
* - the last operation was a write
* or:
* - opened for wronly
* - no data was written (including filemarks)
*/
stxx = st->flags & (ST_WRITTEN | ST_FM_WRITTEN);
if ((flags & FWRITE) != 0) {
int nm = 0;
#ifdef ST_SUNCOMPAT
/*
* on request only
* original compat code has not been working
* since ~1998
*/
if ((flags & O_ACCMODE) == FWRITE && (stxx == 0)) {
st->flags |= ST_WRITTEN;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("SUN compatibility: write FM(s) at close\n"));
}
#endif
error = st_check_eod(st, FALSE, &nm, 0);
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("wrote %d FM(s) at close error=%d\n", nm, error));
}
/* Allow robots to eject tape if needed. */
if (!(st->quirks & ST_Q_NOPREVENT)) {
scsipi_prevent(periph, SPAMR_ALLOW,
XS_CTL_IGNORE_ILLEGAL_REQUEST | XS_CTL_IGNORE_NOT_READY);
}
switch (STMODE(dev)) {
case NORMAL_MODE:
st_unmount(st, NOEJECT);
break;
case NOREW_MODE:
case CTRL_MODE:
/*
* Leave mounted unless media seems to have been removed.
*
* Otherwise, if we're to terminate a tape with more than one
* filemark [ and because we're not rewinding here ], backspace
* one filemark so that later appends will see an unbroken
* sequence of:
*
* file - FMK - file - FMK ... file - FMK FMK (EOM)
*/
if ((periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
st_unmount(st, NOEJECT);
} else if (error == 0) {
/*
* ST_WRITTEN was preserved from above.
*
* All we need to know here is:
*
* Were we writing this tape and was the last
* operation a write?
*
* Are there supposed to be 2FM at EOD?
*
* If both statements are true, then we backspace
* one filemark.
*/
stxx &= ~ST_FM_WRITTEN;
stxx |= (st->flags & ST_2FM_AT_EOD);
if ((flags & FWRITE) != 0 &&
(stxx == (ST_2FM_AT_EOD|ST_WRITTEN))) {
error = st_space(st, -1, SP_FILEMARKS, 0);
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("st_space(-1) error=%d\n", error));
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("no backspacing - flags = 0x%x, stxx=0x%x, st->flags=0x%x\n", flags, stxx, st->flags));
}
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3, ("error %d from st_check_eod\n", error));
}
break;
case EJECT_MODE:
st_unmount(st, EJECT);
break;
}
KASSERTMSG((st->flags & ST_WRITTEN) == 0,
"pending ST_WRITTEN flag NOT cleared (flags=0x%x)", st->flags);
scsipi_wait_drain(periph);
scsipi_adapter_delref(adapt);
periph->periph_flags &= ~PERIPH_OPEN;
return error;
}
/*
* Start a new mount session.
* Copy in all the default parameters from the selected device mode.
* and try guess any that seem to be defaulted.
*/
static int
st_mount_tape(dev_t dev, int flags)
{
int unit;
u_int dsty;
struct st_softc *st;
struct scsipi_periph *periph;
int error = 0;
unit = STUNIT(dev);
dsty = STDSTY(dev);
st = device_lookup_private(&st_cd, unit);
periph = st->sc_periph;
if (st->flags & ST_MOUNTED)
return 0;
SC_DEBUG(periph, SCSIPI_DB1, ("mounting\n "));
st->flags |= ST_NEW_MOUNT;
st->quirks = st->drive_quirks | st->modes[dsty].quirks;
/*
* If the media is new, then make sure we give it a chance to
* to do a 'load' instruction. (We assume it is new.)
*/
if ((error = st_load(st, LD_LOAD, XS_CTL_SILENT)) != 0)
return error;
/*
* Throw another dummy instruction to catch
* 'Unit attention' errors. Many drives give
* these after doing a Load instruction (with
* the MEDIUM MAY HAVE CHANGED asc/ascq).
*/
scsipi_test_unit_ready(periph, XS_CTL_SILENT); /* XXX */
/*
* Some devices can't tell you much until they have been
* asked to look at the media. This quirk does this.
*/
if (st->quirks & ST_Q_SENSE_HELP) if ((error = st_touch_tape(st)) != 0)
return error;
/*
* Load the physical device parameters
* loads: blkmin, blkmax
*/
if ((error = st->ops(st, ST_OPS_RBL, 0)) != 0)
return error;
/*
* Load the media dependent parameters
* includes: media_blksize,media_density,numblks
* As we have a tape in, it should be reflected here.
* If not you may need the "quirk" above.
*/
if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
return error;
/*
* If we have gained a permanent density from somewhere,
* then use it in preference to the one supplied by
* default by the driver.
*/
if (st->modeflags[dsty] & (DENSITY_SET_BY_QUIRK | DENSITY_SET_BY_USER))
st->density = st->modes[dsty].density;
else
st->density = st->media_density;
/*
* If we have gained a permanent blocksize
* then use it in preference to the one supplied by
* default by the driver.
*/
st->flags &= ~ST_FIXEDBLOCKS;
if (st->modeflags[dsty] &
(BLKSIZE_SET_BY_QUIRK | BLKSIZE_SET_BY_USER)) {
st->blksize = st->modes[dsty].blksize;
if (st->blksize) st->flags |= ST_FIXEDBLOCKS;
} else {
if ((error = st_decide_mode(st, FALSE)) != 0)
return error;
}
if ((error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
/* ATAPI will return ENODEV for this, and this may be OK */
if (error != ENODEV) {
aprint_error_dev(st->sc_dev,
"cannot set selected mode\n");
return error;
}
}
st->flags &= ~ST_NEW_MOUNT;
st->flags |= ST_MOUNTED;
periph->periph_flags |= PERIPH_MEDIA_LOADED; /* move earlier? */
st->blkno = st->fileno = (daddr_t) 0;
return 0;
}
/*
* End the present mount session.
* Rewind, and optionally eject the tape.
* Reset various flags to indicate that all new
* operations require another mount operation
*/
static void
st_unmount(struct st_softc *st, boolean eject)
{
struct scsipi_periph *periph = st->sc_periph;
int nmarks;
if ((st->flags & ST_MOUNTED) == 0)
return;
SC_DEBUG(periph, SCSIPI_DB1, ("unmounting\n"));
st_check_eod(st, FALSE, &nmarks, XS_CTL_IGNORE_NOT_READY);
st_rewind(st, 0, XS_CTL_IGNORE_NOT_READY);
/*
* Section 9.3.3 of the SCSI specs states that a device shall return
* the density value specified in the last successful MODE SELECT
* after an unload operation, in case it is not able to
* automatically determine the density of the new medium.
*
* So we instruct the device to use the default density, which will
* prevent the use of stale density values (in particular,
* in st_touch_tape().
*/
st->density = 0;
if (st->ops(st, ST_OPS_MODESELECT, 0) != 0) {
aprint_error_dev(st->sc_dev,
"WARNING: cannot revert to default density\n");
}
if (eject) {
if (!(st->quirks & ST_Q_NOPREVENT)) {
scsipi_prevent(periph, SPAMR_ALLOW,
XS_CTL_IGNORE_ILLEGAL_REQUEST |
XS_CTL_IGNORE_NOT_READY);
}
st_load(st, LD_UNLOAD, XS_CTL_IGNORE_NOT_READY);
st->blkno = st->fileno = (daddr_t) -1;
} else {
st->blkno = st->fileno = (daddr_t) 0;
}
st->flags &= ~(ST_MOUNTED | ST_NEW_MOUNT);
periph->periph_flags &= ~PERIPH_MEDIA_LOADED;
}
/*
* Given all we know about the device, media, mode, 'quirks' and
* initial operation, make a decision as to how we should be set
* to run (regarding blocking and EOD marks)
*/
int
st_decide_mode(struct st_softc *st, boolean first_read)
{
SC_DEBUG(st->sc_periph, SCSIPI_DB2, ("starting block mode decision\n"));
/*
* If the drive can only handle fixed-length blocks and only at
* one size, perhaps we should just do that.
*/
if (st->blkmin && (st->blkmin == st->blkmax)) { st->flags |= ST_FIXEDBLOCKS;
st->blksize = st->blkmin;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("blkmin == blkmax of %d\n", st->blkmin));
goto done;
}
/*
* If the tape density mandates (or even suggests) use of fixed
* or variable-length blocks, comply.
*/
switch (st->density) {
case HALFINCH_800:
case HALFINCH_1600:
case HALFINCH_6250:
case DDS:
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("density specified variable\n"));
goto done;
case QIC_11:
case QIC_24:
case QIC_120:
case QIC_150:
case QIC_525:
case QIC_1320:
case QIC_3095:
case QIC_3220:
st->flags |= ST_FIXEDBLOCKS;
if (st->media_blksize > 0)
st->blksize = st->media_blksize;
else
st->blksize = DEF_FIXED_BSIZE;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("density specified fixed\n"));
goto done;
}
/*
* If we're about to read the tape, perhaps we should choose
* fixed or variable-length blocks and block size according to
* what the drive found on the tape.
*/
if (first_read &&
(!(st->quirks & ST_Q_BLKSIZE) || (st->media_blksize == 0) ||
(st->media_blksize == DEF_FIXED_BSIZE) ||
(st->media_blksize == 1024))) {
if (st->media_blksize > 0)
st->flags |= ST_FIXEDBLOCKS;
else
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = st->media_blksize;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("Used media_blksize of %d\n", st->media_blksize));
goto done;
}
/*
* We're getting no hints from any direction. Choose variable-
* length blocks arbitrarily.
*/
st->flags &= ~ST_FIXEDBLOCKS;
st->blksize = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("Give up and default to variable mode\n"));
done:
/*
* Decide whether or not to write two file marks to signify end-
* of-data. Make the decision as a function of density. If
* the decision is not to use a second file mark, the SCSI BLANK
* CHECK condition code will be recognized as end-of-data when
* first read.
* (I think this should be a by-product of fixed/variable..julian)
*/
switch (st->density) {
/* case 8 mm: What is the SCSI density code for 8 mm, anyway? */
case QIC_11:
case QIC_24:
case QIC_120:
case QIC_150:
case QIC_525:
case QIC_1320:
case QIC_3095:
case QIC_3220:
st->flags &= ~ST_2FM_AT_EOD;
break;
default:
st->flags |= ST_2FM_AT_EOD;
}
return 0;
}
/*
* Actually translate the requested transfer into
* one the physical driver can understand
* The transfer is described by a buf and will include
* only one physical transfer.
*/
static void
ststrategy(struct buf *bp)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(bp->b_dev));
struct scsipi_periph *periph = st->sc_periph;
struct scsipi_channel *chan = periph->periph_channel;
SC_DEBUG(periph, SCSIPI_DB1,
("ststrategy %d bytes @ blk %" PRId64 "\n", bp->b_bcount,
bp->b_blkno));
/* If it's a null transfer, return immediately */
if (bp->b_bcount == 0)
goto abort;
/* If offset is negative, error */
if (bp->b_blkno < 0) {
SC_DEBUG(periph, SCSIPI_DB3,
("EINVAL: ststrategy negative blockcount %" PRId64 "\n", bp->b_blkno));
bp->b_error = EINVAL;
goto abort;
}
/* Odd sized request on fixed drives are verboten */
if (st->flags & ST_FIXEDBLOCKS) {
if (bp->b_bcount % st->blksize) {
aprint_error_dev(st->sc_dev, "bad request, must be multiple of %d\n",
st->blksize);
bp->b_error = EIO;
goto abort;
}
}
/* as are out-of-range requests on variable drives. */
else if (bp->b_bcount < st->blkmin ||
(st->blkmax && bp->b_bcount > st->blkmax)) {
aprint_error_dev(st->sc_dev, "bad request, must be between %d and %d\n",
st->blkmin, st->blkmax);
bp->b_error = EIO;
goto abort;
}
mutex_enter(chan_mtx(chan));
/*
* Place it in the queue of activities for this tape
* at the end (a bit silly because we only have on user..
* (but it could fork()))
*/
bufq_put(st->buf_queue, bp);
/*
* Tell the device to get going on the transfer if it's
* not doing anything, otherwise just wait for completion
* (All a bit silly if we're only allowing 1 open but..)
*/
ststart(periph);
mutex_exit(chan_mtx(chan));
return;
abort:
/*
* Reset the residue because we didn't do anything,
* and send the buffer back as done.
*/
bp->b_resid = bp->b_bcount;
biodone(bp);
return;
}
/*
* ststart looks to see if there is a buf waiting for the device
* and that the device is not already busy. If the device is busy,
* the request is deferred and retried on the next attempt.
* If both are true, ststart creates a scsi command to perform
* the transfer required.
*
* The transfer request will call scsipi_done on completion,
* which will in turn call this routine again so that the next
* queued transfer is performed. The bufs are queued by the
* strategy routine (ststrategy)
*
* This routine is also called after other non-queued requests
* have been made of the scsi driver, to ensure that the queue
* continues to be drained.
* ststart() is called with channel lock held
*/
static int
ststart1(struct scsipi_periph *periph, struct buf *bp, int *errnop)
{
struct st_softc *st = device_private(periph->periph_dev);
struct scsipi_channel *chan = periph->periph_channel;
struct scsi_rw_tape cmd;
struct scsipi_xfer *xs;
int flags, error, complete = 1;
SC_DEBUG(periph, SCSIPI_DB2, ("ststart1 "));
mutex_enter(chan_mtx(chan));
if (periph->periph_active >= periph->periph_openings) {
error = EAGAIN;
goto out;
}
/* if a special awaits, let it proceed first */
if (periph->periph_flags & PERIPH_WAITING) {
periph->periph_flags &= ~PERIPH_WAITING;
cv_broadcast(periph_cv_periph(periph));
error = EAGAIN;
goto out;
}
/*
* If the device has been unmounted by the user
* then throw away all requests until done.
*/
if (__predict_false((st->flags & ST_MOUNTED) == 0 ||
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)) {
error = EIO;
goto out;
}
/*
* only FIXEDBLOCK devices have pending I/O or space operations.
*/
if (st->flags & ST_FIXEDBLOCKS) {
/*
* If we are at a filemark but have not reported it yet
* then we should report it now
*/
if (st->flags & ST_AT_FILEMARK) {
if ((bp->b_flags & B_READ) == B_WRITE) {
/*
* Handling of ST_AT_FILEMARK in
* st_space will fill in the right file
* mark count.
* Back up over filemark
*/
if (st_space(st, 0, SP_FILEMARKS, 0)) {
error = EIO;
goto out;
}
} else {
error = 0;
st->flags &= ~ST_AT_FILEMARK;
goto out;
}
}
}
/*
* If we are at EOM but have not reported it
* yet then we should report it now.
*/
if (st->flags & (ST_EOM_PENDING|ST_EIO_PENDING)) {
error = 0;
if (st->flags & ST_EIO_PENDING)
error = EIO;
st->flags &= ~(ST_EOM_PENDING|ST_EIO_PENDING);
goto out;
}
/* Fill out the scsi command */
memset(&cmd, 0, sizeof(cmd));
flags = XS_CTL_NOSLEEP | XS_CTL_ASYNC;
if ((bp->b_flags & B_READ) == B_WRITE) {
cmd.opcode = WRITE;
st->flags &= ~ST_FM_WRITTEN;
flags |= XS_CTL_DATA_OUT;
} else {
cmd.opcode = READ;
flags |= XS_CTL_DATA_IN;
}
/*
* Handle "fixed-block-mode" tape drives by using the
* block count instead of the length.
*/
if (st->flags & ST_FIXEDBLOCKS) {
cmd.byte2 |= SRW_FIXED;
_lto3b(bp->b_bcount / st->blksize, cmd.len);
} else
_lto3b(bp->b_bcount, cmd.len);
/* Clear 'position updated' indicator */
st->flags &= ~ST_POSUPDATED;
/* go ask the adapter to do all this for us */
xs = scsipi_make_xs_locked(periph,
(struct scsipi_generic *)&cmd, sizeof(cmd),
(u_char *)bp->b_data, bp->b_bcount,
0, ST_IO_TIME, bp, flags);
if (__predict_false(xs == NULL)) {
/*
* out of memory. Keep this buffer in the queue, and
* retry later.
*/
callout_reset(&st->sc_callout, hz / 2, strestart,
periph);
error = EAGAIN;
goto out;
}
error = scsipi_execute_xs(xs);
/* with a scsipi_xfer preallocated, scsipi_command can't fail */
KASSERT(error == 0);
if (error == 0)
complete = 0;
out:
mutex_exit(chan_mtx(chan));
*errnop = error;
return complete;
}
static void
ststart(struct scsipi_periph *periph)
{
struct st_softc *st = device_private(periph->periph_dev);
struct scsipi_channel *chan = periph->periph_channel;
struct buf *bp;
int error, complete;
SC_DEBUG(periph, SCSIPI_DB2, ("ststart "));
mutex_exit(chan_mtx(chan));
mutex_enter(&st->sc_iolock);
while ((bp = bufq_get(st->buf_defer)) != NULL
|| (bp = bufq_get(st->buf_queue)) != NULL) {
iostat_busy(st->stats);
mutex_exit(&st->sc_iolock);
complete = ststart1(periph, bp, &error);
mutex_enter(&st->sc_iolock);
if (complete) {
iostat_unbusy(st->stats, 0,
((bp->b_flags & B_READ) == B_READ));
if (error == EAGAIN) {
bufq_put(st->buf_defer, bp);
break;
}
}
mutex_exit(&st->sc_iolock);
if (complete) {
bp->b_error = error;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
mutex_enter(&st->sc_iolock);
}
mutex_exit(&st->sc_iolock);
mutex_enter(chan_mtx(chan));
}
static void
strestart(void *v)
{
struct scsipi_periph *periph = (struct scsipi_periph *)v;
struct scsipi_channel *chan = periph->periph_channel;
mutex_enter(chan_mtx(chan));
ststart((struct scsipi_periph *)v);
mutex_exit(chan_mtx(chan));
}
static void
stdone(struct scsipi_xfer *xs, int error)
{
struct st_softc *st = device_private(xs->xs_periph->periph_dev);
struct buf *bp = xs->bp;
if (bp) {
bp->b_error = error;
bp->b_resid = xs->resid;
/*
* buggy device ? A SDLT320 can report an info
* field of 0x3de8000 on a Media Error/Write Error
* for this CBD: 0x0a 00 00 80 00 00
*/
if (bp->b_resid > bp->b_bcount || bp->b_resid < 0)
bp->b_resid = bp->b_bcount;
mutex_enter(&st->sc_iolock);
if ((bp->b_flags & B_READ) == B_WRITE)
st->flags |= ST_WRITTEN;
else
st->flags &= ~ST_WRITTEN;
iostat_unbusy(st->stats, bp->b_bcount,
((bp->b_flags & B_READ) == B_READ));
if ((st->flags & ST_POSUPDATED) == 0) {
if (error) {
st->fileno = st->blkno = -1;
} else if (st->blkno != -1) {
if (st->flags & ST_FIXEDBLOCKS)
st->blkno +=
(bp->b_bcount / st->blksize);
else
st->blkno++;
}
}
mutex_exit(&st->sc_iolock);
rnd_add_uint32(&st->rnd_source, bp->b_blkno);
biodone(bp);
}
}
static int
stread(dev_t dev, struct uio *uio, int iomode)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
int r = physio(ststrategy, NULL, dev, B_READ,
st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stread: result=%d]\n", r));
return r;
}
static int
stwrite(dev_t dev, struct uio *uio, int iomode)
{
struct st_softc *st = device_lookup_private(&st_cd, STUNIT(dev));
int r = physio(ststrategy, NULL, dev, B_WRITE,
st->sc_periph->periph_channel->chan_adapter->adapt_minphys, uio);
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[stwrite: result=%d]\n", r));
return r;
}
/*
* Perform special action on behalf of the user;
* knows about the internals of this device
*/
static int
stioctl(dev_t dev, u_long cmd, void *arg, int flag, struct lwp *l)
{
int error = 0;
int unit;
int number, nmarks, dsty;
int flags;
struct st_softc *st;
int hold_blksize;
uint8_t hold_density;
struct mtop *mt = (struct mtop *) arg;
/* Find the device that the user is talking about */
flags = 0; /* give error messages, act on errors etc. */
unit = STUNIT(dev);
dsty = STDSTY(dev);
st = device_lookup_private(&st_cd, unit);
hold_blksize = st->blksize;
hold_density = st->density;
switch ((u_int)cmd) {
case MTIOCGET: {
struct mtget *g = (struct mtget *) arg;
/*
* (to get the current state of READONLY)
*/
error = st->ops(st, ST_OPS_MODESENSE, XS_CTL_SILENT);
if (error) {
/*
* Ignore the error if in control mode;
* this is mandated by st(4).
*/
if (STMODE(dev) != CTRL_MODE)
break;
error = 0;
}
SC_DEBUG(st->sc_periph, SCSIPI_DB1, ("[ioctl: get status]\n"));
memset(g, 0, sizeof(struct mtget));
g->mt_type = MT_ISAR; /* Ultrix compat *//*? */
g->mt_blksiz = st->blksize;
g->mt_density = st->density;
g->mt_mblksiz[0] = st->modes[0].blksize;
g->mt_mblksiz[1] = st->modes[1].blksize;
g->mt_mblksiz[2] = st->modes[2].blksize;
g->mt_mblksiz[3] = st->modes[3].blksize;
g->mt_mdensity[0] = st->modes[0].density;
g->mt_mdensity[1] = st->modes[1].density;
g->mt_mdensity[2] = st->modes[2].density;
g->mt_mdensity[3] = st->modes[3].density;
g->mt_fileno = st->fileno;
g->mt_blkno = st->blkno;
if (st->flags & ST_READONLY)
g->mt_dsreg |= MT_DS_RDONLY;
if (st->flags & ST_MOUNTED)
g->mt_dsreg |= MT_DS_MOUNTED;
g->mt_resid = st->mt_resid;
g->mt_erreg = st->mt_erreg;
/*
* clear latched errors.
*/
st->mt_resid = 0;
st->mt_erreg = 0;
st->asc = 0;
st->ascq = 0;
break;
}
case MTIOCTOP: {
SC_DEBUG(st->sc_periph, SCSIPI_DB1,
("[ioctl: op=0x%x count=0x%x]\n", mt->mt_op,
mt->mt_count));
/* compat: in U*x it is a short */
number = mt->mt_count;
switch ((short) (mt->mt_op)) {
case MTWEOF: /* write an end-of-file record */
error = st_write_filemarks(st, number, flags);
break;
case MTBSF: /* backward space file */
number = -number;
/* FALLTHROUGH */
case MTFSF: /* forward space file */
error = st_check_eod(st, FALSE, &nmarks, flags);
if (!error)
error = st_space(st, number - nmarks,
SP_FILEMARKS, flags);
break;
case MTBSR: /* backward space record */
number = -number;
/* FALLTHROUGH */
case MTFSR: /* forward space record */
error = st_check_eod(st, true, &nmarks, flags);
if (!error)
error = st_space(st, number, SP_BLKS, flags);
break;
case MTREW: /* rewind */
error = st_rewind(st, 0, flags);
break;
case MTOFFL: /* rewind and put the drive offline */
st_unmount(st, EJECT);
break;
case MTNOP: /* no operation, sets status only */
break;
case MTRETEN: /* retension the tape */
error = st_load(st, LD_RETENSION, flags);
if (!error)
error = st_load(st, LD_LOAD, flags);
break;
case MTEOM: /* forward space to end of media */
error = st_check_eod(st, FALSE, &nmarks, flags);
if (!error)
error = st_space(st, 1, SP_EOM, flags);
break;
case MTCACHE: /* enable controller cache */
st->flags &= ~ST_DONTBUFFER;
goto try_new_value;
case MTNOCACHE: /* disable controller cache */
st->flags |= ST_DONTBUFFER;
goto try_new_value;
case MTERASE: /* erase volume */
error = st_erase(st, number, flags);
break;
case MTSETBSIZ: /* Set block size for device */
#ifdef NOTYET
if (!(st->flags & ST_NEW_MOUNT)) {
uprintf("re-mount tape before changing "
"blocksize");
error = EINVAL;
break;
}
#endif
if (number == 0)
st->flags &= ~ST_FIXEDBLOCKS;
else {
if ((st->blkmin || st->blkmax) &&
(number < st->blkmin ||
number > st->blkmax)) {
error = EINVAL;
break;
}
st->flags |= ST_FIXEDBLOCKS;
}
st->blksize = number;
st->flags |= ST_BLOCK_SET; /*XXX */
goto try_new_value;
case MTSETDNSTY: /* Set density for device and mode */
/*
* Any number >= 0 and <= 0xff is legal. Numbers
* above 0x80 are 'vendor unique'.
*/
if (number < 0 || number > 255) {
error = EINVAL;
break;
} else
st->density = number;
goto try_new_value;
case MTCMPRESS:
error = st->ops(st, (number == 0) ?
ST_OPS_CMPRSS_OFF : ST_OPS_CMPRSS_ON,
XS_CTL_SILENT);
break;
case MTEWARN:
if (number)
st->flags |= ST_EARLYWARN;
else
st->flags &= ~ST_EARLYWARN;
break;
default:
error = EINVAL;
}
break;
}
case MTIOCIEOT:
case MTIOCEEOT:
break;
case MTIOCRDSPOS:
error = st_rdpos(st, 0, (uint32_t *)arg);
break;
case MTIOCRDHPOS:
error = st_rdpos(st, 1, (uint32_t *)arg);
break;
case MTIOCSLOCATE:
error = st_setpos(st, 0, (uint32_t *)arg);
break;
case MTIOCHLOCATE:
error = st_setpos(st, 1, (uint32_t *)arg);
break;
default:
error = scsipi_do_ioctl(st->sc_periph, dev, cmd, arg, flag, l);
break;
}
return error;
try_new_value:
/*
* Check that the mode being asked for is aggreeable to the
* drive. If not, put it back the way it was.
*
* If in control mode, we can make (persistent) mode changes
* even if no medium is loaded (see st(4)).
*/
if ((STMODE(dev) != CTRL_MODE || (st->flags & ST_MOUNTED) != 0) &&
(error = st->ops(st, ST_OPS_MODESELECT, 0)) != 0) {
/* put it back as it was */
aprint_error_dev(st->sc_dev, "cannot set selected mode\n");
st->density = hold_density;
st->blksize = hold_blksize;
if (st->blksize)
st->flags |= ST_FIXEDBLOCKS;
else
st->flags &= ~ST_FIXEDBLOCKS;
return error;
}
/*
* As the drive liked it, if we are setting a new default,
* set it into the structures as such.
*
* The means for deciding this are not finalised yet- but
* if the device was opened in Control Mode, the values
* are persistent now across mounts.
*/
if (STMODE(dev) == CTRL_MODE) {
switch ((short) (mt->mt_op)) {
case MTSETBSIZ:
st->modes[dsty].blksize = st->blksize;
st->modeflags[dsty] |= BLKSIZE_SET_BY_USER;
break;
case MTSETDNSTY:
st->modes[dsty].density = st->density;
st->modeflags[dsty] |= DENSITY_SET_BY_USER;
break;
}
}
return 0;
}
/* Do a synchronous read. */
static int
st_read(struct st_softc *st, char *bf, int size, int flags)
{
struct scsi_rw_tape cmd;
/* If it's a null transfer, return immediately */
if (size == 0)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = READ;
if (st->flags & ST_FIXEDBLOCKS) { cmd.byte2 |= SRW_FIXED;
_lto3b(size / (st->blksize ? st->blksize : DEF_FIXED_BSIZE),
cmd.len);
} else
_lto3b(size, cmd.len);
return scsipi_command(st->sc_periph,
(void *)&cmd, sizeof(cmd), (void *)bf, size, 0, ST_IO_TIME, NULL,
flags | XS_CTL_DATA_IN);
}
/* issue an erase command */
static int
st_erase(struct st_softc *st, int full, int flags)
{
int tmo;
struct scsi_erase cmd;
/*
* Full erase means set LONG bit in erase command, which asks
* the drive to erase the entire unit. Without this bit, we're
* asking the drive to write an erase gap.
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = ERASE;
if (full) {
cmd.byte2 = SE_LONG;
tmo = ST_SPC_TIME;
} else
tmo = ST_IO_TIME;
/*
* XXX We always do this asynchronously, for now, unless the device
* has the ST_Q_ERASE_NOIMM quirk. How long should we wait if we
* want to (eventually) to it synchronously?
*/
if ((st->quirks & ST_Q_ERASE_NOIMM) == 0)
cmd.byte2 |= SE_IMMED;
return scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, tmo, NULL, flags);
}
/* skip N blocks/filemarks/seq filemarks/eom */
static int
st_space(struct st_softc *st, int number, u_int what, int flags)
{
struct scsi_space cmd;
int error;
switch (what) {
case SP_BLKS:
if (st->flags & ST_PER_ACTION) {
if (number > 0) {
st->flags &= ~ST_PER_ACTION;
return EIO;
} else if (number < 0) {
if (st->flags & ST_AT_FILEMARK) {
/*
* Handling of ST_AT_FILEMARK
* in st_space will fill in the
* right file mark count.
*/
error = st_space(st, 0, SP_FILEMARKS,
flags);
if (error)
return error;
}
if (st->flags & ST_BLANK_READ) {
st->flags &= ~ST_BLANK_READ;
return EIO;
}
st->flags &= ~(ST_EIO_PENDING|ST_EOM_PENDING);
}
}
break;
case SP_FILEMARKS:
if (st->flags & ST_EIO_PENDING) {
if (number > 0) {
/* pretend we just discovered the error */
st->flags &= ~ST_EIO_PENDING;
return EIO;
} else if (number < 0) {
/* back away from the error */
st->flags &= ~ST_EIO_PENDING;
}
}
if (st->flags & ST_AT_FILEMARK) {
st->flags &= ~ST_AT_FILEMARK;
number--;
}
if ((st->flags & ST_BLANK_READ) && (number < 0)) {
/* back away from unwritten tape */
st->flags &= ~ST_BLANK_READ;
number++; /* XXX dubious */
}
break;
case SP_EOM:
if (st->flags & ST_EOM_PENDING) {
/* we're already there */
st->flags &= ~ST_EOM_PENDING;
return 0;
}
if (st->flags & ST_EIO_PENDING) {
/* pretend we just discovered the error */
st->flags &= ~ST_EIO_PENDING;
return EIO;
}
if (st->flags & ST_AT_FILEMARK)
st->flags &= ~ST_AT_FILEMARK;
break;
}
if (number == 0)
return 0;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = SPACE;
cmd.byte2 = what;
_lto3b(number, cmd.number);
st->flags &= ~ST_POSUPDATED;
st->last_ctl_resid = 0;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
0, ST_SPC_TIME, NULL, flags);
if (error == 0 && (st->flags & ST_POSUPDATED) == 0) {
number = number - st->last_ctl_resid;
if (what == SP_BLKS) {
if (st->blkno != -1)
st->blkno += number;
} else if (what == SP_FILEMARKS) {
if (st->fileno != -1) {
st->fileno += number;
if (number > 0)
st->blkno = 0;
else if (number < 0)
st->blkno = -1;
}
} else if (what == SP_EOM) {
st_updatefilepos(st);
}
}
return error;
}
/*
* write N filemarks
*/
static int
st_write_filemarks(struct st_softc *st, int number, int flags)
{
int error;
struct scsi_write_filemarks cmd;
/*
* It's hard to write a negative number of file marks.
* Don't try.
*/
if (number < 0) {
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("EINVAL: st_write_filemarks not writing %d file marks\n", number));
return EINVAL;
}
switch (number) {
case 0: /* really a command to sync the drive's buffers */
break;
case 1:
if (st->flags & ST_FM_WRITTEN) /* already have one down */
st->flags &= ~ST_WRITTEN;
else
st->flags |= ST_FM_WRITTEN;
st->flags &= ~ST_PER_ACTION;
break;
default:
st->flags &= ~(ST_PER_ACTION | ST_WRITTEN);
}
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = WRITE_FILEMARKS;
if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
SCSIPI_BUSTYPE_ATAPI)
cmd.byte2 = SR_IMMED;
/*
* The ATAPI Onstream DI-30 doesn't support writing filemarks, but
* WRITE_FILEMARKS is still used to flush the buffer
*/
if ((st->quirks & ST_Q_NOFILEMARKS) == 0)
_lto3b(number, cmd.number);
/* XXX WE NEED TO BE ABLE TO GET A RESIDIUAL XXX */
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
0, ST_IO_TIME * 4, NULL, flags);
if (error == 0 && st->fileno != -1)
st->fileno += number;
return error;
}
/*
* Make sure the right number of file marks is on tape if the
* tape has been written. If the position argument is true,
* leave the tape positioned where it was originally.
*
* nmarks returns the number of marks to skip (or, if position
* true, which were skipped) to get back original position.
*/
static int
st_check_eod(struct st_softc *st, boolean position, int *nmarks, int flags)
{
int error;
switch (st->flags & (ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD)) {
default:
*nmarks = 0;
return 0;
case ST_WRITTEN:
case ST_WRITTEN | ST_FM_WRITTEN | ST_2FM_AT_EOD:
*nmarks = 1;
break;
case ST_WRITTEN | ST_2FM_AT_EOD:
*nmarks = 2;
}
error = st_write_filemarks(st, *nmarks, flags);
if (position && !error)
error = st_space(st, -*nmarks, SP_FILEMARKS, flags);
return error;
}
/* load/unload/retension */
static int
st_load(struct st_softc *st, u_int type, int flags)
{
int error;
struct scsi_load cmd;
if (type != LD_LOAD) {
int nmarks;
error = st_check_eod(st, FALSE, &nmarks, flags);
if (error) {
aprint_error_dev(st->sc_dev,
"failed to write closing filemarks at "
"unload, errno=%d\n", error);
return error;
}
}
if (st->quirks & ST_Q_IGNORE_LOADS) {
if (type == LD_LOAD)
/*
* If we ignore loads, at least we should try a rewind.
*/
return st_rewind(st, 0, flags);
/* otherwise, we should do what's asked of us */
}
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = LOAD;
if (SCSIPI_BUSTYPE_TYPE(scsipi_periph_bustype(st->sc_periph)) ==
SCSIPI_BUSTYPE_ATAPI)
cmd.byte2 = SR_IMMED;
cmd.how = type;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, ST_SPC_TIME, NULL, flags);
if (error) {
aprint_error_dev(st->sc_dev, "error %d in st_load (op %d)\n",
error, type);
}
return error;
}
/* Rewind the device */
static int
st_rewind(struct st_softc *st, u_int immediate, int flags)
{
struct scsi_rewind cmd;
int error;
int nmarks;
int timeout;
error = st_check_eod(st, FALSE, &nmarks, flags);
if (error) {
aprint_error_dev(st->sc_dev,
"failed to write closing filemarks at "
"rewind, errno=%d\n", error);
return error;
}
st->flags &= ~ST_PER_ACTION;
/* If requestor asked for immediate response, set a short timeout */
timeout = immediate ? ST_CTL_TIME : ST_SPC_TIME;
/* ATAPI tapes always need immediate to be set */
if (scsipi_periph_bustype(st->sc_periph) == SCSIPI_BUSTYPE_ATAPI)
immediate = SR_IMMED;
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = REWIND;
cmd.byte2 = immediate;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, timeout, NULL, flags);
if (error) {
aprint_error_dev(st->sc_dev, "error %d trying to rewind\n",
error);
/* lost position */
st->fileno = st->blkno = -1;
} else
st->fileno = st->blkno = 0;
return error;
}
static void
st_updatefilepos(struct st_softc *st)
{
int error;
uint8_t posdata[32];
struct scsi_tape_read_position cmd;
memset(&cmd, 0, sizeof(cmd));
memset(&posdata, 0, sizeof(posdata));
cmd.opcode = READ_POSITION;
cmd.byte1 = 6; /* service action: LONG FORM */
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
(void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
XS_CTL_SILENT | XS_CTL_DATA_IN);
if (error == 0) {
#ifdef SCSIPI_DEBUG
if (st->sc_periph->periph_dbflags & SCSIPI_DB3) {
int hard;
printf("posdata: ");
for (hard = 0; hard < sizeof(posdata); hard++)
printf("%02x ", posdata[hard] & 0xff);
printf("\n");
}
#endif
if (posdata[0] & 0xC) { /* Block|Mark Position Unknown */
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos block/mark position unknown (0x%02x)\n",
posdata[0]));
} else {
st->fileno = _8btol(&posdata[16]);
st->blkno = 0;
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos file position %"PRId64"\n",
st->fileno));
return;
}
} else {
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("st_updatefilepos READ POSITION(LONG_FORM) failed (error=%d)\n",
error));
}
st->fileno = -1;
st->blkno = -1;
}
static int
st_rdpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
int error;
uint8_t posdata[20];
struct scsi_tape_read_position cmd;
/*
* We try and flush any buffered writes here if we were writing
* and we're trying to get hardware block position. It eats
* up performance substantially, but I'm wary of drive firmware.
*
* I think that *logical* block position is probably okay-
* but hardware block position might have to wait for data
* to hit media to be valid. Caveat Emptor.
*/
if (hard && (st->flags & ST_WRITTEN)) {
/* First flush any pending writes... */
error = st_write_filemarks(st, 0, XS_CTL_SILENT);
/*
* The latter case is for 'write protected' tapes
* which are too stupid to recognize a zero count
* for writing filemarks as a no-op.
*/
if (error != 0 && error != EACCES && error != EROFS)
return error;
}
memset(&cmd, 0, sizeof(cmd));
memset(&posdata, 0, sizeof(posdata));
cmd.opcode = READ_POSITION;
if (hard)
cmd.byte1 = 1;
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd),
(void *)&posdata, sizeof(posdata), ST_RETRIES, ST_CTL_TIME, NULL,
XS_CTL_SILENT | XS_CTL_DATA_IN);
if (error == 0) {
#if 0
printf("posdata:");
for (hard = 0; hard < sizeof(posdata); hard++)
printf("%02x ", posdata[hard] & 0xff);
printf("\n");
#endif
if (posdata[0] & 0x4) { /* Block Position Unknown */
SC_DEBUG(st->sc_periph, SCSIPI_DB3,
("EINVAL: strdpos block position unknown\n"));
error = EINVAL;
}
else
*blkptr = _4btol(&posdata[4]);
}
return error;
}
static int
st_setpos(struct st_softc *st, int hard, uint32_t *blkptr)
{
int error;
struct scsi_tape_locate cmd;
/*
* We used to try and flush any buffered writes here.
* Now we push this onto user applications to either
* flush the pending writes themselves (via a zero count
* WRITE FILEMARKS command) or they can trust their tape
* drive to do this correctly for them.
*
* There are very ugly performance limitations otherwise.
*/
memset(&cmd, 0, sizeof(cmd));
cmd.opcode = LOCATE;
if (hard)
cmd.byte2 = 1 << 2;
_lto4b(*blkptr, cmd.blkaddr);
error = scsipi_command(st->sc_periph, (void *)&cmd, sizeof(cmd), 0, 0,
ST_RETRIES, ST_SPC_TIME, NULL, 0);
/*
* Note file && block number position now unknown (if
* these things ever start being maintained in this driver)
*/
st->fileno = st->blkno = -1;
return error;
}
/*
* Look at the returned sense and act on the error and determine
* the unix error number to pass back..., 0 (== report no error),
* -1 = retry the operation, -2 continue error processing.
*/
static int
st_interpret_sense(struct scsipi_xfer *xs)
{
struct scsipi_periph *periph = xs->xs_periph;
struct scsi_sense_data *sense = &xs->sense.scsi_sense;
struct buf *bp = xs->bp;
struct st_softc *st = device_private(periph->periph_dev);
int retval = EJUSTRETURN;
int doprint = ((xs->xs_control & XS_CTL_SILENT) == 0);
uint8_t key;
int32_t info;
/*
* If it isn't a extended or extended/deferred error, let
* the generic code handle it.
*/
if (SSD_RCODE(sense->response_code) != SSD_RCODE_CURRENT &&
SSD_RCODE(sense->response_code) != SSD_RCODE_DEFERRED)
return retval;
if (sense->response_code & SSD_RCODE_VALID)
info = _4btol(sense->info);
else
info = (st->flags & ST_FIXEDBLOCKS) ?
xs->datalen / st->blksize : xs->datalen;
key = SSD_SENSE_KEY(sense->flags);
st->mt_erreg = key;
st->asc = sense->asc;
st->ascq = sense->ascq;
st->mt_resid = (short) info;
if (key == SKEY_NOT_READY && st->asc == 0x4 && st->ascq == 0x1) {
/* Not Ready, Logical Unit Is in Process Of Becoming Ready */
if (!callout_pending(&periph->periph_callout))
scsipi_periph_freeze(periph, 1);
callout_reset(&periph->periph_callout,
hz, scsipi_periph_timed_thaw, periph);
return ERESTART;
}
/* If the device is not open yet, let generic handle */
if ((periph->periph_flags & PERIPH_OPEN) == 0)
return retval;
xs->resid = info;
if (st->flags & ST_FIXEDBLOCKS) {
if (bp) {
xs->resid *= st->blksize;
st->last_io_resid = xs->resid;
} else
st->last_ctl_resid = xs->resid;
if (key == SKEY_VOLUME_OVERFLOW) {
st->flags |= ST_EIO_PENDING;
if (bp)
bp->b_resid = xs->resid;
} else if (sense->flags & SSD_EOM) {
if ((st->flags & ST_EARLYWARN) == 0)
st->flags |= ST_EIO_PENDING;
st->flags |= ST_EOM_PENDING;
if (bp) {
#if 0
bp->b_resid = xs->resid;
#else
/*
* Grotesque as it seems, the few times
* I've actually seen a non-zero resid,
* the tape drive actually lied and had
* written all the data!
*/
bp->b_resid = 0;
#endif
}
}
if (sense->flags & SSD_FILEMARK) {
st->flags |= ST_AT_FILEMARK;
if (bp)
bp->b_resid = xs->resid;
if (st->fileno != (daddr_t) -1) {
st->fileno++;
st->blkno = 0;
st->flags |= ST_POSUPDATED;
}
}
if (sense->flags & SSD_ILI) {
st->flags |= ST_EIO_PENDING;
if (bp)
bp->b_resid = xs->resid;
if (sense->response_code & SSD_RCODE_VALID &&
(xs->xs_control & XS_CTL_SILENT) == 0)
aprint_error_dev(st->sc_dev,
"block wrong size, %d blocks residual\n",
info);
/*
* This quirk code helps the drive read
* the first tape block, regardless of
* format. That is required for these
* drives to return proper MODE SENSE
* information.
*/
if ((st->quirks & ST_Q_SENSE_HELP) &&
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0)
st->blksize -= 512;
else if ((st->flags & ST_POSUPDATED) == 0) {
if (st->blkno != (daddr_t) -1) {
st->blkno +=
(xs->datalen / st->blksize);
st->flags |= ST_POSUPDATED;
}
}
}
/*
* If data wanted and no data was transferred, do it immediately
*/
if (xs->datalen && xs->resid >= xs->datalen) {
if (st->flags & ST_EIO_PENDING)
return EIO;
if (st->flags & ST_AT_FILEMARK) {
if (bp)
bp->b_resid = xs->resid;
return 0;
}
}
} else { /* must be variable mode */
if (bp)
st->last_io_resid = xs->resid;
else
st->last_ctl_resid = xs->resid;
if (sense->flags & SSD_EOM) {
/*
* The current semantics of this
* driver requires EOM detection
* to return EIO unless early
* warning detection is enabled
* for variable mode (this is always
* on for fixed block mode).
*/
if (st->flags & ST_EARLYWARN) {
st->flags |= ST_EOM_PENDING;
retval = 0;
} else {
retval = EIO;
/*
* If we return an error we can't claim to
* have transferred all data.
*/
if (xs->resid == 0)
xs->resid = xs->datalen;
}
/*
* If it's an unadorned EOM detection,
* suppress printing an error.
*/
if (key == SKEY_NO_SENSE) {
doprint = 0;
}
} else if (sense->flags & SSD_FILEMARK) {
retval = 0;
if (st->fileno != (daddr_t) -1) {
st->fileno++;
st->blkno = 0;
st->flags |= ST_POSUPDATED;
}
} else if (sense->flags & SSD_ILI) {
if (info < 0) {
/*
* The tape record was bigger than the read
* we issued.
*/
if ((xs->xs_control & XS_CTL_SILENT) == 0) {
aprint_error_dev(st->sc_dev,
"%d-byte tape record too big"
" for %d-byte user buffer\n",
xs->datalen - info, xs->datalen);
}
retval = EIO;
} else {
retval = 0;
if (st->blkno != (daddr_t) -1) {
st->blkno++;
st->flags |= ST_POSUPDATED;
}
}
}
if (bp)
bp->b_resid = xs->resid;
}
#ifndef SCSIPI_DEBUG
if (retval == 0 && key == SKEY_NO_SENSE)
doprint = 0;
#endif
if (key == SKEY_BLANK_CHECK) {
/*
* This quirk code helps the drive read the
* first tape block, regardless of format. That
* is required for these drives to return proper
* MODE SENSE information.
*/
if ((st->quirks & ST_Q_SENSE_HELP) &&
(periph->periph_flags & PERIPH_MEDIA_LOADED) == 0) {
/* still starting */
st->blksize -= 512;
} else if (!(st->flags & (ST_2FM_AT_EOD | ST_BLANK_READ))) {
st->flags |= ST_BLANK_READ;
xs->resid = xs->datalen;
if (bp) {
bp->b_resid = xs->resid;
/* return an EOF */
}
retval = 0;
/* lost position */
st->fileno = st->blkno = -1;
}
}
/*
* If generic sense processing will continue, we should not
* print sense info here.
*/
if (retval == EJUSTRETURN)
doprint = 0;
if (doprint) {
/* Print verbose sense info if possible */
if (scsipi_print_sense(xs, 0) != 0)
return retval;
/* Print less-verbose sense info */
scsipi_printaddr(periph);
printf("Sense Key 0x%02x", key);
if ((sense->response_code & SSD_RCODE_VALID) != 0) {
switch (key) {
case SKEY_NOT_READY:
case SKEY_ILLEGAL_REQUEST:
case SKEY_UNIT_ATTENTION:
case SKEY_DATA_PROTECT:
break;
case SKEY_VOLUME_OVERFLOW:
case SKEY_BLANK_CHECK:
printf(", requested size: %d (decimal)", info);
break;
case SKEY_ABORTED_COMMAND:
if (xs->xs_retries)
printf(", retrying");
printf(", cmd 0x%x, info 0x%x",
xs->cmd->opcode, info);
break;
default:
printf(", info = %d (decimal)", info);
}
}
if (sense->extra_len != 0) {
int n;
printf(", data =");
for (n = 0; n < sense->extra_len; n++)
printf(" %02x", sense->csi[n]);
}
printf("\n");
}
return retval;
}
/*
* The quirk here is that the drive returns some value to st_mode_sense
* incorrectly until the tape has actually passed by the head.
*
* The method is to set the drive to large fixed-block state (user-specified
* density and 1024-byte blocks), then read and rewind to get it to sense the
* tape. If that doesn't work, try 512-byte fixed blocks. If that doesn't
* work, as a last resort, try variable- length blocks. The result will be
* the ability to do an accurate st_mode_sense.
*
* We know we can do a rewind because we just did a load, which implies rewind.
* Rewind seems preferable to space backward if we have a virgin tape.
*
* The rest of the code for this quirk is in ILI processing and BLANK CHECK
* error processing, both part of st_interpret_sense.
*/
static int
st_touch_tape(struct st_softc *st)
{
char *bf;
int readsize;
int error;
bf = malloc(1024, M_TEMP, M_WAITOK);
if ((error = st->ops(st, ST_OPS_MODESENSE, 0)) != 0)
goto bad;
/*
* If the block size is already known from the
* sense data, use it. Else start probing at 1024.
*/
if (st->media_blksize > 0)
st->blksize = st->media_blksize;
else
st->blksize = 1024;
do {
switch (st->blksize) {
case 512:
case 1024:
readsize = st->blksize;
st->flags |= ST_FIXEDBLOCKS;
break;
default:
readsize = 1;
st->flags &= ~ST_FIXEDBLOCKS;
}
if ((error = st->ops(st, ST_OPS_MODESELECT, XS_CTL_SILENT))
!= 0) {
/*
* The device did not agree with the proposed
* block size. If we exhausted our options,
* return failure, else try another.
*/
if (readsize == 1)
goto bad;
st->blksize -= 512;
continue;
}
st_read(st, bf, readsize, XS_CTL_SILENT); /* XXX */ if ((error = st_rewind(st, 0, 0)) != 0) {
bad: free(bf, M_TEMP);
return error;
}
} while (readsize != 1 && readsize > st->blksize);
free(bf, M_TEMP);
return 0;
}
static int
stdump(dev_t dev, daddr_t blkno, void *va, size_t size)
{
/* Not implemented. */
return ENXIO;
}
/*
* Send a filled out parameter structure to the drive to
* set it into the desire modes etc.
*/
int
st_mode_select(struct st_softc *st, int flags)
{
u_int select_len;
struct select {
struct scsi_mode_parameter_header_6 header;
struct scsi_general_block_descriptor blk_desc;
u_char sense_data[MAX_PAGE_0_SIZE];
} select;
struct scsipi_periph *periph = st->sc_periph;
select_len = sizeof(select.header) + sizeof(select.blk_desc) +
st->page_0_size;
/*
* This quirk deals with drives that have only one valid mode
* and think this gives them license to reject all mode selects,
* even if the selected mode is the one that is supported.
*/
if (st->quirks & ST_Q_UNIMODAL) {
SC_DEBUG(periph, SCSIPI_DB3,
("not setting density 0x%x blksize 0x%x\n",
st->density, st->blksize));
return 0;
}
/* Set up for a mode select */
memset(&select, 0, sizeof(select));
select.header.blk_desc_len = sizeof(struct
scsi_general_block_descriptor);
select.header.dev_spec &= ~SMH_DSP_BUFF_MODE;
select.blk_desc.density = st->density;
if (st->flags & ST_DONTBUFFER)
select.header.dev_spec |= SMH_DSP_BUFF_MODE_OFF;
else
select.header.dev_spec |= SMH_DSP_BUFF_MODE_ON;
if (st->flags & ST_FIXEDBLOCKS)
_lto3b(st->blksize, select.blk_desc.blklen);
if (st->page_0_size)
memcpy(select.sense_data, st->sense_data, st->page_0_size);
/* do the command */
return scsipi_mode_select(periph, 0, &select.header, select_len,
flags, ST_RETRIES, ST_CTL_TIME);
}
/* $NetBSD: kern_module_vfs.c,v 1.18 2021/06/29 22:40:53 dholland Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel module file system interaction.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_module_vfs.c,v 1.18 2021/06/29 22:40:53 dholland Exp $");
#define _MODULE_INTERNAL
#include <sys/param.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/kobj.h>
#include <sys/module.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <prop/proplib.h>
static int module_load_plist_vfs(const char *, const bool,
prop_dictionary_t *);
void
module_load_vfs_init(void)
{
module_load_vfs_vec = module_load_vfs;
aprint_normal("kern.module.path=%s\n", module_base);
}
int
module_load_vfs(const char *name, int flags, bool autoload,
module_t *mod, prop_dictionary_t *filedictp)
{
char *path;
bool nochroot;
int error;
prop_bool_t noload;
prop_dictionary_t moduledict;
nochroot = false;
error = 0;
path = NULL;
moduledict = NULL;
if (filedictp) *filedictp = NULL;
path = PNBUF_GET();
if (!autoload) { if (strchr(name, '/') != NULL) {
nochroot = false;
snprintf(path, MAXPATHLEN, "%s", name);
module_print("Loading module from %s", path);
error = kobj_load_vfs(&mod->mod_kobj, path, nochroot);
} else
error = ENOENT;
}
if (autoload || (error == ENOENT)) { if (strchr(name, '/') == NULL) {
nochroot = true;
snprintf(path, MAXPATHLEN, "%s/%s/%s.kmod",
module_base, name, name);
module_print("Loading module from %s", path);
error = kobj_load_vfs(&mod->mod_kobj, path, nochroot);
} else
error = ENOENT;
}
if (error != 0) {
PNBUF_PUT(path);
module_print("Cannot %sload kernel object `%s'"
" error=%d", autoload ? "auto" : "", name, error);
return error;
}
/*
* Load and process <module>.plist if it exists.
*/
if ((!ISSET(flags, MODCTL_NO_PROP) && filedictp) || autoload) { error = module_load_plist_vfs(path, nochroot, &moduledict);
if (error != 0) {
module_print("plist load returned error %d for `%s'",
error, path);
if (error != ENOENT)
goto fail;
} else if (autoload) {
noload = prop_dictionary_get(moduledict, "noautoload");
if (noload != NULL && prop_bool_true(noload)) { module_error("autoloading is disallowed for %s",
path);
prop_object_release(moduledict);
error = EPERM;
goto fail;
}
}
if (error == 0) { /* can get here if error == ENOENT */
if (!ISSET(flags, MODCTL_NO_PROP) && filedictp)
*filedictp = moduledict;
else
prop_object_release(moduledict);
}
}
PNBUF_PUT(path);
return 0;
fail:
kobj_unload(mod->mod_kobj);
PNBUF_PUT(path);
return error;
}
/*
* module_load_plist_vfs:
*
* Load a plist located in the file system into memory.
*/
static int
module_load_plist_vfs(const char *modpath, const bool nochroot,
prop_dictionary_t *filedictp)
{
struct pathbuf *pb;
struct vnode *vp;
struct stat sb;
void *base;
char *proppath;
const size_t plistsize = 8192;
size_t resid;
int error, pathlen;
KASSERT(filedictp != NULL);
base = NULL;
proppath = PNBUF_GET();
strlcpy(proppath, modpath, MAXPATHLEN);
pathlen = strlen(proppath);
if ((pathlen >= 6) && (strcmp(&proppath[pathlen - 5], ".kmod") == 0)) { strcpy(&proppath[pathlen - 5], ".plist"); } else if (pathlen < MAXPATHLEN - 6) {
strcat(proppath, ".plist");
} else {
error = ENOENT;
goto out1;
}
/* XXX this makes an unnecessary extra copy of the path */
pb = pathbuf_create(proppath);
if (pb == NULL) {
error = ENOMEM;
goto out1;
}
module_print("Loading plist from %s", proppath);
error = vn_open(NULL, pb, (nochroot ? NOCHROOT : 0), FREAD, 0,
&vp, NULL, NULL);
if (error != 0) {
goto out2;
}
error = vn_stat(vp, &sb);
if (error != 0) {
goto out3;
}
if (sb.st_size >= (plistsize - 1)) { /* leave space for term \0 */
error = EFBIG;
goto out3;
}
base = kmem_alloc(plistsize, KM_SLEEP);
error = vn_rdwr(UIO_READ, vp, base, sb.st_size, 0,
UIO_SYSSPACE, IO_NODELOCKED, curlwp->l_cred, &resid, curlwp);
*((uint8_t *)base + sb.st_size) = '\0';
if (error == 0 && resid != 0) {
error = EFBIG;
}
if (error != 0) {
kmem_free(base, plistsize);
base = NULL;
goto out3;
}
*filedictp = prop_dictionary_internalize(base);
if (*filedictp == NULL) {
error = EINVAL;
}
kmem_free(base, plistsize);
base = NULL;
KASSERT(error == 0);
out3:
VOP_UNLOCK(vp);
vn_close(vp, FREAD, kauth_cred_get());
out2:
pathbuf_destroy(pb);
out1:
PNBUF_PUT(proppath);
return error;
}
/* $NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $ */
/*-
* Copyright (c) 1998, 2000, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed
* to Berkeley by John Heidemann of the UCLA Ficus project.
*
* Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_init.c 8.5 (Berkeley) 5/11/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_init.c,v 1.64 2023/09/23 18:21:11 ad Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/dirhash.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/ucred.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/fifofs/fifo.h>
#include <miscfs/specfs/specdev.h>
/*
* Sigh, such primitive tools are these...
*/
#if 0
#define DODEBUG(A) A
#else
#define DODEBUG(A)
#endif
SDT_PROVIDER_DEFINE(vfs);
/*
* These vnodeopv_descs are listed here because they are not
* associated with any particular file system, and thus cannot
* be initialized by vfs_attach().
*/
const struct vnodeopv_desc * const vfs_special_vnodeopv_descs[] = {
&dead_vnodeop_opv_desc,
&fifo_vnodeop_opv_desc,
&spec_vnodeop_opv_desc,
NULL,
};
struct vfs_list_head vfs_list = /* vfs list */
LIST_HEAD_INITIALIZER(vfs_list);
static kauth_listener_t mount_listener;
/*
* This code doesn't work if the defn is **vnodop_defns with cc.
* The problem is because of the compiler sometimes putting in an
* extra level of indirection for arrays. It's an interesting
* "feature" of C.
*/
typedef int (*PFI)(void *);
/*
* A miscellaneous routine.
* A generic "default" routine that just returns an error.
*/
/*ARGSUSED*/
int
vn_default_error(void *v)
{
return (EOPNOTSUPP);
}
static struct sysctllog *vfs_sysctllog;
/*
* Top level filesystem related information gathering.
*/
static void
sysctl_vfs_setup(void)
{
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "generic",
SYSCTL_DESCR("Non-specific vfs related information"),
NULL, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "fstypes",
SYSCTL_DESCR("List of file systems present"),
sysctl_vfs_generic_fstypes, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "magiclinks",
SYSCTL_DESCR("Whether \"magic\" symlinks are expanded"),
NULL, 0, &vfs_magiclinks, 0,
CTL_VFS, VFS_GENERIC, VFS_MAGICLINKS, CTL_EOL);
sysctl_createv(&vfs_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "timestamp_precision",
SYSCTL_DESCR("File timestamp precision"),
NULL, 0, &vfs_timestamp_precision, 0,
CTL_VFS, VFS_GENERIC, VFS_TIMESTAMP_PRECISION,
CTL_EOL);
}
/*
* vfs_init.c
*
* Allocate and fill in operations vectors.
*
* An undocumented feature of this approach to defining operations is that
* there can be multiple entries in vfs_opv_descs for the same operations
* vector. This allows third parties to extend the set of operations
* supported by another layer in a binary compatibile way. For example,
* assume that NFS needed to be modified to support Ficus. NFS has an entry
* (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by
* default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions)
* listing those new operations Ficus adds to NFS, all without modifying the
* NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but
* that is a(whole)nother story.) This is a feature.
*/
/*
* Init the vector, if it needs it.
* Also handle backwards compatibility.
*/
static void
vfs_opv_init_explicit(const struct vnodeopv_desc *vfs_opv_desc)
{
int (**opv_desc_vector)(void *);
const struct vnodeopv_entry_desc *opve_descp;
opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
for (opve_descp = vfs_opv_desc->opv_desc_ops;
opve_descp->opve_op;
opve_descp++) {
/*
* Sanity check: is this operation listed
* in the list of operations? We check this
* by seeing if its offset is zero. Since
* the default routine should always be listed
* first, it should be the only one with a zero
* offset. Any other operation with a zero
* offset is probably not listed in
* vfs_op_descs, and so is probably an error.
*
* A panic here means the layer programmer
* has committed the all-too common bug
* of adding a new operation to the layer's
* list of vnode operations but
* not adding the operation to the system-wide
* list of supported operations.
*/
if (opve_descp->opve_op->vdesc_offset == 0 &&
opve_descp->opve_op->vdesc_offset != VOFFSET(vop_default)) {
printf("operation %s not listed in %s.\n",
opve_descp->opve_op->vdesc_name, "vfs_op_descs");
panic ("vfs_opv_init: bad operation");
}
/*
* Fill in this entry.
*/
opv_desc_vector[opve_descp->opve_op->vdesc_offset] =
opve_descp->opve_impl;
}
}
static void
vfs_opv_init_default(const struct vnodeopv_desc *vfs_opv_desc)
{
int j;
int (**opv_desc_vector)(void *);
opv_desc_vector = *(vfs_opv_desc->opv_desc_vector_p);
/*
* Force every operations vector to have a default routine.
*/
if (opv_desc_vector[VOFFSET(vop_default)] == NULL)
panic("vfs_opv_init: operation vector without default routine.");
for (j = 0; j < VNODE_OPS_COUNT; j++)
if (opv_desc_vector[j] == NULL)
opv_desc_vector[j] =
opv_desc_vector[VOFFSET(vop_default)];
}
void
vfs_opv_init(const struct vnodeopv_desc * const *vopvdpp)
{
int (**opv_desc_vector)(void *);
int i;
/*
* Allocate the vectors.
*/
for (i = 0; vopvdpp[i] != NULL; i++) {
opv_desc_vector =
kmem_alloc(VNODE_OPS_COUNT * sizeof(PFI), KM_SLEEP);
memset(opv_desc_vector, 0, VNODE_OPS_COUNT * sizeof(PFI));
*(vopvdpp[i]->opv_desc_vector_p) = opv_desc_vector;
DODEBUG(printf("vector at %p allocated\n",
opv_desc_vector_p));
}
/*
* ...and fill them in.
*/
for (i = 0; vopvdpp[i] != NULL; i++)
vfs_opv_init_explicit(vopvdpp[i]);
/*
* Finally, go back and replace unfilled routines
* with their default.
*/
for (i = 0; vopvdpp[i] != NULL; i++)
vfs_opv_init_default(vopvdpp[i]);
}
void
vfs_opv_free(const struct vnodeopv_desc * const *vopvdpp)
{
int i;
/*
* Free the vectors allocated in vfs_opv_init().
*/
for (i = 0; vopvdpp[i] != NULL; i++) {
kmem_free(*(vopvdpp[i]->opv_desc_vector_p),
VNODE_OPS_COUNT * sizeof(PFI));
*(vopvdpp[i]->opv_desc_vector_p) = NULL;
}
}
#ifdef DEBUG
static void
vfs_op_check(void)
{
int i;
DODEBUG(printf("Vnode_interface_init.\n"));
/*
* Check offset of each op.
*/
for (i = 0; vfs_op_descs[i]; i++) {
if (vfs_op_descs[i]->vdesc_offset != i)
panic("vfs_op_check: vfs_op_desc[] offset mismatch");
}
if (i != VNODE_OPS_COUNT) {
panic("vfs_op_check: vnode ops count mismatch (%d != %d)",
i, VNODE_OPS_COUNT);
}
DODEBUG(printf ("vfs_opv_numops=%d\n", VNODE_OPS_COUNT));
}
#endif /* DEBUG */
/*
* Common routine to check if an unprivileged mount is allowed.
*
* We export just this part (i.e., without the access control) so that if a
* secmodel wants to implement finer grained user mounts it can do so without
* copying too much code. More elaborate policies (i.e., specific users allowed
* to also create devices and/or introduce set-id binaries, or export
* file-systems) will require a different implementation.
*
* This routine is intended to be called from listener context, and as such
* does not take credentials as an argument.
*/
int
usermount_common_policy(struct mount *mp, u_long flags)
{
/* No exporting if unprivileged. */
if (flags & MNT_EXPORTED)
return EPERM;
/* Must have 'nosuid' and 'nodev'. */
if ((flags & MNT_NODEV) == 0 || (flags & MNT_NOSUID) == 0)
return EPERM;
/* Retain 'noexec'. */
if ((mp->mnt_flag & MNT_NOEXEC) && (flags & MNT_NOEXEC) == 0)
return EPERM;
return 0;
}
static int
mount_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_system_req req;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)(uintptr_t)arg0;
if (action != KAUTH_SYSTEM_MOUNT)
return result;
if (req == KAUTH_REQ_SYSTEM_MOUNT_GET)
result = KAUTH_RESULT_ALLOW;
else if (req == KAUTH_REQ_SYSTEM_MOUNT_DEVICE) {
vnode_t *devvp = arg2;
accmode_t accmode = (accmode_t)(unsigned long)arg3;
int error;
error = VOP_ACCESS(devvp, accmode, cred);
if (!error)
result = KAUTH_RESULT_ALLOW;
}
return result;
}
/*
* Initialize the vnode structures and initialize each file system type.
*/
void
vfsinit(void)
{
/*
* Attach sysctl nodes
*/
sysctl_vfs_setup();
/*
* Initialize the vnode table
*/
vntblinit();
/*
* Initialize the vnode name cache
*/
nchinit();
#ifdef DEBUG
/*
* Check the list of vnode operations.
*/
vfs_op_check();
#endif
/*
* Initialize the special vnode operations.
*/
vfs_opv_init(vfs_special_vnodeopv_descs);
/*
* Initialise generic dirhash.
*/
dirhash_init();
/*
* Initialise VFS hooks.
*/
vfs_hooks_init();
mount_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
mount_listener_cb, NULL);
/*
* Establish each file system which was statically
* included in the kernel.
*/
module_init_class(MODULE_CLASS_VFS);
/*
* Initialize EVFILT_FS for kqueue.
*/
vfs_evfilt_fs_init();
}
/*
* Drop a reference to a file system type.
*/
void
vfs_delref(struct vfsops *vfs)
{
mutex_enter(&vfs_list_lock);
vfs->vfs_refcount--;
mutex_exit(&vfs_list_lock);
}
/*
* Establish a file system and initialize it.
*/
int
vfs_attach(struct vfsops *vfs)
{
struct vfsops *v;
int error = 0;
mutex_enter(&vfs_list_lock);
/*
* Make sure this file system doesn't already exist.
*/
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (strcmp(vfs->vfs_name, v->vfs_name) == 0) {
error = EEXIST;
goto out;
}
}
/*
* Initialize the vnode operations for this file system.
*/
vfs_opv_init(vfs->vfs_opv_descs);
/*
* Now initialize the file system itself.
*/
(*vfs->vfs_init)();
/*
* ...and link it into the kernel's list.
*/
LIST_INSERT_HEAD(&vfs_list, vfs, vfs_list);
/*
* Sanity: make sure the reference count is 0.
*/
vfs->vfs_refcount = 0;
out:
mutex_exit(&vfs_list_lock);
return (error);
}
/*
* Remove a file system from the kernel.
*/
int
vfs_detach(struct vfsops *vfs)
{
struct vfsops *v;
int error = 0;
mutex_enter(&vfs_list_lock);
/*
* Make sure no one is using the filesystem.
*/
if (vfs->vfs_refcount != 0) {
error = EBUSY;
goto out;
}
/*
* ...and remove it from the kernel's list.
*/
LIST_FOREACH(v, &vfs_list, vfs_list) {
if (v == vfs) {
LIST_REMOVE(v, vfs_list);
break;
}
}
if (v == NULL) {
error = ESRCH;
goto out;
}
/*
* Now run the file system-specific cleanups.
*/
(*vfs->vfs_done)();
/*
* Free the vnode operations vector.
*/
vfs_opv_free(vfs->vfs_opv_descs);
out:
mutex_exit(&vfs_list_lock);
return (error);
}
void
vfs_reinit(void)
{
struct vfsops *vfs;
mutex_enter(&vfs_list_lock);
LIST_FOREACH(vfs, &vfs_list, vfs_list) {
if (vfs->vfs_reinit) {
vfs->vfs_refcount++;
mutex_exit(&vfs_list_lock);
(*vfs->vfs_reinit)();
mutex_enter(&vfs_list_lock);
vfs->vfs_refcount--;
}
}
mutex_exit(&vfs_list_lock);
}
/* $NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1994 Christopher G. Demetriou. All rights reserved.
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_subr.c 8.6 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_subr.c,v 1.117 2024/01/17 10:20:12 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/fstrans.h>
#include <sys/vnode.h>
#include <sys/stat.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>
#include <miscfs/procfs/procfs.h>
/*
* Allocate a pfsnode/vnode pair. The vnode is referenced.
* The pid, type, and file descriptor uniquely identify a pfsnode.
*/
int
procfs_allocvp(struct mount *mp, struct vnode **vpp, pid_t pid,
pfstype type, int fd)
{
struct pfskey key;
memset(&key, 0, sizeof(key));
key.pk_type = type;
key.pk_pid = pid;
key.pk_fd = fd;
return vcache_get(mp, &key, sizeof(key), vpp);
}
int
procfs_rw(void *v)
{
struct vop_read_args *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *curl;
struct lwp *l;
struct pfsnode *pfs = VTOPFS(vp);
struct proc *p;
int error;
if (uio->uio_offset < 0)
return EINVAL;
if ((error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH)) != 0)
return error;
curl = curlwp;
/*
* Do not allow init to be modified while in secure mode; it
* could be duped into changing the security level.
*/
#define M2K(m) ((m) == UIO_READ ? KAUTH_REQ_PROCESS_PROCFS_READ : \
KAUTH_REQ_PROCESS_PROCFS_WRITE)
mutex_enter(p->p_lock);
error = kauth_authorize_process(curl->l_cred, KAUTH_PROCESS_PROCFS,
p, pfs, KAUTH_ARG(M2K(uio->uio_rw)), NULL);
mutex_exit(p->p_lock);
if (error) {
procfs_proc_unlock(p);
return (error);
}
#undef M2K
mutex_enter(p->p_lock);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (l->l_stat != LSZOMB)
break;
}
/* Process is exiting if no-LWPS or all LWPs are LSZOMB */
if (l == NULL) {
mutex_exit(p->p_lock);
procfs_proc_unlock(p);
return ESRCH;
}
lwp_addref(l);
mutex_exit(p->p_lock);
switch (pfs->pfs_type) {
case PFSnote:
case PFSnotepg:
error = procfs_donote(curl, p, pfs, uio);
break;
case PFSregs:
error = procfs_doregs(curl, l, pfs, uio);
break;
case PFSfpregs:
error = procfs_dofpregs(curl, l, pfs, uio);
break;
case PFSstatus:
error = procfs_dostatus(curl, l, pfs, uio);
break;
case PFSstat:
error = procfs_do_pid_stat(curl, l, pfs, uio);
break;
case PFSlimit:
error = procfs_dolimit(curl, p, pfs, uio);
break;
case PFSmap:
error = procfs_domap(curl, p, pfs, uio, 0);
break;
case PFSmaps:
error = procfs_domap(curl, p, pfs, uio, 1);
break;
case PFSmem:
error = procfs_domem(curl, l, pfs, uio);
break;
case PFScmdline:
error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ARGV);
break;
case PFSenviron:
error = procfs_doprocargs(curl, p, pfs, uio, KERN_PROC_ENV);
break;
case PFSmeminfo:
error = procfs_domeminfo(curl, p, pfs, uio);
break;
case PFSdevices:
error = procfs_dodevices(curl, p, pfs, uio);
break;
case PFScpuinfo:
error = procfs_docpuinfo(curl, p, pfs, uio);
break;
case PFScpustat:
error = procfs_docpustat(curl, p, pfs, uio);
break;
case PFSloadavg:
error = procfs_doloadavg(curl, p, pfs, uio);
break;
case PFSstatm:
error = procfs_do_pid_statm(curl, l, pfs, uio);
break;
case PFSfd:
error = procfs_dofd(curl, p, pfs, uio);
break;
case PFSuptime:
error = procfs_douptime(curl, p, pfs, uio);
break;
case PFSmounts:
error = procfs_domounts(curl, p, pfs, uio);
break;
case PFSemul:
error = procfs_doemul(curl, p, pfs, uio);
break;
case PFSversion:
error = procfs_doversion(curl, p, pfs, uio);
break;
case PFSauxv:
error = procfs_doauxv(curl, p, pfs, uio);
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
error = procfs_machdep_rw(curl, l, pfs, uio);
break;
#endif
default:
error = EOPNOTSUPP;
break;
}
/*
* Release the references that we acquired earlier.
*/
lwp_delref(l);
procfs_proc_unlock(p);
return (error);
}
/*
* Get a string from userland into (bf). Strip a trailing
* nl character (to allow easy access from the shell).
* The buffer should be *buflenp + 1 chars long. vfs_getuserstr
* will automatically add a nul char at the end.
*
* Returns 0 on success or the following errors
*
* EINVAL: file offset is non-zero.
* EMSGSIZE: message is longer than kernel buffer
* EFAULT: user i/o buffer is not addressable
*/
int
vfs_getuserstr(struct uio *uio, char *bf, int *buflenp)
{
size_t xlen;
int error;
if (uio->uio_offset != 0)
return (EINVAL);
xlen = *buflenp;
/* must be able to read the whole string in one go */
if (xlen < uio->uio_resid)
return (EMSGSIZE);
xlen = uio->uio_resid;
if ((error = uiomove(bf, xlen, uio)) != 0)
return (error);
/* allow multiple writes without seeks */
uio->uio_offset = 0;
/* cleanup string and remove trailing newline */
bf[xlen] = '\0';
xlen = strlen(bf);
if (xlen > 0 && bf[xlen-1] == '\n')
bf[--xlen] = '\0';
*buflenp = xlen;
return (0);
}
const vfs_namemap_t *
vfs_findname(const vfs_namemap_t *nm, const char *bf, int buflen)
{
for (; nm->nm_name; nm++)
if (memcmp(bf, nm->nm_name, buflen+1) == 0)
return (nm);
return (0);
}
bool
procfs_use_linux_compat(struct mount *mp)
{
const int flags = VFSTOPROC(mp)->pmnt_flags;
return (flags & PROCFSMNT_LINUXCOMPAT) ? true : false;
}
struct proc *
procfs_proc_find(struct mount *mp, pid_t pid)
{ KASSERT(mutex_owned(&proc_lock)); return procfs_use_linux_compat(mp) ? proc_find_lwpid(pid) : proc_find(pid);
}
int
procfs_proc_lock(struct mount *mp, int pid, struct proc **bunghole,
int notfound)
{
struct proc *tp;
int error = 0;
mutex_enter(&proc_lock);
if (pid == 0)
tp = &proc0;
else if ((tp = procfs_proc_find(mp, pid)) == NULL)
error = notfound;
if (tp != NULL && !rw_tryenter(&tp->p_reflock, RW_READER))
error = EBUSY;
mutex_exit(&proc_lock);
*bunghole = tp;
return error;
}
void
procfs_proc_unlock(struct proc *p)
{
rw_exit(&p->p_reflock);
}
int
procfs_doemul(struct lwp *curl, struct proc *p,
struct pfsnode *pfs, struct uio *uio)
{
const char *ename = p->p_emul->e_name;
return uiomove_frombuf(__UNCONST(ename), strlen(ename), uio);
}
/* $NetBSD: bt_proto.c,v 1.17 2023/08/07 13:31:54 riastradh Exp $ */
/*-
* Copyright (c) 2005 Iain Hibbert.
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bt_proto.c,v 1.17 2023/08/07 13:31:54 riastradh Exp $");
#include <sys/param.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/module.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <net/route.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/l2cap.h>
#include <netbt/rfcomm.h>
#include <netbt/sco.h>
DOMAIN_DEFINE(btdomain); /* forward declare and add to link set */
static void bt_init(void);
PR_WRAP_CTLOUTPUT(hci_ctloutput)PR_WRAP_CTLOUTPUT(sco_ctloutput)
PR_WRAP_CTLOUTPUT(l2cap_ctloutput)
PR_WRAP_CTLOUTPUT(rfcomm_ctloutput)
#define hci_ctloutput hci_ctloutput_wrapper
#define sco_ctloutput sco_ctloutput_wrapper
#define l2cap_ctloutput l2cap_ctloutput_wrapper
#define rfcomm_ctloutput rfcomm_ctloutput_wrapper
static const struct protosw btsw[] = {
{ /* raw HCI commands */
.pr_type = SOCK_RAW,
.pr_domain = &btdomain,
.pr_protocol = BTPROTO_HCI,
.pr_flags = (PR_ADDR | PR_ATOMIC),
.pr_init = hci_init,
.pr_ctloutput = hci_ctloutput,
.pr_usrreqs = &hci_usrreqs,
},
{ /* HCI SCO data (audio) */
.pr_type = SOCK_SEQPACKET,
.pr_domain = &btdomain,
.pr_protocol = BTPROTO_SCO,
.pr_flags = (PR_CONNREQUIRED | PR_ATOMIC | PR_LISTEN),
.pr_ctloutput = sco_ctloutput,
.pr_usrreqs = &sco_usrreqs,
},
{ /* L2CAP Connection Oriented */
.pr_type = SOCK_SEQPACKET,
.pr_domain = &btdomain,
.pr_protocol = BTPROTO_L2CAP,
.pr_flags = (PR_CONNREQUIRED | PR_ATOMIC | PR_LISTEN),
.pr_ctloutput = l2cap_ctloutput,
.pr_usrreqs = &l2cap_usrreqs,
.pr_init = l2cap_init,
},
{ /* RFCOMM */
.pr_type = SOCK_STREAM,
.pr_domain = &btdomain,
.pr_protocol = BTPROTO_RFCOMM,
.pr_flags = (PR_CONNREQUIRED | PR_LISTEN | PR_WANTRCVD),
.pr_ctloutput = rfcomm_ctloutput,
.pr_usrreqs = &rfcomm_usrreqs,
.pr_init = rfcomm_init,
},
};
struct domain btdomain = {
.dom_family = AF_BLUETOOTH,
.dom_name = "bluetooth",
.dom_init = bt_init,
.dom_protosw = btsw,
.dom_protoswNPROTOSW = &btsw[__arraycount(btsw)],
};
kmutex_t *bt_lock;
static void
bt_init(void)
{
}
MODULE(MODULE_CLASS_DRIVER, netbt, NULL);
static int
netbt_modcmd(modcmd_t cmd, void *aux)
{
switch (cmd) {
case MODULE_CMD_INIT:
bt_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
return 0;
case MODULE_CMD_FINI:
return EBUSY; /* XXX */
default:
return ENOTTY;
}
}
/*-
* Copyright (c) 2013-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This material is based upon work partially supported by The
* NetBSD Foundation under a contract with Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* NPF configuration loading mechanism.
*
* The main operations on the configuration are the following:
* 1) Read access, primarily from the npf_packet_handler() function.
* 2) Write access on a particular set, mainly rule or table updates.
* 3) Deletion of the configuration after the reload operation.
*
* Synchronization
*
* For the (1) case, EBR is used to allow concurrent access to
* the configuration set (ruleset, etc). It guarantees that the
* configuration will not be destroyed while accessing it.
*
* For the cases (2) and (3), mutual exclusion (npf_t::config_lock)
* is used with, when necessary, the writer-side barrier of EBR.
*/
#ifdef _KERNEL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf_conf.c,v 1.18 2022/02/13 19:20:11 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/mutex.h>
#endif
#include "npf_impl.h"
#include "npf_conn.h"
void
npf_config_init(npf_t *npf)
{
npf_config_t *nc;
mutex_init(&npf->config_lock, MUTEX_DEFAULT, IPL_SOFTNET);
nc = npf_config_create();
/*
* Load an empty configuration.
*/
nc->ruleset = npf_ruleset_create(0);
nc->nat_ruleset = npf_ruleset_create(0);
nc->rule_procs = npf_rprocset_create();
nc->tableset = npf_tableset_create(0);
nc->default_pass = true;
npf_config_load(npf, nc, NULL, true);
KASSERT(npf->config != NULL);
}
npf_config_t *
npf_config_create(void)
{
return kmem_zalloc(sizeof(npf_config_t), KM_SLEEP);
}
void
npf_config_destroy(npf_config_t *nc)
{
/*
* Note: the rulesets must be destroyed first, in order to drop
* any references to the tableset.
*/
if (nc->ruleset) {
npf_ruleset_destroy(nc->ruleset);
}
if (nc->nat_ruleset) {
npf_ruleset_destroy(nc->nat_ruleset);
}
if (nc->rule_procs) {
npf_rprocset_destroy(nc->rule_procs);
}
if (nc->tableset) {
npf_tableset_destroy(nc->tableset);
}
kmem_free(nc, sizeof(npf_config_t));
}
void
npf_config_fini(npf_t *npf)
{
npf_conndb_t *cd = npf_conndb_create();
/* Flush the connections. */
mutex_enter(&npf->config_lock);
npf_conn_tracking(npf, false);
npf_ebr_full_sync(npf->ebr);
npf_conn_load(npf, cd, false);
npf_ifmap_flush(npf);
mutex_exit(&npf->config_lock);
npf_config_destroy(npf->config);
mutex_destroy(&npf->config_lock);
}
/*
* npf_config_load: the main routine performing configuration load.
* Performs the necessary synchronization and destroys the old config.
*/
void
npf_config_load(npf_t *npf, npf_config_t *nc, npf_conndb_t *conns, bool flush)
{
const bool load = conns != NULL;
npf_config_t *onc;
nc->default_pass = flush;
/*
* Acquire the lock and perform the first phase:
* - Scan and use existing dynamic tables, reload only static.
* - Scan and use matching NAT policies to preserve the connections.
*/
mutex_enter(&npf->config_lock);
if ((onc = atomic_load_relaxed(&npf->config)) != NULL) {
npf_ruleset_reload(npf, nc->ruleset, onc->ruleset, load);
npf_tableset_reload(npf, nc->tableset, onc->tableset);
npf_ruleset_reload(npf, nc->nat_ruleset, onc->nat_ruleset, load);
}
/*
* Set the new config and release the lock.
*/
atomic_store_release(&npf->config, nc);
if (onc == NULL) {
/* Initial load, done. */
npf_ifmap_flush(npf);
npf_conn_load(npf, conns, !flush);
mutex_exit(&npf->config_lock);
goto done;
}
/*
* If we are going to flush the connections or load the new ones,
* then disable the connection tracking for the grace period.
*/
if (flush || conns) {
npf_conn_tracking(npf, false);
}
/* Synchronise: drain all references. */
npf_ebr_full_sync(npf->ebr);
if (flush) {
npf_portmap_flush(npf->portmap);
npf_ifmap_flush(npf);
}
/*
* G/C the existing connections and, if passed, load the new ones.
* If not flushing - enable the connection tracking.
*/
npf_conn_load(npf, conns, !flush);
mutex_exit(&npf->config_lock);
/* Finally, it is safe to destroy the old config. */
npf_config_destroy(onc);
done:
/* Sync all interface address tables (can be done asynchronously). */
npf_ifaddr_syncall(npf);
}
/*
* Writer-side exclusive locking.
*/
npf_config_t *
npf_config_enter(npf_t *npf)
{
mutex_enter(&npf->config_lock);
return npf->config;
}
void
npf_config_exit(npf_t *npf)
{
mutex_exit(&npf->config_lock);
}
bool
npf_config_locked_p(npf_t *npf)
{
return mutex_owned(&npf->config_lock);
}
void
npf_config_sync(npf_t *npf)
{
KASSERT(npf_config_locked_p(npf));
npf_ebr_full_sync(npf->ebr);
}
/*
* Reader-side synchronization routines.
*/
int
npf_config_read_enter(npf_t *npf)
{
/* Note: issues an acquire fence. */
return npf_ebr_enter(npf->ebr);
}
void
npf_config_read_exit(npf_t *npf, int s)
{
/* Note: issues a release fence. */
npf_ebr_exit(npf->ebr, s);
}
/*
* Accessors.
*/
npf_ruleset_t *
npf_config_ruleset(npf_t *npf)
{
npf_config_t *config = atomic_load_consume(&npf->config);
KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr));
return config->ruleset;
}
npf_ruleset_t *
npf_config_natset(npf_t *npf)
{
npf_config_t *config = atomic_load_consume(&npf->config);
KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr));
return config->nat_ruleset;
}
npf_tableset_t *
npf_config_tableset(npf_t *npf)
{
npf_config_t *config = atomic_load_consume(&npf->config);
KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr));
return config->tableset;
}
bool
npf_default_pass(npf_t *npf)
{
npf_config_t *config = atomic_load_consume(&npf->config);
KASSERT(npf_config_locked_p(npf) || npf_ebr_incrit_p(npf->ebr));
return config->default_pass;
}
/* $NetBSD: in6_ifattach.c,v 1.122 2024/04/11 07:34:37 knakahara Exp $ */
/* $KAME: in6_ifattach.c,v 1.124 2001/07/18 08:32:51 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_ifattach.c,v 1.122 2024/04/11 07:34:37 knakahara Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/md5.h>
#include <sys/socketvar.h>
#include <sys/cprng.h>
#include <net/if.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/scope6_var.h>
int ip6_auto_linklocal = 1; /* enable by default */
#if 0
static int get_hostid_ifid(struct ifnet *, struct in6_addr *);
#endif
static int get_ifid(struct ifnet *, struct ifnet *, struct in6_addr *);
static int in6_ifattach_linklocal(struct ifnet *, struct ifnet *);
static int in6_ifattach_loopback(struct ifnet *);
#define EUI64_GBIT 0x01
#define EUI64_UBIT 0x02
#define EUI64_TO_IFID(in6) do {(in6)->s6_addr[8] ^= EUI64_UBIT; } while (/*CONSTCOND*/ 0)
#define EUI64_GROUP(in6) ((in6)->s6_addr[8] & EUI64_GBIT)
#define EUI64_INDIVIDUAL(in6) (!EUI64_GROUP(in6))
#define EUI64_LOCAL(in6) ((in6)->s6_addr[8] & EUI64_UBIT)
#define EUI64_UNIVERSAL(in6) (!EUI64_LOCAL(in6))
#define IFID_LOCAL(in6) (!EUI64_LOCAL(in6))
#define IFID_UNIVERSAL(in6) (!EUI64_UNIVERSAL(in6))
#if 0
/*
* Generate a last-resort interface identifier from hostid.
* works only for certain architectures (like sparc).
* also, using hostid itself may constitute a privacy threat, much worse
* than MAC addresses (hostids are used for software licensing).
* maybe we should use MD5(hostid) instead.
*
* in6 - upper 64bits are preserved
*/
static int
get_hostid_ifid(struct ifnet *ifp, struct in6_addr *in6)
{
int off, len;
static const uint8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static const uint8_t allone[8] =
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
if (!hostid)
return -1;
/* get up to 8 bytes from the hostid field - should we get */
len = (sizeof(hostid) > 8) ? 8 : sizeof(hostid);
off = sizeof(*in6) - len;
memcpy(&in6->s6_addr[off], &hostid, len);
/* make sure we do not return anything bogus */
if (memcmp(&in6->s6_addr[8], allzero, sizeof(allzero)))
return -1;
if (memcmp(&in6->s6_addr[8], allone, sizeof(allone)))
return -1;
/* make sure to set "u" bit to local, and "g" bit to individual. */
in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */
in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */
/* convert EUI64 into IPv6 interface identifier */
EUI64_TO_IFID(in6);
return 0;
}
#endif
/*
* Generate a last-resort interface identifier, when the machine has no
* IEEE802/EUI64 address sources.
* The goal here is to get an interface identifier that is
* (1) random enough and (2) does not change across reboot.
* We currently use MD5(hostname) for it.
*/
static int
get_rand_ifid(struct in6_addr *in6) /* upper 64bits are preserved */
{
MD5_CTX ctxt;
u_int8_t digest[16];
#if 0
/* we need at least several letters as seed for ifid */
if (hostnamelen < 3)
return -1;
#endif
/* generate 8 bytes of pseudo-random value. */
memset(&ctxt, 0, sizeof(ctxt));
MD5Init(&ctxt);
MD5Update(&ctxt, (u_char *)hostname, hostnamelen);
MD5Final(digest, &ctxt);
/* assumes sizeof(digest) > sizeof(ifid) */
memcpy(&in6->s6_addr[8], digest, 8);
/* make sure to set "u" bit to local, and "g" bit to individual. */
in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */
in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */
/* convert EUI64 into IPv6 interface identifier */
EUI64_TO_IFID(in6);
return 0;
}
/*
* Get interface identifier for the specified interface.
*
* in6 - upper 64bits are preserved
*/
int
in6_get_hw_ifid(struct ifnet *ifp, struct in6_addr *in6)
{
struct ifaddr *ifa;
const struct sockaddr_dl *sdl = NULL;
const char *addr = NULL; /* XXX gcc 4.8 -Werror=maybe-uninitialized */
size_t addrlen = 0; /* XXX gcc 4.8 -Werror=maybe-uninitialized */
static u_int8_t allzero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
static u_int8_t allone[8] =
{ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
int s;
s = pserialize_read_enter();
IFADDR_READER_FOREACH(ifa, ifp) {
const struct sockaddr_dl *tsdl;
if (ifa->ifa_addr->sa_family != AF_LINK)
continue;
tsdl = satocsdl(ifa->ifa_addr);
if (tsdl == NULL || tsdl->sdl_alen == 0)
continue;
if (sdl == NULL || ifa == ifp->if_dl || ifa == ifp->if_hwdl) {
sdl = tsdl;
addr = CLLADDR(sdl);
addrlen = sdl->sdl_alen;
}
if (ifa == ifp->if_hwdl)
break;
}
pserialize_read_exit(s);
if (sdl == NULL)
return -1;
switch (ifp->if_type) {
case IFT_IEEE1394:
case IFT_IEEE80211:
/* IEEE1394 uses 16byte length address starting with EUI64 */
if (addrlen > 8)
addrlen = 8;
break;
default:
break;
}
/* get EUI64 */
switch (ifp->if_type) {
/* IEEE802/EUI64 cases - what others? */
case IFT_ETHER:
case IFT_ATM:
case IFT_IEEE1394:
case IFT_IEEE80211:
/* look at IEEE802/EUI64 only */
if (addrlen != 8 && addrlen != 6)
return -1;
/*
* check for invalid MAC address - on bsdi, we see it a lot
* since wildboar configures all-zero MAC on pccard before
* card insertion.
*/
if (memcmp(addr, allzero, addrlen) == 0)
return -1;
if (memcmp(addr, allone, addrlen) == 0)
return -1;
/* make EUI64 address */
if (addrlen == 8)
memcpy(&in6->s6_addr[8], addr, 8);
else if (addrlen == 6) {
in6->s6_addr[8] = addr[0];
in6->s6_addr[9] = addr[1];
in6->s6_addr[10] = addr[2];
in6->s6_addr[11] = 0xff;
in6->s6_addr[12] = 0xfe;
in6->s6_addr[13] = addr[3];
in6->s6_addr[14] = addr[4];
in6->s6_addr[15] = addr[5];
}
break;
case IFT_ARCNET:
if (addrlen != 1)
return -1;
if (!addr[0])
return -1;
memset(&in6->s6_addr[8], 0, 8);
in6->s6_addr[15] = addr[0];
/*
* due to insufficient bitwidth, we mark it local.
*/
in6->s6_addr[8] &= ~EUI64_GBIT; /* g bit to "individual" */
in6->s6_addr[8] |= EUI64_UBIT; /* u bit to "local" */
break;
case IFT_GIF:
case IFT_IPSEC:
#ifdef IFT_STF
case IFT_STF:
#endif
/*
* RFC2893 says: "SHOULD use IPv4 address as ifid source".
* however, IPv4 address is not very suitable as unique
* identifier source (can be renumbered).
* we don't do this.
*/
return -1;
default:
return -1;
}
/* sanity check: g bit must not indicate "group" */
if (EUI64_GROUP(in6))
return -1;
/* convert EUI64 into IPv6 interface identifier */
EUI64_TO_IFID(in6);
/*
* sanity check: ifid must not be all zero, avoid conflict with
* subnet router anycast
*/
if ((in6->s6_addr[8] & ~(EUI64_GBIT | EUI64_UBIT)) == 0x00 &&
memcmp(&in6->s6_addr[9], allzero, 7) == 0) {
return -1;
}
return 0;
}
/*
* Get interface identifier for the specified interface. If it is not
* available on ifp0, borrow interface identifier from other information
* sources.
*
* altifp - secondary EUI64 source
*/
static int
get_ifid(struct ifnet *ifp0, struct ifnet *altifp,
struct in6_addr *in6)
{
struct ifnet *ifp;
int s;
/* first, try to get it from the interface itself */
if (in6_get_hw_ifid(ifp0, in6) == 0) {
nd6log(LOG_DEBUG, "%s: got interface identifier from itself\n",
if_name(ifp0));
goto success;
}
/* try secondary EUI64 source. this basically is for ATM PVC */
if (altifp && in6_get_hw_ifid(altifp, in6) == 0) { nd6log(LOG_DEBUG, "%s: got interface identifier from %s\n",
if_name(ifp0), if_name(altifp));
goto success;
}
/* next, try to get it from some other hardware interface */
s = pserialize_read_enter();
IFNET_READER_FOREACH(ifp) { if (ifp == ifp0)
continue;
if (in6_get_hw_ifid(ifp, in6) != 0)
continue;
/*
* to borrow ifid from other interface, ifid needs to be
* globally unique
*/
if (IFID_UNIVERSAL(in6)) { nd6log(LOG_DEBUG,
"%s: borrow interface identifier from %s\n",
if_name(ifp0), if_name(ifp));
pserialize_read_exit(s);
goto success;
}
}
pserialize_read_exit(s);
#if 0
/* get from hostid - only for certain architectures */
if (get_hostid_ifid(ifp, in6) == 0) {
nd6log(LOG_DEBUG,
"%s: interface identifier generated by hostid\n",
if_name(ifp0));
goto success;
}
#endif
/* last resort: get from random number source */
if (get_rand_ifid(in6) == 0) {
nd6log(LOG_DEBUG,
"%s: interface identifier generated by random number\n",
if_name(ifp0));
goto success;
}
printf("%s: failed to get interface identifier\n", if_name(ifp0));
return -1;
success:
nd6log(LOG_INFO, "%s: ifid: %02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
if_name(ifp0), in6->s6_addr[8], in6->s6_addr[9], in6->s6_addr[10],
in6->s6_addr[11], in6->s6_addr[12], in6->s6_addr[13],
in6->s6_addr[14], in6->s6_addr[15]);
return 0;
}
/*
* altifp - secondary EUI64 source
*/
static int
in6_ifattach_linklocal(struct ifnet *ifp, struct ifnet *altifp)
{
struct in6_aliasreq ifra;
int error;
/*
* configure link-local address.
*/
memset(&ifra, 0, sizeof(ifra));
/*
* in6_update_ifa() does not use ifra_name, but we accurately set it
* for safety.
*/
strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
ifra.ifra_addr.sin6_family = AF_INET6;
ifra.ifra_addr.sin6_len = sizeof(struct sockaddr_in6);
ifra.ifra_addr.sin6_addr.s6_addr32[0] = htonl(0xfe800000);
ifra.ifra_addr.sin6_addr.s6_addr32[1] = 0;
if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
ifra.ifra_addr.sin6_addr.s6_addr32[2] = 0;
ifra.ifra_addr.sin6_addr.s6_addr32[3] = htonl(1);
} else {
if (get_ifid(ifp, altifp, &ifra.ifra_addr.sin6_addr) != 0) {
nd6log(LOG_ERR,
"%s: no ifid available\n", if_name(ifp));
return -1;
}
}
if (in6_setscope(&ifra.ifra_addr.sin6_addr, ifp, NULL))
return -1;
sockaddr_in6_init(&ifra.ifra_prefixmask, &in6mask64, 0, 0, 0);
/* link-local addresses should NEVER expire. */
ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
/*
* Now call in6_update_ifa() to do a bunch of procedures to configure
* a link-local address. We can set the 3rd argument to NULL, because
* we know there's no other link-local address on the interface
* and therefore we are adding one (instead of updating one).
*/
if ((error = in6_update_ifa(ifp, &ifra, IN6_IFAUPDATE_DADDELAY)) != 0) {
/*
* XXX: When the interface does not support IPv6, this call
* would fail in the SIOCINITIFADDR ioctl. I believe the
* notification is rather confusing in this case, so just
* suppress it. (jinmei@kame.net 20010130)
*/
if (error != EAFNOSUPPORT) nd6log(LOG_NOTICE,
"failed to configure a link-local address on %s "
"(errno=%d)\n",
if_name(ifp), error);
return -1;
}
return 0;
}
/*
* ifp - must be IFT_LOOP
*/
static int
in6_ifattach_loopback(struct ifnet *ifp)
{
struct in6_aliasreq ifra;
int error;
memset(&ifra, 0, sizeof(ifra));
/*
* in6_update_ifa() does not use ifra_name, but we accurately set it
* for safety.
*/
strncpy(ifra.ifra_name, if_name(ifp), sizeof(ifra.ifra_name));
sockaddr_in6_init(&ifra.ifra_prefixmask, &in6mask128, 0, 0, 0);
/*
* Always initialize ia_dstaddr (= broadcast address) to loopback
* address. Follows IPv4 practice - see in_ifinit().
*/
sockaddr_in6_init(&ifra.ifra_dstaddr, &in6addr_loopback, 0, 0, 0);
sockaddr_in6_init(&ifra.ifra_addr, &in6addr_loopback, 0, 0, 0);
/* the loopback address should NEVER expire. */
ifra.ifra_lifetime.ia6t_vltime = ND6_INFINITE_LIFETIME;
ifra.ifra_lifetime.ia6t_pltime = ND6_INFINITE_LIFETIME;
/* we don't need to perform DAD on loopback interfaces. */
ifra.ifra_flags |= IN6_IFF_NODAD;
/*
* We are sure that this is a newly assigned address, so we can set
* NULL to the 3rd arg.
*/
if ((error = in6_update_ifa(ifp, &ifra, 0)) != 0) {
nd6log(LOG_ERR, "failed to configure "
"the loopback address on %s (errno=%d)\n",
if_name(ifp), error);
return -1;
}
return 0;
}
/*
* compute NI group address, based on the current hostname setting.
* see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later).
*
* when ifp == NULL, the caller is responsible for filling scopeid.
*/
int
in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
struct sockaddr_in6 *sa6)
{
const char *p;
u_int8_t *q;
MD5_CTX ctxt;
u_int8_t digest[16];
u_int8_t l;
u_int8_t n[64]; /* a single label must not exceed 63 chars */
if (!namelen || !name)
return -1;
p = name;
while (p && *p && *p != '.' && p - name < namelen)
p++;
if (p - name > sizeof(n) - 1)
return -1; /* label too long */
l = p - name;
strncpy((char *)n, name, l);
n[(int)l] = '\0';
for (q = n; *q; q++) {
if ('A' <= *q && *q <= 'Z')
*q = *q - 'A' + 'a';
}
/* generate 8 bytes of pseudo-random value. */
memset(&ctxt, 0, sizeof(ctxt));
MD5Init(&ctxt);
MD5Update(&ctxt, &l, sizeof(l));
MD5Update(&ctxt, n, l);
MD5Final(digest, &ctxt);
memset(sa6, 0, sizeof(*sa6));
sa6->sin6_family = AF_INET6;
sa6->sin6_len = sizeof(*sa6);
sa6->sin6_addr.s6_addr16[0] = htons(0xff02);
sa6->sin6_addr.s6_addr8[11] = 2;
memcpy(&sa6->sin6_addr.s6_addr32[3], digest,
sizeof(sa6->sin6_addr.s6_addr32[3]));
if (in6_setscope(&sa6->sin6_addr, ifp, NULL))
return -1; /* XXX: should not fail */
return 0;
}
/*
* XXX multiple loopback interface needs more care. for instance,
* nodelocal address needs to be configured onto only one of them.
* XXX multiple link-local address case
*
* altifp - secondary EUI64 source
*/
void
in6_ifattach(struct ifnet *ifp, struct ifnet *altifp)
{
struct in6_ifaddr *ia;
struct in6_addr in6;
KASSERT(IFNET_LOCKED(ifp));
/* some of the interfaces are inherently not IPv6 capable */
switch (ifp->if_type) {
case IFT_BRIDGE:
case IFT_L2TP:
case IFT_IEEE8023ADLAG:
#ifdef IFT_PFLOG
case IFT_PFLOG:
#endif
#ifdef IFT_PFSYNC
case IFT_PFSYNC:
#endif
ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
ND_IFINFO(ifp)->flags |= ND6_IFF_IFDISABLED;
return;
}
/*
* if link mtu is too small, don't try to configure IPv6.
* remember there could be some link-layer that has special
* fragmentation logic.
*/
if (ifp->if_mtu < IPV6_MMTU) {
nd6log(LOG_INFO, "%s has too small MTU, IPv6 not enabled\n",
if_name(ifp));
return;
}
/*
* quirks based on interface type
*/
switch (ifp->if_type) {
#ifdef IFT_STF
case IFT_STF:
/*
* 6to4 interface is a very special kind of beast.
* no multicast, no linklocal. RFC2529 specifies how to make
* linklocals for 6to4 interface, but there's no use and
* it is rather harmful to have one.
*/
ND_IFINFO(ifp)->flags &= ~ND6_IFF_AUTO_LINKLOCAL;
return;
#endif
case IFT_CARP:
return;
default:
break;
}
/*
* usually, we require multicast capability to the interface
*/
if ((ifp->if_flags & IFF_MULTICAST) == 0) {
nd6log(LOG_INFO,
"%s is not multicast capable, IPv6 not enabled\n",
if_name(ifp));
return;
}
/*
* assign loopback address for loopback interface.
* XXX multiple loopback interface case.
*/
if ((ifp->if_flags & IFF_LOOPBACK) != 0) {
in6 = in6addr_loopback;
/* These are safe and atomic thanks to IFNET_LOCK */
if (in6ifa_ifpwithaddr(ifp, &in6) == NULL) { if (in6_ifattach_loopback(ifp) != 0)
return;
}
}
/*
* assign a link-local address, if there's none.
*/
if (!(ND_IFINFO(ifp)->flags & ND6_IFF_IFDISABLED) &&
ND_IFINFO(ifp)->flags & ND6_IFF_AUTO_LINKLOCAL) {
int bound = curlwp_bind();
struct psref psref;
ia = in6ifa_ifpforlinklocal_psref(ifp, 0, &psref);
if (ia == NULL && in6_ifattach_linklocal(ifp, altifp) != 0) {
printf("%s: cannot assign link-local address\n",
ifp->if_xname);
}
ia6_release(ia, &psref); curlwp_bindx(bound);
}
}
/*
* NOTE: in6_ifdetach() does not support loopback if at this moment.
* We don't need this function in bsdi, because interfaces are never removed
* from the ifnet list in bsdi.
*/
void
in6_ifdetach(struct ifnet *ifp)
{
/* nuke any of IPv6 addresses we have */
if_purgeaddrs(ifp, AF_INET6, in6_purgeaddr);
in6_purge_multi(ifp);
/* remove ip6_mrouter stuff */
ip6_mrouter_detach(ifp);
/* remove neighbor management table */
nd6_purge(ifp, NULL);
}
/* $NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_user.c,v 1.1.2.1 1997/08/14 19:10:41 chuck Exp
*/
/*
* uvm_user.c: high level uvm_allocate/uvm_deallocate interface into vm.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_user.c,v 1.14 2011/02/02 15:13:34 chuck Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <uvm/uvm.h>
/*
* uvm_deallocate: deallocate memory (unmap)
*/
void
uvm_deallocate(struct vm_map *map, vaddr_t start, vsize_t size)
{ if (size == 0)
return;
uvm_unmap(map, trunc_page(start), round_page(start + size));
}
/*-
* Copyright (c) 2019 Mindaugas Rasiukevicius <rmind at noxt eu>
* Copyright (c) 2013 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* NPF network interface handling.
*
* NPF uses its own interface IDs (npf-if-id). These IDs start from 1.
* Zero is reserved to indicate "no interface" case or an interface of
* no interest (i.e. not registered).
*
* This module provides an interface to primarily handle the following:
*
* - Bind a symbolic interface name to NPF interface ID.
* - Associate NPF interface ID when the network interface is attached.
*
* When NPF configuration is (re)loaded, each referenced network interface
* name is registered with a unique ID. If the network interface is already
* attached, then the ID is associated with it immediately; otherwise, IDs
* are associated/disassociated on interface events which are monitored
* using pfil(9) hooks.
*
* To avoid race conditions when an active NPF configuration is updated or
* interfaces are detached/attached, the interface names are never removed
* and therefore IDs are never re-assigned. The only point when interface
* names and IDs are cleared is when the configuration is flushed.
*
* A linear counter is used for IDs.
*/
#ifdef _KERNEL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf_if.c,v 1.13 2020/05/30 14:16:56 rmind Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <net/if.h>
#endif
#include "npf_impl.h"
typedef struct npf_ifmap {
char ifname[IFNAMSIZ + 1];
} npf_ifmap_t;
#define NPF_IFMAP_NOID (0U)
#define NPF_IFMAP_SLOT2ID(npf, slot) ((npf)->ifmap_off + (slot) + 1)
#define NPF_IFMAP_ID2SLOT(npf, id) \
((id) - atomic_load_relaxed(&(npf)->ifmap_off) - 1)
void
npf_ifmap_init(npf_t *npf, const npf_ifops_t *ifops)
{
const size_t nbytes = sizeof(npf_ifmap_t) * NPF_MAX_IFMAP;
KASSERT(ifops != NULL);
ifops->flush(npf, (void *)(uintptr_t)0);
mutex_init(&npf->ifmap_lock, MUTEX_DEFAULT, IPL_SOFTNET);
npf->ifmap = kmem_zalloc(nbytes, KM_SLEEP);
npf->ifmap_cnt = 0;
npf->ifmap_off = 0;
npf->ifops = ifops;
}
void
npf_ifmap_fini(npf_t *npf)
{
const size_t nbytes = sizeof(npf_ifmap_t) * NPF_MAX_IFMAP;
mutex_destroy(&npf->ifmap_lock);
kmem_free(npf->ifmap, nbytes);
}
static unsigned
npf_ifmap_lookup(npf_t *npf, const char *ifname)
{
KASSERT(mutex_owned(&npf->ifmap_lock)); for (unsigned i = 0; i < npf->ifmap_cnt; i++) {
npf_ifmap_t *ifmap = &npf->ifmap[i];
if (strcmp(ifmap->ifname, ifname) == 0) {
return NPF_IFMAP_SLOT2ID(npf, i);
}
}
return NPF_IFMAP_NOID;
}
/*
* npf_ifmap_register: register an interface name; return an assigned
* NPF network ID on success (non-zero).
*
* This routine is mostly called on NPF configuration (re)load for the
* interfaces names referenced by the rules.
*/
unsigned
npf_ifmap_register(npf_t *npf, const char *ifname)
{
npf_ifmap_t *ifmap;
unsigned id, i;
ifnet_t *ifp;
mutex_enter(&npf->ifmap_lock);
if ((id = npf_ifmap_lookup(npf, ifname)) != NPF_IFMAP_NOID) {
goto out;
}
if (npf->ifmap_cnt == NPF_MAX_IFMAP) {
printf("npf_ifmap_new: out of slots; bump NPF_MAX_IFMAP\n");
id = NPF_IFMAP_NOID;
goto out;
}
KASSERT(npf->ifmap_cnt < NPF_MAX_IFMAP);
/* Allocate a new slot and convert and assign an ID. */
i = npf->ifmap_cnt++;
ifmap = &npf->ifmap[i];
strlcpy(ifmap->ifname, ifname, IFNAMSIZ);
id = NPF_IFMAP_SLOT2ID(npf, i);
if ((ifp = npf->ifops->lookup(npf, ifname)) != NULL) {
npf->ifops->setmeta(npf, ifp, (void *)(uintptr_t)id);
}
out:
mutex_exit(&npf->ifmap_lock);
return id;
}
void
npf_ifmap_flush(npf_t *npf)
{
mutex_enter(&npf->ifmap_lock);
npf->ifops->flush(npf, (void *)(uintptr_t)NPF_IFMAP_NOID);
for (unsigned i = 0; i < npf->ifmap_cnt; i++) {
npf->ifmap[i].ifname[0] = '\0';
}
npf->ifmap_cnt = 0;
/*
* Reset the ID counter if reaching the overflow; this is not
* realistic, but we maintain correctness.
*/
if (npf->ifmap_off < (UINT_MAX - NPF_MAX_IFMAP)) {
npf->ifmap_off += NPF_MAX_IFMAP;
} else {
npf->ifmap_off = 0;
}
mutex_exit(&npf->ifmap_lock);
}
/*
* npf_ifmap_getid: get the ID for the given network interface.
*
* => This routine is typically called from the packet handler when
* matching whether the packet is on particular network interface.
*
* => This routine is lock-free; if the NPF configuration is flushed
* while the packet is in-flight, the ID will not match because we
* keep the IDs linear.
*/
unsigned
npf_ifmap_getid(npf_t *npf, const ifnet_t *ifp)
{
const unsigned id = (uintptr_t)npf->ifops->getmeta(npf, ifp);
return id;
}
/*
* npf_ifmap_copylogname: this function is toxic; it can return garbage
* as we don't lock, but it is only used temporarily and only for logging.
*/
void
npf_ifmap_copylogname(npf_t *npf, unsigned id, char *buf, size_t len)
{
const unsigned i = NPF_IFMAP_ID2SLOT(npf, id);
membar_consumer();
if (id != NPF_IFMAP_NOID && i < NPF_MAX_IFMAP) {
/*
* Lock-free access is safe as there is an extra byte
* with a permanent NUL terminator at the end.
*/
const npf_ifmap_t *ifmap = &npf->ifmap[i];
strlcpy(buf, ifmap->ifname, MIN(len, IFNAMSIZ));
} else {
strlcpy(buf, "???", len);
}
}
void
npf_ifmap_copyname(npf_t *npf, unsigned id, char *buf, size_t len)
{
mutex_enter(&npf->ifmap_lock);
npf_ifmap_copylogname(npf, id, buf, len);
mutex_exit(&npf->ifmap_lock);
}
__dso_public void
npfk_ifmap_attach(npf_t *npf, ifnet_t *ifp)
{
const npf_ifops_t *ifops = npf->ifops;
unsigned id;
mutex_enter(&npf->ifmap_lock);
id = npf_ifmap_lookup(npf, ifops->getname(npf, ifp));
ifops->setmeta(npf, ifp, (void *)(uintptr_t)id);
mutex_exit(&npf->ifmap_lock);
}
__dso_public void
npfk_ifmap_detach(npf_t *npf, ifnet_t *ifp)
{
/* Diagnostic. */
mutex_enter(&npf->ifmap_lock);
npf->ifops->setmeta(npf, ifp, (void *)(uintptr_t)NPF_IFMAP_NOID);
mutex_exit(&npf->ifmap_lock);
}
/* $NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $ */
/*-
* Copyright (c) 2003, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Brown.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Mike Karels at Berkeley Software Design, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95
*/
/*
* sysctl system call.
*/
#define __COMPAT_SYSCTL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sysctl.c,v 1.270 2023/09/09 16:01:09 christos Exp $");
#ifdef _KERNEL_OPT
#include "opt_defcorename.h"
#endif
#include "ksyms.h"
#include <sys/param.h>
#include <sys/types.h>
#include <sys/buf.h>
#include <sys/cprng.h>
#include <sys/kauth.h>
#include <sys/ksyms.h>
#include <sys/ktrace.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/once.h>
#include <sys/rndsource.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <crypto/blake2/blake2s.h>
#define MAXDESCLEN 1024
MALLOC_DEFINE(M_SYSCTLNODE, "sysctlnode", "sysctl node structures");
MALLOC_DEFINE(M_SYSCTLDATA, "sysctldata", "misc sysctl data");
static int sysctl_mmap(SYSCTLFN_PROTO);
static int sysctl_alloc(struct sysctlnode *, int);
static int sysctl_realloc(struct sysctlnode *);
static int sysctl_cvt_in(struct lwp *, int *, const void *, size_t,
struct sysctlnode *);
static int sysctl_cvt_out(struct lwp *, int, const struct sysctlnode *,
void *, size_t, size_t *);
static int sysctl_log_add(struct sysctllog **, const struct sysctlnode *);
static int sysctl_log_realloc(struct sysctllog *);
typedef void sysctl_setup_func(struct sysctllog **);
#ifdef SYSCTL_DEBUG
#define DPRINTF(a) printf a
#else
#define DPRINTF(a)
#endif
struct sysctllog {
const struct sysctlnode *log_root;
int *log_num;
int log_size, log_left;
};
/*
* the "root" of the new sysctl tree
*/
struct sysctlnode sysctl_root = {
.sysctl_flags = SYSCTL_VERSION|
CTLFLAG_ROOT|CTLFLAG_READWRITE|
CTLTYPE_NODE,
.sysctl_num = 0,
.sysctl_size = sizeof(struct sysctlnode),
.sysctl_name = "(root)",
};
/*
* link set of functions that add nodes at boot time (see also
* sysctl_buildtree())
*/
__link_set_decl(sysctl_funcs, sysctl_setup_func);
/*
* The `sysctl_treelock' is intended to serialize access to the sysctl
* tree. XXX This has serious problems; allocating memory and
* copying data out with the lock held is insane.
*/
krwlock_t sysctl_treelock;
kmutex_t sysctl_file_marker_lock;
/*
* Attributes stored in the kernel.
*/
char hostname[MAXHOSTNAMELEN];
int hostnamelen;
char domainname[MAXHOSTNAMELEN];
int domainnamelen;
long hostid;
#ifndef DEFCORENAME
#define DEFCORENAME "%n.core"
#endif
char defcorename[MAXPATHLEN] = DEFCORENAME;
/*
* ********************************************************************
* Section 0: Some simple glue
* ********************************************************************
* By wrapping copyin(), copyout(), and copyinstr() like this, we can
* stop caring about who's calling us and simplify some code a bunch.
* ********************************************************************
*/
int
sysctl_copyin(struct lwp *l, const void *uaddr, void *kaddr, size_t len)
{
int error;
if (l != NULL) {
error = copyin(uaddr, kaddr, len);
ktrmibio(-1, UIO_WRITE, uaddr, len, error);
} else {
error = kcopy(uaddr, kaddr, len);
}
return error;
}
int
sysctl_copyout(struct lwp *l, const void *kaddr, void *uaddr, size_t len)
{
int error;
if (l != NULL) {
error = copyout(kaddr, uaddr, len);
ktrmibio(-1, UIO_READ, uaddr, len, error);
} else {
error = kcopy(kaddr, uaddr, len);
}
return error;
}
int
sysctl_copyinstr(struct lwp *l, const void *uaddr, void *kaddr,
size_t len, size_t *done)
{
int error;
if (l != NULL) {
error = copyinstr(uaddr, kaddr, len, done);
ktrmibio(-1, UIO_WRITE, uaddr, len, error);
} else {
error = copystr(uaddr, kaddr, len, done);
}
return error;
}
/*
* ********************************************************************
* Initialize sysctl subsystem.
* ********************************************************************
*/
void
sysctl_init(void)
{
sysctl_setup_func *const *sysctl_setup;
rw_init(&sysctl_treelock);
/*
* dynamic mib numbers start here
*/
sysctl_root.sysctl_num = CREATE_BASE;
sysctl_basenode_init();
__link_set_foreach(sysctl_setup, sysctl_funcs) {
(**sysctl_setup)(NULL);
}
mutex_init(&sysctl_file_marker_lock, MUTEX_DEFAULT, IPL_NONE);
}
/*
* Setting this means no more permanent nodes can be added,
* trees that claim to be readonly at the root now are, and if
* the main tree is readonly, *everything* is.
*
* Also starts up the PRNG used for the "random" sysctl: it's
* better to start it later than sooner.
*
* Call this at the end of kernel init.
*/
void
sysctl_finalize(void)
{
sysctl_root.sysctl_flags |= CTLFLAG_PERMANENT;
}
/*
* ********************************************************************
* The main native sysctl system call itself.
* ********************************************************************
*/
int
sys___sysctl(struct lwp *l, const struct sys___sysctl_args *uap, register_t *retval)
{
/* {
syscallarg(const int *) name;
syscallarg(u_int) namelen;
syscallarg(void *) old;
syscallarg(size_t *) oldlenp;
syscallarg(const void *) new;
syscallarg(size_t) newlen;
} */
int error, nerror, name[CTL_MAXNAME];
size_t oldlen, savelen, *oldlenp;
/*
* get oldlen
*/
oldlen = 0;
oldlenp = SCARG(uap, oldlenp);
if (oldlenp != NULL) {
error = copyin(oldlenp, &oldlen, sizeof(oldlen));
if (error)
return (error);
}
savelen = oldlen;
/*
* top-level sysctl names may or may not be non-terminal, but
* we don't care
*/
if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 1)
return (EINVAL);
error = copyin(SCARG(uap, name), &name,
SCARG(uap, namelen) * sizeof(int));
if (error)
return (error);
ktrmib(name, SCARG(uap, namelen));
sysctl_lock(SCARG(uap, newv) != NULL);
/*
* do sysctl work (NULL means main built-in default tree)
*/
error = sysctl_dispatch(&name[0], SCARG(uap, namelen),
SCARG(uap, oldv), &oldlen,
SCARG(uap, newv), SCARG(uap, newlen),
&name[0], l, NULL);
/*
* release the sysctl lock
*/
sysctl_unlock();
/*
* set caller's oldlen to new value even in the face of an
* error (if this gets an error and they didn't have one, they
* get this one)
*/
if (oldlenp) {
nerror = copyout(&oldlen, oldlenp, sizeof(oldlen));
if (error == 0)
error = nerror;
}
/*
* if the only problem is that we weren't given enough space,
* that's an ENOMEM error
*/
if (error == 0 && SCARG(uap, oldv) != NULL && savelen < oldlen)
error = ENOMEM;
return (error);
}
/*
* ********************************************************************
* Section 1: How the tree is used
* ********************************************************************
* Implementations of sysctl for emulations should typically need only
* these three functions in this order: lock the tree, dispatch
* request into it, unlock the tree.
* ********************************************************************
*/
void
sysctl_lock(bool write)
{
if (write) {
rw_enter(&sysctl_treelock, RW_WRITER);
curlwp->l_pflag |= LP_SYSCTLWRITE;
} else {
rw_enter(&sysctl_treelock, RW_READER);
curlwp->l_pflag &= ~LP_SYSCTLWRITE;
}
}
void
sysctl_relock(void)
{
if ((curlwp->l_pflag & LP_SYSCTLWRITE) != 0) {
rw_enter(&sysctl_treelock, RW_WRITER);
} else {
rw_enter(&sysctl_treelock, RW_READER);
}
}
/*
* ********************************************************************
* the main sysctl dispatch routine. scans the given tree and picks a
* function to call based on what it finds.
* ********************************************************************
*/
int
sysctl_dispatch(SYSCTLFN_ARGS)
{
int error;
sysctlfn fn;
int ni;
KASSERT(rw_lock_held(&sysctl_treelock)); if (rnode && SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) { printf("sysctl_dispatch: rnode %p wrong version\n", rnode);
error = EINVAL;
goto out;
}
fn = NULL;
error = sysctl_locate(l, name, namelen, &rnode, &ni);
if (rnode->sysctl_func != NULL) {
/*
* the node we ended up at has a function, so call it. it can
* hand off to query or create if it wants to.
*/
fn = rnode->sysctl_func;
} else if (error == 0) {
/*
* we found the node they were looking for, so do a lookup.
*/
fn = (sysctlfn)sysctl_lookup; /* XXX may write to rnode */
} else if (error == ENOENT && (ni + 1) == namelen && name[ni] < 0) {
/*
* prospective parent node found, but the terminal node was
* not. generic operations associate with the parent.
*/
switch (name[ni]) {
case CTL_QUERY:
fn = sysctl_query;
break;
case CTL_CREATE:
#if NKSYMS > 0
case CTL_CREATESYM:
#endif /* NKSYMS > 0 */
if (newp == NULL) {
error = EINVAL;
break;
}
KASSERT(rw_write_held(&sysctl_treelock));
fn = (sysctlfn)sysctl_create; /* we own the rnode */
break;
case CTL_DESTROY:
if (newp == NULL) {
error = EINVAL;
break;
}
KASSERT(rw_write_held(&sysctl_treelock));
fn = (sysctlfn)sysctl_destroy; /* we own the rnode */
break;
case CTL_MMAP:
fn = (sysctlfn)sysctl_mmap; /* we own the rnode */
break;
case CTL_DESCRIBE:
fn = sysctl_describe;
break;
default:
error = EOPNOTSUPP;
break;
}
}
/*
* after all of that, maybe we found someone who knows how to
* get us what we want?
*/
if (fn != NULL)
error = (*fn)(name + ni, namelen - ni, oldp, oldlenp,
newp, newlen, name, l, rnode);
else if (error == 0)
error = EOPNOTSUPP;
out:
return (error);
}
/*
* ********************************************************************
* Releases the tree lock.
* ********************************************************************
*/
void
sysctl_unlock(void)
{
rw_exit(&sysctl_treelock);
}
/*
* ********************************************************************
* Section 2: The main tree interfaces
* ********************************************************************
* This is how sysctl_dispatch() does its work, and you can too, by
* calling these routines from helpers (though typically only
* sysctl_lookup() will be used). The tree MUST BE LOCKED when these
* are called.
* ********************************************************************
*/
/*
* sysctl_locate -- Finds the node matching the given mib under the
* given tree (via rv). If no tree is given, we fall back to the
* native tree. The current process (via l) is used for access
* control on the tree (some nodes may be traversable only by root) and
* on return, nip will show how many numbers in the mib were consumed.
*/
int
sysctl_locate(struct lwp *l, const int *name, u_int namelen,
const struct sysctlnode **rnode, int *nip)
{
const struct sysctlnode *node, *pnode;
int tn, si, ni, error, alias;
KASSERT(rw_lock_held(&sysctl_treelock));
/*
* basic checks and setup
*/
if (*rnode == NULL) *rnode = &sysctl_root; if (nip) *nip = 0; if (namelen == 0)
return (0);
/*
* search starts from "root"
*/
pnode = *rnode;
if (SYSCTL_VERS(pnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_locate: pnode %p wrong version\n", pnode);
return (EINVAL);
}
node = pnode->sysctl_child;
error = 0;
/*
* scan for node to which new node should be attached
*/
for (ni = 0; ni < namelen; ni++) {
/*
* walked off bottom of tree
*/
if (node == NULL) {
if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE)
error = ENOENT;
else
error = ENOTDIR;
break;
}
/*
* can anyone traverse this node or only root?
*/
if (l != NULL && (pnode->sysctl_flags & CTLFLAG_PRIVATE) &&
(error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_PRVT,
NULL, NULL, NULL)) != 0)
return (error);
/*
* find a child node with the right number
*/
tn = name[ni];
alias = 0;
si = 0;
/*
* Note: ANYNUMBER only matches positive integers.
* Since ANYNUMBER is only permitted on single-node
* sub-trees (eg proc), check before the loop and skip
* it if we can.
*/
if ((node[si].sysctl_flags & CTLFLAG_ANYNUMBER) && (tn >= 0))
goto foundit;
for (; si < pnode->sysctl_clen; si++) { if (node[si].sysctl_num == tn) { if (node[si].sysctl_flags & CTLFLAG_ALIAS) { if (alias++ == 4)
break;
else {
tn = node[si].sysctl_alias;
si = -1;
}
} else
goto foundit;
}
}
/*
* if we ran off the end, it obviously doesn't exist
*/
error = ENOENT;
break;
/*
* so far so good, move on down the line
*/
foundit:
pnode = &node[si];
if (SYSCTL_TYPE(pnode->sysctl_flags) == CTLTYPE_NODE) node = node[si].sysctl_child;
else
node = NULL;
}
*rnode = pnode;
if (nip) *nip = ni;
return (error);
}
/*
* sysctl_query -- The auto-discovery engine. Copies out the structs
* describing nodes under the given node and handles overlay trees.
*/
int
sysctl_query(SYSCTLFN_ARGS)
{
int error, ni, elim, v;
size_t out, left, t;
const struct sysctlnode *enode, *onode;
struct sysctlnode qnode;
KASSERT(rw_lock_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_query: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (namelen != 1 || name[0] != CTL_QUERY)
return (EINVAL);
error = 0;
out = 0;
left = *oldlenp;
elim = 0;
enode = NULL;
/*
* translate the given request to a current node
*/
error = sysctl_cvt_in(l, &v, newp, newlen, &qnode);
if (error)
return (error);
/*
* if the request specifies a version, check it
*/
if (qnode.sysctl_ver != 0) {
enode = rnode;
if (qnode.sysctl_ver != enode->sysctl_ver &&
qnode.sysctl_ver != sysctl_rootof(enode)->sysctl_ver)
return (EINVAL);
}
/*
* process has overlay tree
*/
if (l && l->l_proc->p_emul->e_sysctlovly) {
enode = l->l_proc->p_emul->e_sysctlovly;
elim = (name - oname);
error = sysctl_locate(l, oname, elim, &enode, NULL);
if (error == 0) {
/* ah, found parent in overlay */
elim = enode->sysctl_clen;
enode = enode->sysctl_child;
} else {
error = 0;
elim = 0;
enode = NULL;
}
}
for (ni = 0; ni < rnode->sysctl_clen; ni++) {
onode = &rnode->sysctl_child[ni];
if (enode && enode->sysctl_num == onode->sysctl_num) {
if (SYSCTL_TYPE(enode->sysctl_flags) != CTLTYPE_NODE)
onode = enode;
if (--elim > 0)
enode++;
else
enode = NULL;
}
error = sysctl_cvt_out(l, v, onode, oldp, left, &t);
if (error)
return (error);
if (oldp != NULL)
oldp = (char*)oldp + t;
out += t;
left -= MIN(left, t);
}
/*
* overlay trees *MUST* be entirely consumed
*/
KASSERT(enode == NULL);
*oldlenp = out;
return (error);
}
/*
* sysctl_create -- Adds a node (the description of which is taken
* from newp) to the tree, returning a copy of it in the space pointed
* to by oldp. In the event that the requested slot is already taken
* (either by name or by number), the offending node is returned
* instead. Yes, this is complex, but we want to make sure everything
* is proper.
*/
#ifdef SYSCTL_DEBUG_CREATE
int _sysctl_create(SYSCTLFN_ARGS);
int
_sysctl_create(SYSCTLFN_ARGS)
#else
int
sysctl_create(SYSCTLFN_ARGS)
#endif
{
struct sysctlnode nnode, *node, *pnode;
int error, ni, at, nm, type, nsz, sz, flags, anum, v;
void *own;
KASSERT(rw_write_held(&sysctl_treelock));
error = 0;
own = NULL;
anum = -1;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_create: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (namelen != 1 || (name[namelen - 1] != CTL_CREATE
#if NKSYMS > 0
&& name[namelen - 1] != CTL_CREATESYM
#endif /* NKSYMS > 0 */
))
return (EINVAL);
/*
* processes can only add nodes at securelevel 0, must be
* root, and can't add nodes to a parent that's not writeable
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_ADD, NULL, NULL, NULL);
if (error)
return (error);
if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
return (EPERM);
}
/*
* nothing can add a node if:
* we've finished initial set up of this tree and
* (the tree itself is not writeable or
* the entire sysctl system is not writeable)
*/
if ((sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_PERMANENT) && (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE)))
return (EPERM);
/*
* it must be a "node", not a "int" or something
*/
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (rnode->sysctl_flags & CTLFLAG_ALIAS) {
printf("sysctl_create: attempt to add node to aliased "
"node %p\n", rnode);
return (EINVAL);
}
pnode = __UNCONST(rnode); /* we are adding children to this node */
if (newp == NULL)
return (EINVAL);
error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return (error);
/*
* nodes passed in don't *have* parents
*/
if (nnode.sysctl_parent != NULL)
return (EINVAL);
/*
* if we are indeed adding it, it should be a "good" name and
* number
*/
nm = nnode.sysctl_num;
#if NKSYMS > 0
if (nm == CTL_CREATESYM)
nm = CTL_CREATE;
#endif /* NKSYMS > 0 */
if (nm < 0 && nm != CTL_CREATE)
return (EINVAL);
/*
* the name can't start with a digit
*/
if (nnode.sysctl_name[0] >= '0' &&
nnode.sysctl_name[0] <= '9')
return (EINVAL);
/*
* the name must be only alphanumerics or - or _, longer than
* 0 bytes and less than SYSCTL_NAMELEN
*/
nsz = 0;
while (nsz < SYSCTL_NAMELEN && nnode.sysctl_name[nsz] != '\0') { if ((nnode.sysctl_name[nsz] >= '0' &&
nnode.sysctl_name[nsz] <= '9') ||
(nnode.sysctl_name[nsz] >= 'A' &&
nnode.sysctl_name[nsz] <= 'Z') ||
(nnode.sysctl_name[nsz] >= 'a' &&
nnode.sysctl_name[nsz] <= 'z') ||
nnode.sysctl_name[nsz] == '-' ||
nnode.sysctl_name[nsz] == '_')
nsz++;
else
return (EINVAL);
}
if (nsz == 0 || nsz == SYSCTL_NAMELEN)
return (EINVAL);
/*
* various checks revolve around size vs type, etc
*/
type = SYSCTL_TYPE(nnode.sysctl_flags);
flags = SYSCTL_FLAGS(nnode.sysctl_flags);
sz = nnode.sysctl_size;
/*
* find out if there's a collision, and if so, let the caller
* know what they collided with
*/
node = pnode->sysctl_child;
at = 0;
if (node) { if ((flags | node->sysctl_flags) & CTLFLAG_ANYNUMBER)
/* No siblings for a CTLFLAG_ANYNUMBER node */
return EINVAL;
for (ni = 0; ni < pnode->sysctl_clen; ni++) { if (nm == node[ni].sysctl_num ||
strcmp(nnode.sysctl_name, node[ni].sysctl_name) == 0) {
/*
* ignore error here, since we
* are already fixed on EEXIST
*/
(void)sysctl_cvt_out(l, v, &node[ni], oldp,
*oldlenp, oldlenp);
return (EEXIST);
}
if (nm > node[ni].sysctl_num)
at++;
}
}
/*
* use sysctl_ver to add to the tree iff it hasn't changed
*/
if (nnode.sysctl_ver != 0) {
/*
* a specified value must match either the parent
* node's version or the root node's version
*/
if (nnode.sysctl_ver != sysctl_rootof(rnode)->sysctl_ver &&
nnode.sysctl_ver != rnode->sysctl_ver) {
return (EINVAL);
}
}
/*
* only the kernel can assign functions to entries
*/
if (l != NULL && nnode.sysctl_func != NULL)
return (EPERM);
/*
* only the kernel can create permanent entries, and only then
* before the kernel is finished setting itself up
*/
if (l != NULL && (flags & ~SYSCTL_USERFLAGS))
return (EPERM);
if ((flags & CTLFLAG_PERMANENT) &
(sysctl_root.sysctl_flags & CTLFLAG_PERMANENT))
return (EPERM);
if ((flags & (CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE)) ==
(CTLFLAG_OWNDATA | CTLFLAG_IMMEDIATE))
return (EINVAL);
if ((flags & CTLFLAG_IMMEDIATE) &&
type != CTLTYPE_INT && type != CTLTYPE_QUAD && type != CTLTYPE_BOOL)
return (EINVAL);
/*
* check size, or set it if unset and we can figure it out.
* kernel created nodes are allowed to have a function instead
* of a size (or a data pointer).
*/
switch (type) {
case CTLTYPE_NODE:
/*
* only *i* can assert the size of a node
*/
if (flags & CTLFLAG_ALIAS) {
anum = nnode.sysctl_alias;
if (anum < 0)
return (EINVAL);
nnode.sysctl_alias = 0;
}
if (sz != 0 || nnode.sysctl_data != NULL)
return (EINVAL);
if (nnode.sysctl_csize != 0 ||
nnode.sysctl_clen != 0 ||
nnode.sysctl_child != 0)
return (EINVAL);
if (flags & CTLFLAG_OWNDATA)
return (EINVAL);
sz = sizeof(struct sysctlnode);
break;
case CTLTYPE_INT:
/*
* since an int is an int, if the size is not given or
* is wrong, we can "int-uit" it.
*/
if (sz != 0 && sz != sizeof(int))
return (EINVAL);
sz = sizeof(int);
break;
case CTLTYPE_STRING:
/*
* strings are a little more tricky
*/
if (sz == 0) {
if (l == NULL) {
if (nnode.sysctl_func == NULL) { if (nnode.sysctl_data == NULL)
return (EINVAL);
else
sz = strlen(nnode.sysctl_data) +
1;
}
} else if (nnode.sysctl_data == NULL &&
flags & CTLFLAG_OWNDATA) {
return (EINVAL);
} else {
char *vp, *e;
size_t s;
/*
* we want a rough idea of what the
* size is now
*/
vp = malloc(PAGE_SIZE, M_SYSCTLDATA, M_WAITOK);
if (vp == NULL)
return (ENOMEM);
e = nnode.sysctl_data;
do {
error = copyinstr(e, vp, PAGE_SIZE, &s);
if (error) {
if (error != ENAMETOOLONG) {
free(vp, M_SYSCTLDATA);
return (error);
}
e += PAGE_SIZE;
if ((e - 32 * PAGE_SIZE) >
(char*)nnode.sysctl_data) {
free(vp, M_SYSCTLDATA);
return (ERANGE);
}
}
} while (error != 0);
sz = s + (e - (char*)nnode.sysctl_data);
free(vp, M_SYSCTLDATA);
}
}
break;
case CTLTYPE_QUAD:
if (sz != 0 && sz != sizeof(u_quad_t))
return (EINVAL);
sz = sizeof(u_quad_t);
break;
case CTLTYPE_BOOL:
/*
* since an bool is an bool, if the size is not given or
* is wrong, we can "intuit" it.
*/
if (sz != 0 && sz != sizeof(bool))
return (EINVAL);
sz = sizeof(bool);
break;
case CTLTYPE_STRUCT:
if (sz == 0) { if (l != NULL || nnode.sysctl_func == NULL)
return (EINVAL);
if (flags & CTLFLAG_OWNDATA)
return (EINVAL);
}
break;
default:
return (EINVAL);
}
/*
* at this point, if sz is zero, we *must* have a
* function to go with it and we can't own it.
*/
/*
* l ptr own
* 0 0 0 -> EINVAL (if no func)
* 0 0 1 -> own
* 0 1 0 -> kptr
* 0 1 1 -> kptr
* 1 0 0 -> EINVAL
* 1 0 1 -> own
* 1 1 0 -> kptr, no own (fault on lookup)
* 1 1 1 -> uptr, own
*/
if (type != CTLTYPE_NODE) {
if (sz != 0) {
if (flags & CTLFLAG_OWNDATA) {
own = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (own == NULL)
return ENOMEM;
if (nnode.sysctl_data == NULL)
memset(own, 0, sz);
else {
error = sysctl_copyin(l,
nnode.sysctl_data, own, sz);
if (error != 0) { free(own, M_SYSCTLDATA);
return (error);
}
}
} else if ((nnode.sysctl_data != NULL) &&
!(flags & CTLFLAG_IMMEDIATE)) {
#if NKSYMS > 0
if (name[namelen - 1] == CTL_CREATESYM) {
char symname[128]; /* XXX enough? */
u_long symaddr;
size_t symlen;
error = sysctl_copyinstr(l,
nnode.sysctl_data, symname,
sizeof(symname), &symlen);
if (error)
return (error);
error = ksyms_getval(NULL, symname,
&symaddr, KSYMS_EXTERN);
if (error)
return (error); /* EINVAL? */
nnode.sysctl_data = (void*)symaddr;
}
#endif /* NKSYMS > 0 */
/*
* Ideally, we'd like to verify here
* that this address is acceptable,
* but...
*
* - it might be valid now, only to
* become invalid later
*
* - it might be invalid only for the
* moment and valid later
*
* - or something else.
*
* Since we can't get a good answer,
* we'll just accept the address as
* given, and fault on individual
* lookups.
*/
}
} else if (nnode.sysctl_func == NULL)
return (EINVAL);
}
/*
* a process can't assign a function to a node, and the kernel
* can't create a node that has no function or data.
* (XXX somewhat redundant check)
*/
if (l != NULL || nnode.sysctl_func == NULL) { if (type != CTLTYPE_NODE &&
!(flags & CTLFLAG_IMMEDIATE) &&
nnode.sysctl_data == NULL &&
own == NULL)
return (EINVAL);
}
#ifdef SYSCTL_DISALLOW_KWRITE
/*
* a process can't create a writable node unless it refers to
* new data.
*/
if (l != NULL && own == NULL && type != CTLTYPE_NODE &&
(flags & CTLFLAG_READWRITE) != CTLFLAG_READONLY &&
!(flags & CTLFLAG_IMMEDIATE))
return (EPERM);
#endif /* SYSCTL_DISALLOW_KWRITE */
/*
* make sure there's somewhere to put the new stuff.
*/
if (pnode->sysctl_child == NULL) {
if (flags & CTLFLAG_ANYNUMBER)
error = sysctl_alloc(pnode, 1);
else
error = sysctl_alloc(pnode, 0);
if (error) {
if (own != NULL) free(own, M_SYSCTLDATA);
return (error);
}
}
node = pnode->sysctl_child;
/*
* no collisions, so pick a good dynamic number if we need to.
*/
if (nm == CTL_CREATE) {
nm = ++sysctl_root.sysctl_num;
for (ni = 0; ni < pnode->sysctl_clen; ni++) {
if (nm == node[ni].sysctl_num) {
nm++;
ni = -1;
} else if (nm > node[ni].sysctl_num)
at = ni + 1;
}
}
/*
* oops...ran out of space
*/
if (pnode->sysctl_clen == pnode->sysctl_csize) { error = sysctl_realloc(pnode);
if (error) {
if (own != NULL) free(own, M_SYSCTLDATA);
return (error);
}
node = pnode->sysctl_child;
}
/*
* insert new node data
*/
if (at < pnode->sysctl_clen) {
int t;
/*
* move the nodes that should come after the new one
*/
memmove(&node[at + 1], &node[at],
(pnode->sysctl_clen - at) * sizeof(struct sysctlnode));
memset(&node[at], 0, sizeof(struct sysctlnode));
node[at].sysctl_parent = pnode;
/*
* and...reparent any children of any moved nodes
*/
for (ni = at; ni <= pnode->sysctl_clen; ni++) if (node[ni].sysctl_child != NULL) for (t = 0; t < node[ni].sysctl_csize; t++)
node[ni].sysctl_child[t].sysctl_parent =
&node[ni];
}
node = &node[at];
pnode->sysctl_clen++;
strlcpy(node->sysctl_name, nnode.sysctl_name,
sizeof(node->sysctl_name));
node->sysctl_num = nm;
node->sysctl_size = sz;
node->sysctl_flags = SYSCTL_VERSION|type|flags; /* XXX other trees */
node->sysctl_csize = 0;
node->sysctl_clen = 0;
if (own) {
node->sysctl_data = own;
node->sysctl_flags |= CTLFLAG_OWNDATA;
} else if (flags & CTLFLAG_ALIAS) {
node->sysctl_alias = anum;
} else if (flags & CTLFLAG_IMMEDIATE) {
switch (type) {
case CTLTYPE_BOOL:
node->sysctl_bdata = nnode.sysctl_bdata;
break;
case CTLTYPE_INT:
node->sysctl_idata = nnode.sysctl_idata;
break;
case CTLTYPE_QUAD:
node->sysctl_qdata = nnode.sysctl_qdata;
break;
}
} else {
node->sysctl_data = nnode.sysctl_data;
node->sysctl_flags &= ~CTLFLAG_OWNDATA;
}
node->sysctl_func = nnode.sysctl_func;
node->sysctl_child = NULL;
/* node->sysctl_parent should already be done */
/*
* update "version" on path to "root"
*/
for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
;
pnode = node;
for (nm = rnode->sysctl_ver + 1; pnode != NULL;
pnode = pnode->sysctl_parent)
pnode->sysctl_ver = nm;
/* If this fails, the node is already added - the user won't know! */
error = sysctl_cvt_out(l, v, node, oldp, *oldlenp, oldlenp);
return (error);
}
/*
* ********************************************************************
* A wrapper around sysctl_create() that prints the thing we're trying
* to add.
* ********************************************************************
*/
#ifdef SYSCTL_DEBUG_CREATE
int
sysctl_create(SYSCTLFN_ARGS)
{
const struct sysctlnode *node;
int k, v, rc, ni, nl = namelen + (name - oname);
struct sysctlnode nnode;
if (newp == NULL)
return EINVAL;
int error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return error;
node = &nnode;
printf("namelen %d (", nl);
for (ni = 0; ni < nl - 1; ni++)
printf(" %d", oname[ni]);
printf(" %d )\t[%s]\tflags %08x (%08x %d %zu)\n",
k = node->sysctl_num,
node->sysctl_name,
node->sysctl_flags,
SYSCTL_FLAGS(node->sysctl_flags),
SYSCTL_TYPE(node->sysctl_flags),
node->sysctl_size);
node = rnode;
rc = _sysctl_create(SYSCTLFN_CALL(rnode));
printf("sysctl_create(");
for (ni = 0; ni < nl - 1; ni++)
printf(" %d", oname[ni]);
printf(" %d ) returned %d\n", k, rc);
return (rc);
}
#endif /* SYSCTL_DEBUG_CREATE */
/*
* sysctl_destroy -- Removes a node (as described by newp) from the
* given tree, returning (if successful) a copy of the dead node in
* oldp. Since we're removing stuff, there's not much to check.
*/
int
sysctl_destroy(SYSCTLFN_ARGS)
{
struct sysctlnode *node, *pnode, onode, nnode;
int ni, error, v;
KASSERT(rw_write_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_destroy: rnode %p wrong version\n", rnode);
return (EINVAL);
}
error = 0;
if (namelen != 1 || name[namelen - 1] != CTL_DESTROY)
return (EINVAL);
/*
* processes can only destroy nodes at securelevel 0, must be
* root, and can't remove nodes from a parent that's not
* writeable
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_DELETE, NULL, NULL, NULL);
if (error)
return (error);
if (!(rnode->sysctl_flags & CTLFLAG_READWRITE))
#endif /* SYSCTL_DISALLOW_CREATE */
return (EPERM);
}
/*
* nothing can remove a node if:
* the node is permanent (checked later) or
* the tree itself is not writeable or
* the entire sysctl system is not writeable
*
* note that we ignore whether setup is complete or not,
* because these rules always apply.
*/
if (!(sysctl_rootof(rnode)->sysctl_flags & CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))
return (EPERM);
if (newp == NULL)
return (EINVAL);
error = sysctl_cvt_in(l, &v, newp, newlen, &nnode);
if (error)
return (error);
memset(&onode, 0, sizeof(struct sysctlnode));
node = rnode->sysctl_child;
for (ni = 0; ni < rnode->sysctl_clen; ni++) {
if (nnode.sysctl_num == node[ni].sysctl_num) {
/*
* if name specified, must match
*/
if (nnode.sysctl_name[0] != '\0' &&
strcmp(nnode.sysctl_name, node[ni].sysctl_name))
continue;
/*
* if version specified, must match
*/
if (nnode.sysctl_ver != 0 &&
nnode.sysctl_ver != node[ni].sysctl_ver)
continue;
/*
* this must be the one
*/
break;
}
}
if (ni == rnode->sysctl_clen)
return (ENOENT);
node = &node[ni];
pnode = node->sysctl_parent;
/*
* if the kernel says permanent, it is, so there. nyah.
*/
if (SYSCTL_FLAGS(node->sysctl_flags) & CTLFLAG_PERMANENT)
return (EPERM);
/*
* can't delete non-empty nodes
*/
if (SYSCTL_TYPE(node->sysctl_flags) == CTLTYPE_NODE &&
node->sysctl_clen != 0)
return (ENOTEMPTY);
/*
* if the node "owns" data, release it now
*/
if (node->sysctl_flags & CTLFLAG_OWNDATA) {
if (node->sysctl_data != NULL)
free(node->sysctl_data, M_SYSCTLDATA);
node->sysctl_data = NULL;
}
if (node->sysctl_flags & CTLFLAG_OWNDESC) {
if (node->sysctl_desc != NULL)
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
node->sysctl_desc = NULL;
}
/*
* if the node to be removed is not the last one on the list,
* move the remaining nodes up, and reparent any grandchildren
*/
onode = *node;
if (ni < pnode->sysctl_clen - 1) {
int t;
memmove(&pnode->sysctl_child[ni], &pnode->sysctl_child[ni + 1],
(pnode->sysctl_clen - ni - 1) *
sizeof(struct sysctlnode));
for (; ni < pnode->sysctl_clen - 1; ni++)
if (SYSCTL_TYPE(pnode->sysctl_child[ni].sysctl_flags) ==
CTLTYPE_NODE)
for (t = 0;
t < pnode->sysctl_child[ni].sysctl_clen;
t++)
pnode->sysctl_child[ni].sysctl_child[t].
sysctl_parent =
&pnode->sysctl_child[ni];
ni = pnode->sysctl_clen - 1;
node = &pnode->sysctl_child[ni];
}
/*
* reset the space we just vacated
*/
memset(node, 0, sizeof(struct sysctlnode));
node->sysctl_parent = pnode;
pnode->sysctl_clen--;
/*
* if this parent just lost its last child, nuke the creche
*/
if (pnode->sysctl_clen == 0) {
free(pnode->sysctl_child, M_SYSCTLNODE);
pnode->sysctl_csize = 0;
pnode->sysctl_child = NULL;
}
/*
* update "version" on path to "root"
*/
for (; rnode->sysctl_parent != NULL; rnode = rnode->sysctl_parent)
;
for (ni = rnode->sysctl_ver + 1; pnode != NULL;
pnode = pnode->sysctl_parent)
pnode->sysctl_ver = ni;
error = sysctl_cvt_out(l, v, &onode, oldp, *oldlenp, oldlenp);
return (error);
}
/*
* sysctl_lookup -- Handles copyin/copyout of new and old values.
* Partial reads are globally allowed. Only root can write to things
* unless the node says otherwise.
*/
int
sysctl_lookup(SYSCTLFN_ARGS)
{
int error, rw;
size_t sz, len;
void *d;
KASSERT(rw_lock_held(&sysctl_treelock));
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("%s: rnode %p wrong version\n", __func__, rnode);
return EINVAL;
}
if (newlen == 0)
newp = NULL;
error = 0;
/*
* you can't "look up" a node. you can "query" it, but you
* can't "look it up".
*/
if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_NODE || namelen != 0) {
DPRINTF(("%s: can't lookup a node\n", __func__));
return EINVAL;
}
/*
* some nodes are private, so only root can look into them.
*/
if (l != NULL && (rnode->sysctl_flags & CTLFLAG_PRIVATE) &&
(error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)) != 0) {
DPRINTF(("%s: private node\n", __func__));
return error;
}
/*
* if a node wants to be writable according to different rules
* other than "only root can write to stuff unless a flag is
* set", then it needs its own function which should have been
* called and not us.
*/
if (l != NULL && newp != NULL && !(rnode->sysctl_flags & CTLFLAG_ANYWRITE) &&
(error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL, KAUTH_REQ_SYSTEM_SYSCTL_MODIFY, NULL, NULL,
NULL)) != 0) {
DPRINTF(("%s: can't modify\n", __func__));
return error;
}
/*
* is this node supposedly writable?
*/
rw = (rnode->sysctl_flags & CTLFLAG_READWRITE) ? 1 : 0;
/*
* it appears not to be writable at this time, so if someone
* tried to write to it, we must tell them to go away
*/
if (!rw && newp != NULL) {
DPRINTF(("%s: not writable\n", __func__));
return EPERM;
}
/*
* step one, copy out the stuff we have presently
*/
if (rnode->sysctl_flags & CTLFLAG_IMMEDIATE) {
/*
* note that we discard const here because we are
* modifying the contents of the node (which is okay
* because it's ours)
*
* It also doesn't matter which field of the union we pick.
*/
d = __UNCONST(&rnode->sysctl_qdata);
} else
d = rnode->sysctl_data;
if (SYSCTL_TYPE(rnode->sysctl_flags) == CTLTYPE_STRING)
sz = strlen(d) + 1; /* XXX@@@ possible fault here */
else
sz = rnode->sysctl_size; if (oldp != NULL) { error = sysctl_copyout(l, d, oldp, MIN(sz, *oldlenp)); if (error) {
DPRINTF(("%s: bad copyout %d\n", __func__, error));
return error;
}
}
*oldlenp = sz;
/*
* are we done?
*/
if (newp == NULL)
return 0;
/*
* hmm...not done. must now "copy in" new value. re-adjust
* sz to maximum value (strings are "weird").
*/
sz = rnode->sysctl_size;
switch (SYSCTL_TYPE(rnode->sysctl_flags)) {
case CTLTYPE_BOOL: {
bool tmp;
/*
* these data must be *exactly* the same size coming
* in. bool may only be true or false.
*/
if (newlen != sz) {
DPRINTF(("%s: bad size %zu != %zu\n", __func__, newlen,
sz));
return EINVAL;
}
error = sysctl_copyin(l, newp, &tmp, sz); if (error)
break;
if (tmp != true && tmp != false) {
DPRINTF(("%s: tmp %d\n", __func__, tmp));
return EINVAL;
}
*(bool *)d = tmp;
break;
}
case CTLTYPE_INT:
case CTLTYPE_QUAD:
case CTLTYPE_STRUCT:
/*
* these data must be *exactly* the same size coming
* in.
*/
if (newlen != sz)
goto bad_size;
error = sysctl_copyin(l, newp, d, sz);
rnd_add_data(NULL, d, sz, 0);
break;
case CTLTYPE_STRING: {
/*
* strings, on the other hand, can be shorter, and we
* let userland be sloppy about the trailing nul.
*/
char *newbuf;
/*
* too much new string?
*/
if (newlen > sz)
goto bad_size;
/*
* temporary copy of new inbound string
*/
len = MIN(sz, newlen);
newbuf = malloc(len, M_SYSCTLDATA, M_WAITOK);
if (newbuf == NULL) {
DPRINTF(("%s: oomem %zu\n", __func__, len));
return ENOMEM;
}
error = sysctl_copyin(l, newp, newbuf, len);
if (error) {
free(newbuf, M_SYSCTLDATA);
DPRINTF(("%s: copyin %d\n", __func__, error));
return error;
}
/*
* did they NUL terminate it, or do we have space
* left to do it ourselves?
*/
if (newbuf[len - 1] != '\0' && len == sz) {
free(newbuf, M_SYSCTLDATA);
DPRINTF(("%s: string too long\n", __func__));
return EINVAL;
}
/*
* looks good, so pop it into place and zero the rest.
*/
if (len > 0) { memcpy(d, newbuf, len);
rnd_add_data(NULL, d, len, 0);
}
if (sz != len) memset((char*)d + len, 0, sz - len);
free(newbuf, M_SYSCTLDATA);
break;
}
default:
DPRINTF(("%s: bad type\n", __func__));
return EINVAL;
}
if (error) {
DPRINTF(("%s: copyin %d\n", __func__, error));
}
return error;
bad_size:
DPRINTF(("%s: bad size %zu > %zu\n", __func__, newlen, sz));
return EINVAL;
}
/*
* sysctl_mmap -- Dispatches sysctl mmap requests to those nodes that
* purport to handle it. This interface isn't fully fleshed out yet,
* unfortunately.
*/
static int
sysctl_mmap(SYSCTLFN_ARGS)
{
const struct sysctlnode *node;
struct sysctlnode nnode;
int error;
int sysctl_num;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_mmap: rnode %p wrong version\n", rnode);
return (EINVAL);
}
/*
* let's just pretend that didn't happen, m'kay?
*/
if (l == NULL)
return (EPERM);
/*
* is this a sysctlnode description of an mmap request?
*/
if (newp == NULL || newlen != sizeof(struct sysctlnode))
return (EINVAL);
error = sysctl_copyin(l, newp, &nnode, sizeof(nnode));
if (error)
return (error);
/*
* does the node they asked for exist?
*/
if (namelen != 1)
return (EOPNOTSUPP);
node = rnode;
sysctl_num = nnode.sysctl_num;
error = sysctl_locate(l, &sysctl_num, 1, &node, NULL);
if (error)
return (error);
/*
* does this node that we have found purport to handle mmap?
*/
if (node->sysctl_func == NULL ||
!(node->sysctl_flags & CTLFLAG_MMAP))
return (EOPNOTSUPP);
/*
* well...okay, they asked for it.
*/
return ((*node->sysctl_func)(SYSCTLFN_CALL(node)));
}
int
sysctl_describe(SYSCTLFN_ARGS)
{
struct sysctldesc *d;
void *bf;
size_t sz, left, tot;
int i, error, v = -1;
struct sysctlnode *node;
struct sysctlnode dnode;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_query: rnode %p wrong version\n", rnode);
return (EINVAL);
}
if (SYSCTL_TYPE(rnode->sysctl_flags) != CTLTYPE_NODE)
return (ENOTDIR);
if (namelen != 1 || name[0] != CTL_DESCRIBE)
return (EINVAL);
/*
* get ready...
*/
error = 0;
d = bf = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
if (bf == NULL)
return ENOMEM;
tot = 0;
node = rnode->sysctl_child;
left = *oldlenp;
/*
* no request -> all descriptions at this level
* request with desc unset -> just this node
* request with desc set -> set descr for this node
*/
if (newp != NULL) {
error = sysctl_cvt_in(l, &v, newp, newlen, &dnode);
if (error)
goto out;
if (dnode.sysctl_desc != NULL) {
/*
* processes cannot set descriptions above
* securelevel 0. and must be root. blah
* blah blah. a couple more checks are made
* once we find the node we want.
*/
if (l != NULL) {
#ifndef SYSCTL_DISALLOW_CREATE
error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_DESC, NULL,
NULL, NULL);
if (error)
goto out;
#else /* SYSCTL_DISALLOW_CREATE */
error = EPERM;
goto out;
#endif /* SYSCTL_DISALLOW_CREATE */
}
/*
* find node and try to set the description on it
*/
for (i = 0; i < rnode->sysctl_clen; i++)
if (node[i].sysctl_num == dnode.sysctl_num)
break;
if (i == rnode->sysctl_clen) {
error = ENOENT;
goto out;
}
node = &node[i];
/*
* did the caller specify a node version?
*/
if (dnode.sysctl_ver != 0 &&
dnode.sysctl_ver != node->sysctl_ver) {
error = EINVAL;
goto out;
}
/*
* okay...some rules:
* (1) if setup is done and the tree is
* read-only or the whole system is
* read-only
* (2) no one can set a description on a
* permanent node (it must be set when
* using createv)
* (3) processes cannot *change* a description
* (4) processes *can*, however, set a
* description on a read-only node so that
* one can be created and then described
* in two steps
* anything else come to mind?
*/
if ((sysctl_root.sysctl_flags & CTLFLAG_PERMANENT) &&
(!(sysctl_rootof(node)->sysctl_flags &
CTLFLAG_READWRITE) ||
!(sysctl_root.sysctl_flags & CTLFLAG_READWRITE))) {
error = EPERM;
goto out;
}
if (node->sysctl_flags & CTLFLAG_PERMANENT) {
error = EPERM;
goto out;
}
if (l != NULL && node->sysctl_desc != NULL) {
error = EPERM;
goto out;
}
/*
* right, let's go ahead. the first step is
* making the description into something the
* node can "own", if need be.
*/
if (l != NULL ||
dnode.sysctl_flags & CTLFLAG_OWNDESC) {
char *nd, *k;
k = malloc(MAXDESCLEN, M_TEMP, M_WAITOK);
if (k == NULL) {
error = ENOMEM;
goto out;
}
error = sysctl_copyinstr(l, dnode.sysctl_desc,
k, MAXDESCLEN, &sz);
if (error) {
free(k, M_TEMP);
goto out;
}
nd = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (nd == NULL) {
free(k, M_TEMP);
error = ENOMEM;
goto out;
}
memcpy(nd, k, sz);
dnode.sysctl_flags |= CTLFLAG_OWNDESC;
dnode.sysctl_desc = nd;
free(k, M_TEMP);
}
/*
* now "release" the old description and
* attach the new one. ta-da.
*/
if ((node->sysctl_flags & CTLFLAG_OWNDESC) &&
node->sysctl_desc != NULL)
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc), M_SYSCTLDATA);
node->sysctl_desc = dnode.sysctl_desc;
node->sysctl_flags |=
(dnode.sysctl_flags & CTLFLAG_OWNDESC);
/*
* now we "fall out" and into the loop which
* will copy the new description back out for
* those interested parties
*/
}
}
/*
* scan for one description or just retrieve all descriptions
*/
for (i = 0; i < rnode->sysctl_clen; i++) {
/*
* did they ask for the description of only one node?
*/
if (v != -1 && node[i].sysctl_num != dnode.sysctl_num)
continue;
/*
* don't describe "private" nodes to non-suser users
*/
if ((node[i].sysctl_flags & CTLFLAG_PRIVATE) && (l != NULL) &&
!(kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SYSCTL,
KAUTH_REQ_SYSTEM_SYSCTL_PRVT, NULL, NULL, NULL)))
continue;
/*
* is this description "valid"?
*/
memset(bf, 0, MAXDESCLEN);
if (node[i].sysctl_desc == NULL)
sz = 1;
else if (copystr(node[i].sysctl_desc, &d->descr_str[0],
MAXDESCLEN - sizeof(*d), &sz) != 0) {
/*
* erase possible partial description
*/
memset(bf, 0, MAXDESCLEN);
sz = 1;
}
/*
* we've got it, stuff it into the caller's buffer
*/
d->descr_num = node[i].sysctl_num;
d->descr_ver = node[i].sysctl_ver;
d->descr_len = sz; /* includes trailing nul */
sz = (char *)NEXT_DESCR(d) - (char *)d;
if (oldp != NULL && left >= sz) {
error = sysctl_copyout(l, d, oldp, sz);
if (error)
goto out;
left -= sz;
oldp = (void *)__sysc_desc_adv(oldp, d->descr_len);
}
tot += sz;
/*
* if we get this far with v not "unset", they asked
* for a specific node and we found it
*/
if (v != -1)
break;
}
/*
* did we find it after all?
*/
if (v != -1 && tot == 0)
error = ENOENT;
else
*oldlenp = tot;
out:
free(bf, M_TEMP);
return (error);
}
/*
* ********************************************************************
* Section 3: Create and destroy from inside the kernel
* ********************************************************************
* sysctl_createv() and sysctl_destroyv() are simpler-to-use
* interfaces for the kernel to fling new entries into the mib and rip
* them out later. In the case of sysctl_createv(), the returned copy
* of the node (see sysctl_create()) will be translated back into a
* pointer to the actual node.
*
* Note that sysctl_createv() will return 0 if the create request
* matches an existing node (ala mkdir -p), and that sysctl_destroyv()
* will return 0 if the node to be destroyed already does not exist
* (aka rm -f) or if it is a parent of other nodes.
*
* This allows two (or more) different subsystems to assert sub-tree
* existence before populating their own nodes, and to remove their
* own nodes without orphaning the others when they are done.
* ********************************************************************
*/
#undef sysctl_createv
int
sysctl_createv(struct sysctllog **log, int cflags,
const struct sysctlnode **rnode, const struct sysctlnode **cnode,
int flags, int type, const char *namep, const char *descr,
sysctlfn func, u_quad_t qv, void *newp, size_t newlen,
...)
{
va_list ap;
int error, ni, namelen, name[CTL_MAXNAME];
const struct sysctlnode *root, *pnode;
struct sysctlnode nnode, onode, *dnode;
size_t sz;
const struct sysctlnode *snode __diagused;
/*
* where are we putting this?
*/
if (rnode != NULL && *rnode == NULL) { printf("sysctl_createv: rnode NULL\n");
return (EINVAL);
}
root = rnode ? *rnode : NULL;
if (cnode != NULL) *cnode = NULL; if (cflags != 0)
return (EINVAL);
/*
* what is it?
*/
flags = SYSCTL_VERSION|SYSCTL_TYPE(type)|SYSCTL_FLAGS(flags);
if (log != NULL)
flags &= ~CTLFLAG_PERMANENT;
/*
* where do we put it?
*/
va_start(ap, newlen);
namelen = 0;
error = 0;
ni = -1;
do {
if (++ni == CTL_MAXNAME) {
error = ENAMETOOLONG;
break;
}
name[ni] = va_arg(ap, int);
/*
* sorry, this is not supported from here
*/
if (name[ni] == CTL_CREATESYM) {
error = EINVAL;
break;
}
} while (name[ni] != CTL_EOL && name[ni] != CTL_CREATE);
va_end(ap);
if (error)
return error;
namelen = ni + (name[ni] == CTL_CREATE ? 1 : 0);
/*
* what's it called
*/
if (strlcpy(nnode.sysctl_name, namep, sizeof(nnode.sysctl_name)) >=
sizeof(nnode.sysctl_name))
return (ENAMETOOLONG);
/*
* cons up the description of the new node
*/
nnode.sysctl_num = name[namelen - 1];
name[namelen - 1] = CTL_CREATE;
nnode.sysctl_size = newlen;
nnode.sysctl_flags = flags;
if (type == CTLTYPE_NODE) {
nnode.sysctl_csize = 0;
nnode.sysctl_clen = 0;
nnode.sysctl_child = NULL;
if (flags & CTLFLAG_ALIAS) nnode.sysctl_alias = qv;
} else if (flags & CTLFLAG_IMMEDIATE) {
switch (type) {
case CTLTYPE_BOOL:
nnode.sysctl_bdata = qv;
break;
case CTLTYPE_INT:
nnode.sysctl_idata = qv;
break;
case CTLTYPE_QUAD:
nnode.sysctl_qdata = qv;
break;
default:
return (EINVAL);
}
} else {
nnode.sysctl_data = newp;
}
nnode.sysctl_func = func;
nnode.sysctl_parent = NULL;
nnode.sysctl_ver = 0;
/*
* initialize lock state -- we need locks if the main tree has
* been marked as complete, but since we could be called from
* either there, or from a device driver (say, at device
* insertion), or from a module (at module load time, say), we
* don't really want to "wait"...
*/
sysctl_lock(true);
/*
* locate the prospective parent of the new node, and if we
* find it, add the new node.
*/
sz = sizeof(onode);
pnode = root;
error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
if (error) {
/*
* XXX: If you are seeing this printf in early bringup
* stages, perhaps your setfault is not functioning and
* thus kcopy() is mis-behaving.
*/
printf("sysctl_createv: sysctl_locate(%s) returned %d\n",
nnode.sysctl_name, error);
sysctl_unlock();
return (error);
}
error = sysctl_create(&name[ni], namelen - ni, &onode, &sz,
&nnode, sizeof(nnode), &name[0], NULL,
pnode);
/*
* unfortunately the node we wanted to create is already
* there. if the node that's already there is a reasonable
* facsimile of the node we wanted to create, just pretend
* (for the caller's benefit) that we managed to create the
* node they wanted.
*/
if (error == EEXIST) {
/* name is the same as requested... */
if (strcmp(nnode.sysctl_name, onode.sysctl_name) == 0 &&
/* they want the same function... */
nnode.sysctl_func == onode.sysctl_func &&
/* number is the same as requested, or... */
(nnode.sysctl_num == onode.sysctl_num ||
/* they didn't pick a number... */
nnode.sysctl_num == CTL_CREATE)) {
/*
* collision here from trying to create
* something that already existed; let's give
* our customers a hand and tell them they got
* what they wanted.
*/
#ifdef SYSCTL_DEBUG_CREATE
printf("cleared\n");
#endif /* SYSCTL_DEBUG_CREATE */
error = 0;
}
}
if (error == 0 &&
(cnode != NULL || log != NULL || descr != NULL)) {
/*
* sysctl_create() gave us back a copy of the node,
* but we need to know where it actually is...
*/
pnode = root;
error = sysctl_locate(NULL, &name[0], namelen - 1, &pnode, &ni);
snode = pnode;
/*
* manual scan of last layer so that aliased nodes
* aren't followed.
*/
if (error == 0) { for (ni = 0; ni < pnode->sysctl_clen; ni++)
if (pnode->sysctl_child[ni].sysctl_num ==
onode.sysctl_num)
break;
if (ni < pnode->sysctl_clen)
pnode = &pnode->sysctl_child[ni];
else
error = ENOENT;
}
/*
* not expecting an error here, but...
*/
if (error == 0) {
KASSERTMSG(pnode->sysctl_parent == snode,
"sysctl parent mis-match pnode %s, snode %s",
pnode->sysctl_name, snode->sysctl_name);
if (log != NULL) sysctl_log_add(log, pnode); if (cnode != NULL)
*cnode = pnode;
if (descr != NULL) {
/*
* allow first caller to *set* a
* description actually to set it
*
* discard const here so we can attach
* the description
*/
dnode = __UNCONST(pnode);
if (pnode->sysctl_desc != NULL)
/* skip it...we've got one */;
else if (flags & CTLFLAG_OWNDESC) {
size_t l = strlen(descr) + 1;
char *d = malloc(l, M_SYSCTLDATA,
M_WAITOK);
if (d != NULL) { memcpy(d, descr, l);
dnode->sysctl_desc = d;
dnode->sysctl_flags |=
CTLFLAG_OWNDESC;
}
} else
dnode->sysctl_desc = descr;
}
} else {
printf("sysctl_create succeeded but node not found?!\n");
/*
* confusing, but the create said it
* succeeded, so...
*/
error = 0;
}
}
/*
* now it should be safe to release the lock state. note that
* the pointer to the newly created node being passed back may
* not be "good" for very long.
*/
sysctl_unlock();
if (error != 0) {
printf("sysctl_createv: sysctl_create(%s) returned %d\n",
nnode.sysctl_name, error);
#if 0
if (error != ENOENT)
sysctl_dump(&onode);
#endif
}
return (error);
}
int
sysctl_destroyv(struct sysctlnode *rnode, ...)
{
va_list ap;
int error, name[CTL_MAXNAME], namelen, ni;
const struct sysctlnode *pnode, *node;
struct sysctlnode dnode, *onode;
size_t sz;
va_start(ap, rnode);
namelen = 0;
ni = 0;
do {
if (ni == CTL_MAXNAME) {
va_end(ap);
return (ENAMETOOLONG);
}
name[ni] = va_arg(ap, int);
} while (name[ni++] != CTL_EOL);
namelen = ni - 1;
va_end(ap);
/*
* i can't imagine why we'd be destroying a node when the tree
* wasn't complete, but who knows?
*/
sysctl_lock(true);
/*
* where is it?
*/
node = rnode;
error = sysctl_locate(NULL, &name[0], namelen - 1, &node, &ni);
if (error) {
/* they want it gone and it's not there, so... */
sysctl_unlock();
return (error == ENOENT ? 0 : error);
}
/*
* set up the deletion
*/
pnode = node;
node = &dnode;
memset(&dnode, 0, sizeof(dnode));
dnode.sysctl_flags = SYSCTL_VERSION;
dnode.sysctl_num = name[namelen - 1];
/*
* we found it, now let's nuke it
*/
name[namelen - 1] = CTL_DESTROY;
sz = 0;
error = sysctl_destroy(&name[namelen - 1], 1, NULL, &sz,
node, sizeof(*node), &name[0], NULL,
pnode);
if (error == ENOTEMPTY) {
/*
* think of trying to delete "foo" when "foo.bar"
* (which someone else put there) is still in
* existence
*/
error = 0;
/*
* dunno who put the description there, but if this
* node can ever be removed, we need to make sure the
* string doesn't go out of context. that means we
* need to find the node that's still there (don't use
* sysctl_locate() because that follows aliasing).
*/
node = pnode->sysctl_child;
for (ni = 0; ni < pnode->sysctl_clen; ni++)
if (node[ni].sysctl_num == dnode.sysctl_num)
break;
node = (ni < pnode->sysctl_clen) ? &node[ni] : NULL;
/*
* if we found it, and this node has a description,
* and this node can be released, and it doesn't
* already own its own description...sigh. :)
*/
if (node != NULL && node->sysctl_desc != NULL &&
!(node->sysctl_flags & CTLFLAG_PERMANENT) &&
!(node->sysctl_flags & CTLFLAG_OWNDESC)) {
char *d;
sz = strlen(node->sysctl_desc) + 1;
d = malloc(sz, M_SYSCTLDATA, M_WAITOK);
if (d != NULL) {
/*
* discard const so that we can
* re-attach the description
*/
memcpy(d, node->sysctl_desc, sz);
onode = __UNCONST(node);
onode->sysctl_desc = d;
onode->sysctl_flags |= CTLFLAG_OWNDESC;
} else {
/*
* XXX drop the description? be
* afraid? don't care?
*/
}
}
}
sysctl_unlock();
return (error);
}
/*
* ********************************************************************
* Deletes an entire n-ary tree. Not recommended unless you know why
* you're doing it. Personally, I don't know why you'd even think
* about it.
* ********************************************************************
*/
void
sysctl_free(struct sysctlnode *rnode)
{
struct sysctlnode *node, *pnode;
rw_enter(&sysctl_treelock, RW_WRITER);
if (rnode == NULL)
rnode = &sysctl_root;
if (SYSCTL_VERS(rnode->sysctl_flags) != SYSCTL_VERSION) {
printf("sysctl_free: rnode %p wrong version\n", rnode);
rw_exit(&sysctl_treelock);
return;
}
pnode = rnode;
node = pnode->sysctl_child;
do {
while (node != NULL && pnode->sysctl_csize > 0) {
while (node <
&pnode->sysctl_child[pnode->sysctl_clen] &&
(SYSCTL_TYPE(node->sysctl_flags) !=
CTLTYPE_NODE ||
node->sysctl_csize == 0)) {
if (SYSCTL_FLAGS(node->sysctl_flags) &
CTLFLAG_OWNDATA) {
if (node->sysctl_data != NULL) {
free(node->sysctl_data,
M_SYSCTLDATA);
node->sysctl_data = NULL;
}
}
if (SYSCTL_FLAGS(node->sysctl_flags) &
CTLFLAG_OWNDESC) {
if (node->sysctl_desc != NULL) {
/*XXXUNCONST*/
free(__UNCONST(node->sysctl_desc),
M_SYSCTLDATA);
node->sysctl_desc = NULL;
}
}
node++;
}
if (node < &pnode->sysctl_child[pnode->sysctl_clen]) {
pnode = node;
node = node->sysctl_child;
} else
break;
}
if (pnode->sysctl_child != NULL)
free(pnode->sysctl_child, M_SYSCTLNODE);
pnode->sysctl_clen = 0;
pnode->sysctl_csize = 0;
pnode->sysctl_child = NULL;
node = pnode;
pnode = node->sysctl_parent;
} while (pnode != NULL && node != rnode);
rw_exit(&sysctl_treelock);
}
void
sysctl_log_print(const struct sysctllog *slog)
{
int i, len;
printf("root %p left %d size %d content", (const void *)slog->log_root,
slog->log_left, slog->log_size);
for (len = 0, i = slog->log_left; i < slog->log_size; i++) {
switch (len) {
case 0:
len = -1;
printf(" version %d", slog->log_num[i]);
break;
case -1:
len = -2;
printf(" type %d", slog->log_num[i]);
break;
case -2:
len = slog->log_num[i];
printf(" len %d:", slog->log_num[i]);
if (len <= 0)
len = -1;
break;
default:
len--;
printf(" %d", slog->log_num[i]);
break;
}
}
printf(" end\n");
}
int
sysctl_log_add(struct sysctllog **logp, const struct sysctlnode *node)
{
const int size0 = 16;
int name[CTL_MAXNAME], namelen, i;
const struct sysctlnode *pnode;
struct sysctllog *log;
if (node->sysctl_flags & CTLFLAG_PERMANENT)
return (0);
if (logp == NULL)
return (0);
if (*logp == NULL) {
log = malloc(sizeof(struct sysctllog),
M_SYSCTLDATA, M_WAITOK);
if (log == NULL) {
/* XXX print error message? */
return (-1);
}
log->log_num = malloc(size0 * sizeof(int),
M_SYSCTLDATA, M_WAITOK);
if (log->log_num == NULL) {
/* XXX print error message? */
free(log, M_SYSCTLDATA);
return (-1);
}
memset(log->log_num, 0, size0 * sizeof(int));
log->log_root = NULL;
log->log_size = size0;
log->log_left = size0;
*logp = log;
} else
log = *logp;
/*
* check that the root is proper. it's okay to record the
* address of the root of a tree. it's the only thing that's
* guaranteed not to shift around as nodes come and go.
*/
if (log->log_root == NULL)
log->log_root = sysctl_rootof(node); else if (log->log_root != sysctl_rootof(node)) { printf("sysctl: log %p root mismatch (%p)\n", log->log_root, sysctl_rootof(node));
return (-1);
}
/*
* we will copy out name in reverse order
*/
for (pnode = node, namelen = 0; pnode != NULL && !(pnode->sysctl_flags & CTLFLAG_ROOT);
pnode = pnode->sysctl_parent)
name[namelen++] = pnode->sysctl_num;
/*
* do we have space?
*/
if (log->log_left < (namelen + 3)) sysctl_log_realloc(log); if (log->log_left < (namelen + 3))
return (-1);
/*
* stuff name in, then namelen, then node type, and finally,
* the version for non-node nodes.
*/
for (i = 0; i < namelen && i < CTL_MAXNAME; i++)
log->log_num[--log->log_left] = name[i];
log->log_num[--log->log_left] = namelen;
log->log_num[--log->log_left] = SYSCTL_TYPE(node->sysctl_flags);
if (log->log_num[log->log_left] != CTLTYPE_NODE)
log->log_num[--log->log_left] = node->sysctl_ver;
else
log->log_num[--log->log_left] = 0;
return (0);
}
void
sysctl_teardown(struct sysctllog **logp)
{
const struct sysctlnode *rnode;
struct sysctlnode node;
struct sysctllog *log;
uint namelen;
int *name, t, v, error, ni;
size_t sz;
if (logp == NULL || *logp == NULL)
return;
log = *logp;
rw_enter(&sysctl_treelock, RW_WRITER);
memset(&node, 0, sizeof(node));
while (log->log_left < log->log_size) {
KASSERT(log->log_left + 3 < log->log_size);
KASSERT(log->log_left + log->log_num[log->log_left + 2] <=
log->log_size);
v = log->log_num[log->log_left++];
t = log->log_num[log->log_left++];
namelen = log->log_num[log->log_left++];
name = &log->log_num[log->log_left];
node.sysctl_num = name[namelen - 1];
node.sysctl_flags = SYSCTL_VERSION|t;
node.sysctl_ver = v;
rnode = log->log_root;
error = sysctl_locate(NULL, &name[0], namelen, &rnode, &ni);
if (error == 0) {
name[namelen - 1] = CTL_DESTROY;
rnode = rnode->sysctl_parent;
sz = 0;
(void)sysctl_destroy(&name[namelen - 1], 1, NULL,
&sz, &node, sizeof(node),
&name[0], NULL, rnode);
}
log->log_left += namelen;
}
KASSERT(log->log_size == log->log_left);
free(log->log_num, M_SYSCTLDATA);
free(log, M_SYSCTLDATA);
*logp = NULL;
rw_exit(&sysctl_treelock);
}
/*
* ********************************************************************
* old_sysctl -- A routine to bridge old-style internal calls to the
* new infrastructure.
* ********************************************************************
*/
int
old_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
void *newp, size_t newlen, struct lwp *l)
{
int error;
size_t oldlen = 0;
size_t savelen;
if (oldlenp) { oldlen = *oldlenp;
}
savelen = oldlen;
sysctl_lock(newp != NULL);
error = sysctl_dispatch(name, namelen, oldp, &oldlen,
newp, newlen, name, l, NULL);
sysctl_unlock();
if (error == 0 && oldp != NULL && savelen < oldlen)
error = ENOMEM;
if (oldlenp) {
*oldlenp = oldlen;
}
return (error);
}
/*
* ********************************************************************
* Section 4: Generic helper routines
* ********************************************************************
* "helper" routines that can do more finely grained access control,
* construct structures from disparate information, create the
* appearance of more nodes and sub-trees, etc. for example, if
* CTL_PROC wanted a helper function, it could respond to a CTL_QUERY
* with a dynamically created list of nodes that represented the
* currently running processes at that instant.
* ********************************************************************
*/
/*
* first, a few generic helpers that provide:
*
* sysctl_needfunc() a readonly interface that emits a warning
* sysctl_notavail() returns EOPNOTSUPP (generic error)
* sysctl_null() an empty return buffer with no error
*/
int
sysctl_needfunc(SYSCTLFN_ARGS)
{
int error;
printf("!!SYSCTL_NEEDFUNC!!\n");
if (newp != NULL || namelen != 0)
return (EOPNOTSUPP);
error = 0;
if (oldp != NULL)
error = sysctl_copyout(l, rnode->sysctl_data, oldp,
MIN(rnode->sysctl_size, *oldlenp));
*oldlenp = rnode->sysctl_size;
return (error);
}
int
sysctl_notavail(SYSCTLFN_ARGS)
{
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
return (EOPNOTSUPP);
}
int
sysctl_null(SYSCTLFN_ARGS)
{
*oldlenp = 0;
return (0);
}
u_int
sysctl_map_flags(const u_int *map, u_int word)
{
u_int rv;
for (rv = 0; *map != 0; map += 2)
if ((word & map[0]) != 0)
rv |= map[1];
return rv;
}
/*
* ********************************************************************
* Section 5: The machinery that makes it all go
* ********************************************************************
* Memory "manglement" routines. Not much to this, eh?
* ********************************************************************
*/
static int
sysctl_alloc(struct sysctlnode *p, int x)
{
int i;
struct sysctlnode *n;
assert(p->sysctl_child == NULL);
if (x == 1)
n = malloc(sizeof(struct sysctlnode),
M_SYSCTLNODE, M_WAITOK);
else
n = malloc(SYSCTL_DEFSIZE * sizeof(struct sysctlnode),
M_SYSCTLNODE, M_WAITOK);
if (n == NULL)
return (ENOMEM);
if (x == 1) {
memset(n, 0, sizeof(struct sysctlnode));
p->sysctl_csize = 1;
} else {
memset(n, 0, SYSCTL_DEFSIZE * sizeof(struct sysctlnode));
p->sysctl_csize = SYSCTL_DEFSIZE;
}
p->sysctl_clen = 0;
for (i = 0; i < p->sysctl_csize; i++)
n[i].sysctl_parent = p;
p->sysctl_child = n;
return (0);
}
static int
sysctl_realloc(struct sysctlnode *p)
{
int i, j, olen;
struct sysctlnode *n;
assert(p->sysctl_csize == p->sysctl_clen);
/*
* how many do we have...how many should we make?
*/
olen = p->sysctl_clen;
n = malloc(2 * olen * sizeof(struct sysctlnode), M_SYSCTLNODE,
M_WAITOK);
if (n == NULL)
return (ENOMEM);
/*
* move old children over...initialize new children
*/
memcpy(n, p->sysctl_child, olen * sizeof(struct sysctlnode));
memset(&n[olen], 0, olen * sizeof(struct sysctlnode));
p->sysctl_csize = 2 * olen;
/*
* reattach moved (and new) children to parent; if a moved
* child node has children, reattach the parent pointers of
* grandchildren
*/
for (i = 0; i < p->sysctl_csize; i++) {
n[i].sysctl_parent = p;
if (n[i].sysctl_child != NULL) { for (j = 0; j < n[i].sysctl_csize; j++)
n[i].sysctl_child[j].sysctl_parent = &n[i];
}
}
/*
* get out with the old and in with the new
*/
free(p->sysctl_child, M_SYSCTLNODE);
p->sysctl_child = n;
return (0);
}
static int
sysctl_log_realloc(struct sysctllog *log)
{
int *n, s, d;
s = log->log_size * 2;
d = log->log_size;
n = malloc(s * sizeof(int), M_SYSCTLDATA, M_WAITOK);
if (n == NULL)
return (-1);
memset(n, 0, s * sizeof(int));
memcpy(&n[d], log->log_num, d * sizeof(int));
free(log->log_num, M_SYSCTLDATA);
log->log_num = n;
if (d) log->log_left += d;
else
log->log_left = s;
log->log_size = s;
return (0);
}
/*
* ********************************************************************
* Section 6: Conversion between API versions wrt the sysctlnode
* ********************************************************************
*/
static int
sysctl_cvt_in(struct lwp *l, int *vp, const void *i, size_t sz,
struct sysctlnode *node)
{
int error, flags;
if (i == NULL || sz < sizeof(flags))
return (EINVAL);
error = sysctl_copyin(l, i, &flags, sizeof(flags)); if (error)
return (error);
#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_in: no support for SYSCTL_VERSION
#endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */
if (sz == sizeof(*node) &&
SYSCTL_VERS(flags) == SYSCTL_VERSION) {
error = sysctl_copyin(l, i, node, sizeof(*node)); if (error)
return (error);
*vp = SYSCTL_VERSION;
return (0);
}
return (EINVAL);
}
static int
sysctl_cvt_out(struct lwp *l, int v, const struct sysctlnode *i,
void *ovp, size_t left, size_t *szp)
{
size_t sz = sizeof(*i);
const void *src = i;
int error;
switch (v) {
case SYSCTL_VERS_0:
return (EINVAL);
#if (SYSCTL_VERSION != SYSCTL_VERS_1)
#error sysctl_cvt_out: no support for SYSCTL_VERSION
#endif /* (SYSCTL_VERSION != SYSCTL_VERS_1) */
case SYSCTL_VERSION:
/* nothing more to do here */
break;
}
if (ovp != NULL && left >= sz) { error = sysctl_copyout(l, src, ovp, sz); if (error)
return (error);
}
if (szp != NULL)
*szp = sz;
return (0);
}
static uint8_t address_key[32]; /* key used in address hashing */
static ONCE_DECL(random_inithook);
static int
random_address_init(void)
{
cprng_strong(kern_cprng, address_key, sizeof(address_key), 0);
return 0;
}
void
hash_value_ensure_initialized(void)
{
RUN_ONCE(&random_inithook, random_address_init);
}
void
hash_value(void *d, size_t ds, const void *s, size_t ss)
{
blake2s(d, ds, address_key, sizeof(address_key), s, ss);
}
/* $NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
*/
/*
* System calls on descriptors.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_descrip.c,v 1.48 2023/07/10 02:31:55 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/namei.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <uvm/uvm_readahead.h>
/*
* Duplicate a file descriptor.
*/
int
sys_dup(struct lwp *l, const struct sys_dup_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
} */
int error, newfd, oldfd;
file_t *fp;
oldfd = SCARG(uap, fd);
if ((fp = fd_getfile(oldfd)) == NULL) {
return EBADF;
}
error = fd_dup(fp, 0, &newfd, false);
fd_putfile(oldfd);
*retval = newfd;
return error;
}
/*
* Duplicate a file descriptor to a particular value.
*/
int
dodup(struct lwp *l, int from, int to, int flags, register_t *retval)
{
int error;
file_t *fp;
if ((fp = fd_getfile(from)) == NULL)
return EBADF;
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
fd_putfile(from);
if ((u_int)to >= curproc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
(u_int)to >= maxfiles)
error = EBADF;
else if (from == to)
error = 0;
else
error = fd_dup2(fp, to, flags);
closef(fp);
*retval = to;
return error;
}
int
sys_dup3(struct lwp *l, const struct sys_dup3_args *uap, register_t *retval)
{
/* {
syscallarg(int) from;
syscallarg(int) to;
syscallarg(int) flags;
} */
return dodup(l, SCARG(uap, from), SCARG(uap, to), SCARG(uap, flags),
retval);
}
int
sys_dup2(struct lwp *l, const struct sys_dup2_args *uap, register_t *retval)
{
/* {
syscallarg(int) from;
syscallarg(int) to;
} */
return dodup(l, SCARG(uap, from), SCARG(uap, to), 0, retval);
}
/*
* fcntl call which is being passed to the file's fs.
*/
static int
fcntl_forfs(int fd, file_t *fp, int cmd, void *arg)
{
int error;
u_int size;
void *data, *memp;
#define STK_PARAMS 128
char stkbuf[STK_PARAMS];
if ((fp->f_flag & (FREAD | FWRITE)) == 0)
return (EBADF);
/*
* Interpret high order word to find amount of data to be
* copied to/from the user's address space.
*/
size = (size_t)F_PARAM_LEN(cmd);
if (size > F_PARAM_MAX)
return (EINVAL);
memp = NULL;
if (size > sizeof(stkbuf)) { memp = kmem_alloc(size, KM_SLEEP);
data = memp;
} else
data = stkbuf;
if (cmd & F_FSIN) {
if (size) {
error = copyin(arg, data, size);
if (error) { if (memp)
kmem_free(memp, size);
return (error);
}
} else
*(void **)data = arg;
} else if ((cmd & F_FSOUT) != 0 && size != 0) {
/*
* Zero the buffer so the user always
* gets back something deterministic.
*/
memset(data, 0, size); } else if (cmd & F_FSVOID) *(void **)data = arg;
error = (*fp->f_ops->fo_fcntl)(fp, cmd, data);
/*
* Copy any data to user, size was
* already set and checked above.
*/
if (error == 0 && (cmd & F_FSOUT) && size) error = copyout(data, arg, size); if (memp)
kmem_free(memp, size);
return (error);
}
int
do_fcntl_lock(int fd, int cmd, struct flock *fl)
{
struct file *fp = NULL;
proc_t *p;
int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
int error, flg;
if ((fp = fd_getfile(fd)) == NULL) {
error = EBADF;
goto out;
}
if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
error = EINVAL;
goto out;
}
flg = F_POSIX;
p = curproc;
switch (cmd) {
case F_SETLKW:
flg |= F_WAIT;
/* Fall into F_SETLK */
/* FALLTHROUGH */
case F_SETLK:
switch (fl->l_type) {
case F_RDLCK:
if ((fp->f_flag & FREAD) == 0) {
error = EBADF;
break;
}
if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock);
p->p_flag |= PK_ADVLOCK;
mutex_exit(p->p_lock);
}
error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
break;
case F_WRLCK:
if ((fp->f_flag & FWRITE) == 0) {
error = EBADF;
break;
}
if ((p->p_flag & PK_ADVLOCK) == 0) { mutex_enter(p->p_lock);
p->p_flag |= PK_ADVLOCK;
mutex_exit(p->p_lock);
}
error = (*fo_advlock)(fp, p, F_SETLK, fl, flg);
break;
case F_UNLCK:
error = (*fo_advlock)(fp, p, F_UNLCK, fl, F_POSIX);
break;
default:
error = EINVAL;
break;
}
break;
case F_GETLK:
if (fl->l_type != F_RDLCK &&
fl->l_type != F_WRLCK &&
fl->l_type != F_UNLCK) {
error = EINVAL;
break;
}
error = (*fo_advlock)(fp, p, F_GETLK, fl, F_POSIX);
break;
default:
error = EINVAL;
break;
}
out: if (fp)
fd_putfile(fd);
return error;
}
/*
* The file control system call.
*/
int
sys_fcntl(struct lwp *l, const struct sys_fcntl_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) cmd;
syscallarg(void *) arg;
} */
int fd, i, tmp, error, cmd, newmin;
filedesc_t *fdp;
fdtab_t *dt;
file_t *fp;
char *kpath;
struct flock fl;
bool cloexec = false;
fd = SCARG(uap, fd);
cmd = SCARG(uap, cmd);
fdp = l->l_fd;
error = 0;
switch (cmd) {
case F_CLOSEM:
if (fd < 0)
return EBADF;
while ((i = fdp->fd_lastfile) >= fd) { if (fd_getfile(i) == NULL) {
/* Another thread has updated. */
continue;
}
fd_close(i);
}
return 0;
case F_MAXFD:
*retval = fdp->fd_lastfile;
return 0;
case F_SETLKW:
case F_SETLK:
case F_GETLK:
error = copyin(SCARG(uap, arg), &fl, sizeof(fl));
if (error)
return error;
error = do_fcntl_lock(fd, cmd, &fl);
if (cmd == F_GETLK && error == 0) error = copyout(&fl, SCARG(uap, arg), sizeof(fl));
return error;
default:
/* Handled below */
break;
}
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if ((cmd & F_FSCTL)) {
error = fcntl_forfs(fd, fp, cmd, SCARG(uap, arg));
fd_putfile(fd);
return error;
}
switch (cmd) {
case F_DUPFD_CLOEXEC:
cloexec = true;
/*FALLTHROUGH*/
case F_DUPFD:
newmin = (long)SCARG(uap, arg);
if ((u_int)newmin >=
l->l_proc->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
(u_int)newmin >= maxfiles) {
fd_putfile(fd);
return EINVAL;
}
error = fd_dup(fp, newmin, &i, cloexec);
*retval = i;
break;
case F_GETFD:
dt = atomic_load_consume(&fdp->fd_dt);
*retval = dt->dt_ff[fd]->ff_exclose;
break;
case F_SETFD:
fd_set_exclose(l, fd,
((long)SCARG(uap, arg) & FD_CLOEXEC) != 0);
break;
case F_GETNOSIGPIPE:
*retval = (fp->f_flag & FNOSIGPIPE) != 0;
break;
case F_SETNOSIGPIPE:
if (SCARG(uap, arg))
atomic_or_uint(&fp->f_flag, FNOSIGPIPE);
else
atomic_and_uint(&fp->f_flag, ~FNOSIGPIPE);
*retval = 0;
break;
case F_GETFL:
*retval = OFLAGS(fp->f_flag);
break;
case F_SETFL:
/* XXX not guaranteed to be atomic. */
tmp = FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
error = (*fp->f_ops->fo_fcntl)(fp, F_SETFL, &tmp);
if (error)
break;
i = tmp ^ fp->f_flag;
if (i & FNONBLOCK) {
int flgs = tmp & FNONBLOCK;
error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, &flgs);
if (error) {
(*fp->f_ops->fo_fcntl)(fp, F_SETFL,
&fp->f_flag);
break;
}
} if (i & FASYNC) {
int flgs = tmp & FASYNC;
error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, &flgs);
if (error) {
if (i & FNONBLOCK) { tmp = fp->f_flag & FNONBLOCK;
(void)(*fp->f_ops->fo_ioctl)(fp,
FIONBIO, &tmp);
}
(*fp->f_ops->fo_fcntl)(fp, F_SETFL,
&fp->f_flag);
break;
}
}
fp->f_flag = (fp->f_flag & ~FCNTLFLAGS) | tmp;
break;
case F_GETOWN:
error = (*fp->f_ops->fo_ioctl)(fp, FIOGETOWN, &tmp);
*retval = tmp;
break;
case F_SETOWN:
tmp = (int)(uintptr_t) SCARG(uap, arg);
error = (*fp->f_ops->fo_ioctl)(fp, FIOSETOWN, &tmp);
break;
case F_GETPATH:
kpath = PNBUF_GET();
/* vnodes need extra context, so are handled separately */
if (fp->f_type == DTYPE_VNODE)
error = vnode_to_path(kpath, MAXPATHLEN, fp->f_vnode,
l, l->l_proc);
else
error = (*fp->f_ops->fo_fcntl)(fp, F_GETPATH, kpath); if (error == 0) error = copyoutstr(kpath, SCARG(uap, arg), MAXPATHLEN,
NULL);
PNBUF_PUT(kpath);
break;
case F_ADD_SEALS:
tmp = (int)(uintptr_t) SCARG(uap, arg);
error = (*fp->f_ops->fo_fcntl)(fp, F_ADD_SEALS, &tmp);
break;
case F_GET_SEALS:
error = (*fp->f_ops->fo_fcntl)(fp, F_GET_SEALS, &tmp);
*retval = tmp;
break;
default:
error = EINVAL;
}
fd_putfile(fd);
return (error);
}
/*
* Close a file descriptor.
*/
int
sys_close(struct lwp *l, const struct sys_close_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
} */
int error;
int fd = SCARG(uap, fd);
if (fd_getfile(fd) == NULL) {
return EBADF;
}
error = fd_close(fd);
if (error == ERESTART) {
#ifdef DIAGNOSTIC
printf("%s[%d]: close(%d) returned ERESTART\n",
l->l_proc->p_comm, (int)l->l_proc->p_pid, fd);
#endif
error = EINTR;
}
return error;
}
/*
* Return status information about a file descriptor.
* Common function for compat code.
*/
int
do_sys_fstat(int fd, struct stat *sb)
{
file_t *fp;
int error;
if ((fp = fd_getfile(fd)) == NULL) {
return EBADF;
}
error = (*fp->f_ops->fo_stat)(fp, sb);
fd_putfile(fd);
return error;
}
/*
* Return status information about a file descriptor.
*/
int
sys___fstat50(struct lwp *l, const struct sys___fstat50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(struct stat *) sb;
} */
struct stat sb;
int error;
error = do_sys_fstat(SCARG(uap, fd), &sb); if (error == 0) { error = copyout(&sb, SCARG(uap, sb), sizeof(sb));
}
return error;
}
/*
* Return pathconf information about a file descriptor.
*/
int
sys_fpathconf(struct lwp *l, const struct sys_fpathconf_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) name;
} */
int fd, name, error;
file_t *fp;
fd = SCARG(uap, fd);
name = SCARG(uap, name);
error = 0;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if (fp->f_ops->fo_fpathconf == NULL)
error = EOPNOTSUPP;
else
error = (*fp->f_ops->fo_fpathconf)(fp, name, retval);
fd_putfile(fd);
return error;
}
/*
* Apply an advisory lock on a file descriptor.
*
* Just attempt to get a record lock of the requested type on
* the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
*/
/* ARGSUSED */
int
sys_flock(struct lwp *l, const struct sys_flock_args *uap, register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) how;
} */
int fd, how, error;
struct file *fp = NULL;
int (*fo_advlock)(struct file *, void *, int, struct flock *, int);
struct flock lf;
fd = SCARG(uap, fd);
how = SCARG(uap, how);
if ((fp = fd_getfile(fd)) == NULL) {
error = EBADF;
goto out;
}
if ((fo_advlock = fp->f_ops->fo_advlock) == NULL) {
KASSERT((atomic_load_relaxed(&fp->f_flag) & FHASLOCK) == 0);
error = EOPNOTSUPP;
goto out;
}
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
switch (how & ~LOCK_NB) {
case LOCK_UN:
lf.l_type = F_UNLCK;
atomic_and_uint(&fp->f_flag, ~FHASLOCK);
error = (*fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
goto out;
case LOCK_EX:
lf.l_type = F_WRLCK;
break;
case LOCK_SH:
lf.l_type = F_RDLCK;
break;
default:
error = EINVAL;
goto out;
}
atomic_or_uint(&fp->f_flag, FHASLOCK);
if (how & LOCK_NB) {
error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK);
} else {
error = (*fo_advlock)(fp, fp, F_SETLK, &lf, F_FLOCK|F_WAIT);
}
out: if (fp)
fd_putfile(fd);
return error;
}
int
do_posix_fadvise(int fd, off_t offset, off_t len, int advice)
{
file_t *fp;
int error;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if (fp->f_ops->fo_posix_fadvise == NULL) {
error = EOPNOTSUPP;
} else {
error = (*fp->f_ops->fo_posix_fadvise)(fp, offset, len,
advice);
}
fd_putfile(fd);
return error;
}
int
sys___posix_fadvise50(struct lwp *l,
const struct sys___posix_fadvise50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(int) pad;
syscallarg(off_t) offset;
syscallarg(off_t) len;
syscallarg(int) advice;
} */
*retval = do_posix_fadvise(SCARG(uap, fd), SCARG(uap, offset),
SCARG(uap, len), SCARG(uap, advice));
return 0;
}
int
sys_pipe(struct lwp *l, const void *v, register_t *retval)
{
int fd[2], error;
if ((error = pipe1(l, fd, 0)) != 0)
return error;
retval[0] = fd[0];
retval[1] = fd[1];
return 0;
}
int
sys_pipe2(struct lwp *l, const struct sys_pipe2_args *uap, register_t *retval)
{
/* {
syscallarg(int[2]) fildes;
syscallarg(int) flags;
} */
int fd[2], error;
if ((error = pipe1(l, fd, SCARG(uap, flags))) != 0)
return error;
if ((error = copyout(fd, SCARG(uap, fildes), sizeof(fd))) != 0)
return error;
retval[0] = 0;
return 0;
}
/* $NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lookup.c,v 1.234 2023/05/01 05:12:44 mlelstv Exp $");
#ifdef _KERNEL_OPT
#include "opt_magiclinks.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/syslimits.h>
#include <sys/time.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/vnode_impl.h>
#include <sys/fstrans.h>
#include <sys/mount.h>
#include <sys/errno.h>
#include <sys/filedesc.h>
#include <sys/hash.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/ktrace.h>
#include <sys/dirent.h>
#ifndef MAGICLINKS
#define MAGICLINKS 0
#endif
int vfs_magiclinks = MAGICLINKS;
__CTASSERT(MAXNAMLEN == NAME_MAX);
/*
* Substitute replacement text for 'magic' strings in symlinks.
* Returns 0 if successful, and returns non-zero if an error
* occurs. (Currently, the only possible error is running out
* of temporary pathname space.)
*
* Looks for "@<string>" and "@<string>/", where <string> is a
* recognized 'magic' string. Replaces the "@<string>" with the
* appropriate replacement text. (Note that in some cases the
* replacement text may have zero length.)
*
* This would have been table driven, but the variance in
* replacement strings (and replacement string lengths) made
* that impractical.
*/
#define VNL(x) \
(sizeof(x) - 1)
#define VO '{'
#define VC '}'
#define MATCH(str) \
((termchar == '/' && i + VNL(str) == *len) || \
(i + VNL(str) < *len && \
cp[i + VNL(str)] == termchar)) && \
!strncmp((str), &cp[i], VNL(str))
#define SUBSTITUTE(m, s, sl) \
if ((newlen + (sl)) >= MAXPATHLEN) \
return 1; \
i += VNL(m); \
if (termchar != '/') \
i++; \
(void)memcpy(&tmp[newlen], (s), (sl)); \
newlen += (sl); \
change = 1; \
termchar = '/';
static int
symlink_magic(struct proc *p, char *cp, size_t *len)
{
char *tmp;
size_t change, i, newlen, slen;
char termchar = '/';
char idtmp[11]; /* enough for 32 bit *unsigned* integer */
tmp = PNBUF_GET();
for (change = i = newlen = 0; i < *len; ) {
if (cp[i] != '@') {
tmp[newlen++] = cp[i++];
continue;
}
i++;
/* Check for @{var} syntax. */
if (cp[i] == VO) {
termchar = VC;
i++;
}
/*
* The following checks should be ordered according
* to frequency of use.
*/
if (MATCH("machine_arch")) {
slen = strlen(PROC_MACHINE_ARCH(p));
SUBSTITUTE("machine_arch", PROC_MACHINE_ARCH(p), slen); } else if (MATCH("machine")) {
slen = VNL(MACHINE);
SUBSTITUTE("machine", MACHINE, slen); } else if (MATCH("hostname")) { SUBSTITUTE("hostname", hostname, hostnamelen); } else if (MATCH("osrelease")) {
slen = strlen(osrelease);
SUBSTITUTE("osrelease", osrelease, slen); } else if (MATCH("emul")) {
slen = strlen(p->p_emul->e_name);
SUBSTITUTE("emul", p->p_emul->e_name, slen); } else if (MATCH("kernel_ident")) {
slen = strlen(kernel_ident);
SUBSTITUTE("kernel_ident", kernel_ident, slen); } else if (MATCH("domainname")) { SUBSTITUTE("domainname", domainname, domainnamelen); } else if (MATCH("ostype")) {
slen = strlen(ostype);
SUBSTITUTE("ostype", ostype, slen); } else if (MATCH("uid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_geteuid(kauth_cred_get()));
SUBSTITUTE("uid", idtmp, slen); } else if (MATCH("ruid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getuid(kauth_cred_get()));
SUBSTITUTE("ruid", idtmp, slen); } else if (MATCH("gid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getegid(kauth_cred_get()));
SUBSTITUTE("gid", idtmp, slen); } else if (MATCH("rgid")) {
slen = snprintf(idtmp, sizeof(idtmp), "%u",
kauth_cred_getgid(kauth_cred_get()));
SUBSTITUTE("rgid", idtmp, slen);
} else {
tmp[newlen++] = '@';
if (termchar == VC) tmp[newlen++] = VO;
}
}
if (change) { (void)memcpy(cp, tmp, newlen);
*len = newlen;
}
PNBUF_PUT(tmp);
return 0;
}
#undef VNL
#undef VO
#undef VC
#undef MATCH
#undef SUBSTITUTE
////////////////////////////////////////////////////////////
/*
* Determine the namei hash (for the namecache) for name.
* If *ep != NULL, hash from name to ep-1.
* If *ep == NULL, hash from name until the first NUL or '/', and
* return the location of this termination character in *ep.
*
* This function returns an equivalent hash to the MI hash32_strn().
* The latter isn't used because in the *ep == NULL case, determining
* the length of the string to the first NUL or `/' and then calling
* hash32_strn() involves unnecessary double-handling of the data.
*/
uint32_t
namei_hash(const char *name, const char **ep)
{
uint32_t hash;
hash = HASH32_STR_INIT;
if (*ep != NULL) {
for (; name < *ep; name++)
hash = hash * 33 + *(const uint8_t *)name;
} else {
for (; *name != '\0' && *name != '/'; name++)
hash = hash * 33 + *(const uint8_t *)name;
*ep = name;
}
return (hash + (hash >> 5));
}
////////////////////////////////////////////////////////////
/*
* Sealed abstraction for pathnames.
*
* System-call-layer level code that is going to call namei should
* first create a pathbuf and adjust all the bells and whistles on it
* as needed by context.
*/
struct pathbuf {
char *pb_path;
char *pb_pathcopy;
unsigned pb_pathcopyuses;
};
static struct pathbuf *
pathbuf_create_raw(void)
{
struct pathbuf *pb;
pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
pb->pb_path = PNBUF_GET();
if (pb->pb_path == NULL) {
kmem_free(pb, sizeof(*pb));
return NULL;
}
pb->pb_pathcopy = NULL;
pb->pb_pathcopyuses = 0;
return pb;
}
void
pathbuf_destroy(struct pathbuf *pb)
{ KASSERT(pb->pb_pathcopyuses == 0); KASSERT(pb->pb_pathcopy == NULL);
PNBUF_PUT(pb->pb_path);
kmem_free(pb, sizeof(*pb));
}
struct pathbuf *
pathbuf_assimilate(char *pnbuf)
{
struct pathbuf *pb;
pb = kmem_alloc(sizeof(*pb), KM_SLEEP);
pb->pb_path = pnbuf;
pb->pb_pathcopy = NULL;
pb->pb_pathcopyuses = 0;
return pb;
}
struct pathbuf *
pathbuf_create(const char *path)
{
struct pathbuf *pb;
int error;
pb = pathbuf_create_raw();
if (pb == NULL) {
return NULL;
}
error = copystr(path, pb->pb_path, PATH_MAX, NULL);
if (error != 0) { KASSERT(!"kernel path too long in pathbuf_create");
/* make sure it's null-terminated, just in case */
pb->pb_path[PATH_MAX-1] = '\0';
}
return pb;
}
int
pathbuf_copyin(const char *userpath, struct pathbuf **ret)
{
struct pathbuf *pb;
int error;
pb = pathbuf_create_raw();
if (pb == NULL) {
return ENOMEM;
}
error = copyinstr(userpath, pb->pb_path, PATH_MAX, NULL);
if (error) {
pathbuf_destroy(pb);
return error;
}
*ret = pb;
return 0;
}
/*
* XXX should not exist:
* 1. whether a pointer is kernel or user should be statically checkable.
* 2. copyin should be handled by the upper part of the syscall layer,
* not in here.
*/
int
pathbuf_maybe_copyin(const char *path, enum uio_seg seg, struct pathbuf **ret)
{
if (seg == UIO_USERSPACE) {
return pathbuf_copyin(path, ret);
} else {
*ret = pathbuf_create(path);
if (*ret == NULL) {
return ENOMEM;
}
return 0;
}
}
/*
* Get a copy of the path buffer as it currently exists. If this is
* called after namei starts the results may be arbitrary.
*/
void
pathbuf_copystring(const struct pathbuf *pb, char *buf, size_t maxlen)
{
strlcpy(buf, pb->pb_path, maxlen);
}
/*
* These two functions allow access to a saved copy of the original
* path string. The first copy should be gotten before namei is
* called. Each copy that is gotten should be put back.
*/
const char *
pathbuf_stringcopy_get(struct pathbuf *pb)
{ if (pb->pb_pathcopyuses == 0) { pb->pb_pathcopy = PNBUF_GET();
strcpy(pb->pb_pathcopy, pb->pb_path);
}
pb->pb_pathcopyuses++;
return pb->pb_pathcopy;
}
void
pathbuf_stringcopy_put(struct pathbuf *pb, const char *str)
{ KASSERT(str == pb->pb_pathcopy); KASSERT(pb->pb_pathcopyuses > 0);
pb->pb_pathcopyuses--;
if (pb->pb_pathcopyuses == 0) { PNBUF_PUT(pb->pb_pathcopy);
pb->pb_pathcopy = NULL;
}
}
////////////////////////////////////////////////////////////
/*
* namei: convert a pathname into a pointer to a (maybe-locked) vnode,
* and maybe also its parent directory vnode, and assorted other guff.
* See namei(9) for the interface documentation.
*
*
* The FOLLOW flag is set when symbolic links are to be followed
* when they occur at the end of the name translation process.
* Symbolic links are always followed for all other pathname
* components other than the last.
*
* The segflg defines whether the name is to be copied from user
* space or kernel space.
*
* Overall outline of namei:
*
* copy in name
* get starting directory
* while (!done && !error) {
* call lookup to search path.
* if symbolic link, massage name in buffer and continue
* }
*/
/*
* Search a pathname.
* This is a very central and rather complicated routine.
*
* The pathname is pointed to by ni_ptr and is of length ni_pathlen.
* The starting directory is passed in. The pathname is descended
* until done, or a symbolic link is encountered. The variable ni_more
* is clear if the path is completed; it is set to one if a symbolic
* link needing interpretation is encountered.
*
* The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on
* whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it, the parent directory is returned
* locked. Otherwise the parent directory is not returned. If the target
* of the pathname exists and LOCKLEAF is or'ed into the flag the target
* is returned locked, otherwise it is returned unlocked. When creating
* or renaming and LOCKPARENT is specified, the target may not be ".".
* When deleting and LOCKPARENT is specified, the target may be ".".
*
* Overall outline of lookup:
*
* dirloop:
* identify next component of name at ndp->ni_ptr
* handle degenerate case where name is null string
* if .. and crossing mount points and on mounted filesys, find parent
* call VOP_LOOKUP routine for next component name
* directory vnode returned in ni_dvp, locked.
* component vnode returned in ni_vp (if it exists), locked.
* if result vnode is mounted on and crossing mount points,
* find mounted on vnode
* if more components of name, do next level at dirloop
* return the answer in ni_vp, locked if LOCKLEAF set
* if LOCKPARENT set, return locked parent in ni_dvp
*/
/*
* Internal state for a namei operation.
*
* cnp is always equal to &ndp->ni_cnp.
*/
struct namei_state {
struct nameidata *ndp;
struct componentname *cnp;
int docache; /* == 0 do not cache last component */
int rdonly; /* lookup read-only flag bit */
int slashes;
unsigned attempt_retry:1; /* true if error allows emul retry */
unsigned root_referenced:1; /* true if ndp->ni_rootdir and
ndp->ni_erootdir were referenced */
};
/*
* Initialize the namei working state.
*/
static void
namei_init(struct namei_state *state, struct nameidata *ndp)
{
state->ndp = ndp;
state->cnp = &ndp->ni_cnd;
state->docache = 0;
state->rdonly = 0;
state->slashes = 0;
state->root_referenced = 0;
KASSERTMSG((state->cnp->cn_cred != NULL), "namei: bad cred/proc"); KASSERTMSG(((state->cnp->cn_nameiop & (~OPMASK)) == 0),
"namei: nameiop contaminated with flags: %08"PRIx32,
state->cnp->cn_nameiop);
KASSERTMSG(((state->cnp->cn_flags & OPMASK) == 0),
"name: flags contaminated with nameiops: %08"PRIx32,
state->cnp->cn_flags);
/*
* The buffer for name translation shall be the one inside the
* pathbuf.
*/
state->ndp->ni_pnbuf = state->ndp->ni_pathbuf->pb_path;
}
/*
* Clean up the working namei state, leaving things ready for return
* from namei.
*/
static void
namei_cleanup(struct namei_state *state)
{
KASSERT(state->cnp == &state->ndp->ni_cnd); if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir);
}
}
//////////////////////////////
/*
* Get the directory context.
* Initializes the rootdir and erootdir state and returns a reference
* to the starting dir.
*/
static struct vnode *
namei_getstartdir(struct namei_state *state)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct cwdinfo *cwdi; /* pointer to cwd state */
struct lwp *self = curlwp; /* thread doing namei() */
struct vnode *rootdir, *erootdir, *curdir, *startdir;
if (state->root_referenced) { if (state->ndp->ni_rootdir != NULL) vrele(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vrele(state->ndp->ni_erootdir);
state->root_referenced = 0;
}
cwdi = self->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
/* root dir */
if (cwdi->cwdi_rdir == NULL || (cnp->cn_flags & NOCHROOT)) {
rootdir = rootvnode;
} else {
rootdir = cwdi->cwdi_rdir;
}
/* emulation root dir, if any */
if ((cnp->cn_flags & TRYEMULROOT) == 0) {
/* if we don't want it, don't fetch it */
erootdir = NULL;
} else if (cnp->cn_flags & EMULROOTSET) {
/* explicitly set emulroot; "/../" doesn't override this */
erootdir = ndp->ni_erootdir; } else if (!strncmp(ndp->ni_pnbuf, "/../", 4)) {
/* explicit reference to real rootdir */
erootdir = NULL;
} else {
/* may be null */
erootdir = cwdi->cwdi_edir;
}
/* current dir */
curdir = cwdi->cwdi_cdir;
if (ndp->ni_pnbuf[0] != '/') {
if (ndp->ni_atdir != NULL) {
startdir = ndp->ni_atdir;
} else {
startdir = curdir;
}
erootdir = NULL;
} else if (cnp->cn_flags & TRYEMULROOT && erootdir != NULL) {
startdir = erootdir;
} else {
startdir = rootdir;
erootdir = NULL;
}
state->ndp->ni_rootdir = rootdir;
state->ndp->ni_erootdir = erootdir;
/*
* Get a reference to the start dir so we can safely unlock cwdi.
*
* Must hold references to rootdir and erootdir while we're running.
* A multithreaded process may chroot during namei.
*/
if (startdir != NULL) vref(startdir); if (state->ndp->ni_rootdir != NULL) vref(state->ndp->ni_rootdir); if (state->ndp->ni_erootdir != NULL) vref(state->ndp->ni_erootdir);
state->root_referenced = 1;
rw_exit(&cwdi->cwdi_lock);
return startdir;
}
/*
* Get the directory context for the nfsd case, in parallel to
* getstartdir. Initializes the rootdir and erootdir state and
* returns a reference to the passed-in starting dir.
*/
static struct vnode *
namei_getstartdir_for_nfsd(struct namei_state *state)
{
KASSERT(state->ndp->ni_atdir != NULL);
/* always use the real root, and never set an emulation root */
if (rootvnode == NULL) {
return NULL;
}
state->ndp->ni_rootdir = rootvnode;
state->ndp->ni_erootdir = NULL;
vref(state->ndp->ni_atdir);
KASSERT(! state->root_referenced);
vref(state->ndp->ni_rootdir);
state->root_referenced = 1;
return state->ndp->ni_atdir;
}
/*
* Ktrace the namei operation.
*/
static void
namei_ktrace(struct namei_state *state)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct lwp *self = curlwp; /* thread doing namei() */
const char *emul_path;
if (ktrpoint(KTR_NAMEI)) {
if (ndp->ni_erootdir != NULL) {
/*
* To make any sense, the trace entry need to have the
* text of the emulation path prepended.
* Usually we can get this from the current process,
* but when called from emul_find_interp() it is only
* in the exec_package - so we get it passed in ni_next
* (this is a hack).
*/
if (cnp->cn_flags & EMULROOTSET)
emul_path = ndp->ni_next;
else
emul_path = self->l_proc->p_emul->e_path; ktrnamei2(emul_path, strlen(emul_path), ndp->ni_pnbuf, ndp->ni_pathlen);
} else
ktrnamei(ndp->ni_pnbuf, ndp->ni_pathlen);
}
}
/*
* Start up namei. Find the root dir and cwd, establish the starting
* directory for lookup, and lock it. Also calls ktrace when
* appropriate.
*/
static int
namei_start(struct namei_state *state, int isnfsd,
struct vnode **startdir_ret)
{
struct nameidata *ndp = state->ndp;
struct vnode *startdir;
/* length includes null terminator (was originally from copyinstr) */
ndp->ni_pathlen = strlen(ndp->ni_pnbuf) + 1;
/*
* POSIX.1 requirement: "" is not a valid file name.
*/
if (ndp->ni_pathlen == 1) {
ndp->ni_erootdir = NULL;
return ENOENT;
}
ndp->ni_loopcnt = 0;
/* Get starting directory, set up root, and ktrace. */
if (isnfsd) {
startdir = namei_getstartdir_for_nfsd(state);
/* no ktrace */
} else {
startdir = namei_getstartdir(state); namei_ktrace(state);
}
if (startdir == NULL) {
return ENOENT;
}
/* NDAT may feed us with a non directory namei_getstartdir */
if (startdir->v_type != VDIR) {
vrele(startdir);
return ENOTDIR;
}
*startdir_ret = startdir;
return 0;
}
/*
* Check for being at a symlink that we're going to follow.
*/
static inline int
namei_atsymlink(struct namei_state *state, struct vnode *foundobj)
{
return (foundobj->v_type == VLNK) &&
(state->cnp->cn_flags & (FOLLOW|REQUIREDIR));
}
/*
* Follow a symlink.
*
* Updates searchdir. inhibitmagic causes magic symlinks to not be
* interpreted; this is used by nfsd.
*
* Unlocks foundobj on success (ugh)
*/
static inline int
namei_follow(struct namei_state *state, int inhibitmagic,
struct vnode *searchdir, struct vnode *foundobj,
struct vnode **newsearchdir_ret)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct lwp *self = curlwp; /* thread doing namei() */
struct iovec aiov; /* uio for reading symbolic links */
struct uio auio;
char *cp; /* pointer into pathname argument */
size_t linklen;
int error;
if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
return ELOOP;
}
vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
if (foundobj->v_mount->mnt_flag & MNT_SYMPERM) {
error = VOP_ACCESS(foundobj, VEXEC, cnp->cn_cred);
if (error != 0) { VOP_UNLOCK(foundobj);
return error;
}
}
/* FUTURE: fix this to not use a second buffer */
cp = PNBUF_GET();
aiov.iov_base = cp;
aiov.iov_len = MAXPATHLEN;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_offset = 0;
auio.uio_rw = UIO_READ;
auio.uio_resid = MAXPATHLEN;
UIO_SETUP_SYSSPACE(&auio);
error = VOP_READLINK(foundobj, &auio, cnp->cn_cred);
VOP_UNLOCK(foundobj);
if (error) {
PNBUF_PUT(cp);
return error;
}
linklen = MAXPATHLEN - auio.uio_resid;
if (linklen == 0) {
PNBUF_PUT(cp);
return ENOENT;
}
/*
* Do symlink substitution, if appropriate, and
* check length for potential overflow.
*
* Inhibit symlink substitution for nfsd.
* XXX: This is how it was before; is that a bug or a feature?
*/
if ((!inhibitmagic && vfs_magiclinks && symlink_magic(self->l_proc, cp, &linklen)) ||
(linklen + ndp->ni_pathlen >= MAXPATHLEN)) {
PNBUF_PUT(cp);
return ENAMETOOLONG;
}
if (ndp->ni_pathlen > 1) {
/* includes a null-terminator */
memcpy(cp + linklen, ndp->ni_next, ndp->ni_pathlen);
} else {
cp[linklen] = '\0';
}
ndp->ni_pathlen += linklen;
memcpy(ndp->ni_pnbuf, cp, ndp->ni_pathlen);
PNBUF_PUT(cp);
/* we're now starting from the beginning of the buffer again */
cnp->cn_nameptr = ndp->ni_pnbuf;
/*
* Check if root directory should replace current directory.
*/
if (ndp->ni_pnbuf[0] == '/') {
vrele(searchdir);
/* Keep absolute symbolic links inside emulation root */
searchdir = ndp->ni_erootdir;
if (searchdir == NULL ||
(ndp->ni_pnbuf[1] == '.'
&& ndp->ni_pnbuf[2] == '.' && ndp->ni_pnbuf[3] == '/')) {
ndp->ni_erootdir = NULL;
searchdir = ndp->ni_rootdir;
}
vref(searchdir);
while (cnp->cn_nameptr[0] == '/') {
cnp->cn_nameptr++;
ndp->ni_pathlen--;
}
}
*newsearchdir_ret = searchdir;
return 0;
}
//////////////////////////////
/*
* Inspect the leading path component and update the state accordingly.
*/
static int
lookup_parsepath(struct namei_state *state, struct vnode *searchdir)
{
const char *cp; /* pointer into pathname argument */
int error;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
KASSERT(cnp == &ndp->ni_cnd);
/*
* Search a new directory.
*
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*
* At this point, our only vnode state is that the search dir
* is held.
*/
error = VOP_PARSEPATH(searchdir, cnp->cn_nameptr, &cnp->cn_namelen);
if (error) {
return error;
}
cp = cnp->cn_nameptr + cnp->cn_namelen;
if (cnp->cn_namelen > KERNEL_NAME_MAX) {
return ENAMETOOLONG;
}
#ifdef NAMEI_DIAGNOSTIC
{ char c = *cp;
*(char *)cp = '\0';
printf("{%s}: ", cnp->cn_nameptr);
*(char *)cp = c; }
#endif /* NAMEI_DIAGNOSTIC */
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
/*
* If this component is followed by a slash, then move the pointer to
* the next component forward, and remember that this component must be
* a directory.
*/
if (*cp == '/') {
do {
cp++;
} while (*cp == '/'); state->slashes = cp - ndp->ni_next;
ndp->ni_pathlen -= state->slashes;
ndp->ni_next = cp;
cnp->cn_flags |= REQUIREDIR;
} else {
state->slashes = 0;
cnp->cn_flags &= ~REQUIREDIR;
}
/*
* We do special processing on the last component, whether or not it's
* a directory. Cache all intervening lookups, but not the final one.
*/
if (*cp == '\0') {
if (state->docache)
cnp->cn_flags |= MAKEENTRY;
else
cnp->cn_flags &= ~MAKEENTRY;
cnp->cn_flags |= ISLASTCN;
} else {
cnp->cn_flags |= MAKEENTRY;
cnp->cn_flags &= ~ISLASTCN;
}
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') cnp->cn_flags |= ISDOTDOT;
else
cnp->cn_flags &= ~ISDOTDOT;
return 0;
}
/*
* Take care of crossing a mounted-on vnode. On error, foundobj_ret will be
* vrele'd, but searchdir is left alone.
*/
static int
lookup_crossmount(struct namei_state *state,
struct vnode **searchdir_ret,
struct vnode **foundobj_ret,
bool *searchdir_locked)
{
struct componentname *cnp = state->cnp;
struct vnode *foundobj, *vp;
struct vnode *searchdir;
struct mount *mp;
int error, lktype;
searchdir = *searchdir_ret;
foundobj = *foundobj_ret;
error = 0;
KASSERT((cnp->cn_flags & NOCROSSMOUNT) == 0);
/* First, unlock searchdir (oof). */
if (*searchdir_locked) { KASSERT(searchdir != NULL);
lktype = VOP_ISLOCKED(searchdir);
VOP_UNLOCK(searchdir);
*searchdir_locked = false;
} else {
lktype = LK_NONE;
}
/*
* Do an unlocked check to see if the vnode has been mounted on; if
* so find the root of the mounted file system.
*/
while (foundobj->v_type == VDIR && (mp = foundobj->v_mountedhere) != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
/*
* Try the namecache first. If that doesn't work, do
* it the hard way.
*/
if (cache_lookup_mount(foundobj, &vp)) {
vrele(foundobj);
foundobj = vp;
} else {
/* First get the vnodes mount stable. */
while ((mp = foundobj->v_mountedhere) != NULL) {
fstrans_start(mp);
if (fstrans_held(mp) &&
mp == foundobj->v_mountedhere) {
break;
}
fstrans_done(mp);
}
if (mp == NULL) {
break;
}
/*
* Now get a reference on the root vnode.
* XXX Future - maybe allow only VDIR here.
*/
error = VFS_ROOT(mp, LK_NONE, &vp);
/*
* If successful, enter it into the cache while
* holding the mount busy (competing with unmount).
*/
if (error == 0) {
cache_enter_mount(foundobj, vp);
}
/* Finally, drop references to foundobj & mountpoint. */
vrele(foundobj);
fstrans_done(mp);
if (error) {
foundobj = NULL;
break;
}
foundobj = vp;
}
/*
* Avoid locking vnodes from two filesystems because
* it's prone to deadlock, e.g. when using puffs.
* Also, it isn't a good idea to propagate slowness of
* a filesystem up to the root directory. For now,
* only handle the common case, where foundobj is
* VDIR.
*
* In this case set searchdir to null to avoid using
* it again. It is not correct to set searchdir ==
* foundobj here as that will confuse the caller.
* (See PR 40740.)
*/
if (searchdir == NULL) {
/* already been here once; do nothing further */
} else if (foundobj->v_type == VDIR) { vrele(searchdir);
*searchdir_ret = searchdir = NULL;
lktype = LK_NONE;
}
}
/* If searchdir is still around, re-lock it. */
if (error == 0 && lktype != LK_NONE) { vn_lock(searchdir, lktype | LK_RETRY);
*searchdir_locked = true;
}
*foundobj_ret = foundobj;
return error;
}
/*
* Determine the desired locking mode for the directory of a lookup.
*/
static int
lookup_lktype(struct vnode *searchdir, struct componentname *cnp)
{
/*
* If the file system supports VOP_LOOKUP() with a shared lock, and
* we are not making any modifications (nameiop LOOKUP) or this is
* not the last component then get a shared lock. Where we can't do
* fast-forwarded lookups (for example with layered file systems)
* then this is the fallback for reducing lock contention.
*/
if ((searchdir->v_mount->mnt_iflag & IMNT_SHRLOOKUP) != 0 && (cnp->cn_nameiop == LOOKUP || (cnp->cn_flags & ISLASTCN) == 0)) {
return LK_SHARED;
} else {
return LK_EXCLUSIVE;
}
}
/*
* Call VOP_LOOKUP for a single lookup; return a new search directory
* (used when crossing mountpoints up or searching union mounts down) and
* the found object, which for create operations may be NULL on success.
*
* Note that the new search directory may be null, which means the
* searchdir was unlocked and released. This happens in the common case
* when crossing a mount point downwards, in order to avoid coupling
* locks between different file system volumes. Importantly, this can
* happen even if the call fails. (XXX: this is gross and should be
* tidied somehow.)
*/
static int
lookup_once(struct namei_state *state,
struct vnode *searchdir,
struct vnode **newsearchdir_ret,
struct vnode **foundobj_ret,
bool *newsearchdir_locked_ret)
{
struct vnode *tmpvn; /* scratch vnode */
struct vnode *foundobj; /* result */
struct lwp *l = curlwp;
bool searchdir_locked = false;
int error, lktype;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
KASSERT(cnp == &ndp->ni_cnd);
*newsearchdir_ret = searchdir;
/*
* Handle "..": two special cases.
* 1. If at root directory (e.g. after chroot)
* or at absolute root directory
* then ignore it so can't get out.
* 1a. If at the root of the emulation filesystem go to the real
* root. So "/../<path>" is always absolute.
* 1b. If we have somehow gotten out of a jail, warn
* and also ignore it so we can't get farther out.
* 2. If this vnode is the root of a mounted
* filesystem, then replace it with the
* vnode which was mounted on so we take the
* .. in the other file system.
*/
if (cnp->cn_flags & ISDOTDOT) {
struct proc *p = l->l_proc;
for (;;) {
if (searchdir == ndp->ni_rootdir ||
searchdir == rootvnode) {
foundobj = searchdir;
vref(foundobj);
*foundobj_ret = foundobj;
if (cnp->cn_flags & LOCKPARENT) { lktype = lookup_lktype(searchdir, cnp);
vn_lock(searchdir, lktype | LK_RETRY);
searchdir_locked = true;
}
error = 0;
goto done;
}
if (ndp->ni_rootdir != rootvnode) {
int retval;
retval = vn_isunder(searchdir, ndp->ni_rootdir, l);
if (!retval) {
/* Oops! We got out of jail! */
log(LOG_WARNING,
"chrooted pid %d uid %d (%s) "
"detected outside of its chroot\n",
p->p_pid, kauth_cred_geteuid(l->l_cred),
p->p_comm);
/* Put us at the jail root. */
vrele(searchdir);
searchdir = NULL;
foundobj = ndp->ni_rootdir;
vref(foundobj);
vref(foundobj);
*newsearchdir_ret = foundobj;
*foundobj_ret = foundobj;
error = 0;
goto done;
}
}
if ((searchdir->v_vflag & VV_ROOT) == 0 ||
(cnp->cn_flags & NOCROSSMOUNT))
break;
tmpvn = searchdir;
searchdir = searchdir->v_mount->mnt_vnodecovered;
vref(searchdir);
vrele(tmpvn);
*newsearchdir_ret = searchdir;
}
}
lktype = lookup_lktype(searchdir, cnp);
/*
* We now have a segment name to search for, and a directory to search.
* Our vnode state here is that "searchdir" is held.
*/
unionlookup:
foundobj = NULL;
if (!searchdir_locked) { vn_lock(searchdir, lktype | LK_RETRY);
searchdir_locked = true;
}
error = VOP_LOOKUP(searchdir, &foundobj, cnp);
if (error != 0) {
KASSERTMSG((foundobj == NULL),
"leaf `%s' should be empty but is %p",
cnp->cn_nameptr, foundobj);
#ifdef NAMEI_DIAGNOSTIC
printf("not found\n");
#endif /* NAMEI_DIAGNOSTIC */
/*
* If ENOLCK, the file system needs us to retry the lookup
* with an exclusive lock. It's likely nothing was found in
* cache and/or modifications need to be made.
*/
if (error == ENOLCK) { KASSERT(VOP_ISLOCKED(searchdir) == LK_SHARED); KASSERT(searchdir_locked); if (vn_lock(searchdir, LK_UPGRADE | LK_NOWAIT)) { VOP_UNLOCK(searchdir);
searchdir_locked = false;
}
lktype = LK_EXCLUSIVE;
goto unionlookup;
}
if ((error == ENOENT) && (searchdir->v_vflag & VV_ROOT) &&
(searchdir->v_mount->mnt_flag & MNT_UNION)) {
tmpvn = searchdir;
searchdir = searchdir->v_mount->mnt_vnodecovered;
vref(searchdir);
vput(tmpvn);
searchdir_locked = false;
*newsearchdir_ret = searchdir;
goto unionlookup;
}
if (error != EJUSTRETURN)
goto done;
/*
* If this was not the last component, or there were trailing
* slashes, and we are not going to create a directory,
* then the name must exist.
*/
if ((cnp->cn_flags & (REQUIREDIR | CREATEDIR)) == REQUIREDIR) {
error = ENOENT;
goto done;
}
/*
* If creating and at end of pathname, then can consider
* allowing file to be created.
*/
if (state->rdonly) {
error = EROFS;
goto done;
}
/*
* We return success and a NULL foundobj to indicate
* that the entry doesn't currently exist, leaving a
* pointer to the (normally, locked) directory vnode
* as searchdir.
*/
*foundobj_ret = NULL;
error = 0;
goto done;
}
#ifdef NAMEI_DIAGNOSTIC
printf("found\n");
#endif /* NAMEI_DIAGNOSTIC */
/* Unlock, unless the caller needs the parent locked. */
if (searchdir != NULL) {
KASSERT(searchdir_locked); if ((cnp->cn_flags & (ISLASTCN | LOCKPARENT)) !=
(ISLASTCN | LOCKPARENT)) {
VOP_UNLOCK(searchdir);
searchdir_locked = false;
}
} else {
KASSERT(!searchdir_locked);
}
*foundobj_ret = foundobj;
error = 0;
done:
*newsearchdir_locked_ret = searchdir_locked;
return error;
}
/*
* Parse out the first path name component that we need to to consider.
*
* While doing this, attempt to use the name cache to fast-forward through
* as many "easy" to find components of the path as possible.
*
* We use the namecache's node locks to form a chain, and avoid as many
* vnode references and locks as possible. In the ideal case, only the
* final vnode will have its reference count adjusted and lock taken.
*/
static int
lookup_fastforward(struct namei_state *state, struct vnode **searchdir_ret,
struct vnode **foundobj_ret)
{
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
krwlock_t *plock;
struct vnode *foundobj, *searchdir;
int error, error2;
size_t oldpathlen;
const char *oldnameptr;
bool terminal;
/*
* Eat as many path name components as possible before giving up and
* letting lookup_once() handle it. Remember the starting point in
* case we can't get vnode references and need to roll back.
*/
plock = NULL;
searchdir = *searchdir_ret;
oldnameptr = cnp->cn_nameptr;
oldpathlen = ndp->ni_pathlen;
terminal = false;
for (;;) {
foundobj = NULL;
/*
* Get the next component name. There should be no slashes
* here, and we shouldn't have looped around if we were
* done.
*/
KASSERT(cnp->cn_nameptr[0] != '/'); KASSERT(cnp->cn_nameptr[0] != '\0'); if ((error = lookup_parsepath(state, searchdir)) != 0) {
break;
}
/*
* Can't deal with DOTDOT lookups if NOCROSSMOUNT or the
* lookup is chrooted.
*/
if ((cnp->cn_flags & ISDOTDOT) != 0) { if ((searchdir->v_vflag & VV_ROOT) != 0 &&
(cnp->cn_flags & NOCROSSMOUNT)) {
error = EOPNOTSUPP;
break;
}
if (ndp->ni_rootdir != rootvnode) {
error = EOPNOTSUPP;
break;
}
}
/*
* Can't deal with last component when modifying; this needs
* searchdir locked and VOP_LOOKUP() called (which can and
* does modify state, despite the name). NB: this case means
* terminal is never set true when LOCKPARENT.
*/
if ((cnp->cn_flags & ISLASTCN) != 0) { if (cnp->cn_nameiop != LOOKUP ||
(cnp->cn_flags & LOCKPARENT) != 0) {
error = EOPNOTSUPP;
break;
}
}
/*
* Good, now look for it in cache. cache_lookup_linked()
* will fail if there's nothing there, or if there's no
* ownership info for the directory, or if the user doesn't
* have permission to look up files in this directory.
*/
if (!cache_lookup_linked(searchdir, cnp->cn_nameptr,
cnp->cn_namelen, &foundobj, &plock, cnp->cn_cred)) {
error = EOPNOTSUPP;
break;
}
KASSERT(plock != NULL); KASSERT(rw_lock_held(plock));
/*
* Scored a hit. Negative is good too (ENOENT). If there's
* a '-o union' mount here, punt and let lookup_once() deal
* with it.
*/
if (foundobj == NULL) {
if ((searchdir->v_vflag & VV_ROOT) != 0 &&
(searchdir->v_mount->mnt_flag & MNT_UNION) != 0) {
error = EOPNOTSUPP;
} else {
error = ENOENT;
terminal = ((cnp->cn_flags & ISLASTCN) != 0);
}
break;
}
/*
* Stop and get a hold on the vnode if we've encountered
* something other than a dirctory.
*/
if (foundobj->v_type != VDIR) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
} else {
terminal = (foundobj->v_type != VLNK &&
(cnp->cn_flags & ISLASTCN) != 0);
}
break;
}
/*
* Try to cross mountpoints, bearing in mind that they can
* be stacked. If at any point we can't go further, stop
* and try to get a reference on the vnode. If we are able
* to get a ref then lookup_crossmount() will take care of
* it, otherwise we'll fall through to lookup_once().
*/
if (foundobj->v_mountedhere != NULL) { while (foundobj->v_mountedhere != NULL && (cnp->cn_flags & NOCROSSMOUNT) == 0 &&
cache_cross_mount(&foundobj, &plock)) {
KASSERT(foundobj != NULL); KASSERT(foundobj->v_type == VDIR);
}
if (foundobj->v_mountedhere != NULL) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
}
break;
} else {
searchdir = NULL;
}
}
/*
* Time to stop if we found the last component & traversed
* all mounts.
*/
if ((cnp->cn_flags & ISLASTCN) != 0) {
error = vcache_tryvget(foundobj);
if (error != 0) {
foundobj = NULL;
error = EOPNOTSUPP;
} else {
terminal = (foundobj->v_type != VLNK);
}
break;
}
/*
* Otherwise, we're still in business. Set the found VDIR
* vnode as the search dir for the next component and
* continue on to it.
*/
cnp->cn_nameptr = ndp->ni_next;
searchdir = foundobj;
}
if (terminal) {
/*
* If we exited the loop above having successfully located
* the last component with a zero error code, and it's not a
* symbolic link, then the parent directory is not needed.
* Release reference to the starting parent and make the
* terminal parent disappear into thin air.
*/
KASSERT(plock != NULL);
rw_exit(plock);
vrele(*searchdir_ret);
*searchdir_ret = NULL;
} else if (searchdir != *searchdir_ret) {
/*
* Otherwise we need to return the parent. If we ended up
* with a new search dir, ref it before dropping the
* namecache's lock. The lock prevents both searchdir and
* foundobj from disappearing. If we can't ref the new
* searchdir, we have a bit of a problem. Roll back the
* fastforward to the beginning and let lookup_once() take
* care of it.
*/
if (searchdir == NULL) {
/*
* It's possible for searchdir to be NULL in the
* case of a root vnode being reclaimed while
* trying to cross a mount.
*/
error2 = EOPNOTSUPP;
} else {
error2 = vcache_tryvget(searchdir);
}
KASSERT(plock != NULL);
rw_exit(plock);
if (__predict_true(error2 == 0)) {
/* Returning new searchdir, and maybe new foundobj. */
vrele(*searchdir_ret);
*searchdir_ret = searchdir;
} else {
/* Returning nothing. */
if (foundobj != NULL) { vrele(foundobj);
foundobj = NULL;
}
cnp->cn_nameptr = oldnameptr;
ndp->ni_pathlen = oldpathlen;
error = lookup_parsepath(state, *searchdir_ret);
if (error == 0) {
error = EOPNOTSUPP;
}
}
} else if (plock != NULL) {
/* Drop any namecache lock still held. */
rw_exit(plock);
}
KASSERT(error == 0 ? foundobj != NULL : foundobj == NULL);
*foundobj_ret = foundobj;
return error;
}
//////////////////////////////
/*
* Do a complete path search from a single root directory.
* (This is called up to twice if TRYEMULROOT is in effect.)
*/
static int
namei_oneroot(struct namei_state *state,
int neverfollow, int inhibitmagic, int isnfsd)
{
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
struct vnode *searchdir, *foundobj;
bool searchdir_locked = false;
int error;
error = namei_start(state, isnfsd, &searchdir);
if (error) {
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
return error;
}
KASSERT(searchdir->v_type == VDIR);
/*
* Setup: break out flag bits into variables.
*/
state->docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
if (cnp->cn_nameiop == DELETE)
state->docache = 0;
state->rdonly = cnp->cn_flags & RDONLY;
/*
* Keep going until we run out of path components.
*/
cnp->cn_nameptr = ndp->ni_pnbuf;
/* drop leading slashes (already used them to choose startdir) */
while (cnp->cn_nameptr[0] == '/') {
cnp->cn_nameptr++;
ndp->ni_pathlen--;
}
/* was it just "/"? */
if (cnp->cn_nameptr[0] == '\0') {
foundobj = searchdir;
searchdir = NULL;
cnp->cn_flags |= ISLASTCN;
/* bleh */
goto skiploop;
}
for (;;) {
KASSERT(searchdir != NULL); KASSERT(!searchdir_locked);
/*
* Parse out the first path name component that we need to
* to consider. While doing this, attempt to use the name
* cache to fast-forward through as many "easy" to find
* components of the path as possible.
*/
error = lookup_fastforward(state, &searchdir, &foundobj);
/*
* If we didn't get a good answer from the namecache, then
* go directly to the file system.
*/
if (error == EOPNOTSUPP) { error = lookup_once(state, searchdir, &searchdir,
&foundobj, &searchdir_locked);
}
/*
* If the vnode we found is mounted on, then cross the mount
* and get the root vnode in foundobj. If this encounters
* an error, it will dispose of foundobj, but searchdir is
* untouched.
*/
if (error == 0 && foundobj != NULL && foundobj->v_type == VDIR && foundobj->v_mountedhere != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
error = lookup_crossmount(state, &searchdir,
&foundobj, &searchdir_locked);
}
if (error) {
if (searchdir != NULL) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
}
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
/*
* Note that if we're doing TRYEMULROOT we can
* retry with the normal root. Where this is
* currently set matches previous practice,
* but the previous practice didn't make much
* sense and somebody should sit down and
* figure out which cases should cause retry
* and which shouldn't. XXX.
*/
state->attempt_retry = 1;
return (error);
}
if (foundobj == NULL) {
/*
* Success with no object returned means we're
* creating something and it isn't already
* there. Break out of the main loop now so
* the code below doesn't have to test for
* foundobj == NULL.
*/
/* lookup_once can't have dropped the searchdir */
KASSERT(searchdir != NULL ||
(cnp->cn_flags & ISLASTCN) != 0);
break;
}
/*
* Check for symbolic link. If we've reached one,
* follow it, unless we aren't supposed to. Back up
* over any slashes that we skipped, as we will need
* them again.
*/
if (namei_atsymlink(state, foundobj)) {
/* Don't need searchdir locked any more. */
if (searchdir_locked) {
searchdir_locked = false;
VOP_UNLOCK(searchdir);
}
ndp->ni_pathlen += state->slashes;
ndp->ni_next -= state->slashes;
if (neverfollow) {
error = EINVAL;
} else if (searchdir == NULL) {
/*
* dholland 20160410: lookup_once only
* drops searchdir if it crossed a
* mount point. Therefore, if we get
* here it means we crossed a mount
* point to a mounted filesystem whose
* root vnode is a symlink. In theory
* we could continue at this point by
* using the pre-crossing searchdir
* (e.g. just take out an extra
* reference on it before calling
* lookup_once so we still have it),
* but this will make an ugly mess and
* it should never happen in practice
* as only badly broken filesystems
* have non-directory root vnodes. (I
* have seen this sort of thing with
* NFS occasionally but even then it
* means something's badly wrong.)
*/
error = ENOTDIR;
} else {
/*
* dholland 20110410: if we're at a
* union mount it might make sense to
* use the top of the union stack here
* rather than the layer we found the
* symlink in. (FUTURE)
*/
error = namei_follow(state, inhibitmagic,
searchdir, foundobj,
&searchdir);
}
if (error) {
KASSERT(searchdir != foundobj); if (searchdir != NULL) { vrele(searchdir);
}
vrele(foundobj);
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
return error;
}
vrele(foundobj);
foundobj = NULL;
/*
* If we followed a symlink to `/' and there
* are no more components after the symlink,
* we're done with the loop and what we found
* is the searchdir.
*/
if (cnp->cn_nameptr[0] == '\0') { KASSERT(searchdir != NULL);
foundobj = searchdir;
searchdir = NULL;
cnp->cn_flags |= ISLASTCN;
break;
}
continue;
}
/*
* Not a symbolic link.
*
* Check for directory, if the component was
* followed by a series of slashes.
*/
if ((foundobj->v_type != VDIR) &&
(cnp->cn_flags & REQUIREDIR)) {
KASSERT(foundobj != searchdir);
if (searchdir) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
} else {
KASSERT(!searchdir_locked);
}
vrele(foundobj);
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
return ENOTDIR;
}
/*
* Stop if we've reached the last component.
*/
if (cnp->cn_flags & ISLASTCN) {
break;
}
/*
* Continue with the next component.
*/
cnp->cn_nameptr = ndp->ni_next;
if (searchdir != NULL) {
if (searchdir_locked) {
searchdir_locked = false;
vput(searchdir);
} else {
vrele(searchdir);
}
}
searchdir = foundobj;
foundobj = NULL;
}
KASSERT((cnp->cn_flags & LOCKPARENT) == 0 || searchdir == NULL ||
VOP_ISLOCKED(searchdir) == LK_EXCLUSIVE);
skiploop:
if (foundobj != NULL) { if (foundobj == ndp->ni_erootdir) {
/*
* We are about to return the emulation root.
* This isn't a good idea because code might
* repeatedly lookup ".." until the file
* matches that returned for "/" and loop
* forever. So convert it to the real root.
*/
if (searchdir != NULL) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = ndp->ni_rootdir;
vref(foundobj);
}
/*
* If the caller requested the parent node (i.e. it's
* a CREATE, DELETE, or RENAME), and we don't have one
* (because this is the root directory, or we crossed
* a mount point), then we must fail.
*
* 20210604 dholland when NONEXCLHACK is set (open
* with O_CREAT but not O_EXCL) skip this logic. Since
* we have a foundobj, open will not be creating, so
* it doesn't actually need or use the searchdir, so
* it's ok to return it even if it's on a different
* volume, and it's also ok to return NULL; by setting
* NONEXCLHACK the open code promises to cope with
* those cases correctly. (That is, it should do what
* it would do anyway, that is, just release the
* searchdir, except not crash if it's null.) This is
* needed because otherwise opening mountpoints with
* O_CREAT but not O_EXCL fails... which is a silly
* thing to do but ought to work. (This whole issue
* came to light because 3rd party code wanted to open
* certain procfs nodes with O_CREAT for some 3rd
* party reason, and it failed.)
*
* Note that NONEXCLHACK is properly a different
* nameiop (it is partway between LOOKUP and CREATE)
* but it was stuffed in as a flag instead to make the
* resulting patch less invasive for pullup. Blah.
*/
if (cnp->cn_nameiop != LOOKUP &&
(searchdir == NULL ||
searchdir->v_mount != foundobj->v_mount) &&
(cnp->cn_flags & NONEXCLHACK) == 0) {
if (searchdir) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = NULL;
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
switch (cnp->cn_nameiop) {
case CREATE:
return EEXIST;
case DELETE:
case RENAME:
return EBUSY;
default:
break;
}
panic("Invalid nameiop\n");
}
/*
* Disallow directory write attempts on read-only lookups.
* Prefers EEXIST over EROFS for the CREATE case.
*/
if (state->rdonly &&
(cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) {
if (searchdir) {
if (searchdir_locked) {
vput(searchdir);
searchdir_locked = false;
} else {
vrele(searchdir);
}
searchdir = NULL;
}
vrele(foundobj);
foundobj = NULL;
ndp->ni_dvp = NULL;
ndp->ni_vp = NULL;
state->attempt_retry = 1;
return EROFS;
}
/* Lock the leaf node if requested. */
if ((cnp->cn_flags & (LOCKLEAF | LOCKPARENT)) == LOCKPARENT &&
searchdir == foundobj) {
/*
* Note: if LOCKPARENT but not LOCKLEAF is
* set, and searchdir == foundobj, this code
* necessarily unlocks the parent as well as
* the leaf. That is, just because you specify
* LOCKPARENT doesn't mean you necessarily get
* a locked parent vnode. The code in
* vfs_syscalls.c, and possibly elsewhere,
* that uses this combination "knows" this, so
* it can't be safely changed. Feh. XXX
*/
KASSERT(searchdir_locked);
VOP_UNLOCK(searchdir);
searchdir_locked = false;
} else if ((cnp->cn_flags & LOCKLEAF) != 0 && (searchdir != foundobj ||
(cnp->cn_flags & LOCKPARENT) == 0)) {
const int lktype = (cnp->cn_flags & LOCKSHARED) != 0 ?
LK_SHARED : LK_EXCLUSIVE;
vn_lock(foundobj, lktype | LK_RETRY);
}
}
/*
* Done.
*/
/*
* If LOCKPARENT is not set, the parent directory isn't returned.
*/
if ((cnp->cn_flags & LOCKPARENT) == 0 && searchdir != NULL) { vrele(searchdir);
searchdir = NULL;
}
ndp->ni_dvp = searchdir;
ndp->ni_vp = foundobj;
return 0;
}
/*
* Do namei; wrapper layer that handles TRYEMULROOT.
*/
static int
namei_tryemulroot(struct namei_state *state,
int neverfollow, int inhibitmagic, int isnfsd)
{
int error;
struct nameidata *ndp = state->ndp;
struct componentname *cnp = state->cnp;
const char *savepath = NULL;
KASSERT(cnp == &ndp->ni_cnd); if (cnp->cn_flags & TRYEMULROOT) { savepath = pathbuf_stringcopy_get(ndp->ni_pathbuf);
}
emul_retry:
state->attempt_retry = 0;
error = namei_oneroot(state, neverfollow, inhibitmagic, isnfsd); if (error) {
/*
* Once namei has started up, the existence of ni_erootdir
* tells us whether we're working from an emulation root.
* The TRYEMULROOT flag isn't necessarily authoritative.
*/
if (ndp->ni_erootdir != NULL && state->attempt_retry) {
/* Retry the whole thing using the normal root */
cnp->cn_flags &= ~TRYEMULROOT;
state->attempt_retry = 0;
/* kinda gross */
strcpy(ndp->ni_pathbuf->pb_path, savepath);
pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
savepath = NULL;
goto emul_retry;
}
}
if (savepath != NULL) { pathbuf_stringcopy_put(ndp->ni_pathbuf, savepath);
}
return error;
}
/*
* External interface.
*/
int
namei(struct nameidata *ndp)
{
struct namei_state state;
int error;
namei_init(&state, ndp);
error = namei_tryemulroot(&state,
0/*!neverfollow*/, 0/*!inhibitmagic*/,
0/*isnfsd*/);
namei_cleanup(&state); if (error) {
/* make sure no stray refs leak out */
KASSERT(ndp->ni_dvp == NULL); KASSERT(ndp->ni_vp == NULL);
}
return error;
}
////////////////////////////////////////////////////////////
/*
* External interface used by nfsd. This is basically different from
* namei only in that it has the ability to pass in the "current
* directory", and uses an extra flag "neverfollow" for which there's
* no physical flag defined in namei.h. (There used to be a cut&paste
* copy of about half of namei in nfsd to allow these minor
* adjustments to exist.)
*
* XXX: the namei interface should be adjusted so nfsd can just use
* ordinary namei().
*/
int
lookup_for_nfsd(struct nameidata *ndp, struct vnode *forcecwd, int neverfollow)
{
struct namei_state state;
int error;
KASSERT(ndp->ni_atdir == NULL);
ndp->ni_atdir = forcecwd;
namei_init(&state, ndp);
error = namei_tryemulroot(&state,
neverfollow, 1/*inhibitmagic*/, 1/*isnfsd*/);
namei_cleanup(&state);
if (error) {
/* make sure no stray refs leak out */
KASSERT(ndp->ni_dvp == NULL);
KASSERT(ndp->ni_vp == NULL);
}
return error;
}
/*
* A second external interface used by nfsd. This turns out to be a
* single lookup used by the WebNFS code (ha!) to get "index.html" or
* equivalent when asked for a directory. It should eventually evolve
* into some kind of namei_once() call; for the time being it's kind
* of a mess. XXX.
*
* dholland 20110109: I don't think it works, and I don't think it
* worked before I started hacking and slashing either, and I doubt
* anyone will ever notice.
*/
/*
* Internals. This calls lookup_once() after setting up the assorted
* pieces of state the way they ought to be.
*/
static int
do_lookup_for_nfsd_index(struct namei_state *state)
{
int error;
struct componentname *cnp = state->cnp;
struct nameidata *ndp = state->ndp;
struct vnode *startdir;
struct vnode *foundobj;
bool startdir_locked;
const char *cp; /* pointer into pathname argument */
KASSERT(cnp == &ndp->ni_cnd);
startdir = state->ndp->ni_atdir;
cnp->cn_nameptr = ndp->ni_pnbuf;
state->docache = 1;
state->rdonly = cnp->cn_flags & RDONLY;
ndp->ni_dvp = NULL;
error = VOP_PARSEPATH(startdir, cnp->cn_nameptr, &cnp->cn_namelen);
if (error) {
return error;
}
cp = cnp->cn_nameptr + cnp->cn_namelen;
KASSERT(cnp->cn_namelen <= KERNEL_NAME_MAX);
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
state->slashes = 0;
cnp->cn_flags &= ~REQUIREDIR;
cnp->cn_flags |= MAKEENTRY|ISLASTCN;
if (cnp->cn_namelen == 2 &&
cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.')
cnp->cn_flags |= ISDOTDOT;
else
cnp->cn_flags &= ~ISDOTDOT;
/*
* Because lookup_once can change the startdir, we need our
* own reference to it to avoid consuming the caller's.
*/
vref(startdir);
error = lookup_once(state, startdir, &startdir, &foundobj,
&startdir_locked);
KASSERT((cnp->cn_flags & LOCKPARENT) == 0);
if (startdir_locked) {
VOP_UNLOCK(startdir);
startdir_locked = false;
}
/*
* If the vnode we found is mounted on, then cross the mount and get
* the root vnode in foundobj. If this encounters an error, it will
* dispose of foundobj, but searchdir is untouched.
*/
if (error == 0 && foundobj != NULL &&
foundobj->v_type == VDIR &&
foundobj->v_mountedhere != NULL &&
(cnp->cn_flags & NOCROSSMOUNT) == 0) {
error = lookup_crossmount(state, &startdir, &foundobj,
&startdir_locked);
}
/* Now toss startdir and see if we have an error. */
if (startdir != NULL)
vrele(startdir);
if (error)
foundobj = NULL;
else if (foundobj != NULL && (cnp->cn_flags & LOCKLEAF) != 0)
vn_lock(foundobj, LK_EXCLUSIVE | LK_RETRY);
ndp->ni_vp = foundobj;
return (error);
}
/*
* External interface. The partitioning between this function and the
* above isn't very clear - the above function exists mostly so code
* that uses "state->" can be shuffled around without having to change
* it to "state.".
*/
int
lookup_for_nfsd_index(struct nameidata *ndp, struct vnode *startdir)
{
struct namei_state state;
int error;
KASSERT(ndp->ni_atdir == NULL);
ndp->ni_atdir = startdir;
/*
* Note: the name sent in here (is not|should not be) allowed
* to contain a slash.
*/
if (strlen(ndp->ni_pathbuf->pb_path) > KERNEL_NAME_MAX) {
return ENAMETOOLONG;
}
if (strchr(ndp->ni_pathbuf->pb_path, '/')) {
return EINVAL;
}
ndp->ni_pathlen = strlen(ndp->ni_pathbuf->pb_path) + 1;
ndp->ni_pnbuf = NULL;
ndp->ni_cnd.cn_nameptr = NULL;
namei_init(&state, ndp);
error = do_lookup_for_nfsd_index(&state);
namei_cleanup(&state);
return error;
}
////////////////////////////////////////////////////////////
/*
* Reacquire a path name component.
* dvp is locked on entry and exit.
* *vpp is locked on exit unless it's NULL.
*/
int
relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp, int dummy)
{
int rdonly; /* lookup read-only flag bit */
int error = 0;
#ifdef DEBUG
size_t newlen; /* DEBUG: check name len */
const char *cp; /* DEBUG: check name ptr */
#endif /* DEBUG */
(void)dummy;
/*
* Setup: break out flag bits into variables.
*/
rdonly = cnp->cn_flags & RDONLY;
/*
* Search a new directory.
*
* The cn_hash value is for use by vfs_cache.
* The last component of the filename is left accessible via
* cnp->cn_nameptr for callers that need the name. Callers needing
* the name set the SAVENAME flag. When done, they assume
* responsibility for freeing the pathname buffer.
*/
#ifdef DEBUG
#if 0
cp = NULL;
newhash = namei_hash(cnp->cn_nameptr, &cp);
if ((uint32_t)newhash != (uint32_t)cnp->cn_hash)
panic("relookup: bad hash");
#endif
error = VOP_PARSEPATH(dvp, cnp->cn_nameptr, &newlen);
if (error) {
panic("relookup: parsepath failed with error %d", error);
}
if (cnp->cn_namelen != newlen)
panic("relookup: bad len");
cp = cnp->cn_nameptr + cnp->cn_namelen;
while (*cp == '/')
cp++;
if (*cp != 0)
panic("relookup: not last component");
#endif /* DEBUG */
/*
* Check for degenerate name (e.g. / or "")
* which is a way of talking about a directory,
* e.g. like "/." or ".".
*/
if (cnp->cn_nameptr[0] == '\0')
panic("relookup: null name");
if (cnp->cn_flags & ISDOTDOT)
panic("relookup: lookup on dot-dot");
/*
* We now have a segment name to search for, and a directory to search.
*/
*vpp = NULL;
error = VOP_LOOKUP(dvp, vpp, cnp);
if ((error) != 0) { KASSERTMSG((*vpp == NULL),
"leaf `%s' should be empty but is %p",
cnp->cn_nameptr, *vpp);
if (error != EJUSTRETURN)
goto bad;
}
/*
* Check for symbolic link
*/
KASSERTMSG((*vpp == NULL || (*vpp)->v_type != VLNK ||
(cnp->cn_flags & FOLLOW) == 0),
"relookup: symlink found");
/*
* Check for read-only lookups.
*/
if (rdonly && cnp->cn_nameiop != LOOKUP) {
error = EROFS;
if (*vpp) { vrele(*vpp);
}
goto bad;
}
/*
* Lock result.
*/
if (*vpp && *vpp != dvp) {
error = vn_lock(*vpp, LK_EXCLUSIVE);
if (error != 0) { vrele(*vpp);
goto bad;
}
}
return (0);
bad:
*vpp = NULL;
return (error);
}
/*
* namei_simple - simple forms of namei.
*
* These are wrappers to allow the simple case callers of namei to be
* left alone while everything else changes under them.
*/
/* Flags */
struct namei_simple_flags_type {
int dummy;
};
static const struct namei_simple_flags_type ns_nn, ns_nt, ns_fn, ns_ft;
const namei_simple_flags_t NSM_NOFOLLOW_NOEMULROOT = &ns_nn;
const namei_simple_flags_t NSM_NOFOLLOW_TRYEMULROOT = &ns_nt;
const namei_simple_flags_t NSM_FOLLOW_NOEMULROOT = &ns_fn;
const namei_simple_flags_t NSM_FOLLOW_TRYEMULROOT = &ns_ft;
static
int
namei_simple_convert_flags(namei_simple_flags_t sflags)
{
if (sflags == NSM_NOFOLLOW_NOEMULROOT)
return NOFOLLOW | 0;
if (sflags == NSM_NOFOLLOW_TRYEMULROOT)
return NOFOLLOW | TRYEMULROOT;
if (sflags == NSM_FOLLOW_NOEMULROOT)
return FOLLOW | 0;
if (sflags == NSM_FOLLOW_TRYEMULROOT)
return FOLLOW | TRYEMULROOT;
panic("namei_simple_convert_flags: bogus sflags\n");
return 0;
}
int
namei_simple_kernel(const char *path, namei_simple_flags_t sflags,
struct vnode **vp_ret)
{
return nameiat_simple_kernel(NULL, path, sflags, vp_ret);
}
int
nameiat_simple_kernel(struct vnode *dvp, const char *path,
namei_simple_flags_t sflags, struct vnode **vp_ret)
{
struct nameidata nd;
struct pathbuf *pb;
int err;
pb = pathbuf_create(path);
if (pb == NULL) {
return ENOMEM;
}
NDINIT(&nd,
LOOKUP,
namei_simple_convert_flags(sflags),
pb);
if (dvp != NULL) NDAT(&nd, dvp);
err = namei(&nd);
if (err != 0) {
pathbuf_destroy(pb);
return err;
}
*vp_ret = nd.ni_vp;
pathbuf_destroy(pb);
return 0;
}
int
namei_simple_user(const char *path, namei_simple_flags_t sflags,
struct vnode **vp_ret)
{
return nameiat_simple_user(NULL, path, sflags, vp_ret);
}
int
nameiat_simple_user(struct vnode *dvp, const char *path,
namei_simple_flags_t sflags, struct vnode **vp_ret)
{
struct pathbuf *pb;
struct nameidata nd;
int err;
err = pathbuf_copyin(path, &pb);
if (err) {
return err;
}
NDINIT(&nd,
LOOKUP,
namei_simple_convert_flags(sflags),
pb);
if (dvp != NULL) NDAT(&nd, dvp);
err = namei(&nd);
if (err != 0) {
pathbuf_destroy(pb);
return err;
}
*vp_ret = nd.ni_vp;
pathbuf_destroy(pb);
return 0;
}
/* $NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $ */
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lookup.c 8.9 (Berkeley) 8/11/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_lookup.c,v 1.158 2023/08/10 20:49:20 mrg Exp $");
#ifdef _KERNEL_OPT
#include "opt_ffs.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/buf.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/vnode.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/wapbl.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/dir.h>
#ifdef UFS_DIRHASH
#include <ufs/ufs/dirhash.h>
#endif
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <miscfs/genfs/genfs.h>
#ifdef DIAGNOSTIC
int dirchk = 1;
#else
int dirchk = 0;
#endif
#if BYTE_ORDER == LITTLE_ENDIAN
# define ENDIANSWAP(needswap) ((needswap) == 0)
#else
# define ENDIANSWAP(needswap) ((needswap) != 0)
#endif
#define NAMLEN(fsfmt, needswap, dp) \
((fsfmt) && ENDIANSWAP(needswap) ? (dp)->d_type : (dp)->d_namlen)
static void
ufs_dirswap(struct direct *dirp)
{
uint8_t tmp = dirp->d_namlen;
dirp->d_namlen = dirp->d_type;
dirp->d_type = tmp;
}
struct slotinfo {
enum {
NONE, /* need to search a slot for our new entry */
COMPACT, /* a compaction can make a slot in the current
DIRBLKSIZ block */
FOUND, /* found a slot (or no need to search) */
} status;
doff_t offset; /* offset of area with free space.
a special value -1 for invalid */
int size; /* size of area at slotoffset */
int freespace; /* accumulated amount of space free in
the current DIRBLKSIZ block */
int needed; /* size of the entry we're seeking */
};
static void
calc_count(struct ufs_lookup_results *results, int dirblksiz, doff_t prevoff)
{
if ((results->ulr_offset & (dirblksiz - 1)) == 0)
results->ulr_count = 0;
else
results->ulr_count = results->ulr_offset - prevoff;
}
static void
slot_init(struct slotinfo *slot)
{
slot->status = FOUND;
slot->offset = -1;
slot->freespace = slot->size = slot->needed = 0;
}
#ifdef UFS_DIRHASH
static doff_t
slot_findfree(struct slotinfo *slot, struct inode *dp)
{
if (slot->status == FOUND)
return dp->i_size;
slot->offset = ufsdirhash_findfree(dp, slot->needed, &slot->size);
if (slot->offset < 0)
return dp->i_size;
slot->status = COMPACT;
doff_t enduseful = ufsdirhash_enduseful(dp);
if (enduseful < 0)
return dp->i_size;
return enduseful;
}
#endif
static void
slot_white(struct slotinfo *slot, uint16_t reclen,
struct ufs_lookup_results *results)
{
slot->status = FOUND;
slot->offset = results->ulr_offset;
slot->size = reclen;
results->ulr_reclen = slot->size;
}
static void
slot_update(struct slotinfo *slot, int size, uint16_t reclen, doff_t offset)
{
if (size >= slot->needed) {
slot->status = FOUND;
slot->offset = offset;
slot->size = reclen;
} else if (slot->status == NONE) {
slot->freespace += size;
if (slot->offset == -1) slot->offset = offset; if (slot->freespace >= slot->needed) { slot->status = COMPACT;
slot->size = offset + reclen - slot->offset;
}
}
}
/*
* Return an indication of where the new directory entry should be put.
* If we didn't find a slot, then set results->ulr_count to 0 indicating
* that the new slot belongs at the end of the directory. If we found a slot,
* then the new entry can be put in the range from results->ulr_offset to
* results->ulr_offset + results->ulr_count.
*/
static int
slot_estimate(const struct slotinfo *slot, int dirblksiz, int nameiop,
doff_t prevoff, doff_t enduseful, const struct inode *ip,
struct ufs_lookup_results *results)
{
if (slot->status == NONE) {
results->ulr_offset = roundup(ip->i_size, dirblksiz);
results->ulr_count = 0;
enduseful = results->ulr_offset;
} else if (nameiop == DELETE) {
results->ulr_offset = slot->offset;
calc_count(results, dirblksiz, prevoff);
} else {
results->ulr_offset = slot->offset;
results->ulr_count = slot->size;
if (enduseful < slot->offset + slot->size)
enduseful = slot->offset + slot->size;
}
results->ulr_endoff = roundup(enduseful, dirblksiz);
#if 0 /* commented out by dbj. none of the on disk fields changed */
ip->i_flag |= IN_CHANGE | IN_UPDATE;
#endif
return EJUSTRETURN;
}
/*
* Check if we can delete inode tdp in directory vdp with inode ip and creds.
*/
static int
ufs_can_delete(struct vnode *tdp, struct vnode *vdp, struct inode *ip,
kauth_cred_t cred)
{
int error;
#ifdef UFS_ACL
/*
* NFSv4 Minor Version 1, draft-ietf-nfsv4-minorversion1-03.txt
*
* 3.16.2.1. ACE4_DELETE vs. ACE4_DELETE_CHILD
*/
/*
* XXX: Is this check required?
*/
error = VOP_ACCESS(vdp, VEXEC, cred);
if (error)
goto out;
#if 0
/* Moved to ufs_remove, ufs_rmdir because they hold the lock */
error = VOP_ACCESSX(tdp, VDELETE, cred);
if (error == 0)
return (0);
#endif
error = VOP_ACCESSX(vdp, VDELETE_CHILD, cred);
if (error == 0)
return (0);
error = VOP_ACCESSX(vdp, VEXPLICIT_DENY | VDELETE_CHILD, cred);
if (error)
goto out;
#endif /* !UFS_ACL */
/*
* Write access to directory required to delete files.
*/
error = VOP_ACCESS(vdp, VWRITE, cred);
if (error)
goto out;
if (!(ip->i_mode & ISVTX))
return 0;
/*
* If directory is "sticky", then user must own
* the directory, or the file in it, else she
* may not delete it (unless she's root). This
* implements append-only directories.
*/
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, tdp, vdp,
genfs_can_sticky(vdp, cred, ip->i_uid, VTOI(tdp)->i_uid));
if (error) {
error = EPERM; // Why override?
goto out;
}
return 0;
out:
vrele(tdp);
return error;
}
static int
ufs_getino(struct vnode *vdp, struct inode *ip, ino_t foundino,
struct vnode **tdp, bool same)
{
if (ip->i_number == foundino) {
if (same)
return EISDIR;
vref(vdp);
*tdp = vdp;
return 0;
}
return vcache_get(vdp->v_mount, &foundino, sizeof(foundino), tdp);
}
/*
* Convert a component of a pathname into a pointer to a locked inode.
* This is a very central and rather complicated routine.
* If the file system is not maintained in a strict tree hierarchy,
* this can result in a deadlock situation (see comments in code below).
*
* The cnp->cn_nameiop argument is LOOKUP, CREATE, RENAME, or DELETE depending
* on whether the name is to be looked up, created, renamed, or deleted.
* When CREATE, RENAME, or DELETE is specified, information usable in
* creating, renaming, or deleting a directory entry may be calculated.
* If flag has LOCKPARENT or'ed into it and the target of the pathname
* exists, lookup returns both the target and its parent directory locked.
* When creating or renaming and LOCKPARENT is specified, the target may
* not be ".". When deleting and LOCKPARENT is specified, the target may
* be "."., but the caller must check to ensure it does an vrele and vput
* instead of two vputs.
*
* Overall outline of ufs_lookup:
*
* check accessibility of directory
* look for name in cache, if found, then if at end of path
* and deleting or creating, drop it, else return name
* search for name in directory, to found or notfound
* notfound:
* if creating, return locked directory, leaving info on available slots
* else return error
* found:
* if at end of path and deleting, return information to allow delete
* if at end of path and rewriting (RENAME and LOCKPARENT), lock target
* inode and return info to allow rewrite
* if not at end, add name to cache; if at end and neither creating
* nor deleting, add name to cache
*/
int
ufs_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
struct vnode *vdp = ap->a_dvp; /* vnode for directory being searched */
struct inode *dp = VTOI(vdp); /* inode for directory being searched */
struct buf *bp; /* a buffer of directory entries */
struct direct *ep; /* the current directory entry */
int entryoffsetinblock; /* offset of ep in bp's buffer */
struct slotinfo slot;
int numdirpasses; /* strategy for directory search */
doff_t endsearch; /* offset to end directory search */
doff_t prevoff; /* previous value of ulr_offset */
struct vnode *tdp; /* returned by vcache_get */
doff_t enduseful; /* pointer past last used dir slot.
used for directory truncation. */
u_long bmask; /* block offset mask */
int error;
struct vnode **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
int flags;
int nameiop = cnp->cn_nameiop;
struct ufsmount *ump = dp->i_ump;
const int needswap = UFS_MPNEEDSWAP(ump);
int dirblksiz = ump->um_dirblksiz;
ino_t foundino;
struct ufs_lookup_results *results;
int iswhiteout; /* temp result from cache_lookup() */
const int fsfmt = FSFMT(vdp);
uint16_t reclen;
flags = cnp->cn_flags;
bp = NULL;
*vpp = NULL;
endsearch = 0; /* silence compiler warning */
/*
* Check accessibility of directory.
*/
if ((error = VOP_ACCESS(vdp, VEXEC, cred)) != 0)
return (error);
if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) &&
(nameiop == DELETE || nameiop == RENAME))
return (EROFS);
/*
* We now have a segment name to search for, and a directory to search.
*
* Before tediously performing a linear scan of the directory,
* check the name cache to see if the directory/name pair
* we are looking for is known already.
*/
if (cache_lookup(vdp, cnp->cn_nameptr, cnp->cn_namelen,
cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
if (iswhiteout) { cnp->cn_flags |= ISWHITEOUT;
}
return *vpp == NULLVP ? ENOENT : 0;
}
/* May need to restart the lookup with an exclusive lock. */
if (VOP_ISLOCKED(vdp) != LK_EXCLUSIVE) {
return ENOLCK;
}
/*
* Produce the auxiliary lookup results into i_crap. Increment
* its serial number so elsewhere we can tell if we're using
* stale results. This should not be done this way. XXX.
*/
results = &dp->i_crap;
dp->i_crapcounter++;
if (iswhiteout) {
/*
* The namecache set iswhiteout without finding a
* cache entry. As of this writing (20121014), this
* can happen if there was a whiteout entry that has
* been invalidated by the lookup. It is not clear if
* it is correct to set ISWHITEOUT in this case or
* not; however, doing so retains the prior behavior,
* so we'll go with that until some clearer answer
* appears. XXX
*/
cnp->cn_flags |= ISWHITEOUT;
}
/*
* Suppress search for slots unless creating
* file and at end of pathname, in which case
* we watch for a place to put the new file in
* case it doesn't already exist.
*/
slot_init(&slot);
if ((nameiop == CREATE || nameiop == RENAME) && (flags & ISLASTCN)) { slot.status = NONE;
slot.needed = UFS_DIRECTSIZ(cnp->cn_namelen);
}
/*
* If there is cached information on a previous search of
* this directory, pick up where we last left off.
* We cache only lookups as these are the most common
* and have the greatest payoff. Caching CREATE has little
* benefit as it usually must search the entire directory
* to determine that the entry does not exist. Caching the
* location of the last DELETE or RENAME has not reduced
* profiling time and hence has been removed in the interest
* of simplicity.
*/
bmask = vdp->v_mount->mnt_stat.f_iosize - 1;
#ifdef UFS_DIRHASH
/*
* Use dirhash for fast operations on large directories. The logic
* to determine whether to hash the directory is contained within
* ufsdirhash_build(); a zero return means that it decided to hash
* this directory and it successfully built up the hash table.
*/
if (ufsdirhash_build(dp) == 0) {
/* Look for a free slot if needed. */
enduseful = slot_findfree(&slot, dp);
/* Look up the component. */
numdirpasses = 1;
entryoffsetinblock = 0; /* silence compiler warning */
switch (ufsdirhash_lookup(dp, cnp->cn_nameptr, cnp->cn_namelen,
&results->ulr_offset, &bp,
nameiop == DELETE ? &prevoff : NULL)) {
case 0:
ep = (void *)((char *)bp->b_data +
(results->ulr_offset & bmask));
reclen = ufs_rw16(ep->d_reclen, needswap);
goto foundentry;
case ENOENT:
results->ulr_offset = roundup(dp->i_size, dirblksiz);
goto notfound;
default:
/* Something failed; just do a linear search. */
break;
}
}
#endif /* UFS_DIRHASH */
if (nameiop != LOOKUP || results->ulr_diroff == 0 ||
results->ulr_diroff >= dp->i_size) {
entryoffsetinblock = 0;
results->ulr_offset = 0;
numdirpasses = 1;
} else {
results->ulr_offset = results->ulr_diroff;
entryoffsetinblock = results->ulr_offset & bmask;
if (entryoffsetinblock != 0 &&
(error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
NULL, &bp, false)))
goto out;
numdirpasses = 2;
namecache_count_2passes();
}
prevoff = results->ulr_offset;
endsearch = roundup(dp->i_size, dirblksiz);
enduseful = 0;
searchloop:
while (results->ulr_offset < endsearch) {
preempt_point();
/*
* If necessary, get the next directory block.
*/
if ((results->ulr_offset & bmask) == 0) { if (bp != NULL) brelse(bp, 0);
error = ufs_blkatoff(vdp, (off_t)results->ulr_offset,
NULL, &bp, false);
if (error)
goto out;
entryoffsetinblock = 0;
}
/*
* If still looking for a slot, and at a DIRBLKSIZ
* boundary, have to start looking for free space again.
*/
if (slot.status == NONE &&
(entryoffsetinblock & (dirblksiz - 1)) == 0) {
slot.offset = -1;
slot.freespace = 0;
}
/*
* Get pointer to next entry.
* Full validation checks are slow, so we only check
* enough to insure forward progress through the
* directory. Complete checks can be run by patching
* "dirchk" to be true.
*/
KASSERT(bp != NULL);
ep = (void *)((char *)bp->b_data + entryoffsetinblock);
const char *msg;
reclen = ufs_rw16(ep->d_reclen, needswap); if ((reclen == 0 && (msg = "null entry")) || (dirchk &&
(msg = ufs_dirbadentry(vdp, ep, entryoffsetinblock)))) {
ufs_dirbad(dp, results->ulr_offset, msg);
reclen = dirblksiz -
(entryoffsetinblock & (dirblksiz - 1));
goto next;
}
/*
* If an appropriate sized slot has not yet been found,
* check to see if one is available. Also accumulate space
* in the current block so that we can determine if
* compaction is viable.
*/
if (slot.status != FOUND) {
int size = reclen;
if (ep->d_ino != 0) size -= UFS_DIRSIZ(fsfmt, ep, needswap); if (size > 0) slot_update(&slot, size, reclen,
results->ulr_offset);
}
if (ep->d_ino == 0)
goto next;
/*
* Check for a name match.
*/
const uint16_t namlen = NAMLEN(fsfmt, needswap, ep);
if (namlen != cnp->cn_namelen ||
memcmp(cnp->cn_nameptr, ep->d_name, (size_t)namlen))
goto next;
#ifdef UFS_DIRHASH
foundentry:
#endif
/*
* Save directory entry's inode number and
* reclen, and release directory buffer.
*/
if (!fsfmt && ep->d_type == DT_WHT) { slot_white(&slot, reclen, results);
/*
* This is used to set results->ulr_endoff, which may
* be used by ufs_direnter() as a length to truncate
* the directory to. Therefore, it must point past the
* end of the last non-empty directory entry. We don't
* know where that is in this case, so we effectively
* disable shrinking by using the existing size of the
* directory.
*
* Note that we wouldn't expect to shrink the
* directory while rewriting an existing entry anyway.
*/
enduseful = endsearch;
cnp->cn_flags |= ISWHITEOUT;
numdirpasses--;
goto notfound;
}
foundino = ufs_rw32(ep->d_ino, needswap);
results->ulr_reclen = reclen;
goto found;
next:
prevoff = results->ulr_offset;
results->ulr_offset += reclen;
entryoffsetinblock += reclen;
if (ep->d_ino)
enduseful = results->ulr_offset;
}
notfound:
/*
* If we started in the middle of the directory and failed
* to find our target, we must check the beginning as well.
*/
if (numdirpasses == 2) {
numdirpasses--;
results->ulr_offset = 0;
endsearch = results->ulr_diroff;
goto searchloop;
}
if (bp != NULL) brelse(bp, 0);
/*
* If creating, and at end of pathname and current
* directory has not been removed, then can consider
* allowing file to be created.
*/
if ((nameiop == CREATE || nameiop == RENAME || (nameiop == DELETE && (cnp->cn_flags & DOWHITEOUT) &&
(cnp->cn_flags & ISWHITEOUT))) &&
(flags & ISLASTCN) && dp->i_nlink != 0) {
/*
* Access for write is interpreted as allowing
* creation of files in the directory.
*/
if (flags & WILLBEDIR)
error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
else
error = VOP_ACCESS(vdp, VWRITE, cred); if (error)
goto out;
error = slot_estimate(&slot, dirblksiz, nameiop,
prevoff, enduseful, dp, results);
/*
* We return with the directory locked, so that
* the parameters we set up above will still be
* valid if we actually decide to do a direnter().
* We return ni_vp == NULL to indicate that the entry
* does not currently exist; we leave a pointer to
* the (locked) directory inode in ndp->ni_dvp.
*
* NB - if the directory is unlocked, then this
* information cannot be used.
*/
goto out;
}
/*
* Insert name into cache (as non-existent) if appropriate.
*/
if (nameiop != CREATE) {
cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
cnp->cn_flags);
}
error = ENOENT;
goto out;
found:
if (numdirpasses == 2) namecache_count_pass2();
/*
* Check that directory length properly reflects presence
* of this entry.
*/
const uint64_t newisize =
results->ulr_offset + UFS_DIRSIZ(fsfmt, ep, needswap);
if (newisize > dp->i_size) {
ufs_dirbad(dp, results->ulr_offset, "i_size too small");
dp->i_size = newisize;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
}
brelse(bp, 0);
/*
* Found component in pathname.
* If the final component of path name, save information
* in the cache as to where the entry was found.
*/
if ((flags & ISLASTCN) && nameiop == LOOKUP)
results->ulr_diroff = results->ulr_offset & ~(dirblksiz - 1);
/*
* If deleting, and at end of pathname, return
* parameters which can be used to remove file.
* Lock the inode, being careful with ".".
*/
if (nameiop == DELETE && (flags & ISLASTCN)) {
/*
* Return pointer to current entry in results->ulr_offset,
* and distance past previous entry (if there
* is a previous entry in this block) in results->ulr_count.
* Save directory inode pointer in ndp->ni_dvp for dirremove().
*/
calc_count(results, dirblksiz, prevoff);
if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
goto out;
if ((error = ufs_can_delete(tdp, vdp, dp, cred)) != 0)
goto out;
*vpp = tdp;
goto out;
}
/*
* If rewriting (RENAME), return the inode and the
* information required to rewrite the present directory
* Must get inode of directory entry to verify it's a
* regular file, or empty directory.
*/
if (nameiop == RENAME && (flags & ISLASTCN)) {
if (flags & WILLBEDIR)
error = VOP_ACCESSX(vdp, VWRITE | VAPPEND, cred);
else
error = VOP_ACCESS(vdp, VWRITE, cred); if (error)
goto out;
/*
* Careful about locking second inode.
* This can only occur if the target is ".".
*/
if ((error = ufs_getino(vdp, dp, foundino, &tdp, true)) != 0)
goto out;
*vpp = tdp;
goto out;
}
if ((error = ufs_getino(vdp, dp, foundino, &tdp, false)) != 0)
goto out;
*vpp = tdp;
/*
* Insert name into cache if appropriate.
*/
cache_enter(vdp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
error = 0;
out:
return error;
}
void
ufs_dirbad(struct inode *ip, doff_t offset, const char *how)
{
struct mount *mp = ITOV(ip)->v_mount;
void (*p)(const char *, ...) __printflike(1, 2) =
(mp->mnt_flag & MNT_RDONLY) == 0 ? panic : printf;
(*p)("%s: bad dir ino %ju at offset %d: %s\n",
mp->mnt_stat.f_mntonname, (uintmax_t)ip->i_number,
offset, how);
}
/*
* Do consistency checking on a directory entry:
* record length must be multiple of 4
* entry must fit in rest of its DIRBLKSIZ block
* record must be large enough to contain entry
* name is not longer than FFS_MAXNAMLEN
* name must be as long as advertised, and null terminated
*/
const char *
ufs_dirbadentry(const struct vnode *dp, const struct direct *ep,
int entryoffsetinblock)
{
const struct ufsmount *ump = VFSTOUFS(dp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int dirblksiz = ump->um_dirblksiz;
const int maxsize = dirblksiz - (entryoffsetinblock & (dirblksiz - 1));
const int fsfmt = FSFMT(dp);
const uint8_t namlen = NAMLEN(fsfmt, needswap, ep);
const uint16_t reclen = ufs_rw16(ep->d_reclen, needswap); const int dirsiz = (int)UFS_DIRSIZ(fsfmt, ep, needswap);
const char *name = ep->d_name;
const char *str;
#ifdef DIAGNOSTIC
static char buf[512];
#endif
if ((reclen & 0x3) != 0)
str = "not rounded";
else if (reclen > maxsize)
str = "too big";
else if (reclen < dirsiz)
str = "too small";
#if FFS_MAXNAMLEN < 255
else if (namlen > FFS_MAXNAMLEN)
str = "long name";
#endif
else
str = NULL;
if (str) {
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "Bad dir (%s), reclen=%#x, "
"namlen=%d, dirsiz=%d <= reclen=%d <= maxsize=%d, "
"flags=%#x, entryoffsetinblock=%d, dirblksiz=%d",
str, reclen, namlen, dirsiz, reclen, maxsize,
dp->v_mount->mnt_flag, entryoffsetinblock, dirblksiz);
str = buf;
#endif
return str;
}
if (ep->d_ino == 0)
return NULL;
for (uint8_t i = 0; i < namlen; i++)
if (name[i] == '\0') {
str = "NUL in name";
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "%s [%s] i=%d, namlen=%d",
str, name, i, namlen);
str = buf;
#endif
return str;
}
if (name[namlen]) {
str = "missing NUL in name";
#ifdef DIAGNOSTIC
snprintf(buf, sizeof(buf), "%s [%*.*s] namlen=%d", str,
namlen, namlen, name, namlen);
str = buf;
#endif
return str;
}
return NULL;
}
/*
* Construct a new directory entry after a call to namei, using the
* name in the componentname argument cnp. The argument ip is the
* inode to which the new directory entry will refer.
*/
void
ufs_makedirentry(struct inode *ip, struct componentname *cnp,
struct direct *newdirp)
{
size_t namelen = cnp->cn_namelen;
newdirp->d_ino = ip->i_number;
newdirp->d_namlen = namelen;
memcpy(newdirp->d_name, cnp->cn_nameptr, namelen);
/* NUL terminate and zero out padding */
memset(&newdirp->d_name[namelen], 0, UFS_NAMEPAD(namelen));
if (FSFMT(ITOV(ip)))
newdirp->d_type = 0;
else
newdirp->d_type = IFTODT(ip->i_mode);
}
static int
ufs_dirgrow(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
const kauth_cred_t cr = cnp->cn_cred;
const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int dirblksiz = ump->um_dirblksiz;
const int fsfmt = FSFMT(dvp);
const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
struct inode *dp = VTOI(dvp);
int error, ret, blkoff;
struct timespec ts;
struct buf *bp;
/*
* If ulr_count is 0, then namei could find no
* space in the directory. Here, ulr_offset will
* be on a directory block boundary and we will write the
* new entry into a fresh block.
*/
if (ulr->ulr_offset & (dirblksiz - 1))
panic("%s: newblk", __func__); if ((error = UFS_BALLOC(dvp, (off_t)ulr->ulr_offset, dirblksiz,
cr, B_CLRBUF | B_SYNC, &bp)) != 0) {
return error;
}
dp->i_size = ulr->ulr_offset + dirblksiz;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
uvm_vnp_setsize(dvp, dp->i_size);
dirp->d_reclen = ufs_rw16(dirblksiz, needswap);
dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp);
blkoff = ulr->ulr_offset & (ump->um_mountp->mnt_stat.f_iosize - 1);
memcpy((char *)bp->b_data + blkoff, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { ufsdirhash_newblk(dp, ulr->ulr_offset);
ufsdirhash_add(dp, dirp, ulr->ulr_offset);
ufsdirhash_checkblock(dp, (char *)bp->b_data + blkoff,
ulr->ulr_offset);
}
#endif
error = VOP_BWRITE(bp->b_vp, bp);
vfs_timestamp(&ts);
ret = UFS_UPDATE(dvp, &ts, &ts, UPDATE_DIROP);
if (error == 0)
return ret;
return error;
}
static int
#if __GNUC_PREREQ__(5, 3)
/* This gets miscompiled by gcc 5.3 PR/51094 */
__attribute__((__optimize__("no-tree-vrp")))
#endif
ufs_dircompact(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
const struct ufsmount *ump = VFSTOUFS(dvp->v_mount);
const int needswap = UFS_MPNEEDSWAP(ump);
const int fsfmt = FSFMT(dvp);
const u_int newentrysize = UFS_DIRSIZ(0, dirp, 0);
struct inode *dp = VTOI(dvp);
struct buf *bp;
u_int dsize;
struct direct *ep, *nep;
int error, loc, spacefree;
char *dirbuf;
uint16_t reclen;
UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
/*
* If ulr_count is non-zero, then namei found space for the new
* entry in the range ulr_offset to ulr_offset + ulr_count
* in the directory. To use this space, we may have to compact
* the entries located there, by copying them together towards the
* beginning of the block, leaving the free space in one usable
* chunk at the end.
*/
/*
* Increase size of directory if entry eats into new space.
* This should never push the size past a new multiple of
* DIRBLKSIZ.
*
* N.B. - THIS IS AN ARTIFACT OF 4.2 AND SHOULD NEVER HAPPEN.
*/
if (ulr->ulr_offset + ulr->ulr_count > dp->i_size) {
#ifdef DIAGNOSTIC
printf("%s: reached 4.2-only block, not supposed to happen\n",
__func__);
#endif
dp->i_size = ulr->ulr_offset + ulr->ulr_count;
DIP_ASSIGN(dp, size, dp->i_size);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
}
/*
* Get the block containing the space for the new directory entry.
*/
error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &dirbuf, &bp, true);
if (error)
return error;
/*
* Find space for the new entry. In the simple case, the entry at
* offset base will have the space. If it does not, then namei
* arranged that compacting the region ulr_offset to
* ulr_offset + ulr_count would yield the space.
*/
ep = (void *)dirbuf;
dsize = (ep->d_ino != 0) ? UFS_DIRSIZ(fsfmt, ep, needswap) : 0; reclen = ufs_rw16(ep->d_reclen, needswap);
spacefree = reclen - dsize;
for (loc = reclen; loc < ulr->ulr_count; ) {
nep = (void *)(dirbuf + loc);
/* Trim the existing slot (NB: dsize may be zero). */
ep->d_reclen = ufs_rw16(dsize, needswap); ep = (void *)((char *)ep + dsize);
reclen = ufs_rw16(nep->d_reclen, needswap);
loc += reclen;
if (nep->d_ino == 0) {
/*
* A mid-block unused entry. Such entries are
* never created by the kernel, but fsck_ffs
* can create them (and it doesn't fix them).
*
* Add up the free space, and initialise the
* relocated entry since we don't memcpy it.
*/
spacefree += reclen;
ep->d_ino = 0;
dsize = 0;
continue;
}
dsize = UFS_DIRSIZ(fsfmt, nep, needswap);
spacefree += reclen - dsize;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL)
ufsdirhash_move(dp, nep,
ulr->ulr_offset + ((char *)nep - dirbuf),
ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
memcpy(ep, nep, dsize);
}
/*
* Here, `ep' points to a directory entry containing `dsize' in-use
* bytes followed by `spacefree' unused bytes. If ep->d_ino == 0,
* then the entry is completely unused (dsize == 0). The value
* of ep->d_reclen is always indeterminate.
*
* Update the pointer fields in the previous entry (if any),
* copy in the new entry, and write out the block.
*/
if (ep->d_ino == 0 || (ufs_rw32(ep->d_ino, needswap) == UFS_WINO &&
memcmp(ep->d_name, dirp->d_name, dirp->d_namlen) == 0)) {
if (spacefree + dsize < newentrysize)
panic("%s: too big", __func__); dirp->d_reclen = spacefree + dsize;
} else {
if (spacefree < newentrysize)
panic("%s: nospace", __func__);
dirp->d_reclen = spacefree;
ep->d_reclen = ufs_rw16(dsize, needswap);
ep = (void *)((char *)ep + dsize);
}
dirp->d_reclen = ufs_rw16(dirp->d_reclen, needswap);
dirp->d_ino = ufs_rw32(dirp->d_ino, needswap);
if (fsfmt && ENDIANSWAP(needswap)) ufs_dirswap(dirp);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL && (ep->d_ino == 0 ||
dirp->d_reclen == spacefree))
ufsdirhash_add(dp, dirp, ulr->ulr_offset + ((char *)ep - dirbuf));
#endif
memcpy(ep, dirp, newentrysize);
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { const int dirblkmsk = ump->um_dirblksiz - 1;
ufsdirhash_checkblock(dp, dirbuf -
(ulr->ulr_offset & dirblkmsk),
ulr->ulr_offset & ~dirblkmsk);
}
#endif
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* If all went well, and the directory can be shortened, proceed
* with the truncation. Note that we have to unlock the inode for
* the entry that we just entered, as the truncation may need to
* lock other inodes which can lead to deadlock if we also hold a
* lock on the newly entered node.
*/
if (error == 0 && ulr->ulr_endoff && ulr->ulr_endoff < dp->i_size) {
const kauth_cred_t cr = cnp->cn_cred;
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) ufsdirhash_dirtrunc(dp, ulr->ulr_endoff);
#endif
(void) UFS_TRUNCATE(dvp, (off_t)ulr->ulr_endoff, IO_SYNC, cr);
}
UFS_WAPBL_UPDATE(dvp, NULL, NULL, UPDATE_DIROP);
return error;
}
/*
* Write a directory entry after a call to namei, using the parameters
* that ufs_lookup left in nameidata and in the ufs_lookup_results.
*
* DVP is the directory to be updated. It must be locked.
* ULR is the ufs_lookup_results structure from the final lookup step.
* TVP is not used. (XXX: why is it here? remove it)
* DIRP is the new directory entry contents.
* CNP is the componentname from the final lookup step.
* NEWDIRBP is not used and (XXX) should be removed. The previous
* comment here said it was used by the now-removed softupdates code.
*
* The link count of the target inode is *not* incremented; the
* caller does that.
*
* If ulr->ulr_count is 0, ufs_lookup did not find space to insert the
* directory entry. ulr_offset, which is the place to put the entry,
* should be on a block boundary (and should be at the end of the
* directory AFAIK) and a fresh block is allocated to put the new
* directory entry in.
*
* If ulr->ulr_count is not zero, ufs_lookup found a slot to insert
* the entry into. This slot ranges from ulr_offset to ulr_offset +
* ulr_count. However, this slot may already be partially populated
* requiring compaction. See notes below.
*
* Furthermore, if ulr_count is not zero and ulr_endoff is not the
* same as i_size, the directory is truncated to size ulr_endoff.
*/
int
ufs_direnter(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct vnode *tvp, struct direct *dirp,
struct componentname *cnp, struct buf *newdirbp)
{
if (ulr->ulr_count == 0)
return ufs_dirgrow(dvp, ulr, tvp, dirp, cnp, newdirbp);
else
return ufs_dircompact(dvp, ulr, tvp, dirp, cnp, newdirbp);
}
/*
* Remove a directory entry after a call to namei, using the
* parameters that ufs_lookup left in nameidata and in the
* ufs_lookup_results.
*
* DVP is the directory to be updated. It must be locked.
* ULR is the ufs_lookup_results structure from the final lookup step.
* IP, if not null, is the inode being unlinked.
* FLAGS may contain DOWHITEOUT.
* ISRMDIR is not used and (XXX) should be removed.
*
* If FLAGS contains DOWHITEOUT the entry is replaced with a whiteout
* instead of being cleared.
*
* ulr->ulr_offset contains the position of the directory entry
* to be removed.
*
* ulr->ulr_reclen contains the size of the directory entry to be
* removed.
*
* ulr->ulr_count contains the size of the *previous* directory
* entry. This allows finding it, for free space management. If
* ulr_count is 0, the target entry is at the beginning of the
* directory. (Does this ever happen? The first entry should be ".",
* which should only be removed at rmdir time. Does rmdir come here
* to clear out the "." and ".." entries? Perhaps, but I doubt it.)
*
* The space is marked free by adding it to the record length (not
* name length) of the preceding entry. If the first entry becomes
* free, it is marked free by setting the inode number to 0.
*
* The link count of IP is decremented. Note that this is not the
* inverse behavior of ufs_direnter, which does not adjust link
* counts. Sigh.
*/
int
ufs_dirremove(struct vnode *dvp, const struct ufs_lookup_results *ulr,
struct inode *ip, int flags, int isrmdir)
{
struct inode *dp = VTOI(dvp);
struct direct *ep;
struct buf *bp;
int error;
const int needswap = UFS_MPNEEDSWAP(dp->i_ump);
uint16_t reclen;
UFS_WAPBL_JLOCK_ASSERT(dvp->v_mount);
if (flags & DOWHITEOUT) {
/*
* Whiteout entry: set d_ino to UFS_WINO.
*/
error = ufs_blkatoff(dvp, (off_t)ulr->ulr_offset, &ep,
&bp, true);
if (error)
return (error);
ep->d_ino = ufs_rw32(UFS_WINO, needswap);
ep->d_type = DT_WHT;
goto out;
}
if ((error = ufs_blkatoff(dvp,
(off_t)(ulr->ulr_offset - ulr->ulr_count), &ep, &bp, true)) != 0)
return (error);
reclen = ufs_rw16(ep->d_reclen, needswap);
#ifdef UFS_DIRHASH
/*
* Remove the dirhash entry. This is complicated by the fact
* that `ep' is the previous entry when ulr_count != 0.
*/
if (dp->i_dirhash != NULL) ufsdirhash_remove(dp, (ulr->ulr_count == 0) ? ep :
(void *)((char *)ep + reclen), ulr->ulr_offset);
#endif
if (ulr->ulr_count == 0) {
/*
* First entry in block: set d_ino to zero.
*/
ep->d_ino = 0;
} else {
/*
* Collapse new free space into previous entry.
*/
ep->d_reclen = ufs_rw16(reclen + ulr->ulr_reclen, needswap);
}
#ifdef UFS_DIRHASH
if (dp->i_dirhash != NULL) { int dirblksiz = ip->i_ump->um_dirblksiz;
ufsdirhash_checkblock(dp, (char *)ep -
((ulr->ulr_offset - ulr->ulr_count) & (dirblksiz - 1)),
ulr->ulr_offset & ~(dirblksiz - 1));
}
#endif
out:
if (ip) {
ip->i_nlink--;
DIP_ASSIGN(ip, nlink, ip->i_nlink);
ip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ITOV(ip), NULL, NULL, 0);
}
/*
* XXX did it ever occur to anyone that it might be a good
* idea to restore ip->i_nlink if this fails? Or something?
* Currently on error return from this function the state of
* ip->i_nlink depends on what happened, and callers
* definitely do not take this into account.
*/
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* If the last named reference to a snapshot goes away,
* drop its snapshot reference so that it will be reclaimed
* when last open reference goes away.
*/
if (ip != 0 && (ip->i_flags & SF_SNAPSHOT) != 0 &&
ip->i_nlink == 0)
UFS_SNAPGONE(ITOV(ip)); UFS_WAPBL_UPDATE(dvp, NULL, NULL, 0);
return (error);
}
/*
* Rewrite an existing directory entry to point at the inode supplied.
*
* DP is the directory to update.
* OFFSET is the position of the entry in question. It may come
* from ulr_offset of a ufs_lookup_results.
* OIP is the old inode the directory previously pointed to.
* NEWINUM is the number of the new inode.
* NEWTYPE is the new value for the type field of the directory entry.
* (This is ignored if the fs doesn't support that.)
* ISRMDIR is not used and (XXX) should be removed.
* IFLAGS are added to DP's inode flags.
*
* The link count of OIP is decremented. Note that the link count of
* the new inode is *not* incremented. Yay for symmetry.
*/
int
ufs_dirrewrite(struct inode *dp, off_t offset,
struct inode *oip, ino_t newinum, int newtype,
int isrmdir, int iflags)
{
struct buf *bp;
struct direct *ep;
struct vnode *vdp = ITOV(dp);
int error;
error = ufs_blkatoff(vdp, offset, &ep, &bp, true);
if (error)
return (error);
ep->d_ino = ufs_rw32(newinum, UFS_MPNEEDSWAP(dp->i_ump)); if (!FSFMT(vdp)) ep->d_type = newtype;
oip->i_nlink--;
DIP_ASSIGN(oip, nlink, oip->i_nlink);
oip->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(ITOV(oip), NULL, NULL, UPDATE_DIROP);
error = VOP_BWRITE(bp->b_vp, bp);
dp->i_flag |= iflags;
/*
* If the last named reference to a snapshot goes away,
* drop its snapshot reference so that it will be reclaimed
* when last open reference goes away.
*/
if ((oip->i_flags & SF_SNAPSHOT) != 0 && oip->i_nlink == 0) UFS_SNAPGONE(ITOV(oip)); UFS_WAPBL_UPDATE(vdp, NULL, NULL, UPDATE_DIROP);
return (error);
}
/*
* Check if a directory is empty or not.
* Inode supplied must be locked.
*
* Using a struct dirtemplate here is not precisely
* what we want, but better than using a struct direct.
*
* NB: does not handle corrupted directories.
*/
int
ufs_dirempty(struct inode *ip, ino_t parentino, kauth_cred_t cred)
{
doff_t off;
struct direct dbuf;
struct direct *dp = &dbuf;
int error;
size_t count;
const int needswap = UFS_IPNEEDSWAP(ip);
const int fsfmt = FSFMT(ITOV(ip));
#define MINDIRSIZ (sizeof (struct dirtemplate) / 2)
for (off = 0; off < ip->i_size; off += ufs_rw16(dp->d_reclen, needswap)) { error = ufs_bufio(UIO_READ, ITOV(ip), dp, MINDIRSIZ,
off, IO_NODELOCKED, cred, &count, NULL);
/*
* Since we read MINDIRSIZ, residual must
* be 0 unless we're at end of file.
*/
if (error || count != 0)
return (0);
/* avoid infinite loops */
if (dp->d_reclen == 0)
return (0);
/* skip empty entries */
ino_t ino = ufs_rw32(dp->d_ino, needswap); if (ino == 0 || ino == UFS_WINO)
continue;
/* accept only "." and ".." */
const uint8_t namlen = NAMLEN(fsfmt, needswap, dp);
if (namlen > 2)
return (0);
if (dp->d_name[0] != '.')
return (0);
/*
* At this point namlen must be 1 or 2.
* 1 implies ".", 2 implies ".." if second
* char is also "."
*/
if (namlen == 1 && ino == ip->i_number)
continue;
if (dp->d_name[1] == '.' && ino == parentino)
continue;
return (0);
}
return (1);
}
#define UFS_DIRRABLKS 0
int ufs_dirrablks = UFS_DIRRABLKS;
/*
* ufs_blkatoff: Return buffer with the contents of block "offset" from
* the beginning of directory "vp". If "res" is non-NULL, fill it in with
* a pointer to the remaining space in the directory. If the caller intends
* to modify the buffer returned, "modify" must be true.
*/
int
ufs_blkatoff(struct vnode *vp, off_t offset, void *v, struct buf **bpp,
bool modify)
{
char **res = v;
struct inode *ip __diagused;
struct buf *bp;
daddr_t lbn;
const int dirrablks = ufs_dirrablks;
daddr_t *blks;
int *blksizes;
int run, error;
struct mount *mp = vp->v_mount;
const int bshift = mp->mnt_fs_bshift;
const int bsize = 1 << bshift;
off_t eof;
blks = kmem_alloc((1 + dirrablks) * sizeof(daddr_t), KM_SLEEP);
blksizes = kmem_alloc((1 + dirrablks) * sizeof(int), KM_SLEEP);
ip = VTOI(vp);
KASSERT(vp->v_size == ip->i_size);
GOP_SIZE(vp, vp->v_size, &eof, 0);
lbn = offset >> bshift;
for (run = 0; run <= dirrablks;) {
const off_t curoff = lbn << bshift;
const int size = MIN(eof - curoff, bsize);
if (size == 0) {
break;
}
KASSERT(curoff < eof);
blks[run] = lbn;
blksizes[run] = size;
lbn++;
run++;
if (size != bsize) {
break;
}
}
KASSERT(run >= 1); error = breadn(vp, blks[0], blksizes[0], &blks[1], &blksizes[1],
run - 1, (modify ? B_MODIFY : 0), &bp);
if (error != 0) {
*bpp = NULL;
goto out;
}
if (res) {
*res = (char *)bp->b_data + (offset & (bsize - 1));
}
*bpp = bp;
out:
kmem_free(blks, (1 + dirrablks) * sizeof(daddr_t));
kmem_free(blksizes, (1 + dirrablks) * sizeof(int));
return error;
}
/* $NetBSD: process_machdep.c,v 1.50 2023/11/20 03:05:48 simonb Exp $ */
/*
* Copyright (c) 1998, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* This file may seem a bit stylized, but that so that it's easier to port.
* Functions to be implemented here are:
*
* process_read_regs(proc, regs)
* Get the current user-visible register set from the process
* and copy it into the regs structure (<machine/reg.h>).
* The process is stopped at the time read_regs is called.
*
* process_write_regs(proc, regs)
* Update the current register set from the passed in regs
* structure. Take care to avoid clobbering special CPU
* registers or privileged bits in the PSL.
* The process is stopped at the time write_regs is called.
*
* process_read_fpregs(proc, regs, sz)
* Get the current user-visible register set from the process
* and copy it into the regs structure (<machine/reg.h>).
* The process is stopped at the time read_fpregs is called.
*
* process_write_fpregs(proc, regs, sz)
* Update the current register set from the passed in regs
* structure. Take care to avoid clobbering special CPU
* registers or privileged bits in the PSL.
* The process is stopped at the time write_fpregs is called.
*
* process_read_dbregs(proc, regs, sz)
* Get the current user-visible register set from the process
* and copy it into the regs structure (<machine/reg.h>).
* The process is stopped at the time read_dbregs is called.
*
* process_write_dbregs(proc, regs, sz)
* Update the current register set from the passed in regs
* structure. Take care to avoid clobbering special CPU
* registers or privileged bits in the PSL.
* The process is stopped at the time write_dbregs is called.
*
* process_sstep(proc)
* Arrange for the process to trap after executing a single instruction.
*
* process_set_pc(proc)
* Set the process's program counter.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: process_machdep.c,v 1.50 2023/11/20 03:05:48 simonb Exp $");
#ifdef _KERNEL_OPT
#include "opt_xen.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/ptrace.h>
#include <sys/compat_stub.h>
#include <uvm/uvm_extern.h>
#include <compat/netbsd32/netbsd32.h>
#include <machine/psl.h>
#include <machine/reg.h>
#include <machine/segments.h>
#include <x86/dbregs.h>
#include <x86/fpu.h>
struct netbsd32_process_doxmmregs_hook_t netbsd32_process_doxmmregs_hook;
static inline struct trapframe *process_frame(struct lwp *);
static inline struct trapframe *
process_frame(struct lwp *l)
{
return l->l_md.md_regs;
}
int
process_read_regs(struct lwp *l, struct reg *regp)
{
struct trapframe *tf = process_frame(l);
long *regs = regp->regs;
const bool pk32 = (l->l_proc->p_flag & PK_32) != 0;
regs[_REG_RDI] = tf->tf_rdi;
regs[_REG_RSI] = tf->tf_rsi;
regs[_REG_RDX] = tf->tf_rdx;
regs[_REG_R10] = tf->tf_r10;
regs[_REG_R8] = tf->tf_r8;
regs[_REG_R9] = tf->tf_r9;
/* argX not touched */
regs[_REG_RCX] = tf->tf_rcx;
regs[_REG_R11] = tf->tf_r11;
regs[_REG_R12] = tf->tf_r12;
regs[_REG_R13] = tf->tf_r13;
regs[_REG_R14] = tf->tf_r14;
regs[_REG_R15] = tf->tf_r15;
regs[_REG_RBP] = tf->tf_rbp;
regs[_REG_RBX] = tf->tf_rbx;
regs[_REG_RAX] = tf->tf_rax;
if (pk32) {
regs[_REG_GS] = tf->tf_gs & 0xffff;
regs[_REG_FS] = tf->tf_fs & 0xffff;
regs[_REG_ES] = tf->tf_es & 0xffff;
regs[_REG_DS] = tf->tf_ds & 0xffff;
regs[_REG_CS] = tf->tf_cs & 0xffff;
regs[_REG_SS] = tf->tf_ss & 0xffff;
} else {
regs[_REG_GS] = 0;
regs[_REG_FS] = 0;
regs[_REG_ES] = GSEL(GUDATA_SEL, SEL_UPL);
regs[_REG_DS] = GSEL(GUDATA_SEL, SEL_UPL);
regs[_REG_CS] = LSEL(LUCODE_SEL, SEL_UPL);
regs[_REG_SS] = LSEL(LUDATA_SEL, SEL_UPL);
}
regs[_REG_TRAPNO] = tf->tf_trapno;
regs[_REG_ERR] = tf->tf_err;
regs[_REG_RIP] = tf->tf_rip;
regs[_REG_RFLAGS] = tf->tf_rflags;
regs[_REG_RSP] = tf->tf_rsp;
return 0;
}
int
process_read_fpregs(struct lwp *l, struct fpreg *regs, size_t *sz)
{
process_read_fpregs_xmm(l, ®s->fxstate);
return 0;
}
int
process_read_dbregs(struct lwp *l, struct dbreg *regs, size_t *sz)
{
x86_dbregs_read(l, regs);
return 0;
}
int
process_write_regs(struct lwp *l, const struct reg *regp)
{
struct trapframe *tf = process_frame(l);
int error;
const long *regs = regp->regs;
const bool pk32 = (l->l_proc->p_flag & PK_32) != 0;
/*
* Check for security violations. Note that struct regs is compatible
* with the __gregs array in mcontext_t.
*/
if (pk32) {
MODULE_HOOK_CALL(netbsd32_reg_validate_hook, (l, regp), EINVAL,
error);
} else {
error = cpu_mcontext_validate(l, (const mcontext_t *)regs);
}
if (error != 0)
return error;
tf->tf_rdi = regs[_REG_RDI];
tf->tf_rsi = regs[_REG_RSI];
tf->tf_rdx = regs[_REG_RDX];
tf->tf_r10 = regs[_REG_R10];
tf->tf_r8 = regs[_REG_R8];
tf->tf_r9 = regs[_REG_R9];
/* argX not touched */
tf->tf_rcx = regs[_REG_RCX];
tf->tf_r11 = regs[_REG_R11];
tf->tf_r12 = regs[_REG_R12];
tf->tf_r13 = regs[_REG_R13];
tf->tf_r14 = regs[_REG_R14];
tf->tf_r15 = regs[_REG_R15];
tf->tf_rbp = regs[_REG_RBP];
tf->tf_rbx = regs[_REG_RBX];
tf->tf_rax = regs[_REG_RAX];
if (pk32) {
tf->tf_gs = regs[_REG_GS] & 0xffff;
tf->tf_fs = regs[_REG_FS] & 0xffff;
tf->tf_es = regs[_REG_ES] & 0xffff;
tf->tf_ds = regs[_REG_DS] & 0xffff;
tf->tf_cs = regs[_REG_CS] & 0xffff;
tf->tf_ss = regs[_REG_SS] & 0xffff;
} else {
tf->tf_gs = 0;
tf->tf_fs = 0;
tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
}
/* trapno, err not touched */
tf->tf_rip = regs[_REG_RIP];
tf->tf_rflags = regs[_REG_RFLAGS];
tf->tf_rsp = regs[_REG_RSP];
return 0;
}
int
process_write_fpregs(struct lwp *l, const struct fpreg *regs, size_t sz)
{
process_write_fpregs_xmm(l, ®s->fxstate);
return 0;
}
int
process_write_dbregs(struct lwp *l, const struct dbreg *regs, size_t sz)
{
int error;
/*
* Check for security violations.
*/
error = x86_dbregs_validate(regs);
if (error != 0)
return error;
x86_dbregs_write(l, regs);
return 0;
}
int
process_sstep(struct lwp *l, int sstep)
{
struct trapframe *tf = process_frame(l);
if (sstep)
tf->tf_rflags |= PSL_T;
else
tf->tf_rflags &= ~PSL_T;
return 0;
}
int
process_set_pc(struct lwp *l, void *addr)
{ struct trapframe *tf = process_frame(l);
const bool pk32 = (l->l_proc->p_flag & PK_32) != 0;
const uint64_t rip = (uint64_t)addr;
if (rip >= (pk32 ? VM_MAXUSER_ADDRESS32 : VM_MAXUSER_ADDRESS))
return EINVAL;
tf->tf_rip = rip;
return 0;
}
#ifdef __HAVE_PTRACE_MACHDEP
static int
process_machdep_read_xstate(struct lwp *l, struct xstate *regs)
{
return process_read_xstate(l, regs);
}
static int
process_machdep_write_xstate(struct lwp *l, const struct xstate *regs)
{
int error;
/*
* Check for security violations.
*/
error = process_verify_xstate(regs);
if (error != 0)
return error;
return process_write_xstate(l, regs);
}
int
ptrace_machdep_dorequest(
struct lwp *l,
struct lwp **lt,
int req,
void *addr,
int data
)
{
struct uio uio;
struct iovec iov;
struct vmspace *vm;
int error;
bool write = false;
switch (req) {
case PT_SETXSTATE:
write = true;
/* FALLTHROUGH */
case PT_GETXSTATE:
/* write = false done above. */
if ((error = ptrace_update_lwp((*lt)->l_proc, lt, data)) != 0)
return error;
if (!process_machdep_validfpu((*lt)->l_proc))
return EINVAL;
if (__predict_false(l->l_proc->p_flag & PK_32)) {
struct netbsd32_iovec user_iov;
if ((error = copyin(addr, &user_iov, sizeof(user_iov)))
!= 0)
return error;
iov.iov_base = NETBSD32PTR64(user_iov.iov_base);
iov.iov_len = user_iov.iov_len;
} else {
struct iovec user_iov;
if ((error = copyin(addr, &user_iov, sizeof(user_iov)))
!= 0)
return error;
iov.iov_base = user_iov.iov_base;
iov.iov_len = user_iov.iov_len;
}
error = proc_vmspace_getref(l->l_proc, &vm);
if (error)
return error;
if (iov.iov_len > sizeof(struct xstate))
iov.iov_len = sizeof(struct xstate);
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = 0;
uio.uio_resid = iov.iov_len;
uio.uio_rw = write ? UIO_WRITE : UIO_READ;
uio.uio_vmspace = vm;
error = process_machdep_doxstate(l, *lt, &uio);
uvmspace_free(vm);
return error;
case PT_SETXMMREGS: /* only for COMPAT_NETBSD32 */
write = true;
/* FALLTHROUGH */
case PT_GETXMMREGS: /* only for COMPAT_NETBSD32 */
/* write = false done above. */
if ((error = ptrace_update_lwp((*lt)->l_proc, lt, data)) != 0)
return error;
MODULE_HOOK_CALL(netbsd32_process_doxmmregs_hook,
(l, *lt, addr, write), EINVAL, error);
return error;
}
#ifdef DIAGNOSTIC
panic("ptrace_machdep: impossible");
#endif
return 0;
}
/*
* The following functions are used by both ptrace(2) and procfs.
*/
int
process_machdep_doxstate(struct lwp *curl, struct lwp *l, struct uio *uio)
/* curl: tracer */
/* l: traced */
{
int error;
struct xstate r; /* XXX FIXME big stack object */
char *kv;
ssize_t kl;
memset(&r, 0, sizeof(r));
kl = MIN(uio->uio_iov->iov_len, sizeof(r));
kv = (char *) &r;
kv += uio->uio_offset;
kl -= uio->uio_offset;
if (kl > uio->uio_resid)
kl = uio->uio_resid;
if (kl < 0)
error = EINVAL;
else
error = process_machdep_read_xstate(l, &r);
if (error == 0)
error = uiomove(kv, kl, uio);
if (error == 0 && uio->uio_rw == UIO_WRITE) error = process_machdep_write_xstate(l, &r);
uio->uio_offset = 0;
return error;
}
int
process_machdep_validfpu(struct proc *p)
{
if (p->p_flag & PK_SYSTEM)
return 0;
return 1;
}
#endif /* __HAVE_PTRACE_MACHDEP */
/* $NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock_obj.c,v 1.13 2023/10/02 21:03:55 ad Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/rwlock.h>
/* Mutex cache */
#define RW_OBJ_MAGIC 0x85d3c85d
struct krwobj {
krwlock_t ro_lock;
u_int ro_magic;
u_int ro_refcnt;
uint8_t mo_pad[COHERENCY_UNIT - sizeof(krwlock_t) -
sizeof(u_int) * 2];
};
/*
* rw_obj_alloc:
*
* Allocate a single lock object, waiting for memory if needed.
*/
krwlock_t *
rw_obj_alloc(void)
{
struct krwobj *ro;
ro = kmem_intr_alloc(sizeof(*ro), KM_SLEEP);
KASSERT(ALIGNED_POINTER(ro, coherency_unit));
_rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
ro->ro_magic = RW_OBJ_MAGIC;
ro->ro_refcnt = 1;
return (krwlock_t *)ro;
}
/*
* rw_obj_tryalloc:
*
* Allocate a single lock object, but fail if no memory is available.
*/
krwlock_t *
rw_obj_tryalloc(void)
{
struct krwobj *ro;
ro = kmem_intr_alloc(sizeof(*ro), KM_NOSLEEP);
KASSERT(ALIGNED_POINTER(ro, coherency_unit));
if (__predict_true(ro != NULL)) {
_rw_init(&ro->ro_lock, (uintptr_t)__builtin_return_address(0));
ro->ro_magic = RW_OBJ_MAGIC;
ro->ro_refcnt = 1;
}
return (krwlock_t *)ro;
}
/*
* rw_obj_hold:
*
* Add a single reference to a lock object. A reference to the object
* must already be held, and must be held across this call.
*/
void
rw_obj_hold(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0);
atomic_inc_uint(&ro->ro_refcnt);
}
/*
* rw_obj_free:
*
* Drop a reference from a lock object. If the last reference is being
* dropped, free the object and return true. Otherwise, return false.
*/
bool
rw_obj_free(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
KASSERT(ro->ro_magic == RW_OBJ_MAGIC); KASSERT(ro->ro_refcnt > 0);
membar_release();
if (atomic_dec_uint_nv(&ro->ro_refcnt) > 0) {
return false;
}
membar_acquire();
rw_destroy(&ro->ro_lock);
kmem_intr_free(ro, sizeof(*ro));
return true;
}
/*
* rw_obj_refcnt:
*
* Return the reference count for a lock object.
*/
u_int
rw_obj_refcnt(krwlock_t *lock)
{
struct krwobj *ro = (struct krwobj *)lock;
return ro->ro_refcnt;
}
/* $NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Scooter Morris at Genentech Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_lockf.c 8.4 (Berkeley) 10/26/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_lockf.c,v 1.81 2023/09/23 18:21:11 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/uidinfo.h>
/*
* The lockf structure is a kernel structure which contains the information
* associated with a byte range lock. The lockf structures are linked into
* the vnode structure. Locks are sorted by the starting byte of the lock for
* efficiency.
*
* lf_next is used for two purposes, depending on whether the lock is
* being held, or is in conflict with an existing lock. If this lock
* is held, it indicates the next lock on the same vnode.
* For pending locks, if lock->lf_next is non-NULL, then lock->lf_block
* must be queued on the lf_blkhd TAILQ of lock->lf_next.
*/
TAILQ_HEAD(locklist, lockf);
struct lockf {
kcondvar_t lf_cv; /* Signalling */
short lf_flags; /* Lock semantics: F_POSIX, F_FLOCK, F_WAIT */
short lf_type; /* Lock type: F_RDLCK, F_WRLCK */
off_t lf_start; /* The byte # of the start of the lock */
off_t lf_end; /* The byte # of the end of the lock (-1=EOF)*/
void *lf_id; /* process or file description holding lock */
struct lockf **lf_head; /* Back pointer to the head of lockf list */
struct lockf *lf_next; /* Next lock on this vnode, or blocking lock */
struct locklist lf_blkhd; /* List of requests blocked on this lock */
TAILQ_ENTRY(lockf) lf_block;/* A request waiting for a lock */
struct uidinfo *lf_uip; /* Cached pointer to uidinfo */
};
/* Maximum length of sleep chains to traverse to try and detect deadlock. */
#define MAXDEPTH 50
static kmutex_t lockf_lock __cacheline_aligned;
static char lockstr[] = "lockf";
/*
* This variable controls the maximum number of processes that will
* be checked in doing deadlock detection.
*/
int maxlockdepth = MAXDEPTH;
#ifdef LOCKF_DEBUG
int lockf_debug = 0;
#endif
#define SELF 0x1
#define OTHERS 0x2
/*
* XXX TODO
* Misc cleanups: "void *id" should be visible in the API as a
* "struct proc *".
* (This requires rototilling all VFS's which support advisory locking).
*/
/*
* If there's a lot of lock contention on a single vnode, locking
* schemes which allow for more paralleism would be needed. Given how
* infrequently byte-range locks are actually used in typical BSD
* code, a more complex approach probably isn't worth it.
*/
/*
* We enforce a limit on locks by uid, so that a single user cannot
* run the kernel out of memory. For now, the limit is pretty coarse.
* There is no limit on root.
*
* Splitting a lock will always succeed, regardless of current allocations.
* If you're slightly above the limit, we still have to permit an allocation
* so that the unlock can succeed. If the unlocking causes too many splits,
* however, you're totally cutoff.
*/
#define MAXLOCKSPERUID (2 * maxfiles)
#ifdef LOCKF_DEBUG
/*
* Print out a lock.
*/
static void
lf_print(const char *tag, struct lockf *lock)
{
printf("%s: lock %p for ", tag, lock);
if (lock->lf_flags & F_POSIX)
printf("proc %d", ((struct proc *)lock->lf_id)->p_pid);
else
printf("file %p", (struct file *)lock->lf_id);
printf(" %s, start %jd, end %jd",
lock->lf_type == F_RDLCK ? "shared" :
lock->lf_type == F_WRLCK ? "exclusive" :
lock->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)lock->lf_start, (intmax_t)lock->lf_end);
if (TAILQ_FIRST(&lock->lf_blkhd))
printf(" block %p\n", TAILQ_FIRST(&lock->lf_blkhd));
else
printf("\n");
}
static void
lf_printlist(const char *tag, struct lockf *lock)
{
struct lockf *lf, *blk;
printf("%s: Lock list:\n", tag);
for (lf = *lock->lf_head; lf; lf = lf->lf_next) {
printf("\tlock %p for ", lf);
if (lf->lf_flags & F_POSIX)
printf("proc %d", ((struct proc *)lf->lf_id)->p_pid);
else
printf("file %p", (struct file *)lf->lf_id);
printf(", %s, start %jd, end %jd",
lf->lf_type == F_RDLCK ? "shared" :
lf->lf_type == F_WRLCK ? "exclusive" :
lf->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)lf->lf_start, (intmax_t)lf->lf_end);
TAILQ_FOREACH(blk, &lf->lf_blkhd, lf_block) {
if (blk->lf_flags & F_POSIX)
printf("; proc %d",
((struct proc *)blk->lf_id)->p_pid);
else
printf("; file %p", (struct file *)blk->lf_id);
printf(", %s, start %jd, end %jd",
blk->lf_type == F_RDLCK ? "shared" :
blk->lf_type == F_WRLCK ? "exclusive" :
blk->lf_type == F_UNLCK ? "unlock" :
"unknown", (intmax_t)blk->lf_start, (intmax_t)blk->lf_end);
if (TAILQ_FIRST(&blk->lf_blkhd))
panic("lf_printlist: bad list");
}
printf("\n");
}
}
#endif /* LOCKF_DEBUG */
/*
* 3 options for allowfail.
* 0 - always allocate. 1 - cutoff at limit. 2 - cutoff at double limit.
*/
static struct lockf *
lf_alloc(int allowfail)
{
struct uidinfo *uip;
struct lockf *lock;
u_long lcnt;
const uid_t uid = kauth_cred_geteuid(kauth_cred_get());
uip = uid_find(uid);
lcnt = atomic_inc_ulong_nv(&uip->ui_lockcnt);
if (uid && allowfail && lcnt >
(allowfail == 1 ? MAXLOCKSPERUID : (MAXLOCKSPERUID * 2))) {
atomic_dec_ulong(&uip->ui_lockcnt);
return NULL;
}
lock = kmem_alloc(sizeof(*lock), KM_SLEEP);
lock->lf_uip = uip;
cv_init(&lock->lf_cv, lockstr);
return lock;
}
static void
lf_free(struct lockf *lock)
{
atomic_dec_ulong(&lock->lf_uip->ui_lockcnt);
cv_destroy(&lock->lf_cv);
kmem_free(lock, sizeof(*lock));
}
/*
* Walk the list of locks for an inode to
* find an overlapping lock (if any).
*
* NOTE: this returns only the FIRST overlapping lock. There
* may be more than one.
*/
static int
lf_findoverlap(struct lockf *lf, struct lockf *lock, int type,
struct lockf ***prev, struct lockf **overlap)
{
off_t start, end;
*overlap = lf;
if (lf == NULL)
return 0;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("lf_findoverlap: looking for overlap in", lock);
#endif /* LOCKF_DEBUG */
start = lock->lf_start;
end = lock->lf_end;
while (lf != NULL) { if (((type == SELF) && lf->lf_id != lock->lf_id) ||
((type == OTHERS) && lf->lf_id == lock->lf_id)) {
*prev = &lf->lf_next;
*overlap = lf = lf->lf_next;
continue;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("\tchecking", lf);
#endif /* LOCKF_DEBUG */
/*
* OK, check for overlap
*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
if ((lf->lf_end != -1 && start > lf->lf_end) || (end != -1 && lf->lf_start > end)) {
/* Case 0 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("no overlap\n");
#endif /* LOCKF_DEBUG */
if ((type & SELF) && end != -1 && lf->lf_start > end)
return 0;
*prev = &lf->lf_next;
*overlap = lf = lf->lf_next;
continue;
}
if ((lf->lf_start == start) && (lf->lf_end == end)) {
/* Case 1 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap == lock\n");
#endif /* LOCKF_DEBUG */
return 1;
}
if ((lf->lf_start <= start) &&
(end != -1) &&
((lf->lf_end >= end) || (lf->lf_end == -1))) {
/* Case 2 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap contains lock\n");
#endif /* LOCKF_DEBUG */
return 2;
}
if (start <= lf->lf_start &&
(end == -1 || (lf->lf_end != -1 && end >= lf->lf_end))) {
/* Case 3 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("lock contains overlap\n");
#endif /* LOCKF_DEBUG */
return 3;
}
if ((lf->lf_start < start) &&
((lf->lf_end >= start) || (lf->lf_end == -1))) {
/* Case 4 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap starts before lock\n");
#endif /* LOCKF_DEBUG */
return 4;
}
if ((lf->lf_start > start) &&
(end != -1) &&
((lf->lf_end > end) || (lf->lf_end == -1))) {
/* Case 5 */
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
printf("overlap ends after lock\n");
#endif /* LOCKF_DEBUG */
return 5;
}
panic("lf_findoverlap: default");
}
return 0;
}
/*
* Split a lock and a contained region into
* two or three locks as necessary.
*/
static void
lf_split(struct lockf *lock1, struct lockf *lock2, struct lockf **sparelock)
{
struct lockf *splitlock;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2) {
lf_print("lf_split", lock1);
lf_print("splitting from", lock2);
}
#endif /* LOCKF_DEBUG */
/*
* Check to see if splitting into only two pieces.
*/
if (lock1->lf_start == lock2->lf_start) {
lock1->lf_start = lock2->lf_end + 1;
lock2->lf_next = lock1;
return;
}
if (lock1->lf_end == lock2->lf_end) {
lock1->lf_end = lock2->lf_start - 1;
lock2->lf_next = lock1->lf_next;
lock1->lf_next = lock2;
return;
}
/*
* Make a new lock consisting of the last part of
* the encompassing lock
*/
splitlock = *sparelock;
*sparelock = NULL;
cv_destroy(&splitlock->lf_cv);
memcpy(splitlock, lock1, sizeof(*splitlock));
cv_init(&splitlock->lf_cv, lockstr);
splitlock->lf_start = lock2->lf_end + 1;
TAILQ_INIT(&splitlock->lf_blkhd);
lock1->lf_end = lock2->lf_start - 1;
/*
* OK, now link it in
*/
splitlock->lf_next = lock1->lf_next;
lock2->lf_next = splitlock;
lock1->lf_next = lock2;
}
/*
* Wakeup a blocklist
*/
static void
lf_wakelock(struct lockf *listhead)
{
struct lockf *wakelock;
while ((wakelock = TAILQ_FIRST(&listhead->lf_blkhd))) { KASSERT(wakelock->lf_next == listhead); TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
wakelock->lf_next = NULL;
#ifdef LOCKF_DEBUG
if (lockf_debug & 2)
lf_print("lf_wakelock: awakening", wakelock);
#endif
cv_broadcast(&wakelock->lf_cv);
}
}
/*
* Remove a byte-range lock on an inode.
*
* Generally, find the lock (or an overlap to that lock)
* and remove it (or shrink it), then wakeup anyone we can.
*/
static int
lf_clearlock(struct lockf *unlock, struct lockf **sparelock)
{
struct lockf **head = unlock->lf_head;
struct lockf *lf = *head;
struct lockf *overlap, **prev;
int ovcase;
if (lf == NULL)
return 0;
#ifdef LOCKF_DEBUG
if (unlock->lf_type != F_UNLCK)
panic("lf_clearlock: bad type");
if (lockf_debug & 1)
lf_print("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
prev = head;
while ((ovcase = lf_findoverlap(lf, unlock, SELF,
&prev, &overlap)) != 0) {
/*
* Wakeup the list of locks to be retried.
*/
lf_wakelock(overlap);
switch (ovcase) {
case 1: /* overlap == lock */
*prev = overlap->lf_next;
lf_free(overlap);
break;
case 2: /* overlap contains lock: split it */
if (overlap->lf_start == unlock->lf_start) {
overlap->lf_start = unlock->lf_end + 1;
break;
}
lf_split(overlap, unlock, sparelock);
overlap->lf_next = unlock->lf_next;
break;
case 3: /* lock contains overlap */
*prev = overlap->lf_next;
lf = overlap->lf_next;
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
overlap->lf_end = unlock->lf_start - 1;
prev = &overlap->lf_next;
lf = overlap->lf_next;
continue;
case 5: /* overlap ends after lock */
overlap->lf_start = unlock->lf_end + 1;
break;
}
break;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_printlist("lf_clearlock", unlock);
#endif /* LOCKF_DEBUG */
return 0;
}
/*
* Walk the list of locks for an inode and
* return the first blocking lock.
*/
static struct lockf *
lf_getblock(struct lockf *lock)
{
struct lockf **prev, *overlap, *lf = *(lock->lf_head);
prev = lock->lf_head;
while (lf_findoverlap(lf, lock, OTHERS, &prev, &overlap) != 0) {
/*
* We've found an overlap, see if it blocks us
*/
if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
return overlap;
/*
* Nope, point to the next one on the list and
* see if it blocks us
*/
lf = overlap->lf_next;
}
return NULL;
}
/*
* Set a byte-range lock.
*/
static int
lf_setlock(struct lockf *lock, struct lockf **sparelock,
kmutex_t *interlock)
{
struct lockf *block;
struct lockf **head = lock->lf_head;
struct lockf **prev, *overlap, *ltmp;
int ovcase, needtolink, error;
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_print("lf_setlock", lock);
#endif /* LOCKF_DEBUG */
/*
* Scan lock list for this file looking for locks that would block us.
*/
while ((block = lf_getblock(lock)) != NULL) {
/*
* Free the structure and return if nonblocking.
*/
if ((lock->lf_flags & F_WAIT) == 0) {
lf_free(lock);
return EAGAIN;
}
/*
* We are blocked. Since flock style locks cover
* the whole file, there is no chance for deadlock.
* For byte-range locks we must check for deadlock.
*
* Deadlock detection is done by looking through the
* wait channels to see if there are any cycles that
* involve us. MAXDEPTH is set just to make sure we
* do not go off into neverneverland.
*/
if ((lock->lf_flags & F_POSIX) &&
(block->lf_flags & F_POSIX)) {
struct lwp *wlwp;
volatile const struct lockf *waitblock;
int i = 0;
struct proc *p;
p = (struct proc *)block->lf_id;
KASSERT(p != NULL); while (i++ < maxlockdepth) {
mutex_enter(p->p_lock);
if (p->p_nlwps > 1) {
mutex_exit(p->p_lock);
break;
}
wlwp = LIST_FIRST(&p->p_lwps);
lwp_lock(wlwp);
if (wlwp->l_wchan == NULL ||
wlwp->l_wmesg != lockstr) {
lwp_unlock(wlwp);
mutex_exit(p->p_lock);
break;
}
waitblock = wlwp->l_wchan;
lwp_unlock(wlwp);
mutex_exit(p->p_lock);
/* Get the owner of the blocking lock */
waitblock = waitblock->lf_next;
if ((waitblock->lf_flags & F_POSIX) == 0)
break;
p = (struct proc *)waitblock->lf_id;
if (p == curproc) { lf_free(lock);
return EDEADLK;
}
}
/*
* If we're still following a dependency chain
* after maxlockdepth iterations, assume we're in
* a cycle to be safe.
*/
if (i >= maxlockdepth) {
lf_free(lock);
return EDEADLK;
}
}
/*
* For flock type locks, we must first remove
* any shared locks that we hold before we sleep
* waiting for an exclusive lock.
*/
if ((lock->lf_flags & F_FLOCK) &&
lock->lf_type == F_WRLCK) {
lock->lf_type = F_UNLCK;
(void) lf_clearlock(lock, NULL);
lock->lf_type = F_WRLCK;
}
/*
* Add our lock to the blocked list and sleep until we're free.
* Remember who blocked us (for deadlock detection).
*/
lock->lf_next = block;
TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
#ifdef LOCKF_DEBUG
if (lockf_debug & 1) {
lf_print("lf_setlock: blocking on", block);
lf_printlist("lf_setlock", block);
}
#endif /* LOCKF_DEBUG */
error = cv_wait_sig(&lock->lf_cv, interlock);
/*
* We may have been awoken by a signal (in
* which case we must remove ourselves from the
* blocked list) and/or by another process
* releasing a lock (in which case we have already
* been removed from the blocked list and our
* lf_next field set to NULL).
*/
if (lock->lf_next != NULL) { TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, lf_block);
lock->lf_next = NULL;
}
if (error) {
lf_free(lock);
return error;
}
}
/*
* No blocks!! Add the lock. Note that we will
* downgrade or upgrade any overlapping locks this
* process already owns.
*
* Skip over locks owned by other processes.
* Handle any locks that overlap and are owned by ourselves.
*/
prev = head;
block = *head;
needtolink = 1;
for (;;) {
ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
if (ovcase)
block = overlap->lf_next;
/*
* Six cases:
* 0) no overlap
* 1) overlap == lock
* 2) overlap contains lock
* 3) lock contains overlap
* 4) overlap starts before lock
* 5) overlap ends after lock
*/
switch (ovcase) {
case 0: /* no overlap */
if (needtolink) {
*prev = lock;
lock->lf_next = overlap;
}
break;
case 1: /* overlap == lock */
/*
* If downgrading lock, others may be
* able to acquire it.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK)
lf_wakelock(overlap);
overlap->lf_type = lock->lf_type;
lf_free(lock);
lock = overlap; /* for debug output below */
break;
case 2: /* overlap contains lock */
/*
* Check for common starting point and different types.
*/
if (overlap->lf_type == lock->lf_type) {
lf_free(lock);
lock = overlap; /* for debug output below */
break;
}
if (overlap->lf_start == lock->lf_start) {
*prev = lock;
lock->lf_next = overlap;
overlap->lf_start = lock->lf_end + 1;
} else
lf_split(overlap, lock, sparelock);
lf_wakelock(overlap);
break;
case 3: /* lock contains overlap */
/*
* If downgrading lock, others may be able to
* acquire it, otherwise take the list.
*/
if (lock->lf_type == F_RDLCK &&
overlap->lf_type == F_WRLCK) {
lf_wakelock(overlap);
} else {
while ((ltmp = TAILQ_FIRST(&overlap->lf_blkhd))) { KASSERT(ltmp->lf_next == overlap); TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
lf_block);
ltmp->lf_next = lock;
TAILQ_INSERT_TAIL(&lock->lf_blkhd,
ltmp, lf_block);
}
}
/*
* Add the new lock if necessary and delete the overlap.
*/
if (needtolink) {
*prev = lock;
lock->lf_next = overlap->lf_next;
prev = &lock->lf_next;
needtolink = 0;
} else
*prev = overlap->lf_next;
lf_free(overlap);
continue;
case 4: /* overlap starts before lock */
/*
* Add lock after overlap on the list.
*/
lock->lf_next = overlap->lf_next;
overlap->lf_next = lock;
overlap->lf_end = lock->lf_start - 1;
prev = &lock->lf_next;
lf_wakelock(overlap);
needtolink = 0;
continue;
case 5: /* overlap ends after lock */
/*
* Add the new lock before overlap.
*/
if (needtolink) { *prev = lock;
lock->lf_next = overlap;
}
overlap->lf_start = lock->lf_end + 1;
lf_wakelock(overlap);
break;
}
break;
}
#ifdef LOCKF_DEBUG
if (lockf_debug & 1) {
lf_print("lf_setlock: got the lock", lock);
lf_printlist("lf_setlock", lock);
}
#endif /* LOCKF_DEBUG */
return 0;
}
/*
* Check whether there is a blocking lock,
* and if so return its process identifier.
*/
static int
lf_getlock(struct lockf *lock, struct flock *fl)
{
struct lockf *block;
#ifdef LOCKF_DEBUG
if (lockf_debug & 1)
lf_print("lf_getlock", lock);
#endif /* LOCKF_DEBUG */
if ((block = lf_getblock(lock)) != NULL) {
fl->l_type = block->lf_type;
fl->l_whence = SEEK_SET;
fl->l_start = block->lf_start;
if (block->lf_end == -1)
fl->l_len = 0;
else
fl->l_len = block->lf_end - block->lf_start + 1;
if (block->lf_flags & F_POSIX)
fl->l_pid = ((struct proc *)block->lf_id)->p_pid;
else
fl->l_pid = -1;
} else {
fl->l_type = F_UNLCK;
}
return 0;
}
/*
* Do an advisory lock operation.
*/
int
lf_advlock(struct vop_advlock_args *ap, struct lockf **head, off_t size)
{
struct flock *fl = ap->a_fl;
struct lockf *lock = NULL;
struct lockf *sparelock;
kmutex_t *interlock = &lockf_lock;
off_t start, end;
int error = 0;
KASSERTMSG(size >= 0, "size=%jd", (intmax_t)size);
/*
* Convert the flock structure into a start and end.
*/
switch (fl->l_whence) {
case SEEK_SET:
case SEEK_CUR:
/*
* Caller is responsible for adding any necessary offset
* when SEEK_CUR is used.
*/
start = fl->l_start;
break;
case SEEK_END:
if (fl->l_start > __type_max(off_t) - size)
return EINVAL;
start = size + fl->l_start;
break;
default:
return EINVAL;
}
if (fl->l_len == 0)
end = -1;
else {
if (fl->l_len >= 0) {
if (start >= 0 &&
fl->l_len - 1 > __type_max(off_t) - start)
return EINVAL;
end = start + (fl->l_len - 1);
} else {
/* lockf() allows -ve lengths */
if (start < 0)
return EINVAL;
end = start - 1;
start += fl->l_len;
}
}
if (start < 0)
return EINVAL;
/*
* Allocate locks before acquiring the interlock. We need two
* locks in the worst case.
*/
switch (ap->a_op) {
case F_SETLK:
case F_UNLCK:
/*
* XXX For F_UNLCK case, we can re-use the lock.
*/
if ((ap->a_flags & F_FLOCK) == 0) {
/*
* Byte-range lock might need one more lock.
*/
sparelock = lf_alloc(0);
if (sparelock == NULL) {
error = ENOMEM;
goto quit;
}
break;
}
/* FALLTHROUGH */
case F_GETLK:
sparelock = NULL;
break;
default:
return EINVAL;
}
switch (ap->a_op) {
case F_SETLK:
lock = lf_alloc(1);
break;
case F_UNLCK:
if (start == 0 || end == -1) {
/* never split */
lock = lf_alloc(0);
} else {
/* might split */
lock = lf_alloc(2);
}
break;
case F_GETLK:
lock = lf_alloc(0);
break;
}
if (lock == NULL) {
error = ENOMEM;
goto quit;
}
mutex_enter(interlock);
/*
* Avoid the common case of unlocking when inode has no locks.
*/
if (*head == (struct lockf *)0) { if (ap->a_op != F_SETLK) { fl->l_type = F_UNLCK;
error = 0;
goto quit_unlock;
}
}
/*
* Create the lockf structure.
*/
lock->lf_start = start;
lock->lf_end = end;
lock->lf_head = head;
lock->lf_type = fl->l_type;
lock->lf_next = (struct lockf *)0;
TAILQ_INIT(&lock->lf_blkhd);
lock->lf_flags = ap->a_flags;
if (lock->lf_flags & F_POSIX) { KASSERT(curproc == (struct proc *)ap->a_id);
}
lock->lf_id = ap->a_id;
/*
* Do the requested operation.
*/
switch (ap->a_op) {
case F_SETLK:
error = lf_setlock(lock, &sparelock, interlock);
lock = NULL; /* lf_setlock freed it */
break;
case F_UNLCK:
error = lf_clearlock(lock, &sparelock);
break;
case F_GETLK:
error = lf_getlock(lock, fl);
break;
default:
break;
/* NOTREACHED */
}
quit_unlock:
mutex_exit(interlock);
quit:
if (lock)
lf_free(lock);
if (sparelock) lf_free(sparelock);
return error;
}
/*
* Initialize subsystem. XXX We use a global lock. This could be the
* vnode interlock, but the deadlock detection code may need to inspect
* locks belonging to other files.
*/
void
lf_init(void)
{
mutex_init(&lockf_lock, MUTEX_DEFAULT, IPL_NONE);
}
/* $NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $ */
/*-
* Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: secmodel_extensions_vfs.c,v 1.1 2023/04/22 13:54:19 riastradh Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/vnode.h>
#include <secmodel/secmodel.h>
#include <secmodel/extensions/extensions.h>
#include <secmodel/extensions/extensions_impl.h>
static int dovfsusermount;
static int hardlink_check_uid;
static int hardlink_check_gid;
static kauth_listener_t l_system, l_vnode;
static int secmodel_extensions_system_cb(kauth_cred_t, kauth_action_t,
void *, void *, void *, void *, void *);
static int secmodel_extensions_vnode_cb(kauth_cred_t, kauth_action_t,
void *, void *, void *, void *, void *);
void
secmodel_extensions_vfs_start(void)
{
l_system = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
secmodel_extensions_system_cb, NULL);
l_vnode = kauth_listen_scope(KAUTH_SCOPE_VNODE,
secmodel_extensions_vnode_cb, NULL);
}
void
secmodel_extensions_vfs_stop(void)
{
kauth_unlisten_scope(l_system);
kauth_unlisten_scope(l_vnode);
}
void
secmodel_extensions_vfs_sysctl(struct sysctllog **clog,
const struct sysctlnode *rnode)
{
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "usermount",
SYSCTL_DESCR("Whether unprivileged users may mount "
"filesystems"),
sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hardlink_check_uid",
SYSCTL_DESCR("Whether unprivileged users can hardlink "\
"to files they don't own"),
sysctl_extensions_user_handler, 0,
&hardlink_check_uid, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &rnode, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hardlink_check_gid",
SYSCTL_DESCR("Whether unprivileged users can hardlink "\
"to files that are not in their " \
"group membership"),
sysctl_extensions_user_handler, 0,
&hardlink_check_gid, 0,
CTL_CREATE, CTL_EOL);
/* Compatibility: vfs.generic.usermount */
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "generic",
SYSCTL_DESCR("Non-specific vfs related information"),
NULL, 0, NULL, 0,
CTL_VFS, VFS_GENERIC, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "usermount",
SYSCTL_DESCR("Whether unprivileged users may mount "
"filesystems"),
sysctl_extensions_user_handler, 0, &dovfsusermount, 0,
CTL_VFS, VFS_GENERIC, VFS_USERMOUNT, CTL_EOL);
}
static int
secmodel_extensions_system_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
vnode_t *vp;
struct vattr va;
struct mount *mp;
u_long flags;
int result;
enum kauth_system_req req;
int error;
req = (enum kauth_system_req)(uintptr_t)arg0;
result = KAUTH_RESULT_DEFER;
switch (action) {
case KAUTH_SYSTEM_MOUNT:
if (dovfsusermount == 0)
break;
switch (req) {
case KAUTH_REQ_SYSTEM_MOUNT_NEW:
vp = (vnode_t *)arg1;
mp = vp->v_mount;
flags = (u_long)arg2;
/*
* Ensure that the user owns the directory onto which
* the mount is attempted.
*/
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, cred);
VOP_UNLOCK(vp);
if (error)
break;
if (va.va_uid != kauth_cred_geteuid(cred))
break;
error = usermount_common_policy(mp, flags);
if (error)
break;
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_SYSTEM_MOUNT_UNMOUNT:
mp = arg1;
/* Must own the mount. */
if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred))
result = KAUTH_RESULT_ALLOW;
break;
case KAUTH_REQ_SYSTEM_MOUNT_UPDATE:
mp = arg1;
flags = (u_long)arg2;
/* Must own the mount. */
if (mp->mnt_stat.f_owner == kauth_cred_geteuid(cred) &&
usermount_common_policy(mp, flags) == 0)
result = KAUTH_RESULT_ALLOW;
break;
default:
break;
}
break;
default:
break;
}
return (result);
}
static int
secmodel_extensions_vnode_cb(kauth_cred_t cred, kauth_action_t action,
void *cookie, void *arg0, void *arg1, void *arg2, void *arg3)
{
int error;
bool isroot;
struct vattr va;
if ((action & KAUTH_VNODE_ADD_LINK) == 0)
return KAUTH_RESULT_DEFER;
error = VOP_GETATTR((vnode_t *)arg0, &va, cred);
if (error)
goto checkroot;
if (hardlink_check_uid && kauth_cred_geteuid(cred) != va.va_uid)
goto checkroot;
if (hardlink_check_gid && kauth_cred_groupmember(cred, va.va_gid) != 0)
goto checkroot;
return KAUTH_RESULT_DEFER;
checkroot:
error = secmodel_eval("org.netbsd.secmodel.suser", "is-root",
cred, &isroot);
if (error || !isroot)
return KAUTH_RESULT_DENY;
return KAUTH_RESULT_DEFER;
}
/* $NetBSD: rtsock_70.c,v 1.8 2019/12/12 02:15:42 pgoyette Exp $ */
/*
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Roy Marples.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtsock_70.c,v 1.8 2019/12/12 02:15:42 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/mbuf.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/route.h>
#include <compat/net/if.h>
#include <compat/net/route.h>
#include <compat/net/route_70.h>
void
compat_70_rt_newaddrmsg1(int cmd, struct ifaddr *ifa)
{
struct rt_addrinfo info;
const struct sockaddr *sa;
struct mbuf *m;
struct ifnet *ifp;
struct ifa_msghdr70 ifam;
int ncmd;
KASSERT(ifa != NULL);
ifp = ifa->ifa_ifp;
switch (cmd) {
case RTM_NEWADDR:
ncmd = RTM_ONEWADDR;
break;
case RTM_DELADDR:
ncmd = RTM_ODELADDR;
break;
case RTM_CHGADDR:
ncmd = RTM_OCHGADDR;
break;
default:
panic("%s: called with wrong command", __func__);
}
memset(&info, 0, sizeof(info));
info.rti_info[RTAX_IFA] = sa = ifa->ifa_addr;
KASSERT(ifp->if_dl != NULL);
info.rti_info[RTAX_IFP] = ifp->if_dl->ifa_addr;
info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
memset(&ifam, 0, sizeof(ifam));
ifam.ifam_index = ifp->if_index;
ifam.ifam_metric = ifa->ifa_metric;
ifam.ifam_flags = ifa->ifa_flags;
m = rt_msg1(ncmd, &info, &ifam, sizeof(ifam));
if (m == NULL)
return;
mtod(m, struct ifa_msghdr70 *)->ifam_addrs = info.rti_addrs;
route_enqueue(m, sa ? sa->sa_family : 0);
}
int
compat_70_iflist_addr(struct rt_walkarg *w, struct ifaddr *ifa,
struct rt_addrinfo *info)
{
int len, error;
if ((error = rt_msg3(RTM_ONEWADDR, info, 0, w, &len)))
return error;
if (w->w_where && w->w_tmem && w->w_needed <= 0) {
struct ifa_msghdr70 *ifam;
ifam = (struct ifa_msghdr70 *)w->w_tmem;
ifam->ifam_index = ifa->ifa_ifp->if_index;
ifam->ifam_flags = ifa->ifa_flags;
ifam->ifam_metric = ifa->ifa_metric;
ifam->ifam_addrs = info->rti_addrs;
if ((error = copyout(w->w_tmem, w->w_where, len)) == 0)
w->w_where = (char *)w->w_where + len;
}
return error;
}
void
rtsock_70_init(void)
{
MODULE_HOOK_SET(rtsock_newaddr_70_hook, compat_70_rt_newaddrmsg1);
MODULE_HOOK_SET(rtsock_iflist_70_hook, compat_70_iflist_addr);
}
void
rtsock_70_fini(void)
{
MODULE_HOOK_UNSET(rtsock_newaddr_70_hook);
MODULE_HOOK_UNSET(rtsock_iflist_70_hook);
}
/* $NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */
/*-
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_sem_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/proc.h>
#include <sys/sem.h>
#ifndef SYSVSEM
#define SYSVSEM
#endif
#include <sys/syscallargs.h>
#include <compat/sys/sem.h>
int
compat_50_sys_____semctl13(struct lwp *l, const struct compat_50_sys_____semctl13_args *uap, register_t *retval)
{
/* {
syscallarg(int) semid;
syscallarg(int) semnum;
syscallarg(int) cmd;
syscallarg(union __semun *) arg;
} */
union __semun arg;
struct semid_ds sembuf;
struct semid_ds13 osembuf;
int cmd, error;
void *pass_arg;
cmd = SCARG(uap, cmd);
pass_arg = get_semctl_arg(cmd, &sembuf, &arg);
if (pass_arg != NULL) {
error = copyin(SCARG(uap, arg), &arg, sizeof(arg));
if (error)
return (error);
if (cmd == IPC_SET) {
error = copyin(arg.buf, &osembuf, sizeof(osembuf));
if (error)
return (error);
__semid_ds13_to_native(&osembuf, &sembuf);
}
}
error = semctl1(l, SCARG(uap, semid), SCARG(uap, semnum), cmd,
pass_arg, retval);
if (error == 0 && cmd == IPC_STAT) { __native_to_semid_ds13(&sembuf, &osembuf);
error = copyout(&osembuf, arg.buf, sizeof(osembuf));
}
return (error);
}
/* $NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_43.c,v 1.27 2023/03/30 17:48:10 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/mbuf.h> /* for MLEN */
#include <sys/protosw.h>
#include <sys/compat_stub.h>
#include <sys/syscallargs.h>
#include <net/if.h>
#include <net/bpf.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <net/if_gre.h>
#include <net/if_tap.h>
#include <net80211/ieee80211_ioctl.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <compat/net/if.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_util.h>
#include <compat/common/compat_mod.h>
#include <uvm/uvm_extern.h>
#if defined(COMPAT_43)
/*
* Use a wrapper so that the compat_cvtcmd() can return a u_long
*/
static int
do_compat_cvtcmd(u_long *ncmd, u_long ocmd)
{
*ncmd = compat_cvtcmd(ocmd);
return 0;
}
u_long
compat_cvtcmd(u_long cmd)
{
u_long ncmd;
if (IOCPARM_LEN(cmd) != sizeof(struct oifreq))
return cmd;
switch (cmd) {
case OSIOCSIFADDR:
return SIOCSIFADDR;
case OOSIOCGIFADDR:
return SIOCGIFADDR;
case OSIOCSIFDSTADDR:
return SIOCSIFDSTADDR;
case OOSIOCGIFDSTADDR:
return SIOCGIFDSTADDR;
case OSIOCSIFFLAGS:
return SIOCSIFFLAGS;
case OSIOCGIFFLAGS:
return SIOCGIFFLAGS;
case OOSIOCGIFBRDADDR:
return SIOCGIFBRDADDR;
case OSIOCSIFBRDADDR:
return SIOCSIFBRDADDR;
case OOSIOCGIFCONF:
return SIOCGIFCONF;
case OOSIOCGIFNETMASK:
return SIOCGIFNETMASK;
case OSIOCSIFNETMASK:
return SIOCSIFNETMASK;
case OSIOCGIFCONF:
return SIOCGIFCONF;
case OSIOCADDMULTI:
return SIOCADDMULTI;
case OSIOCDELMULTI:
return SIOCDELMULTI;
case SIOCSIFMEDIA_43:
return SIOCSIFMEDIA_80;
case OSIOCGIFMTU:
return SIOCGIFMTU;
case OSIOCGIFDATA:
return SIOCGIFDATA;
case OSIOCZIFDATA:
return SIOCZIFDATA;
case OBIOCGETIF:
return BIOCGETIF;
case OBIOCSETIF:
return BIOCSETIF;
case OTAPGIFNAME:
return TAPGIFNAME;
default:
/*
* XXX: the following code should be removed and the
* needing treatment ioctls should move to the switch
* above.
*/
ncmd = ((cmd) & ~(IOCPARM_MASK << IOCPARM_SHIFT)) |
(sizeof(struct ifreq) << IOCPARM_SHIFT);
switch (ncmd) {
case BIOCGETIF:
case BIOCSETIF:
case GREDSOCK:
case GREGADDRD:
case GREGADDRS:
case GREGPROTO:
case GRESADDRD:
case GRESADDRS:
case GRESPROTO:
case GRESSOCK:
case SIOCADDMULTI:
case SIOCDELMULTI:
case SIOCDIFADDR:
case SIOCDIFADDR_IN6:
case SIOCDIFPHYADDR:
case SIOCG80211NWID:
case SIOCG80211STATS:
case SIOCG80211ZSTATS:
case SIOCGIFADDR:
case SIOCGIFADDR_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
case SIOCGIFBRDADDR:
case SIOCGIFDLT:
case SIOCGIFDSTADDR:
case SIOCGIFDSTADDR_IN6:
case SIOCGIFFLAGS:
case SIOCGIFGENERIC:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
case SIOCGIFNETMASK:
case SIOCGIFNETMASK_IN6:
case SIOCGIFPDSTADDR:
case SIOCGIFPDSTADDR_IN6:
case SIOCGIFPSRCADDR:
case SIOCGIFPSRCADDR_IN6:
case SIOCGIFSTAT_ICMP6:
case SIOCGIFSTAT_IN6:
case SIOCGVH:
case SIOCIFCREATE:
case SIOCIFDESTROY:
case SIOCS80211NWID:
case SIOCSIFADDR:
case SIOCSIFADDR_IN6:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
case SIOCSIFDSTADDR_IN6:
case SIOCSIFFLAGS:
case SIOCSIFGENERIC:
case SIOCSIFMEDIA:
case SIOCSIFMETRIC:
case SIOCSIFMTU:
case SIOCSIFNETMASK:
case SIOCSIFNETMASK_IN6:
case SIOCSVH:
case TAPGIFNAME:
return ncmd;
default:
{ int rv;
MODULE_HOOK_CALL(if43_cvtcmd_20_hook, (ncmd), enosys(),
rv);
if (rv == 0)
return ncmd;
return cmd;
}
}
}
}
int
compat_ifioctl(struct socket *so, u_long ocmd, u_long cmd, void *data,
struct lwp *l)
{
int error;
struct ifreq *ifr = (struct ifreq *)data;
struct ifreq ifrb;
struct oifreq *oifr = NULL;
struct ifnet *ifp;
struct sockaddr *sa;
struct psref psref;
int bound = curlwp_bind();
ifp = if_get(ifr->ifr_name, &psref);
if (ifp == NULL) {
curlwp_bindx(bound);
return ENXIO;
}
/*
* If we have not been converted, make sure that we are.
* (because the upper layer handles old socket calls, but
* not oifreq calls.
*/
if (cmd == ocmd) { cmd = compat_cvtcmd(ocmd);
}
if (cmd != ocmd) { oifr = data;
ifr = &ifrb;
IFREQO2N_43(oifr, ifr);
}
switch (ocmd) {
enum { maxlen = sizeof(oifr->ifr_ifru) };
CTASSERT(maxlen == 16);
socklen_t famlen;
case OSIOCSIFADDR:
case OSIOCSIFDSTADDR:
case OSIOCSIFBRDADDR:
case OSIOCSIFNETMASK:
sa = &ifr->ifr_addr;
#if BYTE_ORDER != BIG_ENDIAN
if (sa->sa_family == 0 && sa->sa_len < maxlen) { sa->sa_family = sa->sa_len;
sa->sa_len = maxlen;
}
#else
if (sa->sa_len == 0)
sa->sa_len = maxlen;
#endif
famlen = sockaddr_getsize_by_family(sa->sa_family);
if (famlen > sa->sa_len) { curlwp_bindx(bound);
return EAFNOSUPPORT;
}
break;
}
error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so, cmd, ifr, ifp);
if_put(ifp, &psref);
curlwp_bindx(bound); switch (ocmd) {
case OOSIOCGIFADDR:
case OOSIOCGIFDSTADDR:
case OOSIOCGIFBRDADDR:
case OOSIOCGIFNETMASK:
*(u_int16_t *)&ifr->ifr_addr =
((struct sockaddr *)&ifr->ifr_addr)->sa_family;
break;
}
if (cmd != ocmd) IFREQN2O_43(oifr, ifr);
return error;
}
int
if_43_init(void)
{
MODULE_HOOK_SET(if_cvtcmd_43_hook, do_compat_cvtcmd);
MODULE_HOOK_SET(if_ifioctl_43_hook, compat_ifioctl);
return 0;
}
int
if_43_fini(void)
{
MODULE_HOOK_UNSET(if_cvtcmd_43_hook);
MODULE_HOOK_UNSET(if_ifioctl_43_hook);
return 0;
}
#endif /* defined(COMPAT_43) */
/* $NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(1, "$NetBSD: chacha_sse2_impl.c,v 1.1 2020/07/25 22:49:20 riastradh Exp $");
#include "chacha_sse2.h"
#ifdef _KERNEL
#include <x86/cpu.h>
#include <x86/fpu.h>
#else
#include <sys/sysctl.h>
#include <cpuid.h>
#include <stddef.h>
#define fpu_kern_enter() ((void)0)
#define fpu_kern_leave() ((void)0)
#endif
static void
chacha_core_sse2_impl(uint8_t out[restrict static 64],
const uint8_t in[static 16],
const uint8_t k[static 32],
const uint8_t c[static 16],
unsigned nr)
{
fpu_kern_enter();
chacha_core_sse2(out, in, k, c, nr);
fpu_kern_leave();
}
static void
hchacha_sse2_impl(uint8_t out[restrict static 32],
const uint8_t in[static 16],
const uint8_t k[static 32],
const uint8_t c[static 16],
unsigned nr)
{
fpu_kern_enter();
hchacha_sse2(out, in, k, c, nr);
fpu_kern_leave();
}
static void
chacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
const uint8_t nonce[static 12],
const uint8_t key[static 32],
unsigned nr)
{
fpu_kern_enter();
chacha_stream_sse2(s, nbytes, blkno, nonce, key, nr);
fpu_kern_leave();
}
static void
chacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
uint32_t blkno,
const uint8_t nonce[static 12],
const uint8_t key[static 32],
unsigned nr)
{
fpu_kern_enter();
chacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr);
fpu_kern_leave();
}
static void
xchacha_stream_sse2_impl(uint8_t *restrict s, size_t nbytes, uint32_t blkno,
const uint8_t nonce[static 24],
const uint8_t key[static 32],
unsigned nr)
{
fpu_kern_enter();
xchacha_stream_sse2(s, nbytes, blkno, nonce, key, nr);
fpu_kern_leave();
}
static void
xchacha_stream_xor_sse2_impl(uint8_t *c, const uint8_t *p, size_t nbytes,
uint32_t blkno,
const uint8_t nonce[static 24],
const uint8_t key[static 32],
unsigned nr)
{
fpu_kern_enter();
xchacha_stream_xor_sse2(c, p, nbytes, blkno, nonce, key, nr);
fpu_kern_leave();
}
static int
chacha_probe_sse2(void)
{
/* Verify that the CPU supports SSE and SSE2. */
#ifdef _KERNEL
if (!i386_has_sse)
return -1;
if (!i386_has_sse2)
return -1;
#else
unsigned eax, ebx, ecx, edx;
if (!__get_cpuid(1, &eax, &ebx, &ecx, &edx))
return -1;
if ((edx & bit_SSE) == 0)
return -1;
if ((edx & bit_SSE2) == 0)
return -1;
#endif
return 0;
}
const struct chacha_impl chacha_sse2_impl = {
.ci_name = "x86 SSE2 ChaCha",
.ci_probe = chacha_probe_sse2,
.ci_chacha_core = chacha_core_sse2_impl,
.ci_hchacha = hchacha_sse2_impl,
.ci_chacha_stream = chacha_stream_sse2_impl,
.ci_chacha_stream_xor = chacha_stream_xor_sse2_impl,
.ci_xchacha_stream = xchacha_stream_sse2_impl,
.ci_xchacha_stream_xor = xchacha_stream_xor_sse2_impl,
};
/* $NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* John Heidemann of the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)null_vnops.c 8.6 (Berkeley) 5/27/95
*
* Ancestors:
* @(#)lofs_vnops.c 1.2 (Berkeley) 6/18/92
* Id: lofs_vnops.c,v 1.11 1992/05/30 10:05:43 jsp Exp jsp
* ...and...
* @(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
*/
/*
* Generic layer vnode operations.
*
* The layer.h, layer_extern.h, layer_vfs.c, and layer_vnops.c files provide
* the core implementation of stacked file-systems.
*
* The layerfs duplicates a portion of the file system name space under
* a new name. In this respect, it is similar to the loopback file system.
* It differs from the loopback fs in two respects: it is implemented using
* a stackable layers technique, and it is "layerfs-nodes" stack above all
* lower-layer vnodes, not just over directory vnodes.
*
* OPERATION OF LAYERFS
*
* The layerfs is the minimum file system layer, bypassing all possible
* operations to the lower layer for processing there. The majority of its
* activity centers on the bypass routine, through which nearly all vnode
* operations pass.
*
* The bypass routine accepts arbitrary vnode operations for handling by
* the lower layer. It begins by examining vnode operation arguments and
* replacing any layered nodes by their lower-layer equivalents. It then
* invokes an operation on the lower layer. Finally, it replaces the
* layered nodes in the arguments and, if a vnode is returned by the
* operation, stacks a layered node on top of the returned vnode.
*
* The bypass routine in this file, layer_bypass(), is suitable for use
* by many different layered filesystems. It can be used by multiple
* filesystems simultaneously. Alternatively, a layered fs may provide
* its own bypass routine, in which case layer_bypass() should be used as
* a model. For instance, the main functionality provided by umapfs, the user
* identity mapping file system, is handled by a custom bypass routine.
*
* Typically a layered fs registers its selected bypass routine as the
* default vnode operation in its vnodeopv_entry_desc table. Additionally
* the filesystem must store the bypass entry point in the layerm_bypass
* field of struct layer_mount. All other layer routines in this file will
* use the layerm_bypass() routine.
*
* Although the bypass routine handles most operations outright, a number
* of operations are special cased and handled by the layerfs. For instance,
* layer_getattr() must change the fsid being returned. While layer_lock()
* and layer_unlock() must handle any locking for the current vnode as well
* as pass the lock request down. layer_inactive() and layer_reclaim() are
* not bypassed so that they can handle freeing layerfs-specific data. Also,
* certain vnode operations (create, mknod, remove, link, rename, mkdir,
* rmdir, and symlink) change the locking state within the operation. Ideally
* these operations should not change the lock state, but should be changed
* to let the caller of the function unlock them. Otherwise, all intermediate
* vnode layers (such as union, umapfs, etc) must catch these functions to do
* the necessary locking at their layer.
*
* INSTANTIATING VNODE STACKS
*
* Mounting associates "layerfs-nodes" stack and lower layer, in effect
* stacking two VFSes. The initial mount creates a single vnode stack for
* the root of the new layerfs. All other vnode stacks are created as a
* result of vnode operations on this or other layerfs vnode stacks.
*
* New vnode stacks come into existence as a result of an operation which
* returns a vnode. The bypass routine stacks a layerfs-node above the new
* vnode before returning it to the caller.
*
* For example, imagine mounting a null layer with:
*
* "mount_null /usr/include /dev/layer/null"
*
* Changing directory to /dev/layer/null will assign the root layerfs-node,
* which was created when the null layer was mounted). Now consider opening
* "sys". A layer_lookup() would be performed on the root layerfs-node.
* This operation would bypass through to the lower layer which would return
* a vnode representing the UFS "sys". Then, layer_bypass() builds a
* layerfs-node aliasing the UFS "sys" and returns this to the caller.
* Later operations on the layerfs-node "sys" will repeat this process when
* constructing other vnode stacks.
*
* INVOKING OPERATIONS ON LOWER LAYERS
*
* There are two techniques to invoke operations on a lower layer when the
* operation cannot be completely bypassed. Each method is appropriate in
* different situations. In both cases, it is the responsibility of the
* aliasing layer to make the operation arguments "correct" for the lower
* layer by mapping any vnode arguments to the lower layer.
*
* The first approach is to call the aliasing layer's bypass routine. This
* method is most suitable when you wish to invoke the operation currently
* being handled on the lower layer. It has the advantage that the bypass
* routine already must do argument mapping. An example of this is
* layer_getattr().
*
* A second approach is to directly invoke vnode operations on the lower
* layer with the VOP_OPERATIONNAME interface. The advantage of this method
* is that it is easy to invoke arbitrary operations on the lower layer.
* The disadvantage is that vnode's arguments must be manually mapped.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vnops.c,v 1.72 2021/10/20 03:08:18 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <sys/buf.h>
#include <sys/kauth.h>
#include <sys/fcntl.h>
#include <sys/fstrans.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
/*
* This is the 08-June-99 bypass routine, based on the 10-Apr-92 bypass
* routine by John Heidemann.
* The new element for this version is that the whole nullfs
* system gained the concept of locks on the lower node.
* The 10-Apr-92 version was optimized for speed, throwing away some
* safety checks. It should still always work, but it's not as
* robust to programmer errors.
*
* In general, we map all vnodes going down and unmap them on the way back.
*
* Also, some BSD vnode operations have the side effect of vrele'ing
* their arguments. With stacking, the reference counts are held
* by the upper node, not the lower one, so we must handle these
* side-effects here. This is not of concern in Sun-derived systems
* since there are no such side-effects.
*
* New for the 08-June-99 version: we also handle operations which unlock
* the passed-in node (typically they vput the node).
*
* This makes the following assumptions:
* - only one returned vpp
* - no INOUT vpp's (Sun's vop_open has one of these)
* - the vnode operation vector of the first vnode should be used
* to determine what implementation of the op should be invoked
* - all mapped vnodes are of our vnode-type (NEEDSWORK:
* problems on rmdir'ing mount points and renaming?)
*/
int
layer_bypass(void *v)
{
struct vop_generic_args /* {
struct vnodeop_desc *a_desc;
<other random data follows, presumably>
} */ *ap = v;
int (**our_vnodeop_p)(void *);
struct vnode **this_vp_p;
int error;
struct vnode *old_vps[VDESC_MAX_VPS], *vp0;
struct vnode **vps_p[VDESC_MAX_VPS];
struct vnode ***vppp;
struct mount *mp;
struct vnodeop_desc *descp = ap->a_desc;
int reles, i, flags;
#ifdef DIAGNOSTIC
/*
* We require at least one vp.
*/
if (descp->vdesc_vp_offsets == NULL ||
descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
panic("%s: no vp's in map.\n", __func__);
#endif
vps_p[0] =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[0], ap);
vp0 = *vps_p[0];
mp = vp0->v_mount;
flags = MOUNTTOLAYERMOUNT(mp)->layerm_flags;
our_vnodeop_p = vp0->v_op;
if (flags & LAYERFS_MBYPASSDEBUG) printf("%s: %s\n", __func__, descp->vdesc_name);
/*
* Map the vnodes going in.
* Later, we'll invoke the operation based on
* the first mapped vnode's operation vector.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) { if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
vps_p[i] = this_vp_p =
VOPARG_OFFSETTO(struct vnode**, descp->vdesc_vp_offsets[i],
ap);
/*
* We're not guaranteed that any but the first vnode
* are of our type. Check for and don't map any
* that aren't. (We must always map first vp or vclean fails.)
*/
if (i && (*this_vp_p == NULL ||
(*this_vp_p)->v_op != our_vnodeop_p)) {
old_vps[i] = NULL;
} else {
old_vps[i] = *this_vp_p;
*(vps_p[i]) = LAYERVPTOLOWERVP(*this_vp_p);
/*
* XXX - Several operations have the side effect
* of vrele'ing their vp's. We must account for
* that. (This should go away in the future.)
*/
if (reles & VDESC_VP0_WILLRELE) vref(*this_vp_p);
}
}
/*
* Call the operation on the lower layer
* with the modified argument structure.
*/
error = VCALL(*vps_p[0], descp->vdesc_offset, ap);
/*
* Maintain the illusion of call-by-value
* by restoring vnodes in the argument structure
* to their original value.
*/
reles = descp->vdesc_flags;
for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
break; /* bail out at end of list */
if (old_vps[i]) {
*(vps_p[i]) = old_vps[i];
if (reles & VDESC_VP0_WILLRELE) vrele(*(vps_p[i]));
}
}
/*
* Map the possible out-going vpp
* (Assumes that the lower layer always returns
* a VREF'ed vpp unless it gets an error.)
*/
if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET && !error) {
vppp = VOPARG_OFFSETTO(struct vnode***,
descp->vdesc_vpp_offset, ap);
/*
* Only vop_lookup, vop_create, vop_makedir, vop_mknod
* and vop_symlink return vpp's. vop_lookup doesn't call bypass
* as a lookup on "." would generate a locking error.
* So all the calls which get us here have a unlocked vpp. :-)
*/
error = layer_node_create(mp, **vppp, *vppp);
if (error) { vrele(**vppp);
**vppp = NULL;
}
}
return error;
}
/*
* We have to carry on the locking protocol on the layer vnodes
* as we progress through the tree. We also have to enforce read-only
* if this layer is mounted read-only.
*/
int
layer_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnodeop_desc *a_desc;
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
struct vnode *dvp, *lvp, *ldvp;
int error, flags = cnp->cn_flags;
dvp = ap->a_dvp;
if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { *ap->a_vpp = NULL;
return EROFS;
}
ldvp = LAYERVPTOLOWERVP(dvp);
ap->a_dvp = ldvp;
error = VCALL(ldvp, ap->a_desc->vdesc_offset, ap);
lvp = *ap->a_vpp;
*ap->a_vpp = NULL;
if (error == EJUSTRETURN && (flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) && (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
error = EROFS;
/*
* We must do the same locking and unlocking at this layer as
* is done in the layers below us.
*/
if (ldvp == lvp) {
/*
* Got the same object back, because we looked up ".",
* or ".." in the root node of a mount point.
* So we make another reference to dvp and return it.
*/
vref(dvp);
*ap->a_vpp = dvp;
vrele(lvp);
} else if (lvp != NULL) {
/* Note: dvp and ldvp are both locked. */
KASSERT(error != ENOLCK);
error = layer_node_create(dvp->v_mount, lvp, ap->a_vpp);
if (error) { vrele(lvp);
}
}
return error;
}
/*
* Setattr call. Disallow write attempts if the layer is mounted read-only.
*/
int
layer_setattr(void *v)
{
struct vop_setattr_args /* {
struct vnodeop_desc *a_desc;
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vattr *vap = ap->a_vap;
if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
(vp->v_mount->mnt_flag & MNT_RDONLY))
return EROFS;
if (vap->va_size != VNOVAL) {
switch (vp->v_type) {
case VDIR:
return EISDIR;
case VCHR:
case VBLK:
case VSOCK:
case VFIFO:
return 0;
case VREG:
case VLNK:
default:
/*
* Disallow write attempts if the filesystem is
* mounted read-only.
*/
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
}
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* We handle getattr only to change the fsid.
*/
int
layer_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
int error;
error = LAYERFS_DO_BYPASS(vp, ap);
if (error) {
return error;
}
/* Requires that arguments be restored. */
ap->a_vap->va_fsid = vp->v_mount->mnt_stat.f_fsidx.__fsid_val[0];
return 0;
}
int
layer_access(void *v)
{
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
kauth_cred_t a_cred;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
accmode_t accmode = ap->a_accmode;
/*
* Disallow write attempts on read-only layers;
* unless the file is a socket, fifo, or a block or
* character device resident on the file system.
*/
if (accmode & VWRITE) { switch (vp->v_type) {
case VDIR:
case VLNK:
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
break;
default:
break;
}
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* We must handle open to be able to catch MNT_NODEV and friends
* and increment the lower v_writecount.
*/
int
layer_open(void *v)
{
struct vop_open_args /* {
const struct vnodeop_desc *a_desc;
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
int error;
if (((lvp->v_type == VBLK) || (lvp->v_type == VCHR)) &&
(vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0 && (ap->a_mode & FWRITE)) { mutex_enter(lvp->v_interlock);
lvp->v_writecount++;
mutex_exit(lvp->v_interlock);
}
return error;
}
/*
* We must handle close to decrement the lower v_writecount.
*/
int
layer_close(void *v)
{
struct vop_close_args /* {
const struct vnodeop_desc *a_desc;
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
if ((ap->a_fflag & FWRITE)) {
mutex_enter(lvp->v_interlock);
KASSERT(lvp->v_writecount > 0);
lvp->v_writecount--;
mutex_exit(lvp->v_interlock);
}
return LAYERFS_DO_BYPASS(vp, ap);
}
/*
* If vinvalbuf is calling us, it's a "shallow fsync" -- don't bother
* syncing the underlying vnodes, since they'll be fsync'ed when
* reclaimed; otherwise, pass it through to the underlying layer.
*
* XXX Do we still need to worry about shallow fsync?
*/
int
layer_fsync(void *v)
{
struct vop_fsync_args /* {
struct vnode *a_vp;
kauth_cred_t a_cred;
int a_flags;
off_t offlo;
off_t offhi;
struct lwp *a_l;
} */ *ap = v;
int error;
if (ap->a_flags & FSYNC_RECLAIM) {
return 0;
}
if (ap->a_vp->v_type == VBLK || ap->a_vp->v_type == VCHR) {
error = spec_fsync(v);
if (error)
return error;
}
return LAYERFS_DO_BYPASS(ap->a_vp, ap);
}
int
layer_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
/*
* If we did a remove, don't cache the node.
*/
*ap->a_recycle = ((VTOLAYER(vp)->layer_flags & LAYERFS_REMOVED) != 0);
/*
* Do nothing (and _don't_ bypass).
* Wait to vrele lowervp until reclaim,
* so that until then our layer_node is in the
* cache and reusable.
*
* NEEDSWORK: Someday, consider inactive'ing
* the lowervp and then trying to reactivate it
* with capabilities (v_id)
* like they do in the name lookup cache code.
* That's too much work for now.
*/
return 0;
}
int
layer_remove(void *v)
{
struct vop_remove_v3_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
nlink_t ctx_vp_new_nlink;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
int error;
vref(vp);
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0) { VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
}
vrele(vp);
return error;
}
int
layer_rename(void *v)
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
struct vnode *fdvp = ap->a_fdvp, *tvp;
int error;
tvp = ap->a_tvp;
if (tvp) { if (tvp->v_mount != fdvp->v_mount)
tvp = NULL;
else
vref(tvp);
}
error = LAYERFS_DO_BYPASS(fdvp, ap);
if (tvp) { if (error == 0) VTOLAYER(tvp)->layer_flags |= LAYERFS_REMOVED;
vrele(tvp);
}
return error;
}
int
layer_rmdir(void *v)
{
struct vop_rmdir_v2_args /* {
struct vnode *a_dvp;
struct vnode *a_vp;
struct componentname *a_cnp;
} */ *ap = v;
int error;
struct vnode *vp = ap->a_vp;
vref(vp);
error = LAYERFS_DO_BYPASS(vp, ap);
if (error == 0) { VTOLAYER(vp)->layer_flags |= LAYERFS_REMOVED;
}
vrele(vp);
return error;
}
int
layer_revoke(void *v)
{
struct vop_revoke_args /* {
struct vnode *a_vp;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct vnode *lvp = LAYERVPTOLOWERVP(vp);
int error;
/*
* We will most likely end up in vclean which uses the usecount
* to determine if a vnode is active. Take an extra reference on
* the lower vnode so it will always close and inactivate.
*/
vref(lvp);
error = LAYERFS_DO_BYPASS(vp, ap);
vrele(lvp);
return error;
}
int
layer_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
struct lwp *a_l;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct layer_mount *lmp = MOUNTTOLAYERMOUNT(vp->v_mount);
struct layer_node *xp = VTOLAYER(vp);
struct vnode *lowervp = xp->layer_lowervp;
VOP_UNLOCK(vp);
/*
* Note: in vop_reclaim, the node's struct lock has been
* decomissioned, so we have to be careful about calling
* VOP's on ourself. We must be careful as VXLOCK is set.
*/
if (vp == lmp->layerm_rootvp) {
/*
* Oops! We no longer have a root node. Most likely reason is
* that someone forcably unmunted the underlying fs.
*
* Now getting the root vnode will fail. We're dead. :-(
*/
lmp->layerm_rootvp = NULL;
}
mutex_enter(vp->v_interlock);
KASSERT(vp->v_interlock == lowervp->v_interlock);
lowervp->v_writecount -= vp->v_writecount;
mutex_exit(vp->v_interlock);
/* After this assignment, this node will not be re-used. */
xp->layer_lowervp = NULL;
kmem_free(vp->v_data, lmp->layerm_size);
vp->v_data = NULL;
vrele(lowervp);
return 0;
}
/*
* We just feed the returned vnode up to the caller - there's no need
* to build a layer node on top of the node on which we're going to do
* i/o. :-)
*/
int
layer_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
struct vnode *vp;
vp = LAYERVPTOLOWERVP(ap->a_vp);
ap->a_vp = vp;
return VCALL(vp, ap->a_desc->vdesc_offset, ap);
}
int
layer_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
printf ("\ttag VT_LAYERFS, vp=%p, lowervp=%p\n", vp, LAYERVPTOLOWERVP(vp));
return 0;
}
int
layer_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct mount *mp = vp->v_mount;
int error;
krw_t op;
KASSERT(rw_lock_held(vp->v_uobj.vmobjlock)); if (ap->a_flags & PGO_LOCKED) {
return EBUSY;
}
ap->a_vp = LAYERVPTOLOWERVP(vp);
KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);
/* Just pass the request on to the underlying layer. */
op = rw_lock_op(vp->v_uobj.vmobjlock);
rw_exit(vp->v_uobj.vmobjlock);
fstrans_start(mp);
rw_enter(vp->v_uobj.vmobjlock, op);
if (mp == vp->v_mount) {
/* Will release the lock. */
error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
} else {
rw_exit(vp->v_uobj.vmobjlock);
error = ENOENT;
}
fstrans_done(mp);
return error;
}
int
layer_putpages(void *v)
{
struct vop_putpages_args /* {
struct vnode *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(rw_write_held(vp->v_uobj.vmobjlock));
ap->a_vp = LAYERVPTOLOWERVP(vp);
KASSERT(vp->v_uobj.vmobjlock == ap->a_vp->v_uobj.vmobjlock);
if (ap->a_flags & PGO_RECLAIM) {
rw_exit(vp->v_uobj.vmobjlock);
return 0;
}
/* Just pass the request on to the underlying layer. */
return VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
}
/* $NetBSD: mbuf.h,v 1.239 2024/01/22 21:15:02 jdolecek Exp $ */
/*
* Copyright (c) 1996, 1997, 1999, 2001, 2007 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center and Matt Thomas of 3am Software Foundry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mbuf.h 8.5 (Berkeley) 2/19/95
*/
#ifndef _SYS_MBUF_H_
#define _SYS_MBUF_H_
#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif
#ifndef M_WAITOK
#include <sys/malloc.h>
#endif
#include <sys/pool.h>
#include <sys/queue.h>
#if defined(_KERNEL)
#include <sys/percpu_types.h>
#include <sys/socket.h> /* for AF_UNSPEC */
#include <sys/psref.h>
#endif /* defined(_KERNEL) */
/* For offsetof() */
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/systm.h>
#else
#include <stddef.h>
#endif
#include <uvm/uvm_param.h> /* for MIN_PAGE_SIZE */
#include <net/if.h>
/*
* Mbufs are of a single size, MSIZE (machine/param.h), which
* includes overhead. An mbuf may add a single "mbuf cluster" of size
* MCLBYTES (also in machine/param.h), which has no additional overhead
* and is used instead of the internal data area; this is done when
* at least MINCLSIZE of data must be stored.
*/
/* Packet tags structure */
struct m_tag {
SLIST_ENTRY(m_tag) m_tag_link; /* List of packet tags */
uint16_t m_tag_id; /* Tag ID */
uint16_t m_tag_len; /* Length of data */
};
/* mbuf ownership structure */
struct mowner {
char mo_name[16]; /* owner name (fxp0) */
char mo_descr[16]; /* owner description (input) */
LIST_ENTRY(mowner) mo_link; /* */
struct percpu *mo_counters;
};
#define MOWNER_INIT(x, y) { .mo_name = x, .mo_descr = y }
enum mowner_counter_index {
MOWNER_COUNTER_CLAIMS, /* # of small mbuf claimed */
MOWNER_COUNTER_RELEASES, /* # of small mbuf released */
MOWNER_COUNTER_CLUSTER_CLAIMS, /* # of cluster mbuf claimed */
MOWNER_COUNTER_CLUSTER_RELEASES,/* # of cluster mbuf released */
MOWNER_COUNTER_EXT_CLAIMS, /* # of M_EXT mbuf claimed */
MOWNER_COUNTER_EXT_RELEASES, /* # of M_EXT mbuf released */
MOWNER_COUNTER_NCOUNTERS,
};
#if defined(_KERNEL)
struct mowner_counter {
u_long mc_counter[MOWNER_COUNTER_NCOUNTERS];
};
#endif
/* userland-exported version of struct mowner */
struct mowner_user {
char mo_name[16]; /* owner name (fxp0) */
char mo_descr[16]; /* owner description (input) */
LIST_ENTRY(mowner) mo_link; /* unused padding; for compatibility */
u_long mo_counter[MOWNER_COUNTER_NCOUNTERS]; /* counters */
};
/*
* Macros for type conversion
* mtod(m,t) - convert mbuf pointer to data pointer of correct type
*/
#define mtod(m, t) ((t)((m)->m_data))
/* header at beginning of each mbuf */
struct m_hdr {
struct mbuf *mh_next; /* next buffer in chain */
struct mbuf *mh_nextpkt; /* next chain in queue/record */
char *mh_data; /* location of data */
struct mowner *mh_owner; /* mbuf owner */
int mh_len; /* amount of data in this mbuf */
int mh_flags; /* flags; see below */
paddr_t mh_paddr; /* physical address of mbuf */
short mh_type; /* type of data in this mbuf */
};
/*
* record/packet header in first mbuf of chain; valid if M_PKTHDR set
*
* A note about csum_data:
*
* o For the out-bound direction, the low 16 bits indicates the offset after
* the L4 header where the final L4 checksum value is to be stored and the
* high 16 bits is the length of the L3 header (the start of the data to
* be checksummed).
*
* o For the in-bound direction, it is only valid if the M_CSUM_DATA flag is
* set. In this case, an L4 checksum has been calculated by hardware and
* is stored in csum_data, but it is up to software to perform final
* verification.
*
* Note for in-bound TCP/UDP checksums: we expect the csum_data to NOT
* be bit-wise inverted (the final step in the calculation of an IP
* checksum) -- this is so we can accumulate the checksum for fragmented
* packets during reassembly.
*
* Size ILP32: 40
* LP64: 56
*/
struct pkthdr {
union {
void *ctx; /* for M_GETCTX/M_SETCTX */
if_index_t index; /* rcv interface index */
} _rcvif;
#define rcvif_index _rcvif.index
SLIST_HEAD(packet_tags, m_tag) tags; /* list of packet tags */
int len; /* total packet length */
int csum_flags; /* checksum flags */
uint32_t csum_data; /* checksum data */
u_int segsz; /* segment size */
uint16_t ether_vtag; /* ethernet 802.1p+q vlan tag */
uint16_t pkthdr_flags; /* flags for pkthdr, see blow */
#define PKTHDR_FLAG_IPSEC_SKIP_PFIL 0x0001 /* skip pfil_run_hooks() after ipsec decrypt */
/*
* Following three fields are open-coded struct altq_pktattr
* to rearrange struct pkthdr fields flexibly.
*/
int pattr_af; /* ALTQ: address family */
void *pattr_class; /* ALTQ: sched class set by classifier */
void *pattr_hdr; /* ALTQ: saved header position in mbuf */
};
/* Checksumming flags (csum_flags). */
#define M_CSUM_TCPv4 0x00000001 /* TCP header/payload */
#define M_CSUM_UDPv4 0x00000002 /* UDP header/payload */
#define M_CSUM_TCP_UDP_BAD 0x00000004 /* TCP/UDP checksum bad */
#define M_CSUM_DATA 0x00000008 /* consult csum_data */
#define M_CSUM_TCPv6 0x00000010 /* IPv6 TCP header/payload */
#define M_CSUM_UDPv6 0x00000020 /* IPv6 UDP header/payload */
#define M_CSUM_IPv4 0x00000040 /* IPv4 header */
#define M_CSUM_IPv4_BAD 0x00000080 /* IPv4 header checksum bad */
#define M_CSUM_TSOv4 0x00000100 /* TCPv4 segmentation offload */
#define M_CSUM_TSOv6 0x00000200 /* TCPv6 segmentation offload */
/* Checksum-assist quirks: keep separate from jump-table bits. */
#define M_CSUM_BLANK 0x40000000 /* csum is missing */
#define M_CSUM_NO_PSEUDOHDR 0x80000000 /* Rx csum_data does not include
* the UDP/TCP pseudo-hdr, and
* is not yet 1s-complemented.
*/
#define M_CSUM_BITS \
"\20\1TCPv4\2UDPv4\3TCP_UDP_BAD\4DATA\5TCPv6\6UDPv6\7IPv4\10IPv4_BAD" \
"\11TSOv4\12TSOv6\37BLANK\40NO_PSEUDOHDR"
/*
* Macros for manipulating csum_data on outgoing packets. These are
* used to pass information down from the L4/L3 to the L2.
*
* _IPHL: Length of the IPv{4/6} header, plus the options; in other
* words the offset of the UDP/TCP header in the packet.
* _OFFSET: Offset of the checksum field in the UDP/TCP header.
*/
#define M_CSUM_DATA_IPv4_IPHL(x) ((x) >> 16)
#define M_CSUM_DATA_IPv4_OFFSET(x) ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_IPHL(x) ((x) >> 16)
#define M_CSUM_DATA_IPv6_OFFSET(x) ((x) & 0xffff)
#define M_CSUM_DATA_IPv6_SET(x, v) (x) = ((x) & 0xffff) | ((v) << 16)
/*
* Max # of pages we can attach to m_ext. This is carefully chosen
* to be able to handle SOSEND_LOAN_CHUNK with our minimum sized page.
*/
#ifdef MIN_PAGE_SIZE
#define M_EXT_MAXPAGES ((65536 / MIN_PAGE_SIZE) + 1)
#endif
/*
* Description of external storage mapped into mbuf, valid if M_EXT set.
*/
struct _m_ext_storage {
unsigned int ext_refcnt;
char *ext_buf; /* start of buffer */
void (*ext_free) /* free routine if not the usual */
(struct mbuf *, void *, size_t, void *);
void *ext_arg; /* argument for ext_free */
size_t ext_size; /* size of buffer, for ext_free */
union {
/* M_EXT_CLUSTER: physical address */
paddr_t extun_paddr;
#ifdef M_EXT_MAXPAGES
/* M_EXT_PAGES: pages */
struct vm_page *extun_pgs[M_EXT_MAXPAGES];
#endif
} ext_un;
#define ext_paddr ext_un.extun_paddr
#define ext_pgs ext_un.extun_pgs
};
struct _m_ext {
struct mbuf *ext_ref;
struct _m_ext_storage ext_storage;
};
#define M_PADDR_INVALID POOL_PADDR_INVALID
/*
* Definition of "struct mbuf".
* Don't change this without understanding how MHLEN/MLEN are defined.
*/
#define MBUF_DEFINE(name, mhlen, mlen) \
struct name { \
struct m_hdr m_hdr; \
union { \
struct { \
struct pkthdr MH_pkthdr; \
union { \
struct _m_ext MH_ext; \
char MH_databuf[(mhlen)]; \
} MH_dat; \
} MH; \
char M_databuf[(mlen)]; \
} M_dat; \
}
#define m_next m_hdr.mh_next
#define m_len m_hdr.mh_len
#define m_data m_hdr.mh_data
#define m_owner m_hdr.mh_owner
#define m_type m_hdr.mh_type
#define m_flags m_hdr.mh_flags
#define m_nextpkt m_hdr.mh_nextpkt
#define m_paddr m_hdr.mh_paddr
#define m_pkthdr M_dat.MH.MH_pkthdr
#define m_ext_storage M_dat.MH.MH_dat.MH_ext.ext_storage
#define m_ext_ref M_dat.MH.MH_dat.MH_ext.ext_ref
#define m_ext m_ext_ref->m_ext_storage
#define m_pktdat M_dat.MH.MH_dat.MH_databuf
#define m_dat M_dat.M_databuf
/*
* Dummy mbuf structure to calculate the right values for MLEN/MHLEN, taking
* into account inter-structure padding.
*/
MBUF_DEFINE(_mbuf_dummy, 1, 1);
/* normal data len */
#define MLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_dat)))
/* data len w/pkthdr */
#define MHLEN ((int)(MSIZE - offsetof(struct _mbuf_dummy, m_pktdat)))
#define MINCLSIZE (MHLEN+MLEN+1) /* smallest amount to put in cluster */
/*
* The *real* struct mbuf
*/
MBUF_DEFINE(mbuf, MHLEN, MLEN);
/* mbuf flags */
#define M_EXT 0x00000001 /* has associated external storage */
#define M_PKTHDR 0x00000002 /* start of record */
#define M_EOR 0x00000004 /* end of record */
#define M_PROTO1 0x00000008 /* protocol-specific */
/* mbuf pkthdr flags, also in m_flags */
#define M_AUTHIPHDR 0x00000010 /* authenticated (IPsec) */
#define M_DECRYPTED 0x00000020 /* decrypted (IPsec) */
#define M_LOOP 0x00000040 /* received on loopback */
#define M_BCAST 0x00000100 /* send/received as L2 broadcast */
#define M_MCAST 0x00000200 /* send/received as L2 multicast */
#define M_CANFASTFWD 0x00000400 /* packet can be fast-forwarded */
#define M_ANYCAST6 0x00000800 /* received as IPv6 anycast */
#define M_LINK0 0x00001000 /* link layer specific flag */
#define M_LINK1 0x00002000 /* link layer specific flag */
#define M_LINK2 0x00004000 /* link layer specific flag */
#define M_LINK3 0x00008000 /* link layer specific flag */
#define M_LINK4 0x00010000 /* link layer specific flag */
#define M_LINK5 0x00020000 /* link layer specific flag */
#define M_LINK6 0x00040000 /* link layer specific flag */
#define M_LINK7 0x00080000 /* link layer specific flag */
#define M_VLANTAG 0x00100000 /* ether_vtag is valid */
/* additional flags for M_EXT mbufs */
#define M_EXT_FLAGS 0xff000000
#define M_EXT_CLUSTER 0x01000000 /* ext is a cluster */
#define M_EXT_PAGES 0x02000000 /* ext_pgs is valid */
#define M_EXT_ROMAP 0x04000000 /* ext mapping is r-o at MMU */
#define M_EXT_RW 0x08000000 /* ext storage is writable */
/* for source-level compatibility */
#define M_NOTIFICATION M_PROTO1
#define M_FLAGS_BITS \
"\20\1EXT\2PKTHDR\3EOR\4PROTO1\5AUTHIPHDR\6DECRYPTED\7LOOP\10NONE" \
"\11BCAST\12MCAST\13CANFASTFWD\14ANYCAST6\15LINK0\16LINK1\17LINK2\20LINK3" \
"\21LINK4\22LINK5\23LINK6\24LINK7" \
"\25VLANTAG" \
"\31EXT_CLUSTER\32EXT_PAGES\33EXT_ROMAP\34EXT_RW"
/* flags copied when copying m_pkthdr */
#define M_COPYFLAGS (M_PKTHDR|M_EOR|M_BCAST|M_MCAST|M_CANFASTFWD| \
M_ANYCAST6|M_LINK0|M_LINK1|M_LINK2|M_AUTHIPHDR|M_DECRYPTED|M_LOOP| \
M_VLANTAG)
/* flag copied when shallow-copying external storage */
#define M_EXTCOPYFLAGS (M_EXT|M_EXT_FLAGS)
/* mbuf types */
#define MT_FREE 0 /* should be on free list */
#define MT_DATA 1 /* dynamic (data) allocation */
#define MT_HEADER 2 /* packet header */
#define MT_SONAME 3 /* socket name */
#define MT_SOOPTS 4 /* socket options */
#define MT_FTABLE 5 /* fragment reassembly header */
#define MT_CONTROL 6 /* extra-data protocol message */
#define MT_OOBDATA 7 /* expedited data */
#ifdef MBUFTYPES
const char * const mbuftypes[] = {
"mbfree",
"mbdata",
"mbheader",
"mbsoname",
"mbsopts",
"mbftable",
"mbcontrol",
"mboobdata",
};
#else
extern const char * const mbuftypes[];
#endif
/* flags to m_get/MGET */
#define M_DONTWAIT M_NOWAIT
#define M_WAIT M_WAITOK
#ifdef MBUFTRACE
/* Mbuf allocation tracing. */
void mowner_init_owner(struct mowner *, const char *, const char *);
void mowner_init(struct mbuf *, int);
void mowner_ref(struct mbuf *, int);
void m_claim(struct mbuf *, struct mowner *);
void mowner_revoke(struct mbuf *, bool, int);
void mowner_attach(struct mowner *);
void mowner_detach(struct mowner *);
void m_claimm(struct mbuf *, struct mowner *);
#else
#define mowner_init_owner(mo, n, d) __nothing
#define mowner_init(m, type) __nothing
#define mowner_ref(m, flags) __nothing
#define mowner_revoke(m, all, flags) __nothing
#define m_claim(m, mowner) __nothing
#define mowner_attach(mo) __nothing
#define mowner_detach(mo) __nothing
#define m_claimm(m, mo) __nothing
#endif
#define MCLAIM(m, mo) m_claim((m), (mo))
#define MOWNER_ATTACH(mo) mowner_attach(mo)
#define MOWNER_DETACH(mo) mowner_detach(mo)
/*
* mbuf allocation/deallocation macros:
*
* MGET(struct mbuf *m, int how, int type)
* allocates an mbuf and initializes it to contain internal data.
*
* MGETHDR(struct mbuf *m, int how, int type)
* allocates an mbuf and initializes it to contain a packet header
* and internal data.
*
* If 'how' is M_WAIT, these macros (and the corresponding functions)
* are guaranteed to return successfully.
*/
#define MGET(m, how, type) m = m_get((how), (type))
#define MGETHDR(m, how, type) m = m_gethdr((how), (type))
#if defined(_KERNEL)
#define MCLINITREFERENCE(m) \
do { \
KASSERT(((m)->m_flags & M_EXT) == 0); \
(m)->m_ext_ref = (m); \
(m)->m_ext.ext_refcnt = 1; \
} while (/* CONSTCOND */ 0)
/*
* Macros for mbuf external storage.
*
* MCLGET allocates and adds an mbuf cluster to a normal mbuf;
* the flag M_EXT is set upon success.
*
* MEXTMALLOC allocates external storage and adds it to
* a normal mbuf; the flag M_EXT is set upon success.
*
* MEXTADD adds pre-allocated external storage to
* a normal mbuf; the flag M_EXT is set upon success.
*/
#define MCLGET(m, how) m_clget((m), (how))
#define MEXTMALLOC(m, size, how) \
do { \
(m)->m_ext_storage.ext_buf = malloc((size), 0, (how)); \
if ((m)->m_ext_storage.ext_buf != NULL) { \
MCLINITREFERENCE(m); \
(m)->m_data = (m)->m_ext.ext_buf; \
(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | \
M_EXT|M_EXT_RW; \
(m)->m_ext.ext_size = (size); \
(m)->m_ext.ext_free = NULL; \
(m)->m_ext.ext_arg = NULL; \
mowner_ref((m), M_EXT); \
} \
} while (/* CONSTCOND */ 0)
#define MEXTADD(m, buf, size, type, free, arg) \
do { \
MCLINITREFERENCE(m); \
(m)->m_data = (m)->m_ext.ext_buf = (char *)(buf); \
(m)->m_flags = ((m)->m_flags & ~M_EXTCOPYFLAGS) | M_EXT; \
(m)->m_ext.ext_size = (size); \
(m)->m_ext.ext_free = (free); \
(m)->m_ext.ext_arg = (arg); \
mowner_ref((m), M_EXT); \
} while (/* CONSTCOND */ 0)
#define M_BUFADDR(m) \
(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf : \
((m)->m_flags & M_PKTHDR) ? (m)->m_pktdat : (m)->m_dat)
#define M_BUFSIZE(m) \
(((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size : \
((m)->m_flags & M_PKTHDR) ? MHLEN : MLEN)
#define MRESETDATA(m) (m)->m_data = M_BUFADDR(m)
/*
* Compute the offset of the beginning of the data buffer of a non-ext
* mbuf.
*/
#define M_BUFOFFSET(m) \
(((m)->m_flags & M_PKTHDR) ? \
offsetof(struct mbuf, m_pktdat) : offsetof(struct mbuf, m_dat))
/*
* Determine if an mbuf's data area is read-only. This is true
* if external storage is read-only mapped, or not marked as R/W,
* or referenced by more than one mbuf.
*/
#define M_READONLY(m) \
(((m)->m_flags & M_EXT) != 0 && \
(((m)->m_flags & (M_EXT_ROMAP|M_EXT_RW)) != M_EXT_RW || \
(m)->m_ext.ext_refcnt > 1))
#define M_UNWRITABLE(__m, __len) \
((__m)->m_len < (__len) || M_READONLY((__m)))
/*
* Determine if an mbuf's data area is read-only at the MMU.
*/
#define M_ROMAP(m) \
(((m)->m_flags & (M_EXT|M_EXT_ROMAP)) == (M_EXT|M_EXT_ROMAP))
/*
* Compute the amount of space available before the current start of
* data in an mbuf.
*/
#define M_LEADINGSPACE(m) \
(M_READONLY((m)) ? 0 : ((m)->m_data - M_BUFADDR(m)))
/*
* Compute the amount of space available
* after the end of data in an mbuf.
*/
#define _M_TRAILINGSPACE(m) \
((m)->m_flags & M_EXT ? (m)->m_ext.ext_buf + (m)->m_ext.ext_size - \
((m)->m_data + (m)->m_len) : \
&(m)->m_dat[MLEN] - ((m)->m_data + (m)->m_len))
#define M_TRAILINGSPACE(m) \
(M_READONLY((m)) ? 0 : _M_TRAILINGSPACE((m)))
/*
* Arrange to prepend space of size plen to mbuf m.
* If a new mbuf must be allocated, how specifies whether to wait.
* If how is M_DONTWAIT and allocation fails, the original mbuf chain
* is freed and m is set to NULL.
*/
#define M_PREPEND(m, plen, how) \
do { \
if (M_LEADINGSPACE(m) >= (plen)) { \
(m)->m_data -= (plen); \
(m)->m_len += (plen); \
} else \
(m) = m_prepend((m), (plen), (how)); \
if ((m) && (m)->m_flags & M_PKTHDR) \
(m)->m_pkthdr.len += (plen); \
} while (/* CONSTCOND */ 0)
/* change mbuf to new type */
#define MCHTYPE(m, t) \
do { \
KASSERT((t) != MT_FREE); \
mbstat_type_add((m)->m_type, -1); \
mbstat_type_add(t, 1); \
(m)->m_type = t; \
} while (/* CONSTCOND */ 0)
#ifdef DIAGNOSTIC
#define M_VERIFY_PACKET(m) m_verify_packet(m)
#else
#define M_VERIFY_PACKET(m) __nothing
#endif
/* The "copy all" special length. */
#define M_COPYALL -1
/*
* Allow drivers and/or protocols to store private context information.
*/
#define M_GETCTX(m, t) ((t)(m)->m_pkthdr._rcvif.ctx)
#define M_SETCTX(m, c) ((void)((m)->m_pkthdr._rcvif.ctx = (void *)(c)))
#define M_CLEARCTX(m) M_SETCTX((m), NULL)
/*
* M_REGION_GET ensures that the "len"-sized region of type "typ" starting
* from "off" within "m" is located in a single mbuf, contiguously.
*
* The pointer to the region will be returned to pointer variable "val".
*/
#define M_REGION_GET(val, typ, m, off, len) \
do { \
struct mbuf *_t; \
int _tmp; \
if ((m)->m_len >= (off) + (len)) \
(val) = (typ)(mtod((m), char *) + (off)); \
else { \
_t = m_pulldown((m), (off), (len), &_tmp); \
if (_t) { \
if (_t->m_len < _tmp + (len)) \
panic("m_pulldown malfunction"); \
(val) = (typ)(mtod(_t, char *) + _tmp); \
} else { \
(val) = (typ)NULL; \
(m) = NULL; \
} \
} \
} while (/*CONSTCOND*/ 0)
#endif /* defined(_KERNEL) */
/*
* Simple mbuf queueing system
*
* this is basically a SIMPLEQ adapted to mbuf use (ie using
* m_nextpkt instead of field.sqe_next).
*
* m_next is ignored, so queueing chains of mbufs is possible
*/
#define MBUFQ_HEAD(name) \
struct name { \
struct mbuf *mq_first; \
struct mbuf **mq_last; \
}
#define MBUFQ_INIT(q) do { \
(q)->mq_first = NULL; \
(q)->mq_last = &(q)->mq_first; \
} while (/*CONSTCOND*/0)
#define MBUFQ_ENQUEUE(q, m) do { \
(m)->m_nextpkt = NULL; \
*(q)->mq_last = (m); \
(q)->mq_last = &(m)->m_nextpkt; \
} while (/*CONSTCOND*/0)
#define MBUFQ_PREPEND(q, m) do { \
if (((m)->m_nextpkt = (q)->mq_first) == NULL) \
(q)->mq_last = &(m)->m_nextpkt; \
(q)->mq_first = (m); \
} while (/*CONSTCOND*/0)
#define MBUFQ_DEQUEUE(q, m) do { \
if (((m) = (q)->mq_first) != NULL) { \
if (((q)->mq_first = (m)->m_nextpkt) == NULL) \
(q)->mq_last = &(q)->mq_first; \
else \
(m)->m_nextpkt = NULL; \
} \
} while (/*CONSTCOND*/0)
#define MBUFQ_DRAIN(q) do { \
struct mbuf *__m0; \
while ((__m0 = (q)->mq_first) != NULL) { \
(q)->mq_first = __m0->m_nextpkt; \
m_freem(__m0); \
} \
(q)->mq_last = &(q)->mq_first; \
} while (/*CONSTCOND*/0)
#define MBUFQ_FIRST(q) ((q)->mq_first)
#define MBUFQ_NEXT(m) ((m)->m_nextpkt)
#define MBUFQ_LAST(q) (*(q)->mq_last)
/*
* Mbuf statistics.
* For statistics related to mbuf and cluster allocations, see also the
* pool headers (mb_cache and mcl_cache).
*/
struct mbstat {
u_long _m_spare; /* formerly m_mbufs */
u_long _m_spare1; /* formerly m_clusters */
u_long _m_spare2; /* spare field */
u_long _m_spare3; /* formely m_clfree - free clusters */
u_long m_drops; /* times failed to find space */
u_long m_wait; /* times waited for space */
u_long m_drain; /* times drained protocols for space */
u_short m_mtypes[256]; /* type specific mbuf allocations */
};
struct mbstat_cpu {
u_int m_mtypes[256]; /* type specific mbuf allocations */
};
/*
* Mbuf sysctl variables.
*/
#define MBUF_MSIZE 1 /* int: mbuf base size */
#define MBUF_MCLBYTES 2 /* int: mbuf cluster size */
#define MBUF_NMBCLUSTERS 3 /* int: limit on the # of clusters */
#define MBUF_MBLOWAT 4 /* int: mbuf low water mark */
#define MBUF_MCLLOWAT 5 /* int: mbuf cluster low water mark */
#define MBUF_STATS 6 /* struct: mbstat */
#define MBUF_MOWNERS 7 /* struct: m_owner[] */
#define MBUF_NMBCLUSTERS_LIMIT 8 /* int: limit of nmbclusters */
#ifdef _KERNEL
extern struct mbstat mbstat;
extern int nmbclusters; /* limit on the # of clusters */
extern int mblowat; /* mbuf low water mark */
extern int mcllowat; /* mbuf cluster low water mark */
extern int max_linkhdr; /* largest link-level header */
extern int max_protohdr; /* largest protocol header */
extern int max_hdr; /* largest link+protocol header */
extern int max_datalen; /* MHLEN - max_hdr */
extern const int msize; /* mbuf base size */
extern const int mclbytes; /* mbuf cluster size */
extern pool_cache_t mb_cache;
#ifdef MBUFTRACE
LIST_HEAD(mownerhead, mowner);
extern struct mownerhead mowners;
extern struct mowner unknown_mowners[];
extern struct mowner revoked_mowner;
#endif
MALLOC_DECLARE(M_MBUF);
MALLOC_DECLARE(M_SONAME);
struct mbuf *m_copym(struct mbuf *, int, int, int);
struct mbuf *m_copypacket(struct mbuf *, int);
struct mbuf *m_devget(char *, int, int, struct ifnet *);
struct mbuf *m_dup(struct mbuf *, int, int, int);
struct mbuf *m_get(int, int);
struct mbuf *m_gethdr(int, int);
struct mbuf *m_get_n(int, int, size_t, size_t);
struct mbuf *m_gethdr_n(int, int, size_t, size_t);
struct mbuf *m_prepend(struct mbuf *,int, int);
struct mbuf *m_pulldown(struct mbuf *, int, int, int *);
struct mbuf *m_pullup(struct mbuf *, int);
struct mbuf *m_copyup(struct mbuf *, int, int);
struct mbuf *m_split(struct mbuf *,int, int);
struct mbuf *m_getptr(struct mbuf *, int, int *);
void m_adj(struct mbuf *, int);
struct mbuf *m_defrag(struct mbuf *, int);
int m_apply(struct mbuf *, int, int,
int (*)(void *, void *, unsigned int), void *);
void m_cat(struct mbuf *,struct mbuf *);
void m_clget(struct mbuf *, int);
void m_copyback(struct mbuf *, int, int, const void *);
struct mbuf *m_copyback_cow(struct mbuf *, int, int, const void *, int);
int m_makewritable(struct mbuf **, int, int, int);
struct mbuf *m_getcl(int, int, int);
void m_copydata(struct mbuf *, int, int, void *);
void m_verify_packet(struct mbuf *);
struct mbuf *m_free(struct mbuf *);
void m_freem(struct mbuf *);
void mbinit(void);
void m_remove_pkthdr(struct mbuf *);
void m_copy_pkthdr(struct mbuf *, struct mbuf *);
void m_move_pkthdr(struct mbuf *, struct mbuf *);
void m_align(struct mbuf *, int);
bool m_ensure_contig(struct mbuf **, int);
struct mbuf *m_add(struct mbuf *, struct mbuf *);
/* Inline routines. */
static __inline u_int m_length(const struct mbuf *) __unused;
/* Statistics */
void mbstat_type_add(int, int);
/* Packet tag routines */
struct m_tag *m_tag_get(int, int, int);
void m_tag_free(struct m_tag *);
void m_tag_prepend(struct mbuf *, struct m_tag *);
void m_tag_unlink(struct mbuf *, struct m_tag *);
void m_tag_delete(struct mbuf *, struct m_tag *);
void m_tag_delete_chain(struct mbuf *);
struct m_tag *m_tag_find(const struct mbuf *, int);
struct m_tag *m_tag_copy(struct m_tag *);
int m_tag_copy_chain(struct mbuf *, struct mbuf *);
/* Packet tag types */
#define PACKET_TAG_NONE 0 /* Nothing */
#define PACKET_TAG_SO 4 /* sending socket pointer */
#define PACKET_TAG_NPF 10 /* packet filter */
#define PACKET_TAG_PF 11 /* packet filter */
#define PACKET_TAG_ALTQ_QID 12 /* ALTQ queue id */
#define PACKET_TAG_IPSEC_OUT_DONE 18
#define PACKET_TAG_IPSEC_NAT_T_PORTS 25 /* two uint16_t */
#define PACKET_TAG_INET6 26 /* IPv6 info */
#define PACKET_TAG_TUNNEL_INFO 28 /* tunnel identification and
* protocol callback, for loop
* detection/recovery
*/
#define PACKET_TAG_MPLS 29 /* Indicate it's for MPLS */
#define PACKET_TAG_SRCROUTE 30 /* IPv4 source routing */
#define PACKET_TAG_ETHERNET_SRC 31 /* Ethernet source address */
/*
* Return the number of bytes in the mbuf chain, m.
*/
static __inline u_int
m_length(const struct mbuf *m)
{
const struct mbuf *m0;
u_int pktlen;
if ((m->m_flags & M_PKTHDR) != 0)
return m->m_pkthdr.len;
pktlen = 0;
for (m0 = m; m0 != NULL; m0 = m0->m_next)
pktlen += m0->m_len;
return pktlen;
}
static __inline void
m_set_rcvif(struct mbuf *m, const struct ifnet *ifp)
{
KASSERT(m->m_flags & M_PKTHDR);
m->m_pkthdr.rcvif_index = ifp->if_index;
}
static __inline void
m_reset_rcvif(struct mbuf *m)
{
KASSERT(m->m_flags & M_PKTHDR);
/* A caller may expect whole _rcvif union is zeroed */
/* m->m_pkthdr.rcvif_index = 0; */
m->m_pkthdr._rcvif.ctx = NULL;
}
static __inline void
m_copy_rcvif(struct mbuf *m, const struct mbuf *n)
{
KASSERT(m->m_flags & M_PKTHDR);
KASSERT(n->m_flags & M_PKTHDR);
m->m_pkthdr.rcvif_index = n->m_pkthdr.rcvif_index;
}
#define M_GET_ALIGNED_HDR(m, type, linkhdr) \
m_get_aligned_hdr((m), __alignof(type) - 1, sizeof(type), (linkhdr))
static __inline int
m_get_aligned_hdr(struct mbuf **m, int mask, size_t hlen, bool linkhdr)
{
#ifndef __NO_STRICT_ALIGNMENT
if (((uintptr_t)mtod(*m, void *) & mask) != 0)
*m = m_copyup(*m, hlen,
linkhdr ? (max_linkhdr + mask) & ~mask : 0);
else
#endif
if (__predict_false((size_t)(*m)->m_len < hlen))
*m = m_pullup(*m, hlen);
return *m == NULL;
}
void m_print(const struct mbuf *, const char *, void (*)(const char *, ...)
__printflike(1, 2));
/* from uipc_mbufdebug.c */
void m_examine(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
/* parsers for m_examine() */
void m_examine_ether(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_pppoe(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ppp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_arp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_ip6(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_icmp6(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_tcp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_udp(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
void m_examine_hex(const struct mbuf *, int, const char *,
void (*)(const char *, ...) __printflike(1, 2));
/*
* Get rcvif of a mbuf.
*
* The caller must call m_put_rcvif after using rcvif if the returned rcvif
* isn't NULL. If the returned rcvif is NULL, the caller doesn't need to call
* m_put_rcvif (although calling it is safe).
*
* The caller must not block or sleep while using rcvif. The API ensures a
* returned rcvif isn't freed until m_put_rcvif is called.
*/
static __inline struct ifnet *
m_get_rcvif(const struct mbuf *m, int *s)
{
struct ifnet *ifp;
KASSERT(m->m_flags & M_PKTHDR);
*s = pserialize_read_enter();
ifp = if_byindex(m->m_pkthdr.rcvif_index);
if (__predict_false(ifp == NULL))
pserialize_read_exit(*s);
return ifp;
}
static __inline void
m_put_rcvif(struct ifnet *ifp, int *s)
{
if (ifp == NULL)
return;
pserialize_read_exit(*s);
}
/*
* Get rcvif of a mbuf.
*
* The caller must call m_put_rcvif_psref after using rcvif. The API ensures
* a got rcvif isn't be freed until m_put_rcvif_psref is called.
*/
static __inline struct ifnet *
m_get_rcvif_psref(const struct mbuf *m, struct psref *psref)
{
KASSERT(m->m_flags & M_PKTHDR);
return if_get_byindex(m->m_pkthdr.rcvif_index, psref);
}
static __inline void
m_put_rcvif_psref(struct ifnet *ifp, struct psref *psref)
{
if (ifp == NULL)
return;
if_put(ifp, psref);
}
/*
* Get rcvif of a mbuf.
*
* This is NOT an MP-safe API and shouldn't be used at where we want MP-safe.
*/
static __inline struct ifnet *
m_get_rcvif_NOMPSAFE(const struct mbuf *m)
{
KASSERT(m->m_flags & M_PKTHDR);
return if_byindex(m->m_pkthdr.rcvif_index);
}
#endif /* _KERNEL */
#endif /* !_SYS_MBUF_H_ */
/* $NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $ */
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)fdesc_vfsops.c 8.10 (Berkeley) 5/14/95
*
* #Id: fdesc_vfsops.c,v 1.9 1993/04/06 15:28:33 jsp Exp #
*/
/*
* /dev/fd Filesystem
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: fdesc_vfsops.c,v 1.96 2020/04/13 19:23:18 ad Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/filedesc.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/fdesc/fdesc.h>
MODULE(MODULE_CLASS_VFS, fdesc, NULL);
VFS_PROTOS(fdesc);
/*
* Mount the per-process file descriptors (/dev/fd)
*/
int
fdesc_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
int error = 0, ix;
struct vnode *rvp;
if (mp->mnt_flag & MNT_GETARGS) {
*data_len = 0;
return 0;
}
/*
* Update is a no-op
*/
if (mp->mnt_flag & MNT_UPDATE)
return (EOPNOTSUPP);
ix = FD_ROOT;
error = vcache_get(mp, &ix, sizeof(ix), &rvp);
if (error)
return error;
mp->mnt_stat.f_namemax = FDESC_MAXNAMLEN;
mp->mnt_flag |= MNT_LOCAL;
mp->mnt_data = rvp;
vfs_getnewfsid(mp);
error = set_statvfs_info(path, UIO_USERSPACE, "fdesc", UIO_SYSSPACE,
mp->mnt_op->vfs_name, mp, l);
return error;
}
int
fdesc_start(struct mount *mp, int flags)
{
return (0);
}
int
fdesc_unmount(struct mount *mp, int mntflags)
{
int error;
int flags = 0;
struct vnode *rtvp = mp->mnt_data;
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, rtvp, flags)) != 0)
return (error);
/*
* Blow it away for future re-use
*/
vgone(rtvp);
mp->mnt_data = NULL;
return (0);
}
int
fdesc_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct vnode *vp;
/*
* Return locked reference to root.
*/
vp = mp->mnt_data;
vref(vp);
vn_lock(vp, lktype | LK_RETRY);
*vpp = vp;
return (0);
}
/*ARGSUSED*/
int
fdesc_sync(struct mount *mp, int waitfor,
kauth_cred_t uc)
{
return (0);
}
/*
* Fdesc flat namespace lookup.
* Currently unsupported.
*/
int
fdesc_vget(struct mount *mp, ino_t ino, int lktype,
struct vnode **vpp)
{
return (EOPNOTSUPP);
}
int
fdesc_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
int ix;
struct fdescnode *fd;
KASSERT(key_len == sizeof(ix));
memcpy(&ix, key, key_len);
fd = kmem_alloc(sizeof(struct fdescnode), KM_SLEEP);
fd->fd_fd = -1;
fd->fd_link = NULL;
fd->fd_ix = ix;
fd->fd_vnode = vp;
vp->v_tag = VT_FDESC;
vp->v_op = fdesc_vnodeop_p;
vp->v_data = fd;
switch (ix) {
case FD_ROOT:
fd->fd_type = Froot;
vp->v_type = VDIR;
vp->v_vflag |= VV_ROOT;
break;
case FD_DEVFD:
fd->fd_type = Fdevfd;
vp->v_type = VDIR;
break;
case FD_CTTY:
fd->fd_type = Fctty;
vp->v_type = VCHR;
break;
case FD_STDIN:
fd->fd_type = Flink;
fd->fd_link = "fd/0";
vp->v_type = VLNK;
break;
case FD_STDOUT:
fd->fd_type = Flink;
fd->fd_link = "fd/1";
vp->v_type = VLNK;
break;
case FD_STDERR:
fd->fd_type = Flink;
fd->fd_link = "fd/2";
vp->v_type = VLNK;
break;
default:
KASSERT(ix >= FD_DESC);
fd->fd_type = Fdesc;
fd->fd_fd = ix - FD_DESC;
vp->v_type = VNON;
break;
}
uvm_vnp_setsize(vp, 0);
*new_key = &fd->fd_ix;
return 0;
}
extern const struct vnodeopv_desc fdesc_vnodeop_opv_desc;
const struct vnodeopv_desc * const fdesc_vnodeopv_descs[] = {
&fdesc_vnodeop_opv_desc,
NULL,
};
struct vfsops fdesc_vfsops = {
.vfs_name = MOUNT_FDESC,
.vfs_min_mount_data = 0,
.vfs_mount = fdesc_mount,
.vfs_start = fdesc_start,
.vfs_unmount = fdesc_unmount,
.vfs_root = fdesc_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = genfs_statvfs,
.vfs_sync = fdesc_sync,
.vfs_vget = fdesc_vget,
.vfs_loadvnode = fdesc_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = fdesc_init,
.vfs_done = fdesc_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = fdesc_vnodeopv_descs
};
SYSCTL_SETUP(fdesc_sysctl_setup, "fdesc sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "fdesc",
SYSCTL_DESCR("File-descriptor file system"),
NULL, 0, NULL, 0,
CTL_VFS, 7, CTL_EOL);
/*
* XXX the "7" above could be dynamic, thereby eliminating one
* more instance of the "number to vfs" mapping problem, but
* "7" is the order as taken from sys/mount.h
*/
}
static int
fdesc_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&fdesc_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&fdesc_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/* $NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $ */
/*
*
* Coda: an Experimental Distributed File System
* Release 3.1
*
* Copyright (c) 1987-1998 Carnegie Mellon University
* All Rights Reserved
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation, and
* that credit is given to Carnegie Mellon University in all documents
* and publicity pertaining to direct or indirect use of this code or its
* derivatives.
*
* CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS,
* SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS
* FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON
* DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
* RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF
* ANY DERIVATIVE WORK.
*
* Carnegie Mellon encourages users of this software to return any
* improvements or extensions that they make, and to grant Carnegie
* Mellon the rights to redistribute these changes without encumbrance.
*
* @(#) coda/coda_vnops.c,v 1.1.1.1 1998/08/29 21:26:46 rvb Exp $
*/
/*
* Mach Operating System
* Copyright (c) 1990 Carnegie-Mellon University
* Copyright (c) 1989 Carnegie-Mellon University
* All rights reserved. The CMU software License Agreement specifies
* the terms and conditions for use and redistribution.
*/
/*
* This code was written for the Coda file system at Carnegie Mellon
* University. Contributers include David Steere, James Kistler, and
* M. Satyanarayanan.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vnops.c,v 1.118 2022/03/27 16:24:58 christos Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/errno.h>
#include <sys/acct.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/namei.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/dirent.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vnops.h>
#include <coda/coda_venus.h>
#include <coda/coda_opstats.h>
#include <coda/coda_subr.h>
#include <coda/coda_namecache.h>
#include <coda/coda_pioctl.h>
/*
* These flags select various performance enhancements.
*/
int coda_attr_cache = 1; /* Set to cache attributes in the kernel */
int coda_symlink_cache = 1; /* Set to cache symbolic link information */
int coda_access_cache = 1; /* Set to handle some access checks directly */
/* structure to keep track of vfs calls */
struct coda_op_stats coda_vnodeopstats[CODA_VNODEOPS_SIZE];
#define MARK_ENTRY(op) (coda_vnodeopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vnodeopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vnodeopstats[op].unsat_intrn++)
#define MARK_INT_GEN(op) (coda_vnodeopstats[op].gen_intrn++)
/* What we are delaying for in printf */
static int coda_lockdebug = 0;
#define ENTRY if(coda_vnop_print_entry) myprintf(("Entered %s\n",__func__))
/* Definition of the vnode operation vector */
const struct vnodeopv_entry_desc coda_vnodeop_entries[] = {
{ &vop_default_desc, coda_vop_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, coda_lookup }, /* lookup */
{ &vop_create_desc, coda_create }, /* create */
{ &vop_mknod_desc, coda_vop_error }, /* mknod */
{ &vop_open_desc, coda_open }, /* open */
{ &vop_close_desc, coda_close }, /* close */
{ &vop_access_desc, coda_access }, /* access */
{ &vop_accessx_desc, genfs_accessx }, /* access */
{ &vop_getattr_desc, coda_getattr }, /* getattr */
{ &vop_setattr_desc, coda_setattr }, /* setattr */
{ &vop_read_desc, coda_read }, /* read */
{ &vop_write_desc, coda_write }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, coda_ioctl }, /* ioctl */
{ &vop_mmap_desc, genfs_mmap }, /* mmap */
{ &vop_fsync_desc, coda_fsync }, /* fsync */
{ &vop_remove_desc, coda_remove }, /* remove */
{ &vop_link_desc, coda_link }, /* link */
{ &vop_rename_desc, coda_rename }, /* rename */
{ &vop_mkdir_desc, coda_mkdir }, /* mkdir */
{ &vop_rmdir_desc, coda_rmdir }, /* rmdir */
{ &vop_symlink_desc, coda_symlink }, /* symlink */
{ &vop_readdir_desc, coda_readdir }, /* readdir */
{ &vop_readlink_desc, coda_readlink }, /* readlink */
{ &vop_abortop_desc, coda_abortop }, /* abortop */
{ &vop_inactive_desc, coda_inactive }, /* inactive */
{ &vop_reclaim_desc, coda_reclaim }, /* reclaim */
{ &vop_lock_desc, coda_lock }, /* lock */
{ &vop_unlock_desc, coda_unlock }, /* unlock */
{ &vop_bmap_desc, coda_bmap }, /* bmap */
{ &vop_strategy_desc, coda_strategy }, /* strategy */
{ &vop_print_desc, coda_vop_error }, /* print */
{ &vop_islocked_desc, coda_islocked }, /* islocked */
{ &vop_pathconf_desc, coda_pathconf }, /* pathconf */
{ &vop_advlock_desc, coda_vop_nop }, /* advlock */
{ &vop_bwrite_desc, coda_vop_error }, /* bwrite */
{ &vop_seek_desc, genfs_seek }, /* seek */
{ &vop_poll_desc, genfs_poll }, /* poll */
{ &vop_getpages_desc, coda_getpages }, /* getpages */
{ &vop_putpages_desc, coda_putpages }, /* putpages */
{ NULL, NULL }
};
static void coda_print_vattr(struct vattr *);
int (**coda_vnodeop_p)(void *);
const struct vnodeopv_desc coda_vnodeop_opv_desc =
{ &coda_vnodeop_p, coda_vnodeop_entries };
/* Definitions of NetBSD vnodeop interfaces */
/*
* A generic error routine. Return EIO without looking at arguments.
*/
int
coda_vop_error(void *anon) {
struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;
if (codadebug) {
myprintf(("%s: Vnode operation %s called (error).\n",
__func__, (*desc)->vdesc_name));
}
return EIO;
}
/* A generic do-nothing. */
int
coda_vop_nop(void *anon) {
struct vnodeop_desc **desc = (struct vnodeop_desc **)anon;
if (codadebug) {
myprintf(("Vnode operation %s called, but unsupported\n",
(*desc)->vdesc_name));
}
return (0);
}
int
coda_vnodeopstats_init(void)
{
int i;
for(i=0;i<CODA_VNODEOPS_SIZE;i++) {
coda_vnodeopstats[i].opcode = i;
coda_vnodeopstats[i].entries = 0;
coda_vnodeopstats[i].sat_intrn = 0;
coda_vnodeopstats[i].unsat_intrn = 0;
coda_vnodeopstats[i].gen_intrn = 0;
}
return 0;
}
/*
* XXX The entire relationship between VOP_OPEN and having a container
* file (via venus_open) needs to be reexamined. In particular, it's
* valid to open/mmap/close and then reference. Instead of doing
* VOP_OPEN when getpages needs a container, we should do the
* venus_open part, and record that the vnode has opened the container
* for getpages, and do the matching logical close on coda_inactive.
* Further, coda_rdwr needs a container file, and sometimes needs to
* do the equivalent of open (core dumps).
*/
/*
* coda_open calls Venus to return the device and inode of the
* container file, and then obtains a vnode for that file. The
* container vnode is stored in the coda vnode, and a reference is
* added for each open file.
*/
int
coda_open(void *v)
{
/*
* NetBSD can pass the O_EXCL flag in mode, even though the check
* has already happened. Venus defensively assumes that if open
* is passed the EXCL, it must be a bug. We strip the flag here.
*/
/* true args */
struct vop_open_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
int flag = ap->a_mode & (~O_EXCL);
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
dev_t dev; /* container file device, inode, vnode */
ino_t inode;
vnode_t *container_vp;
MARK_ENTRY(CODA_OPEN_STATS);
KASSERT(VOP_ISLOCKED(vp));
/* Check for open of control file. */
if (IS_CTL_VP(vp)) {
/* if (WRITABLE(flag)) */
if (flag & (FWRITE | O_TRUNC | O_CREAT | O_EXCL)) {
MARK_INT_FAIL(CODA_OPEN_STATS);
return(EACCES);
}
MARK_INT_SAT(CODA_OPEN_STATS);
return(0);
}
error = venus_open(vtomi(vp), &cp->c_fid, flag, cred, curlwp, &dev, &inode);
if (error)
return (error);
if (!error) {
CODADEBUG(CODA_OPEN, myprintf((
"%s: dev 0x%llx inode %llu result %d\n", __func__,
(unsigned long long)dev, (unsigned long long)inode, error));)
}
/*
* Obtain locked and referenced container vnode from container
* device/inode.
*/
error = coda_grab_vnode(vp, dev, inode, &container_vp);
if (error)
return (error);
/* Save the vnode pointer for the container file. */
if (cp->c_ovp == NULL) {
cp->c_ovp = container_vp;
} else {
if (cp->c_ovp != container_vp)
/*
* Perhaps venus returned a different container, or
* something else went wrong.
*/
panic("%s: cp->c_ovp != container_vp", __func__);
}
cp->c_ocount++;
/* Flush the attribute cache if writing the file. */
if (flag & FWRITE) {
cp->c_owrite++;
cp->c_flags &= ~C_VATTR;
}
/*
* Save the <device, inode> pair for the container file to speed
* up subsequent reads while closed (mmap, program execution).
* This is perhaps safe because venus will invalidate the node
* before changing the container file mapping.
*/
cp->c_device = dev;
cp->c_inode = inode;
/* Open the container file. */
error = VOP_OPEN(container_vp, flag, cred);
/*
* Drop the lock on the container, after we have done VOP_OPEN
* (which requires a locked vnode).
*/
VOP_UNLOCK(container_vp);
return(error);
}
/*
* Close the cache file used for I/O and notify Venus.
*/
int
coda_close(void *v)
{
/* true args */
struct vop_close_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
int flag = ap->a_fflag;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_CLOSE_STATS);
/* Check for close of control file. */
if (IS_CTL_VP(vp)) {
MARK_INT_SAT(CODA_CLOSE_STATS);
return(0);
}
/*
* XXX The IS_UNMOUNTING part of this is very suspect.
*/
if (IS_UNMOUNTING(cp)) {
if (cp->c_ovp) {
#ifdef CODA_VERBOSE
printf("%s: destroying container %d, ufs vp %p of vp %p/cp %p\n",
__func__, vrefcnt(vp), cp->c_ovp, vp, cp);
#endif
#ifdef hmm
vgone(cp->c_ovp);
#else
vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
vput(cp->c_ovp);
#endif
} else {
#ifdef CODA_VERBOSE
printf("%s: NO container vp %p/cp %p\n", __func__, vp, cp);
#endif
}
return ENODEV;
}
/* Lock the container node, and VOP_CLOSE it. */
vn_lock(cp->c_ovp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(cp->c_ovp, flag, cred); /* Do errors matter here? */
/*
* Drop the lock we just obtained, and vrele the container vnode.
* Decrement reference counts, and clear container vnode pointer on
* last close.
*/
vput(cp->c_ovp);
if (flag & FWRITE)
--cp->c_owrite;
if (--cp->c_ocount == 0)
cp->c_ovp = NULL;
error = venus_close(vtomi(vp), &cp->c_fid, flag, cred, curlwp);
CODADEBUG(CODA_CLOSE, myprintf(("%s: result %d\n", __func__, error)); )
return(error);
}
int
coda_read(void *v)
{
struct vop_read_args *ap = v;
ENTRY;
return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_READ,
ap->a_ioflag, ap->a_cred, curlwp));
}
int
coda_write(void *v)
{
struct vop_write_args *ap = v;
ENTRY;
return(coda_rdwr(ap->a_vp, ap->a_uio, UIO_WRITE,
ap->a_ioflag, ap->a_cred, curlwp));
}
int
coda_rdwr(vnode_t *vp, struct uio *uiop, enum uio_rw rw, int ioflag,
kauth_cred_t cred, struct lwp *l)
{
/* upcall decl */
/* NOTE: container file operation!!! */
/* locals */
struct cnode *cp = VTOC(vp);
vnode_t *cfvp = cp->c_ovp;
struct proc *p = l->l_proc;
int opened_internally = 0;
int error = 0;
MARK_ENTRY(CODA_RDWR_STATS);
CODADEBUG(CODA_RDWR, myprintf(("coda_rdwr(%d, %p, %lu, %lld)\n", rw,
uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
(long long) uiop->uio_offset)); )
/* Check for rdwr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(EINVAL);
}
/* Redirect the request to UFS. */
/*
* If file is not already open this must be a page
* {read,write} request. Iget the cache file's inode
* pointer if we still have its <device, inode> pair.
* Otherwise, we must do an internal open to derive the
* pair.
* XXX Integrate this into a coherent strategy for container
* file acquisition.
*/
if (cfvp == NULL) {
/*
* If we're dumping core, do the internal open. Otherwise
* venus won't have the correct size of the core when
* it's completely written.
*/
if (cp->c_inode != 0 && !(p && (p->p_acflag & ACORE))) {
#ifdef CODA_VERBOSE
printf("%s: grabbing container vnode, losing reference\n",
__func__);
#endif
/* Get locked and refed vnode. */
error = coda_grab_vnode(vp, cp->c_device, cp->c_inode, &cfvp);
if (error) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(error);
}
/*
* Drop lock.
* XXX Where is reference released.
*/
VOP_UNLOCK(cfvp);
}
else {
#ifdef CODA_VERBOSE
printf("%s: internal VOP_OPEN\n", __func__);
#endif
opened_internally = 1;
MARK_INT_GEN(CODA_OPEN_STATS);
error = VOP_OPEN(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
#ifdef CODA_VERBOSE
printf("%s: Internally Opening %p\n", __func__, vp);
#endif
if (error) {
MARK_INT_FAIL(CODA_RDWR_STATS);
return(error);
}
cfvp = cp->c_ovp;
}
}
/* Have UFS handle the call. */
CODADEBUG(CODA_RDWR, myprintf(("%s: fid = %s, refcnt = %d\n", __func__,
coda_f2s(&cp->c_fid), vrefcnt(CTOV(cp)))); )
if (rw == UIO_READ) {
error = VOP_READ(cfvp, uiop, ioflag, cred);
} else {
error = VOP_WRITE(cfvp, uiop, ioflag, cred);
}
if (error)
MARK_INT_FAIL(CODA_RDWR_STATS);
else
MARK_INT_SAT(CODA_RDWR_STATS);
/* Do an internal close if necessary. */
if (opened_internally) {
MARK_INT_GEN(CODA_CLOSE_STATS);
(void)VOP_CLOSE(vp, (rw == UIO_READ ? FREAD : FWRITE), cred);
}
/* Invalidate cached attributes if writing. */
if (rw == UIO_WRITE)
cp->c_flags &= ~C_VATTR;
return(error);
}
int
coda_ioctl(void *v)
{
/* true args */
struct vop_ioctl_args *ap = v;
vnode_t *vp = ap->a_vp;
int com = ap->a_command;
void *data = ap->a_data;
int flag = ap->a_fflag;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
vnode_t *tvp;
struct PioctlData *iap = (struct PioctlData *)data;
namei_simple_flags_t sflags;
MARK_ENTRY(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("in coda_ioctl on %s\n", iap->path));)
/* Don't check for operation on a dying object, for ctlvp it
shouldn't matter */
/* Must be control object to succeed. */
if (!IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: vp != ctlvp", __func__));)
return (EOPNOTSUPP);
}
/* Look up the pathname. */
/* Should we use the name cache here? It would get it from
lookupname sooner or later anyway, right? */
sflags = iap->follow ? NSM_FOLLOW_NOEMULROOT : NSM_NOFOLLOW_NOEMULROOT;
error = namei_simple_user(iap->path, sflags, &tvp);
if (error) {
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: lookup returns %d\n",
__func__, error));)
return(error);
}
/*
* Make sure this is a coda style cnode, but it may be a
* different vfsp
*/
/* XXX: this totally violates the comment about vtagtype in vnode.h */
if (tvp->v_tag != VT_CODA) {
vrele(tvp);
MARK_INT_FAIL(CODA_IOCTL_STATS);
CODADEBUG(CODA_IOCTL, myprintf(("%s error: %s not a coda object\n",
__func__, iap->path));)
return(EINVAL);
}
if (iap->vi.in_size > VC_MAXDATASIZE || iap->vi.out_size > VC_MAXDATASIZE) {
vrele(tvp);
return(EINVAL);
}
error = venus_ioctl(vtomi(tvp), &((VTOC(tvp))->c_fid), com, flag, data,
cred, curlwp);
if (error)
MARK_INT_FAIL(CODA_IOCTL_STATS);
else
CODADEBUG(CODA_IOCTL, myprintf(("Ioctl returns %d \n", error)); )
vrele(tvp);
return(error);
}
/*
* To reduce the cost of a user-level venus;we cache attributes in
* the kernel. Each cnode has storage allocated for an attribute. If
* c_vattr is valid, return a reference to it. Otherwise, get the
* attributes from venus and store them in the cnode. There is some
* question if this method is a security leak. But I think that in
* order to make this call, the user must have done a lookup and
* opened the file, and therefore should already have access.
*/
int
coda_getattr(void *v)
{
/* true args */
struct vop_getattr_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct vattr *vap = ap->a_vap;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_GETATTR_STATS);
/* Check for getattr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_GETATTR_STATS);
return(ENOENT);
}
/* Check to see if the attributes have already been cached */
if (VALID_VATTR(cp)) {
CODADEBUG(CODA_GETATTR, { myprintf(("%s: attr cache hit: %s\n",
__func__, coda_f2s(&cp->c_fid)));})
CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
coda_print_vattr(&cp->c_vattr); )
*vap = cp->c_vattr;
MARK_INT_SAT(CODA_GETATTR_STATS);
return(0);
}
error = venus_getattr(vtomi(vp), &cp->c_fid, cred, curlwp, vap);
if (!error) {
CODADEBUG(CODA_GETATTR, myprintf(("%s miss %s: result %d\n",
__func__, coda_f2s(&cp->c_fid), error)); )
CODADEBUG(CODA_GETATTR, if (!(codadebug & ~CODA_GETATTR))
coda_print_vattr(vap); )
/* If not open for write, store attributes in cnode */
if ((cp->c_owrite == 0) && (coda_attr_cache)) {
cp->c_vattr = *vap;
cp->c_flags |= C_VATTR;
}
}
return(error);
}
int
coda_setattr(void *v)
{
/* true args */
struct vop_setattr_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct vattr *vap = ap->a_vap;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_SETATTR_STATS);
/* Check for setattr of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_SETATTR_STATS);
return(ENOENT);
}
if (codadebug & CODADBGMSK(CODA_SETATTR)) {
coda_print_vattr(vap);
}
error = venus_setattr(vtomi(vp), &cp->c_fid, vap, cred, curlwp);
if (!error)
cp->c_flags &= ~C_VATTR;
CODADEBUG(CODA_SETATTR, myprintf(("setattr %d\n", error)); )
return(error);
}
int
coda_access(void *v)
{
/* true args */
struct vop_access_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
accmode_t accmode = ap->a_accmode;
kauth_cred_t cred = ap->a_cred;
/* locals */
int error;
MARK_ENTRY(CODA_ACCESS_STATS);
KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0);
/* Check for access of control object. Only read access is
allowed on it. */
if (IS_CTL_VP(vp)) {
/* bogus hack - all will be marked as successes */
MARK_INT_SAT(CODA_ACCESS_STATS);
return(((accmode & VREAD) && !(accmode & (VWRITE | VEXEC)))
? 0 : EACCES);
}
/*
* if the file is a directory, and we are checking exec (eg lookup)
* access, and the file is in the namecache, then the user must have
* lookup access to it.
*/
if (coda_access_cache) {
if ((vp->v_type == VDIR) && (accmode & VEXEC)) {
if (coda_nc_lookup(cp, ".", 1, cred)) {
MARK_INT_SAT(CODA_ACCESS_STATS);
return(0); /* it was in the cache */
}
}
}
error = venus_access(vtomi(vp), &cp->c_fid, accmode, cred, curlwp);
return(error);
}
/*
* CODA abort op, called after namei() when a CREATE/DELETE isn't actually
* done. If a buffer has been saved in anticipation of a coda_create or
* a coda_remove, delete it.
*/
/* ARGSUSED */
int
coda_abortop(void *v)
{
/* true args */
struct vop_abortop_args /* {
vnode_t *a_dvp;
struct componentname *a_cnp;
} */ *ap = v;
(void)ap;
/* upcall decl */
/* locals */
return (0);
}
int
coda_readlink(void *v)
{
/* true args */
struct vop_readlink_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct uio *uiop = ap->a_uio;
kauth_cred_t cred = ap->a_cred;
/* locals */
struct lwp *l = curlwp;
int error;
char *str;
int len;
MARK_ENTRY(CODA_READLINK_STATS);
/* Check for readlink of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_READLINK_STATS);
return(ENOENT);
}
if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) { /* symlink was cached */
uiop->uio_rw = UIO_READ;
error = uiomove(cp->c_symlink, (int)cp->c_symlen, uiop);
if (error)
MARK_INT_FAIL(CODA_READLINK_STATS);
else
MARK_INT_SAT(CODA_READLINK_STATS);
return(error);
}
error = venus_readlink(vtomi(vp), &cp->c_fid, cred, l, &str, &len);
if (!error) {
uiop->uio_rw = UIO_READ;
error = uiomove(str, len, uiop);
if (coda_symlink_cache) {
cp->c_symlink = str;
cp->c_symlen = len;
cp->c_flags |= C_SYMLINK;
} else
CODA_FREE(str, len);
}
CODADEBUG(CODA_READLINK, myprintf(("in readlink result %d\n",error));)
return(error);
}
int
coda_fsync(void *v)
{
/* true args */
struct vop_fsync_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
kauth_cred_t cred = ap->a_cred;
/* locals */
vnode_t *convp = cp->c_ovp;
int error;
MARK_ENTRY(CODA_FSYNC_STATS);
/* Check for fsync on an unmounting object */
/* The NetBSD kernel, in its infinite wisdom, can try to fsync
* after an unmount has been initiated. This is a Bad Thing,
* which we have to avoid. Not a legitimate failure for stats.
*/
if (IS_UNMOUNTING(cp)) {
return(ENODEV);
}
/* Check for fsync of control object or unitialized cnode. */
if (IS_CTL_VP(vp) || vp->v_type == VNON) {
MARK_INT_SAT(CODA_FSYNC_STATS);
return(0);
}
if (convp)
VOP_FSYNC(convp, cred, MNT_WAIT, 0, 0);
/*
* We can expect fsync on any vnode at all if venus is pruging it.
* Venus can't very well answer the fsync request, now can it?
* Hopefully, it won't have to, because hopefully, venus preserves
* the (possibly untrue) invariant that it never purges an open
* vnode. Hopefully.
*/
if (cp->c_flags & C_PURGING) {
return(0);
}
error = venus_fsync(vtomi(vp), &cp->c_fid, cred, curlwp);
CODADEBUG(CODA_FSYNC, myprintf(("in fsync result %d\n",error)); )
return(error);
}
/*
* vp is locked on entry, and we must unlock it.
* XXX This routine is suspect and probably needs rewriting.
*/
int
coda_inactive(void *v)
{
/* true args */
struct vop_inactive_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
kauth_cred_t cred __unused = NULL;
/* We don't need to send inactive to venus - DCS */
MARK_ENTRY(CODA_INACTIVE_STATS);
if (IS_CTL_VP(vp)) {
MARK_INT_SAT(CODA_INACTIVE_STATS);
return 0;
}
CODADEBUG(CODA_INACTIVE, myprintf(("in inactive, %s, vfsp %p\n",
coda_f2s(&cp->c_fid), vp->v_mount));)
if (vp->v_mount->mnt_data == NULL) {
myprintf(("Help! vfsp->vfs_data was NULL, but vnode %p wasn't dying\n", vp));
panic("badness in coda_inactive");
}
#ifdef CODA_VERBOSE
/* Sanity checks that perhaps should be panic. */
if (vrefcnt(vp) > 1)
printf("%s: %p usecount %d\n", __func__, vp, vrefcnt(vp));
if (cp->c_ovp != NULL)
printf("%s: %p ovp != NULL\n", __func__, vp);
#endif
/* XXX Do we need to VOP_CLOSE container vnodes? */
if (!IS_UNMOUNTING(cp))
*ap->a_recycle = true;
MARK_INT_SAT(CODA_INACTIVE_STATS);
return(0);
}
/*
* Coda does not use the normal namecache, but a private version.
* Consider how to use the standard facility instead.
*/
int
coda_lookup(void *v)
{
/* true args */
struct vop_lookup_v2_args *ap = v;
/* (locked) vnode of dir in which to do lookup */
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
/* output variable for result */
vnode_t **vpp = ap->a_vpp;
/* name to lookup */
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
struct cnode *cp;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
CodaFid VFid;
int vtype;
int error = 0;
MARK_ENTRY(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s in %s\n", __func__,
nm, coda_f2s(&dcp->c_fid)));)
/*
* XXX componentname flags in MODMASK are not handled at all
*/
/*
* The overall strategy is to switch on the lookup type and get a
* result vnode that is vref'd but not locked.
*/
/* Check for lookup of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = coda_ctlvp;
vref(*vpp);
MARK_INT_SAT(CODA_LOOKUP_STATS);
goto exit;
}
/* Avoid trying to hand venus an unreasonably long name. */
if (len+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: name too long:, %s (%s)\n",
__func__, coda_f2s(&dcp->c_fid), nm));)
*vpp = (vnode_t *)0;
error = EINVAL;
goto exit;
}
/*
* Try to resolve the lookup in the minicache. If that fails, ask
* venus to do the lookup. XXX The interaction between vnode
* locking and any locking that coda does is not clear.
*/
cp = coda_nc_lookup(dcp, nm, len, cred);
if (cp) {
*vpp = CTOV(cp);
vref(*vpp);
CODADEBUG(CODA_LOOKUP,
myprintf(("lookup result %d vpp %p\n",error,*vpp));)
} else {
/* The name wasn't cached, so ask Venus. */
error = venus_lookup(vtomi(dvp), &dcp->c_fid, nm, len, cred, l, &VFid,
&vtype);
if (error) {
MARK_INT_FAIL(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: lookup error on %s (%s)%d\n",
__func__, coda_f2s(&dcp->c_fid), nm, error));)
*vpp = (vnode_t *)0;
} else {
MARK_INT_SAT(CODA_LOOKUP_STATS);
CODADEBUG(CODA_LOOKUP, myprintf(("%s: %s type %o result %d\n",
__func__, coda_f2s(&VFid), vtype, error)); )
cp = make_coda_node(&VFid, dvp->v_mount, vtype);
*vpp = CTOV(cp);
/* vpp is now vrefed. */
/*
* Unless this vnode is marked CODA_NOCACHE, enter it into
* the coda name cache to avoid a future venus round-trip.
* XXX Interaction with componentname NOCACHE is unclear.
*/
if (!(vtype & CODA_NOCACHE))
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
}
}
exit:
/*
* If we are creating, and this was the last name to be looked up,
* and the error was ENOENT, then make the leaf NULL and return
* success.
* XXX Check against new lookup rules.
*/
if (((cnp->cn_nameiop == CREATE) || (cnp->cn_nameiop == RENAME))
&& (cnp->cn_flags & ISLASTCN)
&& (error == ENOENT))
{
error = EJUSTRETURN;
*ap->a_vpp = NULL;
}
return(error);
}
/*ARGSUSED*/
int
coda_create(void *v)
{
/* true args */
struct vop_create_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct vattr *va = ap->a_vap;
int exclusive = 1;
int mode = ap->a_vap->va_mode;
vnode_t **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
struct cnode *cp;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
CodaFid VFid;
struct vattr attr;
MARK_ENTRY(CODA_CREATE_STATS);
/* All creates are exclusive XXX */
/* I'm assuming the 'mode' argument is the file mode bits XXX */
/* Check for create of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_CREATE_STATS);
return(EACCES);
}
error = venus_create(vtomi(dvp), &dcp->c_fid, nm, len, exclusive, mode, va, cred, l, &VFid, &attr);
if (!error) {
/*
* XXX Violation of venus/kernel invariants is a difficult case,
* but venus should not be able to cause a panic.
*/
/* If this is an exclusive create, panic if the file already exists. */
/* Venus should have detected the file and reported EEXIST. */
if ((exclusive == 1) &&
(coda_find(&VFid) != NULL))
panic("cnode existed for newly created file!");
cp = make_coda_node(&VFid, dvp->v_mount, attr.va_type);
*vpp = CTOV(cp);
/* XXX vnodeops doesn't say this argument can be changed. */
/* Update va to reflect the new attributes. */
(*va) = attr;
/* Update the attribute cache and mark it as valid */
if (coda_attr_cache) {
VTOC(*vpp)->c_vattr = attr;
VTOC(*vpp)->c_flags |= C_VATTR;
}
/* Invalidate parent's attr cache (modification time has changed). */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* enter the new vnode in the Name Cache */
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
CODADEBUG(CODA_CREATE, myprintf(("%s: %s, result %d\n", __func__,
coda_f2s(&VFid), error)); )
} else {
*vpp = (vnode_t *)0;
CODADEBUG(CODA_CREATE, myprintf(("%s: create error %d\n", __func__,
error));)
}
if (!error) {
#ifdef CODA_VERBOSE
if ((cnp->cn_flags & LOCKLEAF) == 0)
/* This should not happen; flags are for lookup only. */
printf("%s: LOCKLEAF not set!\n", __func__);
#endif
}
return(error);
}
int
coda_remove(void *v)
{
/* true args */
struct vop_remove_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *cp = VTOC(dvp);
vnode_t *vp = ap->a_vp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *tp;
MARK_ENTRY(CODA_REMOVE_STATS);
CODADEBUG(CODA_REMOVE, myprintf(("%s: %s in %s\n", __func__,
nm, coda_f2s(&cp->c_fid)));)
/* Remove the file's entry from the CODA Name Cache */
/* We're being conservative here, it might be that this person
* doesn't really have sufficient access to delete the file
* but we feel zapping the entry won't really hurt anyone -- dcs
*/
/* I'm gonna go out on a limb here. If a file and a hardlink to it
* exist, and one is removed, the link count on the other will be
* off by 1. We could either invalidate the attrs if cached, or
* fix them. I'll try to fix them. DCS 11/8/94
*/
tp = coda_nc_lookup(VTOC(dvp), nm, len, cred);
if (tp) {
if (VALID_VATTR(tp)) { /* If attrs are cached */
if (tp->c_vattr.va_nlink > 1) { /* If it's a hard link */
tp->c_vattr.va_nlink--;
}
}
coda_nc_zapfile(VTOC(dvp), nm, len);
/* No need to flush it if it doesn't exist! */
}
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* Check for remove of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_REMOVE_STATS);
return(ENOENT);
}
error = venus_remove(vtomi(dvp), &cp->c_fid, nm, len, cred, l);
CODADEBUG(CODA_REMOVE, myprintf(("in remove result %d\n",error)); )
/*
* Unlock and release child (avoiding double if ".").
*/
if (dvp == vp) {
vrele(vp);
} else {
vput(vp);
}
return(error);
}
/*
* dvp is the directory where the link is to go, and is locked.
* vp is the object to be linked to, and is unlocked.
* At exit, we must unlock dvp, and vput dvp.
*/
int
coda_link(void *v)
{
/* true args */
struct vop_link_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
MARK_ENTRY(CODA_LINK_STATS);
if (codadebug & CODADBGMSK(CODA_LINK)) {
myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
myprintf(("%s: dvp fid: %s)\n", __func__, coda_f2s(&dcp->c_fid)));
}
if (codadebug & CODADBGMSK(CODA_LINK)) {
myprintf(("%s: vp fid: %s\n", __func__, coda_f2s(&cp->c_fid)));
myprintf(("%s: dvp fid: %s\n", __func__, coda_f2s(&dcp->c_fid)));
}
/* Check for link to/from control object. */
if (IS_CTL_NAME(dvp, nm, len) || IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_LINK_STATS);
return(EACCES);
}
/* If linking . to a name, error out earlier. */
if (vp == dvp) {
#ifdef CODA_VERBOSE
printf("%s coda_link vp==dvp\n", __func__);
#endif
error = EISDIR;
goto exit;
}
/* XXX Why does venus_link need the vnode to be locked?*/
if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
#ifdef CODA_VERBOSE
printf("%s: couldn't lock vnode %p\n", __func__, vp);
#endif
error = EFAULT; /* XXX better value */
goto exit;
}
error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
dvp, 0);
if (error)
goto exit;
error = venus_link(vtomi(vp), &cp->c_fid, &dcp->c_fid, nm, len, cred, l);
VOP_UNLOCK(vp);
/* Invalidate parent's attr cache (the modification time has changed). */
VTOC(dvp)->c_flags &= ~C_VATTR;
/* Invalidate child's attr cache (XXX why). */
VTOC(vp)->c_flags &= ~C_VATTR;
CODADEBUG(CODA_LINK, myprintf(("in link result %d\n",error)); )
exit:
return(error);
}
int
coda_rename(void *v)
{
/* true args */
struct vop_rename_args *ap = v;
vnode_t *odvp = ap->a_fdvp;
struct cnode *odcp = VTOC(odvp);
struct componentname *fcnp = ap->a_fcnp;
vnode_t *ndvp = ap->a_tdvp;
struct cnode *ndcp = VTOC(ndvp);
struct componentname *tcnp = ap->a_tcnp;
kauth_cred_t cred = fcnp->cn_cred;
struct lwp *l = curlwp;
/* true args */
int error;
const char *fnm = fcnp->cn_nameptr;
int flen = fcnp->cn_namelen;
const char *tnm = tcnp->cn_nameptr;
int tlen = tcnp->cn_namelen;
MARK_ENTRY(CODA_RENAME_STATS);
/* Hmmm. The vnodes are already looked up. Perhaps they are locked?
This could be Bad. XXX */
#ifdef OLD_DIAGNOSTIC
if ((fcnp->cn_cred != tcnp->cn_cred)
|| (fcnp->cn_lwp != tcnp->cn_lwp))
{
panic("%s: component names don't agree", __func__);
}
#endif
/* Check for rename involving control object. */
if (IS_CTL_NAME(odvp, fnm, flen) || IS_CTL_NAME(ndvp, tnm, tlen)) {
MARK_INT_FAIL(CODA_RENAME_STATS);
return(EACCES);
}
/* Problem with moving directories -- need to flush entry for .. */
if (odvp != ndvp) {
struct cnode *ovcp = coda_nc_lookup(VTOC(odvp), fnm, flen, cred);
if (ovcp) {
vnode_t *ovp = CTOV(ovcp);
if ((ovp) &&
(ovp->v_type == VDIR)) /* If it's a directory */
coda_nc_zapfile(VTOC(ovp),"..", 2);
}
}
/* Remove the entries for both source and target files */
coda_nc_zapfile(VTOC(odvp), fnm, flen);
coda_nc_zapfile(VTOC(ndvp), tnm, tlen);
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(odvp)->c_flags &= ~C_VATTR;
VTOC(ndvp)->c_flags &= ~C_VATTR;
if (flen+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_RENAME_STATS);
error = EINVAL;
goto exit;
}
if (tlen+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_RENAME_STATS);
error = EINVAL;
goto exit;
}
error = venus_rename(vtomi(odvp), &odcp->c_fid, &ndcp->c_fid, fnm, flen, tnm, tlen, cred, l);
exit:
CODADEBUG(CODA_RENAME, myprintf(("in rename result %d\n",error));)
/* XXX - do we need to call cache pureg on the moved vnode? */
cache_purge(ap->a_fvp);
/* It seems to be incumbent on us to drop locks on all four vnodes */
/* From-vnodes are not locked, only ref'd. To-vnodes are locked. */
vrele(ap->a_fvp);
vrele(odvp);
if (ap->a_tvp) {
if (ap->a_tvp == ndvp) {
vrele(ap->a_tvp);
} else {
vput(ap->a_tvp);
}
}
vput(ndvp);
return(error);
}
int
coda_mkdir(void *v)
{
/* true args */
struct vop_mkdir_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
struct componentname *cnp = ap->a_cnp;
struct vattr *va = ap->a_vap;
vnode_t **vpp = ap->a_vpp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *cp;
CodaFid VFid;
struct vattr ova;
MARK_ENTRY(CODA_MKDIR_STATS);
/* Check for mkdir of target object. */
if (IS_CTL_NAME(dvp, nm, len)) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_MKDIR_STATS);
return(EACCES);
}
if (len+1 > CODA_MAXNAMLEN) {
*vpp = (vnode_t *)0;
MARK_INT_FAIL(CODA_MKDIR_STATS);
return(EACCES);
}
error = venus_mkdir(vtomi(dvp), &dcp->c_fid, nm, len, va, cred, l, &VFid, &ova);
if (!error) {
if (coda_find(&VFid) != NULL)
panic("cnode existed for newly created directory!");
cp = make_coda_node(&VFid, dvp->v_mount, va->va_type);
*vpp = CTOV(cp);
/* enter the new vnode in the Name Cache */
coda_nc_enter(VTOC(dvp), nm, len, cred, VTOC(*vpp));
/* as a side effect, enter "." and ".." for the directory */
coda_nc_enter(VTOC(*vpp), ".", 1, cred, VTOC(*vpp));
coda_nc_enter(VTOC(*vpp), "..", 2, cred, VTOC(dvp));
if (coda_attr_cache) {
VTOC(*vpp)->c_vattr = ova; /* update the attr cache */
VTOC(*vpp)->c_flags |= C_VATTR; /* Valid attributes in cnode */
}
/* Invalidate the parent's attr cache, the modification time has changed */
VTOC(dvp)->c_flags &= ~C_VATTR;
CODADEBUG( CODA_MKDIR, myprintf(("%s: %s result %d\n", __func__,
coda_f2s(&VFid), error)); )
} else {
*vpp = (vnode_t *)0;
CODADEBUG(CODA_MKDIR, myprintf(("%s error %d\n", __func__, error));)
}
return(error);
}
int
coda_rmdir(void *v)
{
/* true args */
struct vop_rmdir_v2_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
vnode_t *vp = ap->a_vp;
struct componentname *cnp = ap->a_cnp;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* true args */
int error;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
struct cnode *cp;
MARK_ENTRY(CODA_RMDIR_STATS);
/* Check for rmdir of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_RMDIR_STATS);
return(ENOENT);
}
/* Can't remove . in self. */
if (dvp == vp) {
#ifdef CODA_VERBOSE
printf("%s: dvp == vp\n", __func__);
#endif
error = EINVAL;
goto exit;
}
/*
* The caller may not have adequate permissions, and the venus
* operation may fail, but it doesn't hurt from a correctness
* viewpoint to invalidate cache entries.
* XXX Why isn't this done after the venus_rmdir call?
*/
/* Look up child in name cache (by name, from parent). */
cp = coda_nc_lookup(dcp, nm, len, cred);
/* If found, remove all children of the child (., ..). */
if (cp) coda_nc_zapParentfid(&(cp->c_fid), NOT_DOWNCALL);
/* Remove child's own entry. */
coda_nc_zapfile(dcp, nm, len);
/* Invalidate parent's attr cache (the modification time has changed). */
dcp->c_flags &= ~C_VATTR;
error = venus_rmdir(vtomi(dvp), &dcp->c_fid, nm, len, cred, l);
CODADEBUG(CODA_RMDIR, myprintf(("in rmdir result %d\n", error)); )
exit:
/* unlock and release child */
if (dvp == vp) {
vrele(vp);
} else {
vput(vp);
}
return(error);
}
int
coda_symlink(void *v)
{
/* true args */
struct vop_symlink_v3_args *ap = v;
vnode_t *dvp = ap->a_dvp;
struct cnode *dcp = VTOC(dvp);
/* a_vpp is used in place below */
struct componentname *cnp = ap->a_cnp;
struct vattr *tva = ap->a_vap;
char *path = ap->a_target;
kauth_cred_t cred = cnp->cn_cred;
struct lwp *l = curlwp;
/* locals */
int error;
u_long saved_cn_flags;
const char *nm = cnp->cn_nameptr;
int len = cnp->cn_namelen;
int plen = strlen(path);
/*
* Here's the strategy for the moment: perform the symlink, then
* do a lookup to grab the resulting vnode. I know this requires
* two communications with Venus for a new symbolic link, but
* that's the way the ball bounces. I don't yet want to change
* the way the Mach symlink works. When Mach support is
* deprecated, we should change symlink so that the common case
* returns the resultant vnode in a vpp argument.
*/
MARK_ENTRY(CODA_SYMLINK_STATS);
/* Check for symlink of control object. */
if (IS_CTL_NAME(dvp, nm, len)) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EACCES;
goto exit;
}
if (plen+1 > CODA_MAXPATHLEN) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EINVAL;
goto exit;
}
if (len+1 > CODA_MAXNAMLEN) {
MARK_INT_FAIL(CODA_SYMLINK_STATS);
error = EINVAL;
goto exit;
}
error = venus_symlink(vtomi(dvp), &dcp->c_fid, path, plen, nm, len, tva, cred, l);
/* Invalidate the parent's attr cache (modification time has changed). */
dcp->c_flags &= ~C_VATTR;
if (!error) {
/*
* VOP_SYMLINK is not defined to pay attention to cnp->cn_flags;
* these are defined only for VOP_LOOKUP. We desire to reuse
* cnp for a VOP_LOOKUP operation, and must be sure to not pass
* stray flags passed to us. Such stray flags can occur because
* sys_symlink makes a namei call and then reuses the
* componentname structure.
*/
/*
* XXX Arguably we should create our own componentname structure
* and not reuse the one that was passed in.
*/
saved_cn_flags = cnp->cn_flags;
cnp->cn_flags &= ~(MODMASK | OPMASK);
cnp->cn_flags |= LOOKUP;
error = VOP_LOOKUP(dvp, ap->a_vpp, cnp);
cnp->cn_flags = saved_cn_flags;
}
exit:
CODADEBUG(CODA_SYMLINK, myprintf(("in symlink result %d\n",error)); )
return(error);
}
/*
* Read directory entries.
*/
int
coda_readdir(void *v)
{
/* true args */
struct vop_readdir_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
struct uio *uiop = ap->a_uio;
kauth_cred_t cred = ap->a_cred;
int *eofflag = ap->a_eofflag;
/* upcall decl */
/* locals */
size_t initial_resid = uiop->uio_resid;
int error = 0;
int opened_internally = 0;
int ncookies;
char *buf;
struct vnode *cvp;
struct dirent *dirp;
MARK_ENTRY(CODA_READDIR_STATS);
CODADEBUG(CODA_READDIR, myprintf(("%s: (%p, %lu, %lld)\n", __func__,
uiop->uio_iov->iov_base, (unsigned long) uiop->uio_resid,
(long long) uiop->uio_offset)); )
/* Check for readdir of control object. */
if (IS_CTL_VP(vp)) {
MARK_INT_FAIL(CODA_READDIR_STATS);
return ENOENT;
}
/* If directory is not already open do an "internal open" on it. */
if (cp->c_ovp == NULL) {
opened_internally = 1;
MARK_INT_GEN(CODA_OPEN_STATS);
error = VOP_OPEN(vp, FREAD, cred);
#ifdef CODA_VERBOSE
printf("%s: Internally Opening %p\n", __func__, vp);
#endif
if (error)
return error;
KASSERT(cp->c_ovp != NULL);
}
cvp = cp->c_ovp;
CODADEBUG(CODA_READDIR, myprintf(("%s: fid = %s, refcnt = %d\n",
__func__, coda_f2s(&cp->c_fid), vrefcnt(cvp))); )
if (ap->a_ncookies) {
ncookies = ap->a_uio->uio_resid / _DIRENT_RECLEN(dirp, 1);
*ap->a_ncookies = 0;
*ap->a_cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
}
buf = kmem_alloc(CODA_DIRBLKSIZ, KM_SLEEP);
dirp = kmem_alloc(sizeof(*dirp), KM_SLEEP);
vn_lock(cvp, LK_EXCLUSIVE | LK_RETRY);
while (error == 0) {
size_t resid = 0;
char *dp, *ep;
if (!ALIGNED_POINTER(uiop->uio_offset, uint32_t)) {
error = EINVAL;
break;
}
error = vn_rdwr(UIO_READ, cvp, buf,
CODA_DIRBLKSIZ, uiop->uio_offset,
UIO_SYSSPACE, IO_NODELOCKED, cred, &resid, curlwp);
if (error || resid == CODA_DIRBLKSIZ)
break;
for (dp = buf, ep = dp + CODA_DIRBLKSIZ - resid; dp < ep; ) {
off_t off;
struct venus_dirent *vd = (struct venus_dirent *)dp;
if (!ALIGNED_POINTER(vd, uint32_t) ||
!ALIGNED_POINTER(vd->d_reclen, uint32_t) ||
vd->d_reclen == 0) {
error = EINVAL;
break;
}
if (dp + vd->d_reclen > ep) {
error = ENAMETOOLONG;
break;
}
if (vd->d_namlen == 0) {
uiop->uio_offset += vd->d_reclen;
dp += vd->d_reclen;
continue;
}
dirp->d_fileno = vd->d_fileno;
dirp->d_type = vd->d_type;
dirp->d_namlen = vd->d_namlen;
dirp->d_reclen = _DIRENT_SIZE(dirp);
strlcpy(dirp->d_name, vd->d_name, dirp->d_namlen + 1);
if (uiop->uio_resid < dirp->d_reclen) {
error = ENAMETOOLONG;
break;
}
off = uiop->uio_offset;
error = uiomove(dirp, dirp->d_reclen, uiop);
uiop->uio_offset = off;
if (error)
break;
uiop->uio_offset += vd->d_reclen;
dp += vd->d_reclen;
if (ap->a_ncookies)
(*ap->a_cookies)[(*ap->a_ncookies)++] =
uiop->uio_offset;
}
}
VOP_UNLOCK(cvp);
kmem_free(dirp, sizeof(*dirp));
kmem_free(buf, CODA_DIRBLKSIZ);
if (eofflag && error == 0)
*eofflag = 1;
if (uiop->uio_resid < initial_resid && error == ENAMETOOLONG)
error = 0;
if (ap->a_ncookies && error) {
free(*ap->a_cookies, M_TEMP);
*ap->a_ncookies = 0;
*ap->a_cookies = NULL;
}
if (error)
MARK_INT_FAIL(CODA_READDIR_STATS);
else
MARK_INT_SAT(CODA_READDIR_STATS);
/* Do an "internal close" if necessary. */
if (opened_internally) {
MARK_INT_GEN(CODA_CLOSE_STATS);
(void)VOP_CLOSE(vp, FREAD, cred);
}
return error;
}
/*
* Convert from file system blocks to device blocks
*/
int
coda_bmap(void *v)
{
/* XXX on the global proc */
/* true args */
struct vop_bmap_args *ap = v;
vnode_t *vp __unused = ap->a_vp; /* file's vnode */
daddr_t bn __unused = ap->a_bn; /* fs block number */
vnode_t **vpp = ap->a_vpp; /* RETURN vp of device */
daddr_t *bnp __unused = ap->a_bnp; /* RETURN device block number */
struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */
*vpp = (vnode_t *)0;
myprintf(("coda_bmap called!\n"));
return(EINVAL);
}
/*
* I don't think the following two things are used anywhere, so I've
* commented them out
*
* struct buf *async_bufhead;
* int async_daemon_count;
*/
int
coda_strategy(void *v)
{
/* true args */
struct vop_strategy_args *ap = v;
struct buf *bp __unused = ap->a_bp;
struct lwp *l __unused = curlwp;
/* upcall decl */
/* locals */
myprintf(("coda_strategy called! "));
return(EINVAL);
}
int
coda_reclaim(void *v)
{
/* true args */
struct vop_reclaim_v2_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
VOP_UNLOCK(vp);
/*
* Forced unmount/flush will let vnodes with non zero use be destroyed!
*/
ENTRY;
if (IS_UNMOUNTING(cp)) {
#ifdef DEBUG
if (VTOC(vp)->c_ovp) {
if (IS_UNMOUNTING(cp))
printf("%s: c_ovp not void: vp %p, cp %p\n", __func__, vp, cp);
}
#endif
} else {
#ifdef OLD_DIAGNOSTIC
if (vrefcnt(vp) != 0)
print("%s: pushing active %p\n", __func__, vp);
if (VTOC(vp)->c_ovp) {
panic("%s: c_ovp not void", __func__);
}
#endif
}
/* If an array has been allocated to hold the symlink, deallocate it */
if ((coda_symlink_cache) && (VALID_SYMLINK(cp))) {
if (cp->c_symlink == NULL)
panic("%s: null symlink pointer in cnode", __func__);
CODA_FREE(cp->c_symlink, cp->c_symlen);
cp->c_flags &= ~C_SYMLINK;
cp->c_symlen = 0;
}
mutex_enter(vp->v_interlock);
mutex_enter(&cp->c_lock);
SET_VTOC(vp) = NULL;
mutex_exit(&cp->c_lock);
mutex_exit(vp->v_interlock);
mutex_destroy(&cp->c_lock);
kmem_free(cp, sizeof(*cp));
return (0);
}
int
coda_lock(void *v)
{
/* true args */
struct vop_lock_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
ENTRY;
if (coda_lockdebug) {
myprintf(("Attempting lock on %s\n",
coda_f2s(&cp->c_fid)));
}
return genfs_lock(v);
}
int
coda_unlock(void *v)
{
/* true args */
struct vop_unlock_args *ap = v;
vnode_t *vp = ap->a_vp;
struct cnode *cp = VTOC(vp);
/* upcall decl */
/* locals */
ENTRY;
if (coda_lockdebug) {
myprintf(("Attempting unlock on %s\n",
coda_f2s(&cp->c_fid)));
}
return genfs_unlock(v);
}
int
coda_islocked(void *v)
{
/* true args */
ENTRY;
return genfs_islocked(v);
}
int
coda_pathconf(void *v)
{
struct vop_pathconf_args *ap = v;
switch (ap->a_name) {
default:
return EINVAL;
}
/* NOTREACHED */
}
/*
* Given a device and inode, obtain a locked vnode. One reference is
* obtained and passed back to the caller.
*/
int
coda_grab_vnode(vnode_t *uvp, dev_t dev, ino_t ino, vnode_t **vpp)
{
int error;
struct mount *mp;
/* Obtain mount point structure from device. */
if (!(mp = devtomp(dev))) {
myprintf(("%s: devtomp(0x%llx) returns NULL\n", __func__,
(unsigned long long)dev));
return(ENXIO);
}
/*
* Obtain vnode from mount point and inode.
*/
error = VFS_VGET(mp, ino, LK_EXCLUSIVE, vpp);
if (error) {
myprintf(("%s: iget/vget(0x%llx, %llu) returns %p, err %d\n", __func__,
(unsigned long long)dev, (unsigned long long)ino, *vpp, error));
return(ENOENT);
}
/* share the underlying vnode lock with the coda vnode */
vshareilock(*vpp, uvp);
KASSERT(VOP_ISLOCKED(*vpp));
return(0);
}
static void
coda_print_vattr(struct vattr *attr)
{
const char *typestr;
switch (attr->va_type) {
case VNON:
typestr = "VNON";
break;
case VREG:
typestr = "VREG";
break;
case VDIR:
typestr = "VDIR";
break;
case VBLK:
typestr = "VBLK";
break;
case VCHR:
typestr = "VCHR";
break;
case VLNK:
typestr = "VLNK";
break;
case VSOCK:
typestr = "VSCK";
break;
case VFIFO:
typestr = "VFFO";
break;
case VBAD:
typestr = "VBAD";
break;
default:
typestr = "????";
break;
}
myprintf(("attr: type %s mode %d uid %d gid %d fsid %d rdev %d\n",
typestr, (int)attr->va_mode, (int)attr->va_uid,
(int)attr->va_gid, (int)attr->va_fsid, (int)attr->va_rdev));
myprintf((" fileid %d nlink %d size %d blocksize %d bytes %d\n",
(int)attr->va_fileid, (int)attr->va_nlink,
(int)attr->va_size,
(int)attr->va_blocksize,(int)attr->va_bytes));
myprintf((" gen %ld flags %ld vaflags %d\n",
attr->va_gen, attr->va_flags, attr->va_vaflags));
myprintf((" atime sec %d nsec %d\n",
(int)attr->va_atime.tv_sec, (int)attr->va_atime.tv_nsec));
myprintf((" mtime sec %d nsec %d\n",
(int)attr->va_mtime.tv_sec, (int)attr->va_mtime.tv_nsec));
myprintf((" ctime sec %d nsec %d\n",
(int)attr->va_ctime.tv_sec, (int)attr->va_ctime.tv_nsec));
}
/*
* Return a vnode for the given fid.
* If no cnode exists for this fid create one and put it
* in a table hashed by coda_f2i(). If the cnode for
* this fid is already in the table return it (ref count is
* incremented by coda_find. The cnode will be flushed from the
* table when coda_inactive calls coda_unsave.
*/
struct cnode *
make_coda_node(CodaFid *fid, struct mount *fvsp, short type)
{
int error __diagused;
struct vnode *vp;
struct cnode *cp;
error = vcache_get(fvsp, fid, sizeof(CodaFid), &vp);
KASSERT(error == 0);
mutex_enter(vp->v_interlock);
cp = VTOC(vp);
KASSERT(cp != NULL);
mutex_enter(&cp->c_lock);
mutex_exit(vp->v_interlock);
if (vp->v_type != type) {
if (vp->v_type == VCHR || vp->v_type == VBLK)
spec_node_destroy(vp);
vp->v_type = type;
if (type == VCHR || type == VBLK)
spec_node_init(vp, NODEV);
uvm_vnp_setsize(vp, 0);
}
mutex_exit(&cp->c_lock);
return cp;
}
/*
* coda_getpages may be called on a vnode which has not been opened,
* e.g. to fault in pages to execute a program. In that case, we must
* open the file to get the container. The vnode may or may not be
* locked, and we must leave it in the same state.
*/
int
coda_getpages(void *v)
{
struct vop_getpages_args /* {
vnode_t *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp, *cvp;
struct cnode *cp = VTOC(vp);
struct lwp *l = curlwp;
kauth_cred_t cred = l->l_cred;
int error, cerror;
int waslocked; /* 1 if vnode lock was held on entry */
int didopen = 0; /* 1 if we opened container file */
krw_t op;
/*
* Handle a case that uvm_fault doesn't quite use yet.
* See layer_vnops.c. for inspiration.
*/
if (ap->a_flags & PGO_LOCKED) {
return EBUSY;
}
KASSERT(rw_lock_held(vp->v_uobj.vmobjlock));
/* Check for control object. */
if (IS_CTL_VP(vp)) {
#ifdef CODA_VERBOSE
printf("%s: control object %p\n", __func__, vp);
#endif
return(EINVAL);
}
/*
* XXX It's really not ok to be releasing the lock we get,
* because we could be overlapping with another call to
* getpages and drop a lock they are relying on. We need to
* figure out whether getpages ever is called holding the
* lock, and if we should serialize getpages calls by some
* mechanism.
*/
/* XXX VOP_ISLOCKED() may not be used for lock decisions. */
op = rw_lock_op(vp->v_uobj.vmobjlock);
waslocked = VOP_ISLOCKED(vp);
/* Get container file if not already present. */
cvp = cp->c_ovp;
if (cvp == NULL) {
/*
* VOP_OPEN requires a locked vnode. We must avoid
* locking the vnode if it is already locked, and
* leave it in the same state on exit.
*/
if (waslocked == 0) {
rw_exit(vp->v_uobj.vmobjlock);
cerror = vn_lock(vp, LK_EXCLUSIVE);
if (cerror) {
#ifdef CODA_VERBOSE
printf("%s: can't lock vnode %p\n",
__func__, vp);
#endif
return cerror;
}
#ifdef CODA_VERBOSE
printf("%s: locked vnode %p\n", __func__, vp);
#endif
}
/*
* Open file (causes upcall to venus).
* XXX Perhaps we should not fully open the file, but
* simply obtain a container file.
*/
/* XXX Is it ok to do this while holding the mutex? */
cerror = VOP_OPEN(vp, FREAD, cred);
if (cerror) {
#ifdef CODA_VERBOSE
printf("%s: cannot open vnode %p => %d\n", __func__,
vp, cerror);
#endif
if (waslocked == 0)
VOP_UNLOCK(vp);
return cerror;
}
#ifdef CODA_VERBOSE
printf("%s: opened vnode %p\n", __func__, vp);
#endif
cvp = cp->c_ovp;
didopen = 1;
if (waslocked == 0)
rw_enter(vp->v_uobj.vmobjlock, op);
}
KASSERT(cvp != NULL);
/* Munge the arg structure to refer to the container vnode. */
KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
ap->a_vp = cp->c_ovp;
/* Finally, call getpages on it. */
error = VCALL(ap->a_vp, VOFFSET(vop_getpages), ap);
/* If we opened the vnode, we must close it. */
if (didopen) {
/*
* VOP_CLOSE requires a locked vnode, but we are still
* holding the lock (or riding a caller's lock).
*/
cerror = VOP_CLOSE(vp, FREAD, cred);
#ifdef CODA_VERBOSE
if (cerror != 0)
/* XXX How should we handle this? */
printf("%s: closed vnode %p -> %d\n", __func__,
vp, cerror);
#endif
/* If we obtained a lock, drop it. */
if (waslocked == 0)
VOP_UNLOCK(vp);
}
return error;
}
/*
* The protocol requires v_interlock to be held by the caller.
*/
int
coda_putpages(void *v)
{
struct vop_putpages_args /* {
vnode_t *a_vp;
voff_t a_offlo;
voff_t a_offhi;
int a_flags;
} */ *ap = v;
vnode_t *vp = ap->a_vp, *cvp;
struct cnode *cp = VTOC(vp);
int error;
KASSERT(rw_write_held(vp->v_uobj.vmobjlock));
/* Check for control object. */
if (IS_CTL_VP(vp)) {
rw_exit(vp->v_uobj.vmobjlock);
#ifdef CODA_VERBOSE
printf("%s: control object %p\n", __func__, vp);
#endif
return 0;
}
/*
* If container object is not present, then there are no pages
* to put; just return without error. This happens all the
* time, apparently during discard of a closed vnode (which
* trivially can't have dirty pages).
*/
cvp = cp->c_ovp;
if (cvp == NULL) {
rw_exit(vp->v_uobj.vmobjlock);
return 0;
}
/* Munge the arg structure to refer to the container vnode. */
KASSERT(cvp->v_uobj.vmobjlock == vp->v_uobj.vmobjlock);
ap->a_vp = cvp;
/* Finally, call putpages on it. */
error = VCALL(ap->a_vp, VOFFSET(vop_putpages), ap);
return error;
}
/* $NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Kernel reader/writer lock implementation, modeled after those
* found in Solaris, a description of which can be found in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*
* The NetBSD implementation differs from that described in the book, in
* that the locks are partially adaptive. Lock waiters spin wait while a
* lock is write held and the holder is still running on a CPU. The method
* of choosing which threads to awaken when a lock is released also differs,
* mainly to take account of the partially adaptive behaviour.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_rwlock.c,v 1.76 2023/10/15 10:28:48 riastradh Exp $");
#include "opt_lockdebug.h"
#define __RWLOCK_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/lock.h>
#include <sys/lockdebug.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
#include <dev/lockstat.h>
#include <machine/rwlock.h>
/*
* LOCKDEBUG
*/
#define RW_DEBUG_P(rw) (((rw)->rw_owner & RW_NODEBUG) == 0)
#define RW_WANTLOCK(rw, op) \
LOCKDEBUG_WANTLOCK(RW_DEBUG_P(rw), (rw), \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
#define RW_LOCKED(rw, op) \
LOCKDEBUG_LOCKED(RW_DEBUG_P(rw), (rw), NULL, \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
#define RW_UNLOCKED(rw, op) \
LOCKDEBUG_UNLOCKED(RW_DEBUG_P(rw), (rw), \
(uintptr_t)__builtin_return_address(0), op == RW_READER);
/*
* DIAGNOSTIC
*/
#if defined(DIAGNOSTIC)
#define RW_ASSERT(rw, cond) \
do { \
if (__predict_false(!(cond))) \
rw_abort(__func__, __LINE__, rw, "assertion failed: " #cond);\
} while (/* CONSTCOND */ 0)
#else
#define RW_ASSERT(rw, cond) /* nothing */
#endif /* DIAGNOSTIC */
/*
* For platforms that do not provide stubs, or for the LOCKDEBUG case.
*/
#ifdef LOCKDEBUG
#undef __HAVE_RW_STUBS
#endif
#ifndef __HAVE_RW_STUBS
__strong_alias(rw_enter,rw_vector_enter);
__strong_alias(rw_exit,rw_vector_exit);
__strong_alias(rw_tryenter,rw_vector_tryenter);
#endif
static void rw_abort(const char *, size_t, krwlock_t *, const char *);
static void rw_dump(const volatile void *, lockop_printer_t);
static lwp_t *rw_owner(wchan_t);
lockops_t rwlock_lockops = {
.lo_name = "Reader / writer lock",
.lo_type = LOCKOPS_SLEEP,
.lo_dump = rw_dump,
};
/*
* Give rwlock holders an extra-high priority boost on-blocking due to
* direct handoff. XXX To be revisited.
*/
syncobj_t rw_syncobj = {
.sobj_name = "rwlock",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KTHREAD,
.sobj_unsleep = turnstile_unsleep,
.sobj_changepri = turnstile_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = rw_owner,
};
/*
* rw_cas:
*
* Do an atomic compare-and-swap on the lock word.
*/
static inline uintptr_t
rw_cas(krwlock_t *rw, uintptr_t o, uintptr_t n)
{
return (uintptr_t)atomic_cas_ptr((volatile void *)&rw->rw_owner,
(void *)o, (void *)n);
}
/*
* rw_swap:
*
* Do an atomic swap of the lock word. This is used only when it's
* known that the lock word is set up such that it can't be changed
* behind us (assert this), so there's no point considering the result.
*/
static inline void
rw_swap(krwlock_t *rw, uintptr_t o, uintptr_t n)
{
n = (uintptr_t)atomic_swap_ptr((volatile void *)&rw->rw_owner,
(void *)n);
RW_ASSERT(rw, n == o); RW_ASSERT(rw, (o & RW_HAS_WAITERS) != 0);
}
/*
* rw_dump:
*
* Dump the contents of a rwlock structure.
*/
static void
rw_dump(const volatile void *cookie, lockop_printer_t pr)
{
const volatile krwlock_t *rw = cookie;
pr("owner/count : %#018lx flags : %#018x\n",
(long)RW_OWNER(rw), (int)RW_FLAGS(rw));
}
/*
* rw_abort:
*
* Dump information about an error and panic the system. This
* generates a lot of machine code in the DIAGNOSTIC case, so
* we ask the compiler to not inline it.
*/
static void __noinline
rw_abort(const char *func, size_t line, krwlock_t *rw, const char *msg)
{
if (__predict_false(panicstr != NULL))
return;
LOCKDEBUG_ABORT(func, line, rw, &rwlock_lockops, msg);
}
/*
* rw_init:
*
* Initialize a rwlock for use.
*/
void
_rw_init(krwlock_t *rw, uintptr_t return_address)
{
#ifdef LOCKDEBUG
/* XXX only because the assembly stubs can't handle RW_NODEBUG */
if (LOCKDEBUG_ALLOC(rw, &rwlock_lockops, return_address))
rw->rw_owner = 0;
else
rw->rw_owner = RW_NODEBUG;
#else
rw->rw_owner = 0;
#endif
}
void
rw_init(krwlock_t *rw)
{
_rw_init(rw, (uintptr_t)__builtin_return_address(0));
}
/*
* rw_destroy:
*
* Tear down a rwlock.
*/
void
rw_destroy(krwlock_t *rw)
{ RW_ASSERT(rw, (rw->rw_owner & ~RW_NODEBUG) == 0); LOCKDEBUG_FREE((rw->rw_owner & RW_NODEBUG) == 0, rw);
}
/*
* rw_oncpu:
*
* Return true if an rwlock owner is running on a CPU in the system.
* If the target is waiting on the kernel big lock, then we must
* release it. This is necessary to avoid deadlock.
*/
static bool
rw_oncpu(uintptr_t owner)
{
#ifdef MULTIPROCESSOR
struct cpu_info *ci;
lwp_t *l;
KASSERT(kpreempt_disabled()); if ((owner & (RW_WRITE_LOCKED|RW_HAS_WAITERS)) != RW_WRITE_LOCKED) {
return false;
}
/*
* See lwp_dtor() why dereference of the LWP pointer is safe.
* We must have kernel preemption disabled for that.
*/
l = (lwp_t *)(owner & RW_THREAD);
ci = l->l_cpu;
if (ci && ci->ci_curlwp == l) {
/* Target is running; do we need to block? */
return (ci->ci_biglock_wanted != l);
}
#endif
/* Not running. It may be safe to block now. */
return false;
}
/*
* rw_vector_enter:
*
* Acquire a rwlock.
*/
void
rw_vector_enter(krwlock_t *rw, const krw_t op)
{
uintptr_t owner, incr, need_wait, set_wait, curthread, next;
turnstile_t *ts;
int queue;
lwp_t *l;
LOCKSTAT_TIMER(slptime);
LOCKSTAT_TIMER(slpcnt);
LOCKSTAT_TIMER(spintime);
LOCKSTAT_COUNTER(spincnt);
LOCKSTAT_FLAG(lsflag);
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, !cpu_intr_p()); RW_ASSERT(rw, curthread != 0); RW_WANTLOCK(rw, op); if (__predict_true(panicstr == NULL)) { KDASSERT(pserialize_not_in_read_section());
LOCKDEBUG_BARRIER(&kernel_lock, 1);
}
/*
* We play a slight trick here. If we're a reader, we want
* increment the read count. If we're a writer, we want to
* set the owner field and the WRITE_LOCKED bit.
*
* In the latter case, we expect those bits to be zero,
* therefore we can use an add operation to set them, which
* means an add operation for both cases.
*/
if (__predict_true(op == RW_READER)) {
incr = RW_READ_INCR;
set_wait = RW_HAS_WAITERS;
need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
queue = TS_READER_Q;
} else {
RW_ASSERT(rw, op == RW_WRITER);
incr = curthread | RW_WRITE_LOCKED;
set_wait = RW_HAS_WAITERS | RW_WRITE_WANTED;
need_wait = RW_WRITE_LOCKED | RW_THREAD;
queue = TS_WRITER_Q;
}
LOCKSTAT_ENTER(lsflag);
KPREEMPT_DISABLE(curlwp);
for (owner = rw->rw_owner;;) {
/*
* Read the lock owner field. If the need-to-wait
* indicator is clear, then try to acquire the lock.
*/
if ((owner & need_wait) == 0) {
next = rw_cas(rw, owner, (owner + incr) &
~RW_WRITE_WANTED);
if (__predict_true(next == owner)) {
/* Got it! */
membar_acquire();
break;
}
/*
* Didn't get it -- spin around again (we'll
* probably sleep on the next iteration).
*/
owner = next;
continue;
}
if (__predict_false(RW_OWNER(rw) == curthread)) { rw_abort(__func__, __LINE__, rw,
"locking against myself");
}
/*
* If the lock owner is running on another CPU, and
* there are no existing waiters, then spin.
*/
if (rw_oncpu(owner)) { LOCKSTAT_START_TIMER(lsflag, spintime);
u_int count = SPINLOCK_BACKOFF_MIN;
do {
KPREEMPT_ENABLE(curlwp); SPINLOCK_BACKOFF(count);
KPREEMPT_DISABLE(curlwp);
owner = rw->rw_owner;
} while (rw_oncpu(owner)); LOCKSTAT_STOP_TIMER(lsflag, spintime);
LOCKSTAT_COUNT(spincnt, 1);
if ((owner & need_wait) == 0)
continue;
}
/*
* Grab the turnstile chain lock. Once we have that, we
* can adjust the waiter bits and sleep queue.
*/
ts = turnstile_lookup(rw);
/*
* Mark the rwlock as having waiters. If the set fails,
* then we may not need to sleep and should spin again.
* Reload rw_owner because turnstile_lookup() may have
* spun on the turnstile chain lock.
*/
owner = rw->rw_owner;
if ((owner & need_wait) == 0 || rw_oncpu(owner)) {
turnstile_exit(rw);
continue;
}
next = rw_cas(rw, owner, owner | set_wait);
/* XXX membar? */
if (__predict_false(next != owner)) {
turnstile_exit(rw);
owner = next;
continue;
}
LOCKSTAT_START_TIMER(lsflag, slptime);
turnstile_block(ts, queue, rw, &rw_syncobj);
LOCKSTAT_STOP_TIMER(lsflag, slptime);
LOCKSTAT_COUNT(slpcnt, 1);
/*
* No need for a memory barrier because of context switch.
* If not handed the lock, then spin again.
*/
if (op == RW_READER || (rw->rw_owner & RW_THREAD) == curthread)
break;
owner = rw->rw_owner;
}
KPREEMPT_ENABLE(curlwp); LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK |
(op == RW_WRITER ? LB_SLEEP1 : LB_SLEEP2), slpcnt, slptime,
(l->l_rwcallsite != 0 ? l->l_rwcallsite :
(uintptr_t)__builtin_return_address(0)));
LOCKSTAT_EVENT_RA(lsflag, rw, LB_RWLOCK | LB_SPIN, spincnt, spintime,
(l->l_rwcallsite != 0 ? l->l_rwcallsite :
(uintptr_t)__builtin_return_address(0)));
LOCKSTAT_EXIT(lsflag);
RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
(op == RW_READER && RW_COUNT(rw) != 0));
RW_LOCKED(rw, op);
}
/*
* rw_vector_exit:
*
* Release a rwlock.
*/
void
rw_vector_exit(krwlock_t *rw)
{
uintptr_t curthread, owner, decr, newown, next;
turnstile_t *ts;
int rcnt, wcnt;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0);
/*
* Again, we use a trick. Since we used an add operation to
* set the required lock bits, we can use a subtract to clear
* them, which makes the read-release and write-release path
* the same.
*/
owner = rw->rw_owner;
if (__predict_false((owner & RW_WRITE_LOCKED) != 0)) {
RW_UNLOCKED(rw, RW_WRITER); RW_ASSERT(rw, RW_OWNER(rw) == curthread);
decr = curthread | RW_WRITE_LOCKED;
} else {
RW_UNLOCKED(rw, RW_READER); RW_ASSERT(rw, RW_COUNT(rw) != 0);
decr = RW_READ_INCR;
}
/*
* Compute what we expect the new value of the lock to be. Only
* proceed to do direct handoff if there are waiters, and if the
* lock would become unowned.
*/
membar_release();
for (;;) {
newown = (owner - decr);
if ((newown & (RW_THREAD | RW_HAS_WAITERS)) == RW_HAS_WAITERS)
break;
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner))
return;
owner = next;
}
/*
* Grab the turnstile chain lock. This gets the interlock
* on the sleep queue. Once we have that, we can adjust the
* waiter bits.
*/
ts = turnstile_lookup(rw);
owner = rw->rw_owner;
RW_ASSERT(rw, ts != NULL); RW_ASSERT(rw, (owner & RW_HAS_WAITERS) != 0);
wcnt = TS_WAITERS(ts, TS_WRITER_Q);
rcnt = TS_WAITERS(ts, TS_READER_Q);
/*
* Give the lock away.
*
* If we are releasing a write lock, then prefer to wake all
* outstanding readers. Otherwise, wake one writer if there
* are outstanding readers, or all writers if there are no
* pending readers. If waking one specific writer, the writer
* is handed the lock here. If waking multiple writers, we
* set WRITE_WANTED to block out new readers, and let them
* do the work of acquiring the lock in rw_vector_enter().
*/
if (rcnt == 0 || decr == RW_READ_INCR) {
RW_ASSERT(rw, wcnt != 0); RW_ASSERT(rw, (owner & RW_WRITE_WANTED) != 0);
if (rcnt != 0) {
/* Give the lock to the longest waiting writer. */
l = TS_FIRST(ts, TS_WRITER_Q);
newown = (uintptr_t)l | (owner & RW_NODEBUG);
newown |= RW_WRITE_LOCKED | RW_HAS_WAITERS;
if (wcnt > 1)
newown |= RW_WRITE_WANTED;
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_WRITER_Q, 1, l);
} else {
/* Wake all writers and let them fight it out. */
newown = owner & RW_NODEBUG;
newown |= RW_WRITE_WANTED;
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_WRITER_Q, wcnt, NULL);
}
} else {
RW_ASSERT(rw, rcnt != 0);
/*
* Give the lock to all blocked readers. If there
* is a writer waiting, new readers that arrive
* after the release will be blocked out.
*/
newown = owner & RW_NODEBUG;
newown += rcnt << RW_READ_COUNT_SHIFT;
if (wcnt != 0)
newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
/* Wake up all sleeping readers. */
rw_swap(rw, owner, newown);
turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
}
}
/*
* rw_vector_tryenter:
*
* Try to acquire a rwlock.
*/
int
rw_vector_tryenter(krwlock_t *rw, const krw_t op)
{
uintptr_t curthread, owner, incr, need_wait, next;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0); if (op == RW_READER) {
incr = RW_READ_INCR;
need_wait = RW_WRITE_LOCKED | RW_WRITE_WANTED;
} else {
RW_ASSERT(rw, op == RW_WRITER);
incr = curthread | RW_WRITE_LOCKED;
need_wait = RW_WRITE_LOCKED | RW_THREAD;
}
for (owner = rw->rw_owner;; owner = next) {
if (__predict_false((owner & need_wait) != 0))
return 0;
next = rw_cas(rw, owner, owner + incr);
if (__predict_true(next == owner)) {
/* Got it! */
break;
}
}
RW_WANTLOCK(rw, op); RW_LOCKED(rw, op); RW_ASSERT(rw, (op != RW_READER && RW_OWNER(rw) == curthread) ||
(op == RW_READER && RW_COUNT(rw) != 0));
membar_acquire();
return 1;
}
/*
* rw_downgrade:
*
* Downgrade a write lock to a read lock.
*/
void
rw_downgrade(krwlock_t *rw)
{
uintptr_t owner, newown, next, curthread __diagused;
turnstile_t *ts;
int rcnt, wcnt;
lwp_t *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) != 0);
RW_ASSERT(rw, RW_OWNER(rw) == curthread);
RW_UNLOCKED(rw, RW_WRITER);
membar_release();
for (owner = rw->rw_owner;; owner = next) {
/*
* If there are no waiters we can do this the easy way. Try
* swapping us down to one read hold. If it fails, the lock
* condition has changed and we most likely now have
* waiters.
*/
if ((owner & RW_HAS_WAITERS) == 0) {
newown = (owner & RW_NODEBUG);
next = rw_cas(rw, owner, newown + RW_READ_INCR);
if (__predict_true(next == owner)) {
RW_LOCKED(rw, RW_READER);
RW_ASSERT(rw,
(rw->rw_owner & RW_WRITE_LOCKED) == 0);
RW_ASSERT(rw, RW_COUNT(rw) != 0);
return;
}
continue;
}
/*
* Grab the turnstile chain lock. This gets the interlock
* on the sleep queue. Once we have that, we can adjust the
* waiter bits.
*/
ts = turnstile_lookup(rw);
RW_ASSERT(rw, ts != NULL);
rcnt = TS_WAITERS(ts, TS_READER_Q);
wcnt = TS_WAITERS(ts, TS_WRITER_Q);
if (rcnt == 0) {
/*
* If there are no readers, just preserve the
* waiters bits, swap us down to one read hold and
* return.
*/
RW_ASSERT(rw, wcnt != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_WANTED) != 0);
RW_ASSERT(rw, (rw->rw_owner & RW_HAS_WAITERS) != 0);
newown = owner & RW_NODEBUG;
newown |= RW_READ_INCR | RW_HAS_WAITERS |
RW_WRITE_WANTED;
next = rw_cas(rw, owner, newown);
turnstile_exit(rw);
if (__predict_true(next == owner))
break;
} else {
/*
* Give the lock to all blocked readers. We may
* retain one read hold if downgrading. If there is
* a writer waiting, new readers will be blocked
* out.
*/
newown = owner & RW_NODEBUG;
newown += (rcnt << RW_READ_COUNT_SHIFT) + RW_READ_INCR;
if (wcnt != 0)
newown |= RW_HAS_WAITERS | RW_WRITE_WANTED;
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner)) {
/* Wake up all sleeping readers. */
turnstile_wakeup(ts, TS_READER_Q, rcnt, NULL);
break;
}
turnstile_exit(rw);
}
}
RW_WANTLOCK(rw, RW_READER);
RW_LOCKED(rw, RW_READER);
RW_ASSERT(rw, (rw->rw_owner & RW_WRITE_LOCKED) == 0);
RW_ASSERT(rw, RW_COUNT(rw) != 0);
}
/*
* rw_tryupgrade:
*
* Try to upgrade a read lock to a write lock. We must be the only
* reader.
*/
int
rw_tryupgrade(krwlock_t *rw)
{
uintptr_t owner, curthread, newown, next;
struct lwp *l;
l = curlwp;
curthread = (uintptr_t)l;
RW_ASSERT(rw, curthread != 0); RW_ASSERT(rw, rw_read_held(rw));
for (owner = RW_READ_INCR;; owner = next) {
newown = curthread | RW_WRITE_LOCKED | (owner & ~RW_THREAD);
next = rw_cas(rw, owner, newown);
if (__predict_true(next == owner)) {
membar_acquire();
break;
}
RW_ASSERT(rw, (next & RW_WRITE_LOCKED) == 0); if (__predict_false((next & RW_THREAD) != RW_READ_INCR)) { RW_ASSERT(rw, (next & RW_THREAD) != 0);
return 0;
}
}
RW_UNLOCKED(rw, RW_READER); RW_WANTLOCK(rw, RW_WRITER); RW_LOCKED(rw, RW_WRITER); RW_ASSERT(rw, rw->rw_owner & RW_WRITE_LOCKED); RW_ASSERT(rw, RW_OWNER(rw) == curthread);
return 1;
}
/*
* rw_read_held:
*
* Returns true if the rwlock is held for reading. Must only be
* used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_read_held(krwlock_t *rw)
{
uintptr_t owner;
if (rw == NULL)
return 0;
owner = rw->rw_owner;
return (owner & RW_WRITE_LOCKED) == 0 && (owner & RW_THREAD) != 0;
}
/*
* rw_write_held:
*
* Returns true if the rwlock is held for writing. Must only be
* used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_write_held(krwlock_t *rw)
{ if (rw == NULL)
return 0;
return (rw->rw_owner & (RW_WRITE_LOCKED | RW_THREAD)) ==
(RW_WRITE_LOCKED | (uintptr_t)curlwp);
}
/*
* rw_lock_held:
*
* Returns true if the rwlock is held for reading or writing. Must
* only be used for diagnostic assertions, and never be used to make
* decisions about how to use a rwlock.
*/
int
rw_lock_held(krwlock_t *rw)
{ if (rw == NULL)
return 0;
return (rw->rw_owner & RW_THREAD) != 0;
}
/*
* rw_lock_op:
*
* For a rwlock that is known to be held by the caller, return
* RW_READER or RW_WRITER to describe the hold type.
*/
krw_t
rw_lock_op(krwlock_t *rw)
{ RW_ASSERT(rw, rw_lock_held(rw));
return (rw->rw_owner & RW_WRITE_LOCKED) != 0 ? RW_WRITER : RW_READER;
}
/*
* rw_owner:
*
* Return the current owner of an RW lock, but only if it is write
* held. Used for priority inheritance.
*/
static lwp_t *
rw_owner(wchan_t obj)
{
krwlock_t *rw = (void *)(uintptr_t)obj; /* discard qualifiers */
uintptr_t owner = rw->rw_owner;
if ((owner & RW_WRITE_LOCKED) == 0)
return NULL;
return (void *)(owner & RW_THREAD);
}
/* $NetBSD: if_stats.c,v 1.4 2021/06/29 21:19:58 riastradh Exp $ */
/*-
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_stats.c,v 1.4 2021/06/29 21:19:58 riastradh Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/systm.h>
#include <sys/xcall.h>
#include <net/if.h>
#define IF_STATS_SIZE (sizeof(uint64_t) * IF_NSTATS)
/*
* if_stats_init --
* Initialize statistics storage for a network interface.
*/
void
if_stats_init(ifnet_t * const ifp)
{
ifp->if_stats = percpu_alloc(IF_STATS_SIZE);
}
/*
* if_stats_fini --
* Tear down statistics storage for a network interface.
*/
void
if_stats_fini(ifnet_t * const ifp)
{
percpu_t *pc = ifp->if_stats;
ifp->if_stats = NULL;
if (pc) {
percpu_free(pc, IF_STATS_SIZE);
}
}
struct if_stats_to_if_data_ctx {
struct if_data * const ifi;
const bool zero_stats;
};
static void
if_stats_to_if_data_cb(void *v1, void *v2, struct cpu_info *ci)
{
const uint64_t * const local_counters = v1;
struct if_stats_to_if_data_ctx *ctx = v2;
int s = splnet();
if (ctx->ifi) {
ctx->ifi->ifi_ipackets += local_counters[if_ipackets];
ctx->ifi->ifi_ierrors += local_counters[if_ierrors];
ctx->ifi->ifi_opackets += local_counters[if_opackets];
ctx->ifi->ifi_oerrors += local_counters[if_oerrors];
ctx->ifi->ifi_collisions += local_counters[if_collisions];
ctx->ifi->ifi_ibytes += local_counters[if_ibytes];
ctx->ifi->ifi_obytes += local_counters[if_obytes];
ctx->ifi->ifi_imcasts += local_counters[if_imcasts];
ctx->ifi->ifi_omcasts += local_counters[if_omcasts];
ctx->ifi->ifi_iqdrops += local_counters[if_iqdrops];
ctx->ifi->ifi_noproto += local_counters[if_noproto];
}
if (ctx->zero_stats) {
memset(v1, 0, IF_STATS_SIZE);
}
splx(s);
}
/*
* if_stats_to_if_data --
* Collect the interface statistics and place them into the
* legacy if_data structure for reportig to user space.
* Optionally zeros the stats after collection.
*/
void
if_stats_to_if_data(ifnet_t * const ifp, struct if_data * const ifi,
const bool zero_stats)
{
struct if_stats_to_if_data_ctx ctx = {
.ifi = ifi,
.zero_stats = zero_stats,
};
memset(ifi, 0, sizeof(*ifi));
percpu_foreach_xcall(ifp->if_stats, XC_HIGHPRI_IPL(IPL_SOFTNET),
if_stats_to_if_data_cb, &ctx);
}
/* $NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $ */
/*
* Copyright (c) 1998 Chuck Silvers, Charles D. Cranor and
* Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* from: Id: uvm_aobj.c,v 1.1.2.5 1998/02/06 05:14:38 chs Exp
*/
/*
* uvm_aobj.c: anonymous memory uvm_object pager
*
* author: Chuck Silvers <chuq@chuq.com>
* started: Jan-1998
*
* - design mostly from Chuck Cranor
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_aobj.c,v 1.157 2023/02/24 11:03:13 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_uvmhist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_page_array.h>
/*
* An anonymous UVM object (aobj) manages anonymous-memory. In addition to
* keeping the list of resident pages, it may also keep a list of allocated
* swap blocks. Depending on the size of the object, this list is either
* stored in an array (small objects) or in a hash table (large objects).
*
* Lock order
*
* uao_list_lock ->
* uvm_object::vmobjlock
*/
/*
* Note: for hash tables, we break the address space of the aobj into blocks
* of UAO_SWHASH_CLUSTER_SIZE pages, which shall be a power of two.
*/
#define UAO_SWHASH_CLUSTER_SHIFT 4
#define UAO_SWHASH_CLUSTER_SIZE (1 << UAO_SWHASH_CLUSTER_SHIFT)
/* Get the "tag" for this page index. */
#define UAO_SWHASH_ELT_TAG(idx) ((idx) >> UAO_SWHASH_CLUSTER_SHIFT)
#define UAO_SWHASH_ELT_PAGESLOT_IDX(idx) \
((idx) & (UAO_SWHASH_CLUSTER_SIZE - 1))
/* Given an ELT and a page index, find the swap slot. */
#define UAO_SWHASH_ELT_PAGESLOT(elt, idx) \
((elt)->slots[UAO_SWHASH_ELT_PAGESLOT_IDX(idx)])
/* Given an ELT, return its pageidx base. */
#define UAO_SWHASH_ELT_PAGEIDX_BASE(ELT) \
((elt)->tag << UAO_SWHASH_CLUSTER_SHIFT)
/* The hash function. */
#define UAO_SWHASH_HASH(aobj, idx) \
(&(aobj)->u_swhash[(((idx) >> UAO_SWHASH_CLUSTER_SHIFT) \
& (aobj)->u_swhashmask)])
/*
* The threshold which determines whether we will use an array or a
* hash table to store the list of allocated swap blocks.
*/
#define UAO_SWHASH_THRESHOLD (UAO_SWHASH_CLUSTER_SIZE * 4)
#define UAO_USES_SWHASH(aobj) \
((aobj)->u_pages > UAO_SWHASH_THRESHOLD)
/* The number of buckets in a hash, with an upper bound. */
#define UAO_SWHASH_MAXBUCKETS 256
#define UAO_SWHASH_BUCKETS(aobj) \
(MIN((aobj)->u_pages >> UAO_SWHASH_CLUSTER_SHIFT, UAO_SWHASH_MAXBUCKETS))
/*
* uao_swhash_elt: when a hash table is being used, this structure defines
* the format of an entry in the bucket list.
*/
struct uao_swhash_elt {
LIST_ENTRY(uao_swhash_elt) list; /* the hash list */
voff_t tag; /* our 'tag' */
int count; /* our number of active slots */
int slots[UAO_SWHASH_CLUSTER_SIZE]; /* the slots */
};
/*
* uao_swhash: the swap hash table structure
*/
LIST_HEAD(uao_swhash, uao_swhash_elt);
/*
* uao_swhash_elt_pool: pool of uao_swhash_elt structures.
* Note: pages for this pool must not come from a pageable kernel map.
*/
static struct pool uao_swhash_elt_pool __cacheline_aligned;
/*
* uvm_aobj: the actual anon-backed uvm_object
*
* => the uvm_object is at the top of the structure, this allows
* (struct uvm_aobj *) == (struct uvm_object *)
* => only one of u_swslots and u_swhash is used in any given aobj
*/
struct uvm_aobj {
struct uvm_object u_obj; /* has: lock, pgops, #pages, #refs */
pgoff_t u_pages; /* number of pages in entire object */
int u_flags; /* the flags (see uvm_aobj.h) */
int *u_swslots; /* array of offset->swapslot mappings */
/*
* hashtable of offset->swapslot mappings
* (u_swhash is an array of bucket heads)
*/
struct uao_swhash *u_swhash;
u_long u_swhashmask; /* mask for hashtable */
LIST_ENTRY(uvm_aobj) u_list; /* global list of aobjs */
int u_freelist; /* freelist to allocate pages from */
};
static void uao_free(struct uvm_aobj *);
static int uao_get(struct uvm_object *, voff_t, struct vm_page **,
int *, int, vm_prot_t, int, int);
static int uao_put(struct uvm_object *, voff_t, voff_t, int);
#if defined(VMSWAP)
static struct uao_swhash_elt *uao_find_swhash_elt
(struct uvm_aobj *, int, bool);
static bool uao_pagein(struct uvm_aobj *, int, int);
static bool uao_pagein_page(struct uvm_aobj *, int);
#endif /* defined(VMSWAP) */
static struct vm_page *uao_pagealloc(struct uvm_object *, voff_t, int);
/*
* aobj_pager
*
* note that some functions (e.g. put) are handled elsewhere
*/
const struct uvm_pagerops aobj_pager = {
.pgo_reference = uao_reference,
.pgo_detach = uao_detach,
.pgo_get = uao_get,
.pgo_put = uao_put,
};
/*
* uao_list: global list of active aobjs, locked by uao_list_lock
*/
static LIST_HEAD(aobjlist, uvm_aobj) uao_list __cacheline_aligned;
static kmutex_t uao_list_lock __cacheline_aligned;
/*
* hash table/array related functions
*/
#if defined(VMSWAP)
/*
* uao_find_swhash_elt: find (or create) a hash table entry for a page
* offset.
*
* => the object should be locked by the caller
*/
static struct uao_swhash_elt *
uao_find_swhash_elt(struct uvm_aobj *aobj, int pageidx, bool create)
{
struct uao_swhash *swhash;
struct uao_swhash_elt *elt;
voff_t page_tag;
swhash = UAO_SWHASH_HASH(aobj, pageidx);
page_tag = UAO_SWHASH_ELT_TAG(pageidx);
/*
* now search the bucket for the requested tag
*/
LIST_FOREACH(elt, swhash, list) { if (elt->tag == page_tag) {
return elt;
}
}
if (!create) {
return NULL;
}
/*
* allocate a new entry for the bucket and init/insert it in
*/
elt = pool_get(&uao_swhash_elt_pool, PR_NOWAIT);
if (elt == NULL) {
return NULL;
}
LIST_INSERT_HEAD(swhash, elt, list);
elt->tag = page_tag;
elt->count = 0;
memset(elt->slots, 0, sizeof(elt->slots));
return elt;
}
/*
* uao_find_swslot: find the swap slot number for an aobj/pageidx
*
* => object must be locked by caller
*/
int
uao_find_swslot(struct uvm_object *uobj, int pageidx)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash_elt *elt;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we never return a slot
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP)
return 0;
/*
* if hashing, look in hash table.
*/
if (UAO_USES_SWHASH(aobj)) {
elt = uao_find_swhash_elt(aobj, pageidx, false);
return elt ? UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) : 0;
}
/*
* otherwise, look in the array
*/
return aobj->u_swslots[pageidx];
}
/*
* uao_set_swslot: set the swap slot for a page in an aobj.
*
* => setting a slot to zero frees the slot
* => object must be locked by caller
* => we return the old slot number, or -1 if we failed to allocate
* memory to record the new slot number
*/
int
uao_set_swslot(struct uvm_object *uobj, int pageidx, int slot)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uao_swhash_elt *elt;
int oldslot;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "aobj %#jx pageidx %jd slot %jd",
(uintptr_t)aobj, pageidx, slot, 0);
KASSERT(rw_write_held(uobj->vmobjlock) || uobj->uo_refs == 0); KASSERT(UVM_OBJ_IS_AOBJ(uobj));
/*
* if noswap flag is set, then we can't set a non-zero slot.
*/
if (aobj->u_flags & UAO_FLAG_NOSWAP) {
KASSERTMSG(slot == 0, "uao_set_swslot: no swap object");
return 0;
}
/*
* are we using a hash table? if so, add it in the hash.
*/
if (UAO_USES_SWHASH(aobj)) {
/*
* Avoid allocating an entry just to free it again if
* the page had not swap slot in the first place, and
* we are freeing.
*/
elt = uao_find_swhash_elt(aobj, pageidx, slot != 0);
if (elt == NULL) {
return slot ? -1 : 0;
}
oldslot = UAO_SWHASH_ELT_PAGESLOT(elt, pageidx);
UAO_SWHASH_ELT_PAGESLOT(elt, pageidx) = slot;
/*
* now adjust the elt's reference counter and free it if we've
* dropped it to zero.
*/
if (slot) {
if (oldslot == 0) elt->count++;
} else {
if (oldslot) elt->count--; if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
} else {
/* we are using an array */
oldslot = aobj->u_swslots[pageidx];
aobj->u_swslots[pageidx] = slot;
}
return oldslot;
}
#endif /* defined(VMSWAP) */
/*
* end of hash/array functions
*/
/*
* uao_free: free all resources held by an aobj, and then free the aobj
*
* => the aobj should be dead
*/
static void
uao_free(struct uvm_aobj *aobj)
{
struct uvm_object *uobj = &aobj->u_obj;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
uao_dropswap_range(uobj, 0, 0);
rw_exit(uobj->vmobjlock);
#if defined(VMSWAP)
if (UAO_USES_SWHASH(aobj)) {
/*
* free the hash table itself.
*/
hashdone(aobj->u_swhash, HASH_LIST, aobj->u_swhashmask);
} else {
/*
* free the array itself.
*/
kmem_free(aobj->u_swslots, aobj->u_pages * sizeof(int));
}
#endif /* defined(VMSWAP) */
/*
* finally free the aobj itself
*/
uvm_obj_destroy(uobj, true);
kmem_free(aobj, sizeof(struct uvm_aobj));
}
/*
* pager functions
*/
/*
* uao_create: create an aobj of the given size and return its uvm_object.
*
* => for normal use, flags are always zero
* => for the kernel object, the flags are:
* UAO_FLAG_KERNOBJ - allocate the kernel object (can only happen once)
* UAO_FLAG_KERNSWAP - enable swapping of kernel object (" ")
*/
struct uvm_object *
uao_create(voff_t size, int flags)
{
static struct uvm_aobj kernel_object_store;
static krwlock_t bootstrap_kernel_object_lock;
static int kobj_alloced __diagused = 0;
pgoff_t pages = round_page((uint64_t)size) >> PAGE_SHIFT;
struct uvm_aobj *aobj;
int refs;
/*
* Allocate a new aobj, unless kernel object is requested.
*/
if (flags & UAO_FLAG_KERNOBJ) {
KASSERT(!kobj_alloced);
aobj = &kernel_object_store;
aobj->u_pages = pages;
aobj->u_flags = UAO_FLAG_NOSWAP;
refs = UVM_OBJ_KERN;
kobj_alloced = UAO_FLAG_KERNOBJ;
} else if (flags & UAO_FLAG_KERNSWAP) {
KASSERT(kobj_alloced == UAO_FLAG_KERNOBJ);
aobj = &kernel_object_store;
kobj_alloced = UAO_FLAG_KERNSWAP;
refs = 0xdeadbeaf; /* XXX: gcc */
} else {
aobj = kmem_alloc(sizeof(struct uvm_aobj), KM_SLEEP);
aobj->u_pages = pages;
aobj->u_flags = 0;
refs = 1;
}
/*
* no freelist by default
*/
aobj->u_freelist = VM_NFREELIST;
/*
* allocate hash/array if necessary
*
* note: in the KERNSWAP case no need to worry about locking since
* we are still booting we should be the only thread around.
*/
const int kernswap = (flags & UAO_FLAG_KERNSWAP) != 0;
if (flags == 0 || kernswap) {
#if defined(VMSWAP)
/* allocate hash table or array depending on object size */
if (UAO_USES_SWHASH(aobj)) {
aobj->u_swhash = hashinit(UAO_SWHASH_BUCKETS(aobj),
HASH_LIST, true, &aobj->u_swhashmask);
} else {
aobj->u_swslots = kmem_zalloc(pages * sizeof(int),
KM_SLEEP);
}
#endif /* defined(VMSWAP) */
/*
* Replace kernel_object's temporary static lock with
* a regular rw_obj. We cannot use uvm_obj_setlock()
* because that would try to free the old lock.
*/
if (kernswap) { aobj->u_obj.vmobjlock = rw_obj_alloc();
rw_destroy(&bootstrap_kernel_object_lock);
}
if (flags) { aobj->u_flags &= ~UAO_FLAG_NOSWAP; /* clear noswap */
return &aobj->u_obj;
}
}
/*
* Initialise UVM object.
*/
const bool kernobj = (flags & UAO_FLAG_KERNOBJ) != 0;
uvm_obj_init(&aobj->u_obj, &aobj_pager, !kernobj, refs);
if (__predict_false(kernobj)) {
/* Use a temporary static lock for kernel_object. */
rw_init(&bootstrap_kernel_object_lock);
uvm_obj_setlock(&aobj->u_obj, &bootstrap_kernel_object_lock);
}
/*
* now that aobj is ready, add it to the global list
*/
mutex_enter(&uao_list_lock);
LIST_INSERT_HEAD(&uao_list, aobj, u_list);
mutex_exit(&uao_list_lock);
return(&aobj->u_obj);
}
/*
* uao_set_pgfl: allocate pages only from the specified freelist.
*
* => must be called before any pages are allocated for the object.
* => reset by setting it to VM_NFREELIST, meaning any freelist.
*/
void
uao_set_pgfl(struct uvm_object *uobj, int freelist)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
KASSERTMSG((0 <= freelist), "invalid freelist %d", freelist);
KASSERTMSG((freelist <= VM_NFREELIST), "invalid freelist %d",
freelist);
aobj->u_freelist = freelist;
}
/*
* uao_pagealloc: allocate a page for aobj.
*/
static inline struct vm_page *
uao_pagealloc(struct uvm_object *uobj, voff_t offset, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
if (__predict_true(aobj->u_freelist == VM_NFREELIST))
return uvm_pagealloc(uobj, offset, NULL, flags);
else
return uvm_pagealloc_strat(uobj, offset, NULL, flags,
UVM_PGA_STRAT_ONLY, aobj->u_freelist);
}
/*
* uao_init: set up aobj pager subsystem
*
* => called at boot time from uvm_pager_init()
*/
void
uao_init(void)
{
static int uao_initialized;
if (uao_initialized)
return;
uao_initialized = true;
LIST_INIT(&uao_list);
mutex_init(&uao_list_lock, MUTEX_DEFAULT, IPL_NONE);
pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt),
0, 0, 0, "uaoeltpl", NULL, IPL_VM);
}
/*
* uao_reference: hold a reference to an anonymous UVM object.
*/
void
uao_reference(struct uvm_object *uobj)
{
/* Kernel object is persistent. */
if (UVM_OBJ_IS_KERN_OBJECT(uobj)) {
return;
}
atomic_inc_uint(&uobj->uo_refs);
}
/*
* uao_detach: drop a reference to an anonymous UVM object.
*/
void
uao_detach(struct uvm_object *uobj)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uvm_page_array a;
struct vm_page *pg;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
/*
* Detaching from kernel object is a NOP.
*/
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
return;
/*
* Drop the reference. If it was the last one, destroy the object.
*/
KASSERT(uobj->uo_refs > 0);
UVMHIST_LOG(maphist," (uobj=%#jx) ref=%jd",
(uintptr_t)uobj, uobj->uo_refs, 0, 0);
membar_release();
if (atomic_dec_uint_nv(&uobj->uo_refs) > 0) {
UVMHIST_LOG(maphist, "<- done (rc>0)", 0,0,0,0);
return;
}
membar_acquire();
/*
* Remove the aobj from the global list.
*/
mutex_enter(&uao_list_lock);
LIST_REMOVE(aobj, u_list);
mutex_exit(&uao_list_lock);
/*
* Free all the pages left in the aobj. For each page, when the
* page is no longer busy (and thus after any disk I/O that it is
* involved in is complete), release any swap resources and free
* the page itself.
*/
uvm_page_array_init(&a, uobj, 0);
rw_enter(uobj->vmobjlock, RW_WRITER);
while ((pg = uvm_page_array_fill_and_peek(&a, 0, 0)) != NULL) {
uvm_page_array_advance(&a);
pmap_page_protect(pg, VM_PROT_NONE);
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uao_det");
uvm_page_array_clear(&a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
uvm_pagefree(pg);
}
uvm_page_array_fini(&a);
/*
* Finally, free the anonymous UVM object itself.
*/
uao_free(aobj);
}
/*
* uao_put: flush pages out of a uvm object
*
* => object should be locked by caller. we may _unlock_ the object
* if (and only if) we need to clean a page (PGO_CLEANIT).
* XXXJRT Currently, however, we don't. In the case of cleaning
* XXXJRT a page, we simply just deactivate it. Should probably
* XXXJRT handle this better, in the future (although "flushing"
* XXXJRT anonymous memory isn't terribly important).
* => if PGO_CLEANIT is not set, then we will neither unlock the object
* or block.
* => if PGO_ALLPAGE is set, then all pages in the object are valid targets
* for flushing.
* => we return 0 unless we encountered some sort of I/O error
* XXXJRT currently never happens, as we never directly initiate
* XXXJRT I/O
*/
static int
uao_put(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
struct uvm_page_array a;
struct vm_page *pg;
voff_t curoff;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (flags & PGO_ALLPAGES) {
start = 0;
stop = aobj->u_pages << PAGE_SHIFT;
} else {
start = trunc_page(start);
if (stop == 0) {
stop = aobj->u_pages << PAGE_SHIFT;
} else {
stop = round_page(stop);
}
if (stop > (uint64_t)(aobj->u_pages << PAGE_SHIFT)) { printf("uao_put: strange, got an out of range "
"flush %#jx > %#jx (fixed)\n",
(uintmax_t)stop,
(uintmax_t)(aobj->u_pages << PAGE_SHIFT));
stop = aobj->u_pages << PAGE_SHIFT;
}
}
UVMHIST_LOG(maphist,
" flush start=%#jx, stop=%#jx, flags=%#jx",
start, stop, flags, 0);
/*
* Don't need to do any work here if we're not freeing
* or deactivating pages.
*/
if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
rw_exit(uobj->vmobjlock);
return 0;
}
/* locked: uobj */
uvm_page_array_init(&a, uobj, 0);
curoff = start;
while ((pg = uvm_page_array_fill_and_peek(&a, curoff, 0)) != NULL) { if (pg->offset >= stop) {
break;
}
/*
* wait and try again if the page is busy.
*/
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "uao_put");
uvm_page_array_clear(&a);
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
uvm_page_array_advance(&a);
curoff = pg->offset + PAGE_SIZE;
switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
/*
* XXX In these first 3 cases, we always just
* XXX deactivate the page. We may want to
* XXX handle the different cases more specifically
* XXX in the future.
*/
case PGO_CLEANIT|PGO_FREE:
case PGO_CLEANIT|PGO_DEACTIVATE:
case PGO_DEACTIVATE:
deactivate_it:
uvm_pagelock(pg);
uvm_pagedeactivate(pg);
uvm_pageunlock(pg);
break;
case PGO_FREE:
/*
* If there are multiple references to
* the object, just deactivate the page.
*/
if (uobj->uo_refs > 1)
goto deactivate_it;
/*
* free the swap slot and the page.
*/
pmap_page_protect(pg, VM_PROT_NONE);
/*
* freeing swapslot here is not strictly necessary.
* however, leaving it here doesn't save much
* because we need to update swap accounting anyway.
*/
uao_dropswap(uobj, pg->offset >> PAGE_SHIFT);
uvm_pagefree(pg);
break;
default:
panic("%s: impossible", __func__);
}
}
rw_exit(uobj->vmobjlock);
uvm_page_array_fini(&a);
return 0;
}
/*
* uao_get: fetch me a page
*
* we have three cases:
* 1: page is resident -> just return the page.
* 2: page is zero-fill -> allocate a new page and zero it.
* 3: page is swapped out -> fetch the page from swap.
*
* case 1 can be handled with PGO_LOCKED, cases 2 and 3 cannot.
* so, if the "center" page hits case 2/3 then we will need to return EBUSY.
*
* => prefer map unlocked (not required)
* => object must be locked! we will _unlock_ it before starting any I/O.
* => flags: PGO_LOCKED: fault data structures are locked
* => NOTE: offset is the offset of pps[0], _NOT_ pps[centeridx]
* => NOTE: caller must check for released pages!!
*/
static int
uao_get(struct uvm_object *uobj, voff_t offset, struct vm_page **pps,
int *npagesp, int centeridx, vm_prot_t access_type, int advice, int flags)
{
voff_t current_offset;
struct vm_page *ptmp;
int lcv, gotpages, maxpages, swslot, pageidx;
bool overwrite = ((flags & PGO_OVERWRITE) != 0);
struct uvm_page_array a;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "aobj=%#jx offset=%jd, flags=%#jx",
(uintptr_t)uobj, offset, flags,0);
/*
* the object must be locked. it can only be a read lock when
* processing a read fault with PGO_LOCKED.
*/
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_lock_held(uobj->vmobjlock)); KASSERT(rw_write_held(uobj->vmobjlock) ||
((flags & PGO_LOCKED) != 0 && (access_type & VM_PROT_WRITE) == 0));
/*
* get number of pages
*/
maxpages = *npagesp;
/*
* step 1: handled the case where fault data structures are locked.
*/
if (flags & PGO_LOCKED) {
/*
* step 1a: get pages that are already resident. only do
* this if the data structures are locked (i.e. the first
* time through).
*/
uvm_page_array_init(&a, uobj, 0);
gotpages = 0; /* # of pages we got so far */
for (lcv = 0; lcv < maxpages; lcv++) {
ptmp = uvm_page_array_fill_and_peek(&a,
offset + (lcv << PAGE_SHIFT), maxpages);
if (ptmp == NULL) {
break;
}
KASSERT(ptmp->offset >= offset);
lcv = (ptmp->offset - offset) >> PAGE_SHIFT;
if (lcv >= maxpages) {
break;
}
uvm_page_array_advance(&a);
/*
* to be useful must get a non-busy page
*/
if ((ptmp->flags & PG_BUSY) != 0) {
continue;
}
/*
* useful page: plug it in our result array
*/
KASSERT(uvm_pagegetdirty(ptmp) !=
UVM_PAGE_STATUS_CLEAN);
pps[lcv] = ptmp;
gotpages++;
}
uvm_page_array_fini(&a);
/*
* step 1b: now we've either done everything needed or we
* to unlock and do some waiting or I/O.
*/
UVMHIST_LOG(pdhist, "<- done (done=%jd)",
(pps[centeridx] != NULL), 0,0,0);
*npagesp = gotpages;
return pps[centeridx] != NULL ? 0 : EBUSY;
}
/*
* step 2: get non-resident or busy pages.
* object is locked. data structures are unlocked.
*/
if ((flags & PGO_SYNCIO) == 0) {
goto done;
}
uvm_page_array_init(&a, uobj, 0);
for (lcv = 0, current_offset = offset ; lcv < maxpages ;) {
/*
* we have yet to locate the current page (pps[lcv]). we
* first look for a page that is already at the current offset.
* if we find a page, we check to see if it is busy or
* released. if that is the case, then we sleep on the page
* until it is no longer busy or released and repeat the lookup.
* if the page we found is neither busy nor released, then we
* busy it (so we own it) and plug it into pps[lcv]. we are
* ready to move on to the next page.
*/
ptmp = uvm_page_array_fill_and_peek(&a, current_offset,
maxpages - lcv);
if (ptmp != NULL && ptmp->offset == current_offset) {
/* page is there, see if we need to wait on it */
if ((ptmp->flags & PG_BUSY) != 0) {
UVMHIST_LOG(pdhist,
"sleeping, ptmp->flags %#jx\n",
ptmp->flags,0,0,0);
uvm_pagewait(ptmp, uobj->vmobjlock, "uao_get");
rw_enter(uobj->vmobjlock, RW_WRITER);
uvm_page_array_clear(&a);
continue;
}
/*
* if we get here then the page is resident and
* unbusy. we busy it now (so we own it). if
* overwriting, mark the page dirty up front as
* it will be zapped via an unmanaged mapping.
*/
KASSERT(uvm_pagegetdirty(ptmp) !=
UVM_PAGE_STATUS_CLEAN);
if (overwrite) { uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
}
/* we own it, caller must un-busy */
ptmp->flags |= PG_BUSY;
UVM_PAGE_OWN(ptmp, "uao_get2");
pps[lcv++] = ptmp;
current_offset += PAGE_SIZE;
uvm_page_array_advance(&a);
continue;
} else {
KASSERT(ptmp == NULL || ptmp->offset > current_offset);
}
/*
* not resident. allocate a new busy/fake/clean page in the
* object. if it's in swap we need to do I/O to fill in the
* data, otherwise the page needs to be cleared: if it's not
* destined to be overwritten, then zero it here and now.
*/
pageidx = current_offset >> PAGE_SHIFT;
swslot = uao_find_swslot(uobj, pageidx); ptmp = uao_pagealloc(uobj, current_offset,
swslot != 0 || overwrite ? 0 : UVM_PGA_ZERO);
/* out of RAM? */
if (ptmp == NULL) {
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "sleeping, ptmp == NULL",0,0,0,0);
uvm_wait("uao_getpage");
rw_enter(uobj->vmobjlock, RW_WRITER);
uvm_page_array_clear(&a);
continue;
}
/*
* if swslot == 0, page hasn't existed before and is zeroed.
* otherwise we have a "fake/busy/clean" page that we just
* allocated. do the needed "i/o", reading from swap.
*/
if (swslot != 0) {
#if defined(VMSWAP)
int error;
UVMHIST_LOG(pdhist, "pagein from swslot %jd",
swslot, 0,0,0);
/*
* page in the swapped-out page.
* unlock object for i/o, relock when done.
*/
uvm_page_array_clear(&a);
rw_exit(uobj->vmobjlock);
error = uvm_swap_get(ptmp, swslot, PGO_SYNCIO);
rw_enter(uobj->vmobjlock, RW_WRITER);
/*
* I/O done. check for errors.
*/
if (error != 0) {
UVMHIST_LOG(pdhist, "<- done (error=%jd)",
error,0,0,0);
/*
* remove the swap slot from the aobj
* and mark the aobj as having no real slot.
* don't free the swap slot, thus preventing
* it from being used again.
*/
swslot = uao_set_swslot(uobj, pageidx,
SWSLOT_BAD);
if (swslot > 0) { uvm_swap_markbad(swslot, 1);
}
uvm_pagefree(ptmp);
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "<- done (error)",
error,lcv,0,0);
if (lcv != 0) { uvm_page_unbusy(pps, lcv);
}
memset(pps, 0, maxpages * sizeof(pps[0]));
uvm_page_array_fini(&a);
return error;
}
#else /* defined(VMSWAP) */
panic("%s: pagein", __func__);
#endif /* defined(VMSWAP) */
}
/*
* note that we will allow the page being writably-mapped
* (!PG_RDONLY) regardless of access_type. if overwrite,
* the page can be modified through an unmanaged mapping
* so mark it dirty up front.
*/
if (overwrite) {
uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_DIRTY);
} else {
uvm_pagemarkdirty(ptmp, UVM_PAGE_STATUS_UNKNOWN);
}
/*
* we got the page! clear the fake flag (indicates valid
* data now in page) and plug into our result array. note
* that page is still busy.
*
* it is the callers job to:
* => check if the page is released
* => unbusy the page
* => activate the page
*/
KASSERT(uvm_pagegetdirty(ptmp) != UVM_PAGE_STATUS_CLEAN); KASSERT((ptmp->flags & PG_FAKE) != 0); KASSERT(ptmp->offset == current_offset);
ptmp->flags &= ~PG_FAKE;
pps[lcv++] = ptmp;
current_offset += PAGE_SIZE;
}
uvm_page_array_fini(&a);
/*
* finally, unlock object and return.
*/
done:
rw_exit(uobj->vmobjlock);
UVMHIST_LOG(pdhist, "<- done (OK)",0,0,0,0);
return 0;
}
#if defined(VMSWAP)
/*
* uao_dropswap: release any swap resources from this aobj page.
*
* => aobj must be locked or have a reference count of 0.
*/
void
uao_dropswap(struct uvm_object *uobj, int pageidx)
{
int slot;
KASSERT(UVM_OBJ_IS_AOBJ(uobj));
slot = uao_set_swslot(uobj, pageidx, 0);
if (slot) { uvm_swap_free(slot, 1);
}
}
/*
* page in every page in every aobj that is paged-out to a range of swslots.
*
* => nothing should be locked.
* => returns true if pagein was aborted due to lack of memory.
*/
bool
uao_swap_off(int startslot, int endslot)
{
struct uvm_aobj *aobj;
/*
* Walk the list of all anonymous UVM objects. Grab the first.
*/
mutex_enter(&uao_list_lock);
if ((aobj = LIST_FIRST(&uao_list)) == NULL) {
mutex_exit(&uao_list_lock);
return false;
}
uao_reference(&aobj->u_obj);
do {
struct uvm_aobj *nextaobj;
bool rv;
/*
* Prefetch the next object and immediately hold a reference
* on it, so neither the current nor the next entry could
* disappear while we are iterating.
*/
if ((nextaobj = LIST_NEXT(aobj, u_list)) != NULL) {
uao_reference(&nextaobj->u_obj);
}
mutex_exit(&uao_list_lock);
/*
* Page in all pages in the swap slot range.
*/
rw_enter(aobj->u_obj.vmobjlock, RW_WRITER);
rv = uao_pagein(aobj, startslot, endslot);
rw_exit(aobj->u_obj.vmobjlock);
/* Drop the reference of the current object. */
uao_detach(&aobj->u_obj);
if (rv) {
if (nextaobj) {
uao_detach(&nextaobj->u_obj);
}
return rv;
}
aobj = nextaobj;
mutex_enter(&uao_list_lock);
} while (aobj);
mutex_exit(&uao_list_lock);
return false;
}
/*
* page in any pages from aobj in the given range.
*
* => aobj must be locked and is returned locked.
* => returns true if pagein was aborted due to lack of memory.
*/
static bool
uao_pagein(struct uvm_aobj *aobj, int startslot, int endslot)
{
bool rv;
if (UAO_USES_SWHASH(aobj)) {
struct uao_swhash_elt *elt;
int buck;
restart:
for (buck = aobj->u_swhashmask; buck >= 0; buck--) {
for (elt = LIST_FIRST(&aobj->u_swhash[buck]);
elt != NULL;
elt = LIST_NEXT(elt, list)) {
int i;
for (i = 0; i < UAO_SWHASH_CLUSTER_SIZE; i++) {
int slot = elt->slots[i];
/*
* if the slot isn't in range, skip it.
*/
if (slot < startslot ||
slot >= endslot) {
continue;
}
/*
* process the page,
* the start over on this object
* since the swhash elt
* may have been freed.
*/
rv = uao_pagein_page(aobj,
UAO_SWHASH_ELT_PAGEIDX_BASE(elt) + i);
if (rv) {
return rv;
}
goto restart;
}
}
}
} else {
int i;
for (i = 0; i < aobj->u_pages; i++) {
int slot = aobj->u_swslots[i];
/*
* if the slot isn't in range, skip it
*/
if (slot < startslot || slot >= endslot) {
continue;
}
/*
* process the page.
*/
rv = uao_pagein_page(aobj, i);
if (rv) {
return rv;
}
}
}
return false;
}
/*
* uao_pagein_page: page in a single page from an anonymous UVM object.
*
* => Returns true if pagein was aborted due to lack of memory.
* => Object must be locked and is returned locked.
*/
static bool
uao_pagein_page(struct uvm_aobj *aobj, int pageidx)
{
struct uvm_object *uobj = &aobj->u_obj;
struct vm_page *pg;
int rv, npages;
pg = NULL;
npages = 1;
KASSERT(rw_write_held(uobj->vmobjlock));
rv = uao_get(uobj, (voff_t)pageidx << PAGE_SHIFT, &pg, &npages,
0, VM_PROT_READ | VM_PROT_WRITE, 0, PGO_SYNCIO);
/*
* relock and finish up.
*/
rw_enter(uobj->vmobjlock, RW_WRITER);
switch (rv) {
case 0:
break;
case EIO:
case ERESTART:
/*
* nothing more to do on errors.
* ERESTART can only mean that the anon was freed,
* so again there's nothing to do.
*/
return false;
default:
return true;
}
/*
* ok, we've got the page now.
* mark it as dirty, clear its swslot and un-busy it.
*/
uao_dropswap(&aobj->u_obj, pageidx);
/*
* make sure it's on a page queue.
*/
uvm_pagelock(pg);
uvm_pageenqueue(pg);
uvm_pagewakeup(pg);
uvm_pageunlock(pg);
pg->flags &= ~(PG_BUSY|PG_FAKE);
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
UVM_PAGE_OWN(pg, NULL);
return false;
}
/*
* uao_dropswap_range: drop swapslots in the range.
*
* => aobj must be locked and is returned locked.
* => start is inclusive. end is exclusive.
*/
void
uao_dropswap_range(struct uvm_object *uobj, voff_t start, voff_t end)
{
struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
int swpgonlydelta = 0;
KASSERT(UVM_OBJ_IS_AOBJ(uobj)); KASSERT(rw_write_held(uobj->vmobjlock));
if (end == 0) {
end = INT64_MAX;
}
if (UAO_USES_SWHASH(aobj)) {
int i, hashbuckets = aobj->u_swhashmask + 1;
voff_t taghi;
voff_t taglo;
taglo = UAO_SWHASH_ELT_TAG(start);
taghi = UAO_SWHASH_ELT_TAG(end);
for (i = 0; i < hashbuckets; i++) {
struct uao_swhash_elt *elt, *next;
for (elt = LIST_FIRST(&aobj->u_swhash[i]);
elt != NULL;
elt = next) {
int startidx, endidx;
int j;
next = LIST_NEXT(elt, list);
if (elt->tag < taglo || taghi < elt->tag) {
continue;
}
if (elt->tag == taglo) {
startidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(start);
} else {
startidx = 0;
}
if (elt->tag == taghi) {
endidx =
UAO_SWHASH_ELT_PAGESLOT_IDX(end);
} else {
endidx = UAO_SWHASH_CLUSTER_SIZE;
}
for (j = startidx; j < endidx; j++) {
int slot = elt->slots[j];
KASSERT(uvm_pagelookup(&aobj->u_obj,
(UAO_SWHASH_ELT_PAGEIDX_BASE(elt)
+ j) << PAGE_SHIFT) == NULL);
if (slot > 0) {
uvm_swap_free(slot, 1);
swpgonlydelta++;
KASSERT(elt->count > 0);
elt->slots[j] = 0;
elt->count--;
}
}
if (elt->count == 0) { LIST_REMOVE(elt, list);
pool_put(&uao_swhash_elt_pool, elt);
}
}
}
} else {
int i;
if (aobj->u_pages < end) {
end = aobj->u_pages;
}
for (i = start; i < end; i++) {
int slot = aobj->u_swslots[i];
if (slot > 0) { uvm_swap_free(slot, 1);
swpgonlydelta++;
}
}
}
/*
* adjust the counter of pages only in swap for all
* the swap slots we've freed.
*/
if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta);
atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
#endif /* defined(VMSWAP) */
/* $NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_loop.c 8.2 (Berkeley) 1/9/95
*/
/*
* Loopback interface driver for protocol testing and timing.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_loop.c,v 1.118 2022/09/04 23:34:51 thorpej Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_atalk.h"
#include "opt_mbuftrace.h"
#include "opt_mpls.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/time.h>
#include <sys/device.h>
#include <sys/module.h>
#include <sys/cpu.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/route.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#endif
#ifdef INET6
#ifndef INET
#include <netinet/in.h>
#endif
#include <netinet6/in6_var.h>
#include <netinet6/in6_offload.h>
#include <netinet/ip6.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif
#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#endif
#include <net/bpf.h>
#if defined(LARGE_LOMTU)
#define LOMTU (131072 + MHLEN + MLEN)
#define LOMTU_MAX LOMTU
#else
#define LOMTU (32768 + MHLEN + MLEN)
#define LOMTU_MAX (65536 + MHLEN + MLEN)
#endif
#ifdef ALTQ
static void lostart(struct ifnet *);
#endif
static int loop_clone_create(struct if_clone *, int);
static int loop_clone_destroy(struct ifnet *);
static void loop_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
static struct if_clone loop_cloner =
IF_CLONE_INITIALIZER("lo", loop_clone_create, loop_clone_destroy);
void
loopattach(int n)
{
#ifndef _MODULE
loop_clone_create(&loop_cloner, 0); /* lo0 always exists */
#endif
}
void
loopinit(void)
{
if (lo0ifp != NULL) /* can happen in rump kernel */
return;
#ifdef _MODULE
loop_clone_create(&loop_cloner, 0); /* lo0 always exists */
#endif
if_clone_attach(&loop_cloner);
}
static int
loopdetach(void)
{
/* no detach for now; we don't allow lo0 to be deleted */
return EBUSY;
}
static int
loop_clone_create(struct if_clone *ifc, int unit)
{
struct ifnet *ifp;
ifp = if_alloc(IFT_LOOP);
if_initname(ifp, ifc->ifc_name, unit);
ifp->if_mtu = LOMTU;
ifp->if_flags = IFF_LOOPBACK | IFF_MULTICAST;
#ifdef NET_MPSAFE
ifp->if_extflags = IFEF_MPSAFE;
#endif
ifp->if_ioctl = loioctl;
ifp->if_output = looutput;
#ifdef ALTQ
ifp->if_start = lostart;
#endif
ifp->if_type = IFT_LOOP;
ifp->if_hdrlen = 0;
ifp->if_addrlen = 0;
ifp->if_dlt = DLT_NULL;
IFQ_SET_READY(&ifp->if_snd);
if (unit == 0)
lo0ifp = ifp;
if_initialize(ifp);
ifp->if_link_state = LINK_STATE_UP;
if_alloc_sadl(ifp);
bpf_attach(ifp, DLT_NULL, sizeof(u_int));
#ifdef MBUFTRACE
ifp->if_mowner = malloc(sizeof(struct mowner), M_DEVBUF,
M_WAITOK | M_ZERO);
strlcpy(ifp->if_mowner->mo_name, ifp->if_xname,
sizeof(ifp->if_mowner->mo_name));
MOWNER_ATTACH(ifp->if_mowner);
#endif
ifp->if_flags |= IFF_RUNNING;
if_register(ifp);
return (0);
}
static int
loop_clone_destroy(struct ifnet *ifp)
{
if (ifp == lo0ifp)
return (EPERM);
ifp->if_flags &= ~IFF_RUNNING;
#ifdef MBUFTRACE
MOWNER_DETACH(ifp->if_mowner);
free(ifp->if_mowner, M_DEVBUF);
#endif
bpf_detach(ifp);
if_detach(ifp);
if_free(ifp);
return (0);
}
int
looutput(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
const struct rtentry *rt)
{
pktqueue_t *pktq = NULL;
int s;
int csum_flags;
int error = 0;
size_t pktlen;
MCLAIM(m, ifp->if_mowner);
KERNEL_LOCK_UNLESS_NET_MPSAFE();
if ((m->m_flags & M_PKTHDR) == 0)
panic("looutput: no header mbuf"); if (ifp->if_flags & IFF_LOOPBACK) bpf_mtap_af(ifp, dst->sa_family, m, BPF_D_OUT); m_set_rcvif(m, ifp); if (rt && rt->rt_flags & (RTF_REJECT|RTF_BLACKHOLE)) { m_freem(m);
error = (rt->rt_flags & RTF_BLACKHOLE ? 0 :
rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
goto out;
}
pktlen = m->m_pkthdr.len;
if_statadd2(ifp, if_opackets, 1, if_obytes, pktlen);
#ifdef ALTQ
/*
* ALTQ on the loopback interface is just for debugging. It's
* used only for loopback interfaces, not for a simplex interface.
*/
if ((ALTQ_IS_ENABLED(&ifp->if_snd) || TBR_IS_ENABLED(&ifp->if_snd)) &&
ifp->if_start == lostart) {
/*
* If the queueing discipline needs packet classification,
* do it before prepending the link headers.
*/
IFQ_CLASSIFY(&ifp->if_snd, m, dst->sa_family);
M_PREPEND(m, sizeof(uint32_t), M_DONTWAIT);
if (m == NULL) {
if_statinc(ifp, if_oerrors);
error = ENOBUFS;
goto out;
}
*(mtod(m, uint32_t *)) = dst->sa_family;
error = if_transmit_lock(ifp, m);
goto out;
}
#endif /* ALTQ */
m_tag_delete_chain(m);
#ifdef MPLS
bool is_mpls = false;
if (rt != NULL && rt_gettag(rt) != NULL &&
rt_gettag(rt)->sa_family == AF_MPLS &&
(m->m_flags & (M_MCAST | M_BCAST)) == 0) {
union mpls_shim msh;
msh.s_addr = MPLS_GETSADDR(rt);
if (msh.shim.label != MPLS_LABEL_IMPLNULL) {
is_mpls = true;
pktq = mpls_pktq;
}
}
if (!is_mpls)
#endif
switch (dst->sa_family) {
#ifdef INET
case AF_INET:
csum_flags = m->m_pkthdr.csum_flags;
KASSERT((csum_flags & ~(M_CSUM_IPv4|M_CSUM_UDPv4)) == 0); if (csum_flags != 0 && IN_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
in_undefer_cksum(m, 0, csum_flags);
m->m_pkthdr.csum_flags = 0;
} else {
/*
* Do nothing. Pass M_CSUM_IPv4 and M_CSUM_UDPv4 as
* they are to tell those are calculated and good.
*/
}
pktq = ip_pktq;
break;
#endif
#ifdef INET6
case AF_INET6:
csum_flags = m->m_pkthdr.csum_flags;
KASSERT((csum_flags & ~M_CSUM_UDPv6) == 0); if (csum_flags != 0 && IN6_LOOPBACK_NEED_CHECKSUM(csum_flags)) {
in6_undefer_cksum(m, 0, csum_flags);
m->m_pkthdr.csum_flags = 0;
} else {
/*
* Do nothing. Pass M_CSUM_UDPv6 as
* they are to tell those are calculated and good.
*/
}
m->m_flags |= M_LOOP;
pktq = ip6_pktq;
break;
#endif
#ifdef NETATALK
case AF_APPLETALK:
pktq = at_pktq2;
break;
#endif
default:
printf("%s: can't handle af%d\n", ifp->if_xname,
dst->sa_family);
m_freem(m);
error = EAFNOSUPPORT;
goto out;
}
KASSERT(pktq != NULL);
error = 0;
s = splnet();
if (__predict_true(pktq_enqueue(pktq, m, 0))) {
if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
} else {
m_freem(m);
if_statinc(ifp, if_oerrors);
error = ENOBUFS;
}
splx(s);
out:
KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
return error;
}
#ifdef ALTQ
static void
lostart(struct ifnet *ifp)
{
for (;;) {
pktqueue_t *pktq = NULL;
struct mbuf *m;
size_t pktlen;
uint32_t af;
int s;
IFQ_DEQUEUE(&ifp->if_snd, m);
if (m == NULL)
return;
af = *(mtod(m, uint32_t *));
m_adj(m, sizeof(uint32_t));
switch (af) {
#ifdef INET
case AF_INET:
pktq = ip_pktq;
break;
#endif
#ifdef INET6
case AF_INET6:
m->m_flags |= M_LOOP;
pktq = ip6_pktq;
break;
#endif
#ifdef NETATALK
case AF_APPLETALK:
pktq = at_pktq2;
break;
#endif
default:
printf("%s: can't handle af%d\n", ifp->if_xname, af);
m_freem(m);
return;
}
pktlen = m->m_pkthdr.len;
KASSERT(pktq != NULL);
s = splnet();
if (__predict_false(pktq_enqueue(pktq, m, 0))) {
m_freem(m);
splx(s);
return;
}
if_statadd2(ifp, if_ipackets, 1, if_ibytes, pktlen);
splx(s);
}
}
#endif /* ALTQ */
/* ARGSUSED */
static void
loop_rtrequest(int cmd, struct rtentry *rt,
const struct rt_addrinfo *info)
{ if (rt) rt->rt_rmx.rmx_mtu = lo0ifp->if_mtu;
}
/*
* Process an ioctl request.
*/
/* ARGSUSED */
int
loioctl(struct ifnet *ifp, u_long cmd, void *data)
{
struct ifaddr *ifa;
struct ifreq *ifr = data;
int error = 0;
switch (cmd) {
case SIOCINITIFADDR:
ifp->if_flags |= IFF_UP;
ifa = (struct ifaddr *)data;
if (ifa != NULL)
ifa->ifa_rtrequest = loop_rtrequest;
/*
* Everything else is done at a higher level.
*/
break;
case SIOCSIFMTU:
if ((unsigned)ifr->ifr_mtu > LOMTU_MAX)
error = EINVAL;
else if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET){
error = 0;
}
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (ifr == NULL) {
error = EAFNOSUPPORT; /* XXX */
break;
}
switch (ifreq_getaddr(cmd, ifr)->sa_family) {
#ifdef INET
case AF_INET:
break;
#endif
#ifdef INET6
case AF_INET6:
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
break;
default:
error = ifioctl_common(ifp, cmd, data);
}
return (error);
}
/*
* Module infrastructure
*/
#include "if_module.h"
IF_MODULE(MODULE_CLASS_DRIVER, loop, NULL)
/* $NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_socket2.c,v 1.143 2024/01/03 18:10:42 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_inet.h"
#include "opt_mbuftrace.h"
#include "opt_sb_max.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/poll.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/signalvar.h>
#include <sys/kauth.h>
#include <sys/pool.h>
#include <sys/uidinfo.h>
#ifdef DDB
#include <sys/filedesc.h>
#include <ddb/db_active.h>
#endif
/*
* Primitive routines for operating on sockets and socket buffers.
*
* Connection life-cycle:
*
* Normal sequence from the active (originating) side:
*
* - soisconnecting() is called during processing of connect() call,
* - resulting in an eventual call to soisconnected() if/when the
* connection is established.
*
* When the connection is torn down during processing of disconnect():
*
* - soisdisconnecting() is called and,
* - soisdisconnected() is called when the connection to the peer
* is totally severed.
*
* The semantics of these routines are such that connectionless protocols
* can call soisconnected() and soisdisconnected() only, bypassing the
* in-progress calls when setting up a ``connection'' takes no time.
*
* From the passive side, a socket is created with two queues of sockets:
*
* - so_q0 (0) for partial connections (i.e. connections in progress)
* - so_q (1) for connections already made and awaiting user acceptance.
*
* As a protocol is preparing incoming connections, it creates a socket
* structure queued on so_q0 by calling sonewconn(). When the connection
* is established, soisconnected() is called, and transfers the
* socket structure to so_q, making it available to accept().
*
* If a socket is closed with sockets on either so_q0 or so_q, these
* sockets are dropped.
*
* Locking rules and assumptions:
*
* o socket::so_lock can change on the fly. The low level routines used
* to lock sockets are aware of this. When so_lock is acquired, the
* routine locking must check to see if so_lock still points to the
* lock that was acquired. If so_lock has changed in the meantime, the
* now irrelevant lock that was acquired must be dropped and the lock
* operation retried. Although not proven here, this is completely safe
* on a multiprocessor system, even with relaxed memory ordering, given
* the next two rules:
*
* o In order to mutate so_lock, the lock pointed to by the current value
* of so_lock must be held: i.e., the socket must be held locked by the
* changing thread. The thread must issue membar_release() to prevent
* memory accesses being reordered, and can set so_lock to the desired
* value. If the lock pointed to by the new value of so_lock is not
* held by the changing thread, the socket must then be considered
* unlocked.
*
* o If so_lock is mutated, and the previous lock referred to by so_lock
* could still be visible to other threads in the system (e.g. via file
* descriptor or protocol-internal reference), then the old lock must
* remain valid until the socket and/or protocol control block has been
* torn down.
*
* o If a socket has a non-NULL so_head value (i.e. is in the process of
* connecting), then locking the socket must also lock the socket pointed
* to by so_head: their lock pointers must match.
*
* o If a socket has connections in progress (so_q, so_q0 not empty) then
* locking the socket must also lock the sockets attached to both queues.
* Again, their lock pointers must match.
*
* o Beyond the initial lock assignment in socreate(), assigning locks to
* sockets is the responsibility of the individual protocols / protocol
* domains.
*/
static pool_cache_t socket_cache;
u_long sb_max = SB_MAX;/* maximum socket buffer size */
static u_long sb_max_adj; /* adjusted sb_max */
void
soisconnecting(struct socket *so)
{ KASSERT(solocked(so));
so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTING;
}
void
soisconnected(struct socket *so)
{
struct socket *head;
head = so->so_head;
KASSERT(solocked(so)); KASSERT(head == NULL || solocked2(so, head)); so->so_state &= ~(SS_ISCONNECTING | SS_ISDISCONNECTING);
so->so_state |= SS_ISCONNECTED;
if (head && so->so_onq == &head->so_q0) {
if ((so->so_options & SO_ACCEPTFILTER) == 0) {
/*
* Re-enqueue and wake up any waiters, e.g.
* processes blocking on accept().
*/
soqremque(so, 0);
soqinsque(head, so, 1);
sorwakeup(head);
cv_broadcast(&head->so_cv);
} else {
so->so_upcall =
head->so_accf->so_accept_filter->accf_callback;
so->so_upcallarg = head->so_accf->so_accept_filter_arg;
so->so_rcv.sb_flags |= SB_UPCALL;
so->so_options &= ~SO_ACCEPTFILTER;
(*so->so_upcall)(so, so->so_upcallarg,
POLLIN|POLLRDNORM, M_DONTWAIT);
}
} else {
cv_broadcast(&so->so_cv);
sorwakeup(so); sowwakeup(so);
}
}
void
soisdisconnecting(struct socket *so)
{
KASSERT(solocked(so));
so->so_state &= ~SS_ISCONNECTING;
so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
cv_broadcast(&so->so_cv);
sowwakeup(so);
sorwakeup(so);
}
void
soisdisconnected(struct socket *so)
{ KASSERT(solocked(so));
so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
cv_broadcast(&so->so_cv);
sowwakeup(so); sorwakeup(so);
}
void
soinit2(void)
{
socket_cache = pool_cache_init(sizeof(struct socket), 0, 0, 0,
"socket", NULL, IPL_SOFTNET, NULL, NULL, NULL);
}
/*
* sonewconn: accept a new connection.
*
* When an attempt at a new connection is noted on a socket which accepts
* connections, sonewconn(9) is called. If the connection is possible
* (subject to space constraints, etc) then we allocate a new structure,
* properly linked into the data structure of the original socket.
*
* => If 'soready' is true, then socket will become ready for accept() i.e.
* inserted into the so_q queue, SS_ISCONNECTED set and waiters awoken.
* => May be called from soft-interrupt context.
* => Listening socket should be locked.
* => Returns the new socket locked.
*/
struct socket *
sonewconn(struct socket *head, bool soready)
{
struct socket *so;
int soqueue, error;
KASSERT(solocked(head)); if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) {
/*
* Listen queue overflow. If there is an accept filter
* active, pass through the oldest cxn it's handling.
*/
if (head->so_accf == NULL) {
return NULL;
} else {
struct socket *so2, *next;
/* Pass the oldest connection waiting in the
accept filter */
for (so2 = TAILQ_FIRST(&head->so_q0);
so2 != NULL; so2 = next) {
next = TAILQ_NEXT(so2, so_qe);
if (so2->so_upcall == NULL) {
continue;
}
so2->so_upcall = NULL;
so2->so_upcallarg = NULL;
so2->so_options &= ~SO_ACCEPTFILTER;
so2->so_rcv.sb_flags &= ~SB_UPCALL;
soisconnected(so2);
break;
}
/* If nothing was nudged out of the acept filter, bail
* out; otherwise proceed allocating the socket. */
if (so2 == NULL) {
return NULL;
}
}
}
if ((head->so_options & SO_ACCEPTFILTER) != 0) {
soready = false;
}
soqueue = soready ? 1 : 0;
if ((so = soget(false)) == NULL) {
return NULL;
}
so->so_type = head->so_type;
so->so_options = head->so_options & ~SO_ACCEPTCONN;
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_proto = head->so_proto;
so->so_timeo = head->so_timeo;
so->so_pgid = head->so_pgid;
so->so_send = head->so_send;
so->so_receive = head->so_receive;
so->so_uidinfo = head->so_uidinfo;
so->so_egid = head->so_egid;
so->so_cpid = head->so_cpid;
/*
* Share the lock with the listening-socket, it may get unshared
* once the connection is complete.
*
* so_lock is stable while we hold the socket locked, so no
* need for atomic_load_* here.
*/
mutex_obj_hold(head->so_lock);
so->so_lock = head->so_lock;
/*
* Reserve the space for socket buffers.
*/
#ifdef MBUFTRACE
so->so_mowner = head->so_mowner;
so->so_rcv.sb_mowner = head->so_rcv.sb_mowner;
so->so_snd.sb_mowner = head->so_snd.sb_mowner;
#endif
if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
goto out;
}
so->so_snd.sb_lowat = head->so_snd.sb_lowat;
so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
so->so_snd.sb_timeo = head->so_snd.sb_timeo;
so->so_rcv.sb_flags |= head->so_rcv.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
so->so_snd.sb_flags |= head->so_snd.sb_flags & (SB_AUTOSIZE | SB_ASYNC);
/*
* Finally, perform the protocol attach. Note: a new socket
* lock may be assigned at this point (if so, it will be held).
*/
error = (*so->so_proto->pr_usrreqs->pr_attach)(so, 0);
if (error) {
out:
KASSERT(solocked(so)); KASSERT(so->so_accf == NULL);
soput(so);
/* Note: the listening socket shall stay locked. */
KASSERT(solocked(head));
return NULL;
}
KASSERT(solocked2(head, so));
/*
* Insert into the queue. If ready, update the connection status
* and wake up any waiters, e.g. processes blocking on accept().
*/
soqinsque(head, so, soqueue);
if (soready) {
so->so_state |= SS_ISCONNECTED;
sorwakeup(head);
cv_broadcast(&head->so_cv);
}
return so;
}
struct socket *
soget(bool waitok)
{
struct socket *so;
so = pool_cache_get(socket_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); if (__predict_false(so == NULL))
return (NULL);
memset(so, 0, sizeof(*so));
TAILQ_INIT(&so->so_q0);
TAILQ_INIT(&so->so_q);
cv_init(&so->so_cv, "socket");
cv_init(&so->so_rcv.sb_cv, "netio");
cv_init(&so->so_snd.sb_cv, "netio");
selinit(&so->so_rcv.sb_sel);
selinit(&so->so_snd.sb_sel);
so->so_rcv.sb_so = so;
so->so_snd.sb_so = so;
return so;
}
void
soput(struct socket *so)
{ KASSERT(!cv_has_waiters(&so->so_cv)); KASSERT(!cv_has_waiters(&so->so_rcv.sb_cv)); KASSERT(!cv_has_waiters(&so->so_snd.sb_cv));
seldestroy(&so->so_rcv.sb_sel);
seldestroy(&so->so_snd.sb_sel);
mutex_obj_free(so->so_lock);
cv_destroy(&so->so_cv);
cv_destroy(&so->so_rcv.sb_cv);
cv_destroy(&so->so_snd.sb_cv);
pool_cache_put(socket_cache, so);
}
/*
* soqinsque: insert socket of a new connection into the specified
* accept queue of the listening socket (head).
*
* q = 0: queue of partial connections
* q = 1: queue of incoming connections
*/
void
soqinsque(struct socket *head, struct socket *so, int q)
{ KASSERT(q == 0 || q == 1); KASSERT(solocked2(head, so)); KASSERT(so->so_onq == NULL); KASSERT(so->so_head == NULL);
so->so_head = head;
if (q == 0) {
head->so_q0len++;
so->so_onq = &head->so_q0;
} else {
head->so_qlen++;
so->so_onq = &head->so_q;
}
TAILQ_INSERT_TAIL(so->so_onq, so, so_qe);
}
/*
* soqremque: remove socket from the specified queue.
*
* => Returns true if socket was removed from the specified queue.
* => False if socket was not removed (because it was in other queue).
*/
bool
soqremque(struct socket *so, int q)
{
struct socket *head = so->so_head;
KASSERT(q == 0 || q == 1); KASSERT(solocked(so)); KASSERT(so->so_onq != NULL); KASSERT(head != NULL);
if (q == 0) {
if (so->so_onq != &head->so_q0)
return false;
head->so_q0len--;
} else {
if (so->so_onq != &head->so_q)
return false;
head->so_qlen--;
}
KASSERT(solocked2(so, head)); TAILQ_REMOVE(so->so_onq, so, so_qe);
so->so_onq = NULL;
so->so_head = NULL;
return true;
}
/*
* socantsendmore: indicates that no more data will be sent on the
* socket; it would normally be applied to a socket when the user
* informs the system that no more data is to be sent, by the protocol
* code (in case pr_shutdown()).
*/
void
socantsendmore(struct socket *so)
{ KASSERT(solocked(so));
so->so_state |= SS_CANTSENDMORE;
sowwakeup(so);
}
/*
* socantrcvmore(): indicates that no more data will be received and
* will normally be applied to the socket by a protocol when it detects
* that the peer will send no more data. Data queued for reading in
* the socket may yet be read.
*/
void
socantrcvmore(struct socket *so)
{ KASSERT(solocked(so));
so->so_state |= SS_CANTRCVMORE;
sorwakeup(so);
}
/*
* soroverflow(): indicates that data was attempted to be sent
* but the receiving buffer overflowed.
*/
void
soroverflow(struct socket *so)
{
KASSERT(solocked(so));
so->so_rcv.sb_overflowed++;
if (so->so_options & SO_RERROR) {
so->so_rerror = ENOBUFS;
sorwakeup(so);
}
}
/*
* Wait for data to arrive at/drain from a socket buffer.
*/
int
sbwait(struct sockbuf *sb)
{
struct socket *so;
kmutex_t *lock;
int error;
so = sb->sb_so;
KASSERT(solocked(so));
sb->sb_flags |= SB_NOTIFY;
lock = so->so_lock;
if ((sb->sb_flags & SB_NOINTR) != 0)
error = cv_timedwait(&sb->sb_cv, lock, sb->sb_timeo);
else
error = cv_timedwait_sig(&sb->sb_cv, lock, sb->sb_timeo); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock);
return error;
}
/*
* Wakeup processes waiting on a socket buffer.
* Do asynchronous notification via SIGIO
* if the socket buffer has the SB_ASYNC flag set.
*/
void
sowakeup(struct socket *so, struct sockbuf *sb, int code)
{
int band;
KASSERT(solocked(so)); KASSERT(sb->sb_so == so); switch (code) {
case POLL_IN:
band = POLLIN|POLLRDNORM;
break;
case POLL_OUT:
band = POLLOUT|POLLWRNORM;
break;
case POLL_HUP:
band = POLLHUP;
break;
default:
band = 0;
#ifdef DIAGNOSTIC
printf("bad siginfo code %d in socket notification.\n", code);
#endif
break;
}
sb->sb_flags &= ~SB_NOTIFY;
selnotify(&sb->sb_sel, band, NOTE_SUBMIT);
cv_broadcast(&sb->sb_cv);
if (sb->sb_flags & SB_ASYNC) fownsignal(so->so_pgid, SIGIO, code, band, so); if (sb->sb_flags & SB_UPCALL) (*so->so_upcall)(so, so->so_upcallarg, band, M_DONTWAIT);
}
/*
* Reset a socket's lock pointer. Wake all threads waiting on the
* socket's condition variables so that they can restart their waits
* using the new lock. The existing lock must be held.
*
* Caller must have issued membar_release before this.
*/
void
solockreset(struct socket *so, kmutex_t *lock)
{ KASSERT(solocked(so));
so->so_lock = lock;
cv_broadcast(&so->so_snd.sb_cv);
cv_broadcast(&so->so_rcv.sb_cv);
cv_broadcast(&so->so_cv);
}
/*
* Socket buffer (struct sockbuf) utility routines.
*
* Each socket contains two socket buffers: one for sending data and
* one for receiving data. Each buffer contains a queue of mbufs,
* information about the number of mbufs and amount of data in the
* queue, and other fields allowing poll() statements and notification
* on data availability to be implemented.
*
* Data stored in a socket buffer is maintained as a list of records.
* Each record is a list of mbufs chained together with the m_next
* field. Records are chained together with the m_nextpkt field. The upper
* level routine soreceive() expects the following conventions to be
* observed when placing information in the receive buffer:
*
* 1. If the protocol requires each message be preceded by the sender's
* name, then a record containing that name must be present before
* any associated data (mbuf's must be of type MT_SONAME).
* 2. If the protocol supports the exchange of ``access rights'' (really
* just additional data associated with the message), and there are
* ``rights'' to be received, then a record containing this data
* should be present (mbuf's must be of type MT_CONTROL).
* 3. If a name or rights record exists, then it must be followed by
* a data record, perhaps of zero length.
*
* Before using a new socket structure it is first necessary to reserve
* buffer space to the socket, by calling sbreserve(). This should commit
* some of the available buffer space in the system buffer pool for the
* socket (currently, it does nothing but enforce limits). The space
* should be released by calling sbrelease() when the socket is destroyed.
*/
int
sb_max_set(u_long new_sbmax)
{
int s;
if (new_sbmax < (16 * 1024))
return (EINVAL);
s = splsoftnet();
sb_max = new_sbmax;
sb_max_adj = (u_quad_t)new_sbmax * MCLBYTES / (MSIZE + MCLBYTES);
splx(s);
return (0);
}
int
soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
{ KASSERT(so->so_pcb == NULL || solocked(so));
/*
* there's at least one application (a configure script of screen)
* which expects a fifo is writable even if it has "some" bytes
* in its buffer.
* so we want to make sure (hiwat - lowat) >= (some bytes).
*
* PIPE_BUF here is an arbitrary value chosen as (some bytes) above.
* we expect it's large enough for such applications.
*/
u_long lowat = MAX(sock_loan_thresh, MCLBYTES);
u_long hiwat = lowat + PIPE_BUF;
if (sndcc < hiwat)
sndcc = hiwat;
if (sbreserve(&so->so_snd, sndcc, so) == 0)
goto bad;
if (sbreserve(&so->so_rcv, rcvcc, so) == 0)
goto bad2;
if (so->so_rcv.sb_lowat == 0) so->so_rcv.sb_lowat = 1; if (so->so_snd.sb_lowat == 0) so->so_snd.sb_lowat = lowat; if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
return (0);
bad2:
sbrelease(&so->so_snd, so);
bad:
return (ENOBUFS);
}
/*
* Allot mbufs to a sockbuf.
* Attempt to scale mbmax so that mbcnt doesn't become limiting
* if buffering efficiency is near the normal case.
*/
int
sbreserve(struct sockbuf *sb, u_long cc, struct socket *so)
{
struct lwp *l = curlwp; /* XXX */
rlim_t maxcc;
struct uidinfo *uidinfo;
KASSERT(so->so_pcb == NULL || solocked(so)); KASSERT(sb->sb_so == so); KASSERT(sb_max_adj != 0); if (cc == 0 || cc > sb_max_adj)
return (0);
maxcc = l->l_proc->p_rlimit[RLIMIT_SBSIZE].rlim_cur;
uidinfo = so->so_uidinfo;
if (!chgsbsize(uidinfo, &sb->sb_hiwat, cc, maxcc))
return 0;
sb->sb_mbmax = uimin(cc * 2, sb_max);
if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat;
return (1);
}
/*
* Free mbufs held by a socket, and reserved mbuf space. We do not assert
* that the socket is held locked here: see sorflush().
*/
void
sbrelease(struct sockbuf *sb, struct socket *so)
{ KASSERT(sb->sb_so == so);
sbflush(sb);
(void)chgsbsize(so->so_uidinfo, &sb->sb_hiwat, 0, RLIM_INFINITY);
sb->sb_mbmax = 0;
}
/*
* Routines to add and remove
* data from an mbuf queue.
*
* The routines sbappend() or sbappendrecord() are normally called to
* append new mbufs to a socket buffer, after checking that adequate
* space is available, comparing the function sbspace() with the amount
* of data to be added. sbappendrecord() differs from sbappend() in
* that data supplied is treated as the beginning of a new record.
* To place a sender's address, optional access rights, and data in a
* socket receive buffer, sbappendaddr() should be used. To place
* access rights and data in a socket receive buffer, sbappendrights()
* should be used. In either case, the new data begins a new record.
* Note that unlike sbappend() and sbappendrecord(), these routines check
* for the caller that there will be enough space to store the data.
* Each fails if there is not enough space, or if it cannot find mbufs
* to store additional information in.
*
* Reliable protocols may use the socket send buffer to hold data
* awaiting acknowledgement. Data is normally copied from a socket
* send buffer in a protocol with m_copym for output to a peer,
* and then removing the data from the socket buffer with sbdrop()
* or sbdroprecord() when the data is acknowledged by the peer.
*/
#ifdef SOCKBUF_DEBUG
void
sblastrecordchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
KASSERT(solocked(sb->sb_so));
while (m && m->m_nextpkt)
m = m->m_nextpkt;
if (m != sb->sb_lastrecord) {
printf("sblastrecordchk: sb_mb %p sb_lastrecord %p last %p\n",
sb->sb_mb, sb->sb_lastrecord, m);
printf("packet chain:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
printf("\t%p\n", m);
panic("sblastrecordchk from %s", where);
}
}
void
sblastmbufchk(struct sockbuf *sb, const char *where)
{
struct mbuf *m = sb->sb_mb;
struct mbuf *n;
KASSERT(solocked(sb->sb_so));
while (m && m->m_nextpkt)
m = m->m_nextpkt;
while (m && m->m_next)
m = m->m_next;
if (m != sb->sb_mbtail) {
printf("sblastmbufchk: sb_mb %p sb_mbtail %p last %p\n",
sb->sb_mb, sb->sb_mbtail, m);
printf("packet tree:\n");
for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
printf("\t");
for (n = m; n != NULL; n = n->m_next)
printf("%p ", n);
printf("\n");
}
panic("sblastmbufchk from %s", where);
}
}
#endif /* SOCKBUF_DEBUG */
/*
* Link a chain of records onto a socket buffer
*/
#define SBLINKRECORDCHAIN(sb, m0, mlast) \
do { \
if ((sb)->sb_lastrecord != NULL) \
(sb)->sb_lastrecord->m_nextpkt = (m0); \
else \
(sb)->sb_mb = (m0); \
(sb)->sb_lastrecord = (mlast); \
} while (/*CONSTCOND*/0)
#define SBLINKRECORD(sb, m0) \
SBLINKRECORDCHAIN(sb, m0, m0)
/*
* Append mbuf chain m to the last record in the
* socket buffer sb. The additional space associated
* the mbuf chain is recorded in sb. Empty mbufs are
* discarded and mbufs are compacted where possible.
*/
void
sbappend(struct sockbuf *sb, struct mbuf *m)
{
struct mbuf *n;
KASSERT(solocked(sb->sb_so)); if (m == NULL)
return;
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
SBLASTRECORDCHK(sb, "sbappend 1");
if ((n = sb->sb_lastrecord) != NULL) {
/*
* XXX Would like to simply use sb_mbtail here, but
* XXX I need to verify that I won't miss an EOR that
* XXX way.
*/
do {
if (n->m_flags & M_EOR) {
sbappendrecord(sb, m); /* XXXXXX!!!! */
return;
}
} while (n->m_next && (n = n->m_next));
} else {
/*
* If this is the first record in the socket buffer, it's
* also the last record.
*/
sb->sb_lastrecord = m;
}
sbcompress(sb, m, n);
SBLASTRECORDCHK(sb, "sbappend 2");
}
/*
* This version of sbappend() should only be used when the caller
* absolutely knows that there will never be more than one record
* in the socket buffer, that is, a stream protocol (such as TCP).
*/
void
sbappendstream(struct sockbuf *sb, struct mbuf *m)
{ KASSERT(solocked(sb->sb_so)); KDASSERT(m->m_nextpkt == NULL); KASSERT(sb->sb_mb == sb->sb_lastrecord);
SBLASTMBUFCHK(sb, __func__);
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
sbcompress(sb, m, sb->sb_mbtail);
sb->sb_lastrecord = sb->sb_mb;
SBLASTRECORDCHK(sb, __func__);
}
#ifdef SOCKBUF_DEBUG
void
sbcheck(struct sockbuf *sb)
{
struct mbuf *m, *m2;
u_long len, mbcnt;
KASSERT(solocked(sb->sb_so));
len = 0;
mbcnt = 0;
for (m = sb->sb_mb; m; m = m->m_nextpkt) {
for (m2 = m; m2 != NULL; m2 = m2->m_next) {
len += m2->m_len;
mbcnt += MSIZE;
if (m2->m_flags & M_EXT)
mbcnt += m2->m_ext.ext_size;
if (m2->m_nextpkt != NULL)
panic("sbcheck nextpkt");
}
}
if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
printf("cc %lu != %lu || mbcnt %lu != %lu\n", len, sb->sb_cc,
mbcnt, sb->sb_mbcnt);
panic("sbcheck");
}
}
#endif
/*
* As above, except the mbuf chain
* begins a new record.
*/
void
sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
{
struct mbuf *m;
KASSERT(solocked(sb->sb_so)); if (m0 == NULL)
return;
#ifdef MBUFTRACE
m_claimm(m0, sb->sb_mowner);
#endif
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 1");
SBLINKRECORD(sb, m0);
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) { m0->m_flags &= ~M_EOR;
m->m_flags |= M_EOR;
}
sbcompress(sb, m, m0);
SBLASTRECORDCHK(sb, "sbappendrecord 2");
}
/*
* As above except that OOB data
* is inserted at the beginning of the sockbuf,
* but after any other OOB data.
*/
void
sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
{
struct mbuf *m, **mp;
KASSERT(solocked(sb->sb_so));
if (m0 == NULL)
return;
SBLASTRECORDCHK(sb, "sbinsertoob 1");
for (mp = &sb->sb_mb; (m = *mp) != NULL; mp = &((*mp)->m_nextpkt)) {
again:
switch (m->m_type) {
case MT_OOBDATA:
continue; /* WANT next train */
case MT_CONTROL:
if ((m = m->m_next) != NULL)
goto again; /* inspect THIS train further */
}
break;
}
/*
* Put the first mbuf on the queue.
* Note this permits zero length records.
*/
sballoc(sb, m0);
m0->m_nextpkt = *mp;
if (*mp == NULL) {
/* m0 is actually the new tail */
sb->sb_lastrecord = m0;
}
*mp = m0;
m = m0->m_next;
m0->m_next = 0;
if (m && (m0->m_flags & M_EOR)) {
m0->m_flags &= ~M_EOR;
m->m_flags |= M_EOR;
}
sbcompress(sb, m, m0);
SBLASTRECORDCHK(sb, "sbinsertoob 2");
}
/*
* Append address and data, and optionally, control (ancillary) data
* to the receive queue of a socket. If present,
* m0 must include a packet header with total length.
* Returns 0 if no space in sockbuf or insufficient mbufs.
*/
int
sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa, struct mbuf *m0,
struct mbuf *control)
{
struct mbuf *m, *n, *nlast;
int space, len;
KASSERT(solocked(sb->sb_so));
space = asa->sa_len;
if (m0 != NULL) {
if ((m0->m_flags & M_PKTHDR) == 0)
panic("sbappendaddr"); space += m0->m_pkthdr.len;
#ifdef MBUFTRACE
m_claimm(m0, sb->sb_mowner);
#endif
}
for (n = control; n; n = n->m_next) {
space += n->m_len;
MCLAIM(n, sb->sb_mowner);
if (n->m_next == NULL) /* keep pointer to last control buf */
break;
}
if (space > sbspace(sb))
return (0);
m = m_get(M_DONTWAIT, MT_SONAME);
if (m == NULL)
return (0);
MCLAIM(m, sb->sb_mowner);
/*
* XXX avoid 'comparison always true' warning which isn't easily
* avoided.
*/
len = asa->sa_len;
if (len > MLEN) {
MEXTMALLOC(m, asa->sa_len, M_NOWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return (0);
}
}
m->m_len = asa->sa_len;
memcpy(mtod(m, void *), asa, asa->sa_len);
if (n) n->m_next = m0; /* concatenate data to control */
else
control = m0;
m->m_next = control;
SBLASTRECORDCHK(sb, "sbappendaddr 1");
for (n = m; n->m_next != NULL; n = n->m_next) sballoc(sb, n); sballoc(sb, n);
nlast = n;
SBLINKRECORD(sb, m);
sb->sb_mbtail = nlast;
SBLASTMBUFCHK(sb, "sbappendaddr");
SBLASTRECORDCHK(sb, "sbappendaddr 2");
return (1);
}
/*
* Helper for sbappendchainaddr: prepend a struct sockaddr* to
* an mbuf chain.
*/
static inline struct mbuf *
m_prepend_sockaddr(struct sockbuf *sb, struct mbuf *m0,
const struct sockaddr *asa)
{
struct mbuf *m;
const int salen = asa->sa_len;
KASSERT(solocked(sb->sb_so));
/* only the first in each chain need be a pkthdr */
m = m_gethdr(M_DONTWAIT, MT_SONAME);
if (m == NULL)
return NULL;
MCLAIM(m, sb->sb_mowner);
#ifdef notyet
if (salen > MHLEN) {
MEXTMALLOC(m, salen, M_NOWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
#else
KASSERT(salen <= MHLEN);
#endif
m->m_len = salen;
memcpy(mtod(m, void *), asa, salen);
m->m_next = m0;
m->m_pkthdr.len = salen + m0->m_pkthdr.len;
return m;
}
int
sbappendaddrchain(struct sockbuf *sb, const struct sockaddr *asa,
struct mbuf *m0, int sbprio)
{
struct mbuf *m, *n, *n0, *nlast;
int error;
KASSERT(solocked(sb->sb_so));
/*
* XXX sbprio reserved for encoding priority of this* request:
* SB_PRIO_NONE --> honour normal sb limits
* SB_PRIO_ONESHOT_OVERFLOW --> if socket has any space,
* take whole chain. Intended for large requests
* that should be delivered atomically (all, or none).
* SB_PRIO_OVERDRAFT -- allow a small (2*MLEN) overflow
* over normal socket limits, for messages indicating
* buffer overflow in earlier normal/lower-priority messages
* SB_PRIO_BESTEFFORT --> ignore limits entirely.
* Intended for kernel-generated messages only.
* Up to generator to avoid total mbuf resource exhaustion.
*/
(void)sbprio;
if (m0 && (m0->m_flags & M_PKTHDR) == 0)
panic("sbappendaddrchain");
#ifdef notyet
space = sbspace(sb);
/*
* Enforce SB_PRIO_* limits as described above.
*/
#endif
n0 = NULL;
nlast = NULL;
for (m = m0; m; m = m->m_nextpkt) {
struct mbuf *np;
#ifdef MBUFTRACE
m_claimm(m, sb->sb_mowner);
#endif
/* Prepend sockaddr to this record (m) of input chain m0 */
n = m_prepend_sockaddr(sb, m, asa);
if (n == NULL) {
error = ENOBUFS;
goto bad;
}
/* Append record (asa+m) to end of new chain n0 */
if (n0 == NULL) {
n0 = n;
} else {
nlast->m_nextpkt = n;
}
/* Keep track of last record on new chain */
nlast = n;
for (np = n; np; np = np->m_next)
sballoc(sb, np);
}
SBLASTRECORDCHK(sb, "sbappendaddrchain 1");
/* Drop the entire chain of (asa+m) records onto the socket */
SBLINKRECORDCHAIN(sb, n0, nlast);
SBLASTRECORDCHK(sb, "sbappendaddrchain 2");
for (m = nlast; m->m_next; m = m->m_next)
;
sb->sb_mbtail = m;
SBLASTMBUFCHK(sb, "sbappendaddrchain");
return (1);
bad:
/*
* On error, free the prepended addresses. For consistency
* with sbappendaddr(), leave it to our caller to free
* the input record chain passed to us as m0.
*/
while ((n = n0) != NULL) {
struct mbuf *np;
/* Undo the sballoc() of this record */
for (np = n; np; np = np->m_next)
sbfree(sb, np);
n0 = n->m_nextpkt; /* iterate at next prepended address */
np = m_free(n); /* free prepended address (not data) */
}
return error;
}
int
sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
{
struct mbuf *m, *mlast, *n;
int space;
KASSERT(solocked(sb->sb_so));
space = 0;
if (control == NULL)
panic("sbappendcontrol");
for (m = control; ; m = m->m_next) {
space += m->m_len;
MCLAIM(m, sb->sb_mowner);
if (m->m_next == NULL)
break;
}
n = m; /* save pointer to last control buffer */
for (m = m0; m; m = m->m_next) {
MCLAIM(m, sb->sb_mowner);
space += m->m_len;
}
if (space > sbspace(sb))
return (0);
n->m_next = m0; /* concatenate data to control */
SBLASTRECORDCHK(sb, "sbappendcontrol 1");
for (m = control; m->m_next != NULL; m = m->m_next) sballoc(sb, m); sballoc(sb, m);
mlast = m;
SBLINKRECORD(sb, control);
sb->sb_mbtail = mlast;
SBLASTMBUFCHK(sb, "sbappendcontrol");
SBLASTRECORDCHK(sb, "sbappendcontrol 2");
return (1);
}
/*
* Compress mbuf chain m into the socket
* buffer sb following mbuf n. If n
* is null, the buffer is presumed empty.
*/
void
sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
{
int eor;
struct mbuf *o;
KASSERT(solocked(sb->sb_so));
eor = 0;
while (m) {
eor |= m->m_flags & M_EOR;
if (m->m_len == 0 && (eor == 0 || (((o = m->m_next) || (o = n)) &&
o->m_type == m->m_type))) {
if (sb->sb_lastrecord == m) sb->sb_lastrecord = m->m_next;
m = m_free(m);
continue;
}
if (n && (n->m_flags & M_EOR) == 0 &&
/* M_TRAILINGSPACE() checks buffer writeability */
m->m_len <= MCLBYTES / 4 && /* XXX Don't copy too much */
m->m_len <= M_TRAILINGSPACE(n) &&
n->m_type == m->m_type) {
memcpy(mtod(n, char *) + n->m_len, mtod(m, void *),
(unsigned)m->m_len);
n->m_len += m->m_len;
sb->sb_cc += m->m_len;
m = m_free(m);
continue;
}
if (n)
n->m_next = m;
else
sb->sb_mb = m;
sb->sb_mbtail = m;
sballoc(sb, m);
n = m;
m->m_flags &= ~M_EOR;
m = m->m_next;
n->m_next = 0;
}
if (eor) {
if (n)
n->m_flags |= eor;
else
printf("semi-panic: sbcompress\n");
}
SBLASTMBUFCHK(sb, __func__);
}
/*
* Free all mbufs in a sockbuf.
* Check that all resources are reclaimed.
*/
void
sbflush(struct sockbuf *sb)
{ KASSERT(solocked(sb->sb_so)); KASSERT((sb->sb_flags & SB_LOCK) == 0); while (sb->sb_mbcnt)
sbdrop(sb, (int)sb->sb_cc);
KASSERT(sb->sb_cc == 0); KASSERT(sb->sb_mb == NULL); KASSERT(sb->sb_mbtail == NULL); KASSERT(sb->sb_lastrecord == NULL);
}
/*
* Drop data from (the front of) a sockbuf.
*/
void
sbdrop(struct sockbuf *sb, int len)
{
struct mbuf *m, *next;
KASSERT(solocked(sb->sb_so)); next = (m = sb->sb_mb) ? m->m_nextpkt : NULL; while (len > 0) { if (m == NULL) {
if (next == NULL)
panic("sbdrop(%p,%d): cc=%lu",
sb, len, sb->sb_cc);
m = next;
next = m->m_nextpkt;
continue;
}
if (m->m_len > len) {
m->m_len -= len;
m->m_data += len;
sb->sb_cc -= len;
break;
}
len -= m->m_len;
sbfree(sb, m);
m = m_free(m);
}
while (m && m->m_len == 0) { sbfree(sb, m);
m = m_free(m);
}
if (m) {
sb->sb_mb = m;
m->m_nextpkt = next;
} else
sb->sb_mb = next;
/*
* First part is an inline SB_EMPTY_FIXUP(). Second part
* makes sure sb_lastrecord is up-to-date if we dropped
* part of the last record.
*/
m = sb->sb_mb;
if (m == NULL) {
sb->sb_mbtail = NULL;
sb->sb_lastrecord = NULL;
} else if (m->m_nextpkt == NULL)
sb->sb_lastrecord = m;
}
/*
* Drop a record off the front of a sockbuf
* and move the next record to the front.
*/
void
sbdroprecord(struct sockbuf *sb)
{
struct mbuf *m, *mn;
KASSERT(solocked(sb->sb_so));
m = sb->sb_mb;
if (m) {
sb->sb_mb = m->m_nextpkt;
do {
sbfree(sb, m);
mn = m_free(m);
} while ((m = mn) != NULL);
}
SB_EMPTY_FIXUP(sb);
}
/*
* Create a "control" mbuf containing the specified data
* with the specified type for presentation on a socket buffer.
*/
struct mbuf *
sbcreatecontrol1(void **p, int size, int type, int level, int flags)
{
struct cmsghdr *cp;
struct mbuf *m;
int space = CMSG_SPACE(size);
if ((flags & M_DONTWAIT) && space > MCLBYTES) {
printf("%s: message too large %d\n", __func__, space);
return NULL;
}
if ((m = m_get(flags, MT_CONTROL)) == NULL)
return NULL;
if (space > MLEN) {
if (space > MCLBYTES)
MEXTMALLOC(m, space, M_WAITOK);
else
MCLGET(m, flags);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
return NULL;
}
}
cp = mtod(m, struct cmsghdr *);
*p = CMSG_DATA(cp);
m->m_len = space;
cp->cmsg_len = CMSG_LEN(size);
cp->cmsg_level = level;
cp->cmsg_type = type;
memset(cp + 1, 0, CMSG_LEN(0) - sizeof(*cp));
memset((uint8_t *)*p + size, 0, CMSG_ALIGN(size) - size);
return m;
}
struct mbuf *
sbcreatecontrol(void *p, int size, int type, int level)
{
struct mbuf *m;
void *v;
m = sbcreatecontrol1(&v, size, type, level, M_DONTWAIT);
if (m == NULL)
return NULL;
memcpy(v, p, size);
return m;
}
void
solockretry(struct socket *so, kmutex_t *lock)
{
while (lock != atomic_load_relaxed(&so->so_lock)) {
mutex_exit(lock);
lock = atomic_load_consume(&so->so_lock);
mutex_enter(lock);
}
}
bool
solocked(const struct socket *so)
{
/*
* Used only for diagnostic assertions, so so_lock should be
* stable at this point, hence on need for atomic_load_*.
*/
return mutex_owned(so->so_lock);
}
bool
solocked2(const struct socket *so1, const struct socket *so2)
{
const kmutex_t *lock;
/*
* Used only for diagnostic assertions, so so_lock should be
* stable at this point, hence on need for atomic_load_*.
*/
lock = so1->so_lock;
if (lock != so2->so_lock)
return false;
return mutex_owned(lock);
}
/*
* sosetlock: assign a default lock to a new socket.
*/
void
sosetlock(struct socket *so)
{ if (so->so_lock == NULL) { kmutex_t *lock = softnet_lock;
so->so_lock = lock;
mutex_obj_hold(lock);
mutex_enter(lock);
}
KASSERT(solocked(so));
}
/*
* Set lock on sockbuf sb; sleep if lock is already held.
* Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
* Returns error without lock if sleep is interrupted.
*/
int
sblock(struct sockbuf *sb, int wf)
{
struct socket *so;
kmutex_t *lock;
int error;
KASSERT(solocked(sb->sb_so));
for (;;) {
if (__predict_true((sb->sb_flags & SB_LOCK) == 0)) {
sb->sb_flags |= SB_LOCK;
return 0;
}
if (wf != M_WAITOK)
return EWOULDBLOCK;
so = sb->sb_so;
lock = so->so_lock;
if ((sb->sb_flags & SB_NOINTR) != 0) {
cv_wait(&so->so_cv, lock);
error = 0;
} else
error = cv_wait_sig(&so->so_cv, lock); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock); if (error != 0)
return error;
}
}
void
sbunlock(struct sockbuf *sb)
{
struct socket *so;
so = sb->sb_so;
KASSERT(solocked(so)); KASSERT((sb->sb_flags & SB_LOCK) != 0);
sb->sb_flags &= ~SB_LOCK;
cv_broadcast(&so->so_cv);
}
int
sowait(struct socket *so, bool catch_p, int timo)
{
kmutex_t *lock;
int error;
KASSERT(solocked(so)); KASSERT(catch_p || timo != 0);
lock = so->so_lock;
if (catch_p)
error = cv_timedwait_sig(&so->so_cv, lock, timo);
else
error = cv_timedwait(&so->so_cv, lock, timo); if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock);
return error;
}
#ifdef DDB
/*
* Currently, sofindproc() is used only from DDB. It could be used from others
* by using db_mutex_enter()
*/
static inline int
db_mutex_enter(kmutex_t *mtx)
{
int rv;
if (!db_active) {
mutex_enter(mtx);
rv = 1;
} else
rv = mutex_tryenter(mtx);
return rv;
}
int
sofindproc(struct socket *so, int all, void (*pr)(const char *, ...))
{
proc_t *p;
filedesc_t *fdp;
fdtab_t *dt;
fdfile_t *ff;
file_t *fp = NULL;
int found = 0;
int i, t;
if (so == NULL)
return 0;
t = db_mutex_enter(&proc_lock);
if (!t) {
pr("could not acquire proc_lock mutex\n");
return 0;
}
PROCLIST_FOREACH(p, &allproc) {
if (p->p_stat == SIDL)
continue;
fdp = p->p_fd;
t = db_mutex_enter(&fdp->fd_lock);
if (!t) {
pr("could not acquire fd_lock mutex\n");
continue;
}
dt = atomic_load_consume(&fdp->fd_dt);
for (i = 0; i < dt->dt_nfiles; i++) {
ff = dt->dt_ff[i];
if (ff == NULL)
continue;
fp = atomic_load_consume(&ff->ff_file);
if (fp == NULL)
continue;
t = db_mutex_enter(&fp->f_lock);
if (!t) {
pr("could not acquire f_lock mutex\n");
continue;
}
if ((struct socket *)fp->f_data != so) {
mutex_exit(&fp->f_lock);
continue;
}
found++;
if (pr)
pr("socket %p: owner %s(pid=%d)\n",
so, p->p_comm, p->p_pid);
mutex_exit(&fp->f_lock);
if (all == 0)
break;
}
mutex_exit(&fdp->fd_lock);
if (all == 0 && found != 0)
break;
}
mutex_exit(&proc_lock);
return found;
}
void
socket_print(const char *modif, void (*pr)(const char *, ...))
{
file_t *fp;
struct socket *so;
struct sockbuf *sb_snd, *sb_rcv;
struct mbuf *m_rec, *m;
bool opt_v = false;
bool opt_m = false;
bool opt_a = false;
bool opt_p = false;
int nrecs, nmbufs;
char ch;
const char *family;
while ( (ch = *(modif++)) != '\0') {
switch (ch) {
case 'v':
opt_v = true;
break;
case 'm':
opt_m = true;
break;
case 'a':
opt_a = true;
break;
case 'p':
opt_p = true;
break;
}
}
if (opt_v == false && pr)
(pr)("Ignore empty sockets. use /v to print all.\n");
if (opt_p == true && pr)
(pr)("Don't search owner process.\n");
LIST_FOREACH(fp, &filehead, f_list) {
if (fp->f_type != DTYPE_SOCKET)
continue;
so = (struct socket *)fp->f_data;
if (so == NULL)
continue;
if (so->so_proto->pr_domain->dom_family == AF_INET)
family = "INET";
#ifdef INET6
else if (so->so_proto->pr_domain->dom_family == AF_INET6)
family = "INET6";
#endif
else if (so->so_proto->pr_domain->dom_family == pseudo_AF_KEY)
family = "KEY";
else if (so->so_proto->pr_domain->dom_family == AF_ROUTE)
family = "ROUTE";
else
continue;
sb_snd = &so->so_snd;
sb_rcv = &so->so_rcv;
if (opt_v != true &&
sb_snd->sb_cc == 0 && sb_rcv->sb_cc == 0)
continue;
pr("---SOCKET %p: type %s\n", so, family);
if (opt_p != true)
sofindproc(so, opt_a == true ? 1 : 0, pr);
pr("Send Buffer Bytes: %d [bytes]\n", sb_snd->sb_cc);
pr("Send Buffer mbufs:\n");
m_rec = m = sb_snd->sb_mb;
nrecs = 0;
nmbufs = 0;
while (m_rec) {
nrecs++;
if (opt_m == true)
pr(" mbuf chain %p\n", m_rec);
while (m) {
nmbufs++;
m = m->m_next;
}
m_rec = m = m_rec->m_nextpkt;
}
pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
pr("Recv Buffer Usage: %d [bytes]\n", sb_rcv->sb_cc);
pr("Recv Buffer mbufs:\n");
m_rec = m = sb_rcv->sb_mb;
nrecs = 0;
nmbufs = 0;
while (m_rec) {
nrecs++;
if (opt_m == true)
pr(" mbuf chain %p\n", m_rec);
while (m) {
nmbufs++;
m = m->m_next;
}
m_rec = m = m_rec->m_nextpkt;
}
pr(" Total %d records, %d mbufs.\n", nrecs, nmbufs);
}
}
#endif /* DDB */
/* $NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $ */
/*
* Copyright (c) 1995, 1996, 1997, 2009 Matthew R. Green
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: NetBSD: vm_swap.c,v 1.52 1997/12/02 13:47:37 pk Exp
* from: Id: uvm_swap.c,v 1.1.2.42 1998/02/02 20:38:06 chuck Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_swap.c,v 1.208 2023/04/09 09:00:56 riastradh Exp $");
#include "opt_uvmhist.h"
#include "opt_compat_netbsd.h"
#include "opt_ddb.h"
#include "opt_vmswap.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/conf.h>
#include <sys/cprng.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/disklabel.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/vmem.h>
#include <sys/blist.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/syscallargs.h>
#include <sys/swap.h>
#include <sys/kauth.h>
#include <sys/sysctl.h>
#include <sys/workqueue.h>
#include <uvm/uvm.h>
#include <miscfs/specfs/specdev.h>
#include <crypto/aes/aes.h>
#include <crypto/aes/aes_cbc.h>
/*
* uvm_swap.c: manage configuration and i/o to swap space.
*/
/*
* swap space is managed in the following way:
*
* each swap partition or file is described by a "swapdev" structure.
* each "swapdev" structure contains a "swapent" structure which contains
* information that is passed up to the user (via system calls).
*
* each swap partition is assigned a "priority" (int) which controls
* swap partition usage.
*
* the system maintains a global data structure describing all swap
* partitions/files. there is a sorted LIST of "swappri" structures
* which describe "swapdev"'s at that priority. this LIST is headed
* by the "swap_priority" global var. each "swappri" contains a
* TAILQ of "swapdev" structures at that priority.
*
* locking:
* - swap_syscall_lock (krwlock_t): this lock serializes the swapctl
* system call and prevents the swap priority list from changing
* while we are in the middle of a system call (e.g. SWAP_STATS).
* - uvm_swap_data_lock (kmutex_t): this lock protects all swap data
* structures including the priority list, the swapdev structures,
* and the swapmap arena.
*
* each swap device has the following info:
* - swap device in use (could be disabled, preventing future use)
* - swap enabled (allows new allocations on swap)
* - map info in /dev/drum
* - vnode pointer
* for swap files only:
* - block size
* - max byte count in buffer
* - buffer
*
* userland controls and configures swap with the swapctl(2) system call.
* the sys_swapctl performs the following operations:
* [1] SWAP_NSWAP: returns the number of swap devices currently configured
* [2] SWAP_STATS: given a pointer to an array of swapent structures
* (passed in via "arg") of a size passed in via "misc" ... we load
* the current swap config into the array. The actual work is done
* in the uvm_swap_stats() function.
* [3] SWAP_ON: given a pathname in arg (could be device or file) and a
* priority in "misc", start swapping on it.
* [4] SWAP_OFF: as SWAP_ON, but stops swapping to a device
* [5] SWAP_CTL: changes the priority of a swap device (new priority in
* "misc")
*/
/*
* swapdev: describes a single swap partition/file
*
* note the following should be true:
* swd_inuse <= swd_nblks [number of blocks in use is <= total blocks]
* swd_nblks <= swd_mapsize [because mapsize includes miniroot+disklabel]
*/
struct swapdev {
dev_t swd_dev; /* device id */
int swd_flags; /* flags:inuse/enable/fake */
int swd_priority; /* our priority */
int swd_nblks; /* blocks in this device */
char *swd_path; /* saved pathname of device */
int swd_pathlen; /* length of pathname */
int swd_npages; /* #pages we can use */
int swd_npginuse; /* #pages in use */
int swd_npgbad; /* #pages bad */
int swd_drumoffset; /* page0 offset in drum */
int swd_drumsize; /* #pages in drum */
blist_t swd_blist; /* blist for this swapdev */
struct vnode *swd_vp; /* backing vnode */
TAILQ_ENTRY(swapdev) swd_next; /* priority tailq */
int swd_bsize; /* blocksize (bytes) */
int swd_maxactive; /* max active i/o reqs */
struct bufq_state *swd_tab; /* buffer list */
int swd_active; /* number of active buffers */
volatile uint32_t *swd_encmap; /* bitmap of encrypted slots */
struct aesenc swd_enckey; /* AES key expanded for enc */
struct aesdec swd_deckey; /* AES key expanded for dec */
bool swd_encinit; /* true if keys initialized */
};
/*
* swap device priority entry; the list is kept sorted on `spi_priority'.
*/
struct swappri {
int spi_priority; /* priority */
TAILQ_HEAD(spi_swapdev, swapdev) spi_swapdev;
/* tailq of swapdevs at this priority */
LIST_ENTRY(swappri) spi_swappri; /* global list of pri's */
};
/*
* The following two structures are used to keep track of data transfers
* on swap devices associated with regular files.
* NOTE: this code is more or less a copy of vnd.c; we use the same
* structure names here to ease porting..
*/
struct vndxfer {
struct buf *vx_bp; /* Pointer to parent buffer */
struct swapdev *vx_sdp;
int vx_error;
int vx_pending; /* # of pending aux buffers */
int vx_flags;
#define VX_BUSY 1
#define VX_DEAD 2
};
struct vndbuf {
struct buf vb_buf;
struct vndxfer *vb_xfer;
};
/*
* We keep a of pool vndbuf's and vndxfer structures.
*/
static struct pool vndxfer_pool, vndbuf_pool;
/*
* local variables
*/
static vmem_t *swapmap; /* controls the mapping of /dev/drum */
/* list of all active swap devices [by priority] */
LIST_HEAD(swap_priority, swappri);
static struct swap_priority swap_priority;
/* locks */
static kmutex_t uvm_swap_data_lock __cacheline_aligned;
static krwlock_t swap_syscall_lock;
bool uvm_swap_init_done = false;
/* workqueue and use counter for swap to regular files */
static int sw_reg_count = 0;
static struct workqueue *sw_reg_workqueue;
/* tuneables */
u_int uvm_swapisfull_factor = 99;
#if VMSWAP_DEFAULT_PLAINTEXT
bool uvm_swap_encrypt = false;
#else
bool uvm_swap_encrypt = true;
#endif
/*
* prototypes
*/
static struct swapdev *swapdrum_getsdp(int);
static struct swapdev *swaplist_find(struct vnode *, bool);
static void swaplist_insert(struct swapdev *,
struct swappri *, int);
static void swaplist_trim(void);
static int swap_on(struct lwp *, struct swapdev *);
static int swap_off(struct lwp *, struct swapdev *);
static void sw_reg_strategy(struct swapdev *, struct buf *, int);
static void sw_reg_biodone(struct buf *);
static void sw_reg_iodone(struct work *wk, void *dummy);
static void sw_reg_start(struct swapdev *);
static int uvm_swap_io(struct vm_page **, int, int, int);
static void uvm_swap_genkey(struct swapdev *);
static void uvm_swap_encryptpage(struct swapdev *, void *, int);
static void uvm_swap_decryptpage(struct swapdev *, void *, int);
static size_t
encmap_size(size_t npages)
{
struct swapdev *sdp;
const size_t bytesperword = sizeof(sdp->swd_encmap[0]);
const size_t bitsperword = NBBY * bytesperword;
const size_t nbits = npages; /* one bit for each page */
const size_t nwords = howmany(nbits, bitsperword);
const size_t nbytes = nwords * bytesperword;
return nbytes;
}
/*
* uvm_swap_init: init the swap system data structures and locks
*
* => called at boot time from init_main.c after the filesystems
* are brought up (which happens after uvm_init())
*/
void
uvm_swap_init(void)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLED(pdhist);
/*
* first, init the swap list, its counter, and its lock.
* then get a handle on the vnode for /dev/drum by using
* the its dev_t number ("swapdev", from MD conf.c).
*/
LIST_INIT(&swap_priority);
uvmexp.nswapdev = 0;
rw_init(&swap_syscall_lock);
mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
if (bdevvp(swapdev, &swapdev_vp))
panic("%s: can't get vnode for swap device", __func__);
if (vn_lock(swapdev_vp, LK_EXCLUSIVE | LK_RETRY))
panic("%s: can't lock swap device", __func__);
if (VOP_OPEN(swapdev_vp, FREAD | FWRITE, NOCRED))
panic("%s: can't open swap device", __func__);
VOP_UNLOCK(swapdev_vp);
/*
* create swap block resource map to map /dev/drum. the range
* from 1 to INT_MAX allows 2 gigablocks of swap space. note
* that block 0 is reserved (used to indicate an allocation
* failure, or no allocation).
*/
swapmap = vmem_create("swapmap", 1, INT_MAX - 1, 1, NULL, NULL, NULL, 0,
VM_NOSLEEP, IPL_NONE);
if (swapmap == 0) {
panic("%s: vmem_create failed", __func__);
}
pool_init(&vndxfer_pool, sizeof(struct vndxfer), 0, 0, 0, "swp vnx",
NULL, IPL_BIO);
pool_init(&vndbuf_pool, sizeof(struct vndbuf), 0, 0, 0, "swp vnd",
NULL, IPL_BIO);
uvm_swap_init_done = true;
UVMHIST_LOG(pdhist, "<- done", 0, 0, 0, 0);
}
/*
* swaplist functions: functions that operate on the list of swap
* devices on the system.
*/
/*
* swaplist_insert: insert swap device "sdp" into the global list
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => caller must provide a newly allocated swappri structure (we will
* FREE it if we don't need it... this it to prevent allocation
* blocking here while adding swap)
*/
static void
swaplist_insert(struct swapdev *sdp, struct swappri *newspp, int priority)
{
struct swappri *spp, *pspp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock));
/*
* find entry at or after which to insert the new device.
*/
pspp = NULL;
LIST_FOREACH(spp, &swap_priority, spi_swappri) { if (priority <= spp->spi_priority)
break;
pspp = spp;
}
/*
* new priority?
*/
if (spp == NULL || spp->spi_priority != priority) {
spp = newspp; /* use newspp! */
UVMHIST_LOG(pdhist, "created new swappri = %jd",
priority, 0, 0, 0);
spp->spi_priority = priority;
TAILQ_INIT(&spp->spi_swapdev);
if (pspp)
LIST_INSERT_AFTER(pspp, spp, spi_swappri);
else
LIST_INSERT_HEAD(&swap_priority, spp, spi_swappri);
} else {
/* we don't need a new priority structure, free it */
kmem_free(newspp, sizeof(*newspp));
}
/*
* priority found (or created). now insert on the priority's
* tailq list and bump the total number of swapdevs.
*/
sdp->swd_priority = priority;
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev++;
}
/*
* swaplist_find: find and optionally remove a swap device from the
* global list.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
* => we return the swapdev we found (and removed)
*/
static struct swapdev *
swaplist_find(struct vnode *vp, bool remove)
{
struct swapdev *sdp;
struct swappri *spp;
KASSERT(rw_lock_held(&swap_syscall_lock)); KASSERT(remove ? rw_write_held(&swap_syscall_lock) : 1); KASSERT(mutex_owned(&uvm_swap_data_lock));
/*
* search the lists for the requested vp
*/
LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_vp == vp) {
if (remove) { TAILQ_REMOVE(&spp->spi_swapdev,
sdp, swd_next);
uvmexp.nswapdev--;
}
return(sdp);
}
}
}
return (NULL);
}
/*
* swaplist_trim: scan priority list for empty priority entries and kill
* them.
*
* => caller must hold both swap_syscall_lock and uvm_swap_data_lock
*/
static void
swaplist_trim(void)
{
struct swappri *spp, *nextspp;
KASSERT(rw_write_held(&swap_syscall_lock)); KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH_SAFE(spp, &swap_priority, spi_swappri, nextspp) { if (!TAILQ_EMPTY(&spp->spi_swapdev))
continue;
LIST_REMOVE(spp, spi_swappri);
kmem_free(spp, sizeof(*spp));
}
}
/*
* swapdrum_getsdp: given a page offset in /dev/drum, convert it back
* to the "swapdev" that maps that section of the drum.
*
* => each swapdev takes one big contig chunk of the drum
* => caller must hold uvm_swap_data_lock
*/
static struct swapdev *
swapdrum_getsdp(int pgno)
{
struct swapdev *sdp;
struct swappri *spp;
KASSERT(mutex_owned(&uvm_swap_data_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) { if (sdp->swd_flags & SWF_FAKE)
continue;
if (pgno >= sdp->swd_drumoffset &&
pgno < (sdp->swd_drumoffset + sdp->swd_drumsize)) {
return sdp;
}
}
}
return NULL;
}
/*
* swapdrum_sdp_is: true iff the swap device for pgno is sdp
*
* => for use in positive assertions only; result is not stable
*/
static bool __debugused
swapdrum_sdp_is(int pgno, struct swapdev *sdp)
{
bool result;
mutex_enter(&uvm_swap_data_lock);
result = swapdrum_getsdp(pgno) == sdp;
mutex_exit(&uvm_swap_data_lock);
return result;
}
void swapsys_lock(krw_t op)
{
rw_enter(&swap_syscall_lock, op);
}
void swapsys_unlock(void)
{
rw_exit(&swap_syscall_lock);
}
static void
swapent_cvt(struct swapent *se, const struct swapdev *sdp, int inuse)
{
se->se_dev = sdp->swd_dev;
se->se_flags = sdp->swd_flags;
se->se_nblks = sdp->swd_nblks;
se->se_inuse = inuse;
se->se_priority = sdp->swd_priority;
KASSERT(sdp->swd_pathlen < sizeof(se->se_path));
strcpy(se->se_path, sdp->swd_path);
}
int (*uvm_swap_stats13)(const struct sys_swapctl_args *, register_t *) =
(void *)enosys;
int (*uvm_swap_stats50)(const struct sys_swapctl_args *, register_t *) =
(void *)enosys;
/*
* sys_swapctl: main entry point for swapctl(2) system call
* [with two helper functions: swap_on and swap_off]
*/
int
sys_swapctl(struct lwp *l, const struct sys_swapctl_args *uap, register_t *retval)
{
/* {
syscallarg(int) cmd;
syscallarg(void *) arg;
syscallarg(int) misc;
} */
struct vnode *vp;
struct nameidata nd;
struct swappri *spp;
struct swapdev *sdp;
#define SWAP_PATH_MAX (PATH_MAX + 1)
char *userpath;
size_t len = 0;
int error;
int priority;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* we handle the non-priv NSWAP and STATS request first.
*
* SWAP_NSWAP: return number of config'd swap devices
* [can also be obtained with uvmexp sysctl]
*/
if (SCARG(uap, cmd) == SWAP_NSWAP) {
const int nswapdev = uvmexp.nswapdev;
UVMHIST_LOG(pdhist, "<- done SWAP_NSWAP=%jd", nswapdev,
0, 0, 0);
*retval = nswapdev;
return 0;
}
userpath = kmem_alloc(SWAP_PATH_MAX, KM_SLEEP);
/*
* ensure serialized syscall access by grabbing the swap_syscall_lock
*/
rw_enter(&swap_syscall_lock, RW_WRITER);
/*
* SWAP_STATS: get stats on current # of configured swap devs
*
* note that the swap_priority list can't change as long
* as we are holding the swap_syscall_lock. we don't want
* to grab the uvm_swap_data_lock because we may fault&sleep during
* copyout() and we don't want to be holding that lock then!
*/
switch (SCARG(uap, cmd)) {
case SWAP_STATS13:
error = (*uvm_swap_stats13)(uap, retval);
goto out;
case SWAP_STATS50:
error = (*uvm_swap_stats50)(uap, retval);
goto out;
case SWAP_STATS:
error = uvm_swap_stats(SCARG(uap, arg), SCARG(uap, misc),
NULL, sizeof(struct swapent), retval);
UVMHIST_LOG(pdhist, "<- done SWAP_STATS", 0, 0, 0, 0);
goto out;
case SWAP_GETDUMPDEV:
error = copyout(&dumpdev, SCARG(uap, arg), sizeof(dumpdev));
goto out;
default:
break;
}
/*
* all other requests require superuser privs. verify.
*/
if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_SWAPCTL,
0, NULL, NULL, NULL)))
goto out;
if (SCARG(uap, cmd) == SWAP_DUMPOFF) {
/* drop the current dump device */
dumpdev = NODEV;
dumpcdev = NODEV;
cpu_dumpconf();
goto out;
}
/*
* at this point we expect a path name in arg. we will
* use namei() to gain a vnode reference (vref), and lock
* the vnode (VOP_LOCK).
*
* XXX: a NULL arg means use the root vnode pointer (e.g. for
* miniroot)
*/
if (SCARG(uap, arg) == NULL) {
vp = rootvp; /* miniroot */
vref(vp);
if (vn_lock(vp, LK_EXCLUSIVE)) {
vrele(vp);
error = EBUSY;
goto out;
}
if (SCARG(uap, cmd) == SWAP_ON &&
copystr("miniroot", userpath, SWAP_PATH_MAX, &len))
panic("swapctl: miniroot copy failed");
} else {
struct pathbuf *pb;
/*
* This used to allow copying in one extra byte
* (SWAP_PATH_MAX instead of PATH_MAX) for SWAP_ON.
* This was completely pointless because if anyone
* used that extra byte namei would fail with
* ENAMETOOLONG anyway, so I've removed the excess
* logic. - dholland 20100215
*/
error = pathbuf_copyin(SCARG(uap, arg), &pb);
if (error) {
goto out;
}
if (SCARG(uap, cmd) == SWAP_ON) {
/* get a copy of the string */
pathbuf_copystring(pb, userpath, SWAP_PATH_MAX);
len = strlen(userpath) + 1;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
if ((error = namei(&nd))) {
pathbuf_destroy(pb);
goto out;
}
vp = nd.ni_vp;
pathbuf_destroy(pb);
}
/* note: "vp" is referenced and locked */
error = 0; /* assume no error */
switch(SCARG(uap, cmd)) {
case SWAP_DUMPDEV:
if (vp->v_type != VBLK) {
error = ENOTBLK;
break;
}
if (bdevsw_lookup(vp->v_rdev)) {
dumpdev = vp->v_rdev;
dumpcdev = devsw_blk2chr(dumpdev);
} else
dumpdev = NODEV;
cpu_dumpconf();
break;
case SWAP_CTL:
/*
* get new priority, remove old entry (if any) and then
* reinsert it in the correct place. finally, prune out
* any empty priority structures.
*/
priority = SCARG(uap, misc);
spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
mutex_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, true)) == NULL) {
error = ENOENT;
} else {
swaplist_insert(sdp, spp, priority);
swaplist_trim();
}
mutex_exit(&uvm_swap_data_lock);
if (error)
kmem_free(spp, sizeof(*spp));
break;
case SWAP_ON:
/*
* check for duplicates. if none found, then insert a
* dummy entry on the list to prevent someone else from
* trying to enable this device while we are working on
* it.
*/
priority = SCARG(uap, misc);
sdp = kmem_zalloc(sizeof(*sdp), KM_SLEEP);
spp = kmem_alloc(sizeof(*spp), KM_SLEEP);
sdp->swd_flags = SWF_FAKE;
sdp->swd_vp = vp;
sdp->swd_dev = (vp->v_type == VBLK) ? vp->v_rdev : NODEV;
bufq_alloc(&sdp->swd_tab, "disksort", BUFQ_SORT_RAWBLOCK);
mutex_enter(&uvm_swap_data_lock);
if (swaplist_find(vp, false) != NULL) {
error = EBUSY;
mutex_exit(&uvm_swap_data_lock);
bufq_free(sdp->swd_tab);
kmem_free(sdp, sizeof(*sdp));
kmem_free(spp, sizeof(*spp));
break;
}
swaplist_insert(sdp, spp, priority);
mutex_exit(&uvm_swap_data_lock);
KASSERT(len > 0);
sdp->swd_pathlen = len;
sdp->swd_path = kmem_alloc(len, KM_SLEEP);
if (copystr(userpath, sdp->swd_path, len, 0) != 0)
panic("swapctl: copystr");
/*
* we've now got a FAKE placeholder in the swap list.
* now attempt to enable swap on it. if we fail, undo
* what we've done and kill the fake entry we just inserted.
* if swap_on is a success, it will clear the SWF_FAKE flag
*/
if ((error = swap_on(l, sdp)) != 0) {
mutex_enter(&uvm_swap_data_lock);
(void) swaplist_find(vp, true); /* kill fake entry */
swaplist_trim();
mutex_exit(&uvm_swap_data_lock);
bufq_free(sdp->swd_tab);
kmem_free(sdp->swd_path, sdp->swd_pathlen);
kmem_free(sdp, sizeof(*sdp));
break;
}
break;
case SWAP_OFF:
mutex_enter(&uvm_swap_data_lock);
if ((sdp = swaplist_find(vp, false)) == NULL) {
mutex_exit(&uvm_swap_data_lock);
error = ENXIO;
break;
}
/*
* If a device isn't in use or enabled, we
* can't stop swapping from it (again).
*/
if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0) {
mutex_exit(&uvm_swap_data_lock);
error = EBUSY;
break;
}
/*
* do the real work.
*/
error = swap_off(l, sdp);
break;
default:
error = EINVAL;
}
/*
* done! release the ref gained by namei() and unlock.
*/
vput(vp);
out:
rw_exit(&swap_syscall_lock);
kmem_free(userpath, SWAP_PATH_MAX);
UVMHIST_LOG(pdhist, "<- done! error=%jd", error, 0, 0, 0);
return (error);
}
/*
* uvm_swap_stats: implements swapctl(SWAP_STATS). The function is kept
* away from sys_swapctl() in order to allow COMPAT_* swapctl()
* emulation to use it directly without going through sys_swapctl().
* The problem with using sys_swapctl() there is that it involves
* copying the swapent array to the stackgap, and this array's size
* is not known at build time. Hence it would not be possible to
* ensure it would fit in the stackgap in any case.
*/
int
uvm_swap_stats(char *ptr, int misc,
void (*f)(void *, const struct swapent *), size_t len,
register_t *retval)
{
struct swappri *spp;
struct swapdev *sdp;
struct swapent sep;
int count = 0;
int error;
KASSERT(len <= sizeof(sep)); if (len == 0)
return ENOSYS;
if (misc < 0)
return EINVAL;
if (misc == 0 || uvmexp.nswapdev == 0)
return 0;
/* Make sure userland cannot exhaust kernel memory */
if ((size_t)misc > (size_t)uvmexp.nswapdev)
misc = uvmexp.nswapdev;
KASSERT(rw_lock_held(&swap_syscall_lock)); LIST_FOREACH(spp, &swap_priority, spi_swappri) { TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
int inuse;
if (misc-- <= 0)
break;
inuse = btodb((uint64_t)sdp->swd_npginuse <<
PAGE_SHIFT);
memset(&sep, 0, sizeof(sep));
swapent_cvt(&sep, sdp, inuse); if (f) (*f)(&sep, &sep); if ((error = copyout(&sep, ptr, len)) != 0)
return error;
ptr += len;
count++;
}
}
*retval = count;
return 0;
}
/*
* swap_on: attempt to enable a swapdev for swapping. note that the
* swapdev is already on the global list, but disabled (marked
* SWF_FAKE).
*
* => we avoid the start of the disk (to protect disk labels)
* => we also avoid the miniroot, if we are swapping to root.
* => caller should leave uvm_swap_data_lock unlocked, we may lock it
* if needed.
*/
static int
swap_on(struct lwp *l, struct swapdev *sdp)
{
struct vnode *vp;
int error, npages, nblocks, size;
long addr;
vmem_addr_t result;
struct vattr va;
dev_t dev;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* we want to enable swapping on sdp. the swd_vp contains
* the vnode we want (locked and ref'd), and the swd_dev
* contains the dev_t of the file, if it a block device.
*/
vp = sdp->swd_vp;
dev = sdp->swd_dev;
/*
* open the swap file (mostly useful for block device files to
* let device driver know what is up).
*
* we skip the open/close for root on swap because the root
* has already been opened when root was mounted (mountroot).
*/
if (vp != rootvp) { if ((error = VOP_OPEN(vp, FREAD|FWRITE, l->l_cred)))
return (error);
}
/* XXX this only works for block devices */
UVMHIST_LOG(pdhist, " dev=%jd, major(dev)=%jd", dev, major(dev), 0, 0);
/*
* we now need to determine the size of the swap area. for
* block specials we can call the d_psize function.
* for normal files, we must stat [get attrs].
*
* we put the result in nblks.
* for normal files, we also want the filesystem block size
* (which we get with statfs).
*/
switch (vp->v_type) {
case VBLK:
if ((nblocks = bdev_size(dev)) == -1) {
error = ENXIO;
goto bad;
}
break;
case VREG:
if ((error = VOP_GETATTR(vp, &va, l->l_cred)))
goto bad;
nblocks = (int)btodb(va.va_size);
sdp->swd_bsize = 1 << vp->v_mount->mnt_fs_bshift;
/*
* limit the max # of outstanding I/O requests we issue
* at any one time. take it easy on NFS servers.
*/
if (vp->v_tag == VT_NFS)
sdp->swd_maxactive = 2; /* XXX */
else
sdp->swd_maxactive = 8; /* XXX */
break;
default:
error = ENXIO;
goto bad;
}
/*
* save nblocks in a safe place and convert to pages.
*/
sdp->swd_nblks = nblocks;
npages = dbtob((uint64_t)nblocks) >> PAGE_SHIFT;
/*
* for block special files, we want to make sure that leave
* the disklabel and bootblocks alone, so we arrange to skip
* over them (arbitrarily choosing to skip PAGE_SIZE bytes).
* note that because of this the "size" can be less than the
* actual number of blocks on the device.
*/
if (vp->v_type == VBLK) {
/* we use pages 1 to (size - 1) [inclusive] */
size = npages - 1;
addr = 1;
} else {
/* we use pages 0 to (size - 1) [inclusive] */
size = npages;
addr = 0;
}
/*
* make sure we have enough blocks for a reasonable sized swap
* area. we want at least one page.
*/
if (size < 1) {
UVMHIST_LOG(pdhist, " size <= 1!!", 0, 0, 0, 0);
error = EINVAL;
goto bad;
}
UVMHIST_LOG(pdhist, " dev=%#jx: size=%jd addr=%jd", dev, size, addr, 0);
/*
* now we need to allocate an extent to manage this swap device
*/
sdp->swd_blist = blist_create(npages);
/* mark all expect the `saved' region free. */
blist_free(sdp->swd_blist, addr, size);
/*
* allocate space to for swap encryption state and mark the
* keys uninitialized so we generate them lazily
*/
sdp->swd_encmap = kmem_zalloc(encmap_size(npages), KM_SLEEP);
sdp->swd_encinit = false;
/*
* if the vnode we are swapping to is the root vnode
* (i.e. we are swapping to the miniroot) then we want
* to make sure we don't overwrite it. do a statfs to
* find its size and skip over it.
*/
if (vp == rootvp) {
struct mount *mp;
struct statvfs *sp;
int rootblocks, rootpages;
mp = rootvnode->v_mount;
sp = &mp->mnt_stat;
rootblocks = sp->f_blocks * btodb(sp->f_frsize);
/*
* XXX: sp->f_blocks isn't the total number of
* blocks in the filesystem, it's the number of
* data blocks. so, our rootblocks almost
* definitely underestimates the total size
* of the filesystem - how badly depends on the
* details of the filesystem type. there isn't
* an obvious way to deal with this cleanly
* and perfectly, so for now we just pad our
* rootblocks estimate with an extra 5 percent.
*/
rootblocks += (rootblocks >> 5) +
(rootblocks >> 6) +
(rootblocks >> 7);
rootpages = round_page(dbtob(rootblocks)) >> PAGE_SHIFT;
if (rootpages > size)
panic("swap_on: miniroot larger than swap?");
if (rootpages != blist_fill(sdp->swd_blist, addr, rootpages)) {
panic("swap_on: unable to preserve miniroot");
}
size -= rootpages;
printf("Preserved %d pages of miniroot ", rootpages);
printf("leaving %d pages of swap\n", size);
}
/*
* add a ref to vp to reflect usage as a swap device.
*/
vref(vp);
/*
* now add the new swapdev to the drum and enable.
*/
error = vmem_alloc(swapmap, npages, VM_BESTFIT | VM_SLEEP, &result);
if (error != 0)
panic("swapdrum_add");
/*
* If this is the first regular swap create the workqueue.
* => Protected by swap_syscall_lock.
*/
if (vp->v_type != VBLK) { if (sw_reg_count++ == 0) { KASSERT(sw_reg_workqueue == NULL); if (workqueue_create(&sw_reg_workqueue, "swapiod",
sw_reg_iodone, NULL, PRIBIO, IPL_BIO, 0) != 0)
panic("%s: workqueue_create failed", __func__);
}
}
sdp->swd_drumoffset = (int)result;
sdp->swd_drumsize = npages;
sdp->swd_npages = size;
mutex_enter(&uvm_swap_data_lock);
sdp->swd_flags &= ~SWF_FAKE; /* going live */
sdp->swd_flags |= (SWF_INUSE|SWF_ENABLE);
uvmexp.swpages += size;
uvmexp.swpgavail += size;
mutex_exit(&uvm_swap_data_lock);
return (0);
/*
* failure: clean up and return error.
*/
bad:
if (sdp->swd_blist) { blist_destroy(sdp->swd_blist);
}
if (vp != rootvp) { (void)VOP_CLOSE(vp, FREAD|FWRITE, l->l_cred);
}
return (error);
}
/*
* swap_off: stop swapping on swapdev
*
* => swap data should be locked, we will unlock.
*/
static int
swap_off(struct lwp *l, struct swapdev *sdp)
{
int npages = sdp->swd_npages;
int error = 0;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx, npages=%jd", sdp->swd_dev,npages, 0, 0);
KASSERT(rw_write_held(&swap_syscall_lock));
KASSERT(mutex_owned(&uvm_swap_data_lock));
/* disable the swap area being removed */
sdp->swd_flags &= ~SWF_ENABLE;
uvmexp.swpgavail -= npages;
mutex_exit(&uvm_swap_data_lock);
/*
* the idea is to find all the pages that are paged out to this
* device, and page them all in. in uvm, swap-backed pageable
* memory can take two forms: aobjs and anons. call the
* swapoff hook for each subsystem to bring in pages.
*/
if (uao_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize) ||
amap_swap_off(sdp->swd_drumoffset,
sdp->swd_drumoffset + sdp->swd_drumsize)) {
error = ENOMEM;
} else if (sdp->swd_npginuse > sdp->swd_npgbad) {
error = EBUSY;
}
if (error) {
mutex_enter(&uvm_swap_data_lock);
sdp->swd_flags |= SWF_ENABLE;
uvmexp.swpgavail += npages;
mutex_exit(&uvm_swap_data_lock);
return error;
}
/*
* If this is the last regular swap destroy the workqueue.
* => Protected by swap_syscall_lock.
*/
if (sdp->swd_vp->v_type != VBLK) {
KASSERT(sw_reg_count > 0);
KASSERT(sw_reg_workqueue != NULL);
if (--sw_reg_count == 0) {
workqueue_destroy(sw_reg_workqueue);
sw_reg_workqueue = NULL;
}
}
/*
* done with the vnode.
* drop our ref on the vnode before calling VOP_CLOSE()
* so that spec_close() can tell if this is the last close.
*/
vrele(sdp->swd_vp);
if (sdp->swd_vp != rootvp) {
(void) VOP_CLOSE(sdp->swd_vp, FREAD|FWRITE, l->l_cred);
}
mutex_enter(&uvm_swap_data_lock);
uvmexp.swpages -= npages;
uvmexp.swpginuse -= sdp->swd_npgbad;
if (swaplist_find(sdp->swd_vp, true) == NULL)
panic("%s: swapdev not in list", __func__);
swaplist_trim();
mutex_exit(&uvm_swap_data_lock);
/*
* free all resources!
*/
vmem_free(swapmap, sdp->swd_drumoffset, sdp->swd_drumsize);
blist_destroy(sdp->swd_blist);
bufq_free(sdp->swd_tab);
kmem_free(__UNVOLATILE(sdp->swd_encmap),
encmap_size(sdp->swd_drumsize));
explicit_memset(&sdp->swd_enckey, 0, sizeof sdp->swd_enckey);
explicit_memset(&sdp->swd_deckey, 0, sizeof sdp->swd_deckey);
kmem_free(sdp, sizeof(*sdp));
return (0);
}
void
uvm_swap_shutdown(struct lwp *l)
{
struct swapdev *sdp;
struct swappri *spp;
struct vnode *vp;
int error;
if (!uvm_swap_init_done || uvmexp.nswapdev == 0)
return;
printf("turning off swap...");
rw_enter(&swap_syscall_lock, RW_WRITER);
mutex_enter(&uvm_swap_data_lock);
again:
LIST_FOREACH(spp, &swap_priority, spi_swappri)
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
if (sdp->swd_flags & SWF_FAKE)
continue;
if ((sdp->swd_flags & (SWF_INUSE|SWF_ENABLE)) == 0)
continue;
#ifdef DEBUG
printf("\nturning off swap on %s...", sdp->swd_path);
#endif
/* Have to lock and reference vnode for swap_off(). */
vn_lock(vp = sdp->swd_vp, LK_EXCLUSIVE|LK_RETRY);
vref(vp);
error = swap_off(l, sdp);
vput(vp);
mutex_enter(&uvm_swap_data_lock);
if (error) {
printf("stopping swap on %s failed "
"with error %d\n", sdp->swd_path, error);
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
uvmexp.nswapdev--;
swaplist_trim();
}
goto again;
}
printf(" done\n");
mutex_exit(&uvm_swap_data_lock);
rw_exit(&swap_syscall_lock);
}
/*
* /dev/drum interface and i/o functions
*/
/*
* swopen: allow the initial open from uvm_swap_init() and reject all others.
*/
static int
swopen(dev_t dev, int flag, int mode, struct lwp *l)
{
static bool inited = false;
if (!inited) {
inited = true;
return 0;
}
return ENODEV;
}
/*
* swstrategy: perform I/O on the drum
*
* => we must map the i/o request from the drum to the correct swapdev.
*/
static void
swstrategy(struct buf *bp)
{
struct swapdev *sdp;
struct vnode *vp;
int pageno, bn;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* convert block number to swapdev. note that swapdev can't
* be yanked out from under us because we are holding resources
* in it (i.e. the blocks we are doing I/O on).
*/
pageno = dbtob((int64_t)bp->b_blkno) >> PAGE_SHIFT;
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(pageno);
mutex_exit(&uvm_swap_data_lock);
if (sdp == NULL) {
bp->b_error = EINVAL;
bp->b_resid = bp->b_bcount;
biodone(bp);
UVMHIST_LOG(pdhist, " failed to get swap device", 0, 0, 0, 0);
return;
}
/*
* convert drum page number to block number on this swapdev.
*/
pageno -= sdp->swd_drumoffset; /* page # on swapdev */
bn = btodb((uint64_t)pageno << PAGE_SHIFT); /* convert to diskblock */
UVMHIST_LOG(pdhist, " Rd/Wr (0/1) %jd: mapoff=%#jx bn=%#jx bcount=%jd",
((bp->b_flags & B_READ) == 0) ? 1 : 0,
sdp->swd_drumoffset, bn, bp->b_bcount);
/*
* for block devices we finish up here.
* for regular files we have to do more work which we delegate
* to sw_reg_strategy().
*/
vp = sdp->swd_vp; /* swapdev vnode pointer */
switch (vp->v_type) {
default:
panic("%s: vnode type 0x%x", __func__, vp->v_type);
case VBLK:
/*
* must convert "bp" from an I/O on /dev/drum to an I/O
* on the swapdev (sdp).
*/
bp->b_blkno = bn; /* swapdev block number */
bp->b_dev = sdp->swd_dev; /* swapdev dev_t */
/*
* if we are doing a write, we have to redirect the i/o on
* drum's v_numoutput counter to the swapdevs.
*/
if ((bp->b_flags & B_READ) == 0) { mutex_enter(bp->b_objlock);
vwakeup(bp); /* kills one 'v_numoutput' on drum */
mutex_exit(bp->b_objlock);
mutex_enter(vp->v_interlock);
vp->v_numoutput++; /* put it on swapdev */
mutex_exit(vp->v_interlock);
}
/*
* finally plug in swapdev vnode and start I/O
*/
bp->b_vp = vp;
bp->b_objlock = vp->v_interlock;
VOP_STRATEGY(vp, bp);
return;
case VREG:
/*
* delegate to sw_reg_strategy function.
*/
sw_reg_strategy(sdp, bp, bn);
return;
}
/* NOTREACHED */
}
/*
* swread: the read function for the drum (just a call to physio)
*/
/*ARGSUSED*/
static int
swread(dev_t dev, struct uio *uio, int ioflag)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
return (physio(swstrategy, NULL, dev, B_READ, minphys, uio));
}
/*
* swwrite: the write function for the drum (just a call to physio)
*/
/*ARGSUSED*/
static int
swwrite(dev_t dev, struct uio *uio, int ioflag)
{
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " dev=%#jx offset=%#jx", dev, uio->uio_offset, 0, 0);
return (physio(swstrategy, NULL, dev, B_WRITE, minphys, uio));
}
const struct bdevsw swap_bdevsw = {
.d_open = swopen,
.d_close = noclose,
.d_strategy = swstrategy,
.d_ioctl = noioctl,
.d_dump = nodump,
.d_psize = nosize,
.d_discard = nodiscard,
.d_flag = D_OTHER
};
const struct cdevsw swap_cdevsw = {
.d_open = nullopen,
.d_close = nullclose,
.d_read = swread,
.d_write = swwrite,
.d_ioctl = noioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER,
};
/*
* sw_reg_strategy: handle swap i/o to regular files
*/
static void
sw_reg_strategy(struct swapdev *sdp, struct buf *bp, int bn)
{
struct vnode *vp;
struct vndxfer *vnx;
daddr_t nbn;
char *addr;
off_t byteoff;
int s, off, nra, error, sz, resid;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* allocate a vndxfer head for this transfer and point it to
* our buffer.
*/
vnx = pool_get(&vndxfer_pool, PR_WAITOK);
vnx->vx_flags = VX_BUSY;
vnx->vx_error = 0;
vnx->vx_pending = 0;
vnx->vx_bp = bp;
vnx->vx_sdp = sdp;
/*
* setup for main loop where we read filesystem blocks into
* our buffer.
*/
error = 0;
bp->b_resid = bp->b_bcount; /* nothing transferred yet! */
addr = bp->b_data; /* current position in buffer */
byteoff = dbtob((uint64_t)bn);
for (resid = bp->b_resid; resid; resid -= sz) {
struct vndbuf *nbp;
/*
* translate byteoffset into block number. return values:
* vp = vnode of underlying device
* nbn = new block number (on underlying vnode dev)
* nra = num blocks we can read-ahead (excludes requested
* block)
*/
nra = 0;
error = VOP_BMAP(sdp->swd_vp, byteoff / sdp->swd_bsize,
&vp, &nbn, &nra);
if (error == 0 && nbn == (daddr_t)-1) {
/*
* this used to just set error, but that doesn't
* do the right thing. Instead, it causes random
* memory errors. The panic() should remain until
* this condition doesn't destabilize the system.
*/
#if 1
panic("%s: swap to sparse file", __func__);
#else
error = EIO; /* failure */
#endif
}
/*
* punt if there was an error or a hole in the file.
* we must wait for any i/o ops we have already started
* to finish before returning.
*
* XXX we could deal with holes here but it would be
* a hassle (in the write case).
*/
if (error) {
s = splbio();
vnx->vx_error = error; /* pass error up */
goto out;
}
/*
* compute the size ("sz") of this transfer (in bytes).
*/
off = byteoff % sdp->swd_bsize;
sz = (1 + nra) * sdp->swd_bsize - off;
if (sz > resid)
sz = resid;
UVMHIST_LOG(pdhist, "sw_reg_strategy: "
"vp %#jx/%#jx offset %#jx/%#jx",
(uintptr_t)sdp->swd_vp, (uintptr_t)vp, byteoff, nbn);
/*
* now get a buf structure. note that the vb_buf is
* at the front of the nbp structure so that you can
* cast pointers between the two structure easily.
*/
nbp = pool_get(&vndbuf_pool, PR_WAITOK);
buf_init(&nbp->vb_buf);
nbp->vb_buf.b_flags = bp->b_flags;
nbp->vb_buf.b_cflags = bp->b_cflags;
nbp->vb_buf.b_oflags = bp->b_oflags;
nbp->vb_buf.b_bcount = sz;
nbp->vb_buf.b_bufsize = sz;
nbp->vb_buf.b_error = 0;
nbp->vb_buf.b_data = addr;
nbp->vb_buf.b_lblkno = 0;
nbp->vb_buf.b_blkno = nbn + btodb(off);
nbp->vb_buf.b_rawblkno = nbp->vb_buf.b_blkno;
nbp->vb_buf.b_iodone = sw_reg_biodone;
nbp->vb_buf.b_vp = vp;
nbp->vb_buf.b_objlock = vp->v_interlock;
if (vp->v_type == VBLK) { nbp->vb_buf.b_dev = vp->v_rdev;
}
nbp->vb_xfer = vnx; /* patch it back in to vnx */
/*
* Just sort by block number
*/
s = splbio();
if (vnx->vx_error != 0) {
buf_destroy(&nbp->vb_buf);
pool_put(&vndbuf_pool, nbp);
goto out;
}
vnx->vx_pending++;
/* sort it in and start I/O if we are not over our limit */
/* XXXAD locking */
bufq_put(sdp->swd_tab, &nbp->vb_buf);
sw_reg_start(sdp);
splx(s);
/*
* advance to the next I/O
*/
byteoff += sz;
addr += sz;
}
s = splbio();
out: /* Arrive here at splbio */
vnx->vx_flags &= ~VX_BUSY;
if (vnx->vx_pending == 0) { error = vnx->vx_error;
pool_put(&vndxfer_pool, vnx);
bp->b_error = error;
biodone(bp);
}
splx(s);
}
/*
* sw_reg_start: start an I/O request on the requested swapdev
*
* => reqs are sorted by b_rawblkno (above)
*/
static void
sw_reg_start(struct swapdev *sdp)
{
struct buf *bp;
struct vnode *vp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/* recursion control */
if ((sdp->swd_flags & SWF_BUSY) != 0)
return;
sdp->swd_flags |= SWF_BUSY;
while (sdp->swd_active < sdp->swd_maxactive) {
bp = bufq_get(sdp->swd_tab);
if (bp == NULL)
break;
sdp->swd_active++;
UVMHIST_LOG(pdhist,
"sw_reg_start: bp %#jx vp %#jx blkno %#jx cnt %#jx",
(uintptr_t)bp, (uintptr_t)bp->b_vp, (uintptr_t)bp->b_blkno,
bp->b_bcount);
vp = bp->b_vp;
KASSERT(bp->b_objlock == vp->v_interlock);
if ((bp->b_flags & B_READ) == 0) {
mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
}
VOP_STRATEGY(vp, bp);
}
sdp->swd_flags &= ~SWF_BUSY;
}
/*
* sw_reg_biodone: one of our i/o's has completed
*/
static void
sw_reg_biodone(struct buf *bp)
{
workqueue_enqueue(sw_reg_workqueue, &bp->b_work, NULL);
}
/*
* sw_reg_iodone: one of our i/o's has completed and needs post-i/o cleanup
*
* => note that we can recover the vndbuf struct by casting the buf ptr
*/
static void
sw_reg_iodone(struct work *wk, void *dummy)
{
struct vndbuf *vbp = (void *)wk;
struct vndxfer *vnx = vbp->vb_xfer;
struct buf *pbp = vnx->vx_bp; /* parent buffer */
struct swapdev *sdp = vnx->vx_sdp;
int s, resid, error;
KASSERT(&vbp->vb_buf.b_work == wk);
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, " vbp=%#jx vp=%#jx blkno=%#jx addr=%#jx",
(uintptr_t)vbp, (uintptr_t)vbp->vb_buf.b_vp, vbp->vb_buf.b_blkno,
(uintptr_t)vbp->vb_buf.b_data);
UVMHIST_LOG(pdhist, " cnt=%#jx resid=%#jx",
vbp->vb_buf.b_bcount, vbp->vb_buf.b_resid, 0, 0);
/*
* protect vbp at splbio and update.
*/
s = splbio();
resid = vbp->vb_buf.b_bcount - vbp->vb_buf.b_resid;
pbp->b_resid -= resid;
vnx->vx_pending--;
if (vbp->vb_buf.b_error != 0) {
/* pass error upward */
error = vbp->vb_buf.b_error ? vbp->vb_buf.b_error : EIO;
UVMHIST_LOG(pdhist, " got error=%jd !", error, 0, 0, 0);
vnx->vx_error = error;
}
/*
* kill vbp structure
*/
buf_destroy(&vbp->vb_buf);
pool_put(&vndbuf_pool, vbp);
/*
* wrap up this transaction if it has run to completion or, in
* case of an error, when all auxiliary buffers have returned.
*/
if (vnx->vx_error != 0) {
/* pass error upward */
error = vnx->vx_error;
if ((vnx->vx_flags & VX_BUSY) == 0 && vnx->vx_pending == 0) {
pbp->b_error = error;
biodone(pbp);
pool_put(&vndxfer_pool, vnx);
}
} else if (pbp->b_resid == 0) {
KASSERT(vnx->vx_pending == 0);
if ((vnx->vx_flags & VX_BUSY) == 0) {
UVMHIST_LOG(pdhist, " iodone, pbp=%#jx error=%jd !",
(uintptr_t)pbp, vnx->vx_error, 0, 0);
biodone(pbp);
pool_put(&vndxfer_pool, vnx);
}
}
/*
* done! start next swapdev I/O if one is pending
*/
sdp->swd_active--;
sw_reg_start(sdp);
splx(s);
}
/*
* uvm_swap_alloc: allocate space on swap
*
* => allocation is done "round robin" down the priority list, as we
* allocate in a priority we "rotate" the circle queue.
* => space can be freed with uvm_swap_free
* => we return the page slot number in /dev/drum (0 == invalid slot)
* => we lock uvm_swap_data_lock
* => XXXMRG: "LESSOK" INTERFACE NEEDED TO EXTENT SYSTEM
*/
int
uvm_swap_alloc(int *nslots /* IN/OUT */, bool lessok)
{
struct swapdev *sdp;
struct swappri *spp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
/*
* no swap devices configured yet? definite failure.
*/
if (uvmexp.nswapdev < 1)
return 0;
/*
* XXXJAK: BEGIN HACK
*
* blist_alloc() in subr_blist.c will panic if we try to allocate
* too many slots.
*/
if (*nslots > BLIST_MAX_ALLOC) {
if (__predict_false(lessok == false))
return 0;
*nslots = BLIST_MAX_ALLOC;
}
/* XXXJAK: END HACK */
/*
* lock data lock, convert slots into blocks, and enter loop
*/
mutex_enter(&uvm_swap_data_lock);
ReTry: /* XXXMRG */
LIST_FOREACH(spp, &swap_priority, spi_swappri) {
TAILQ_FOREACH(sdp, &spp->spi_swapdev, swd_next) {
uint64_t result;
/* if it's not enabled, then we can't swap from it */
if ((sdp->swd_flags & SWF_ENABLE) == 0)
continue;
if (sdp->swd_npginuse + *nslots > sdp->swd_npages)
continue;
result = blist_alloc(sdp->swd_blist, *nslots);
if (result == BLIST_NONE) {
continue;
}
KASSERT(result < sdp->swd_drumsize);
/*
* successful allocation! now rotate the tailq.
*/
TAILQ_REMOVE(&spp->spi_swapdev, sdp, swd_next);
TAILQ_INSERT_TAIL(&spp->spi_swapdev, sdp, swd_next);
sdp->swd_npginuse += *nslots;
uvmexp.swpginuse += *nslots;
mutex_exit(&uvm_swap_data_lock);
/* done! return drum slot number */
UVMHIST_LOG(pdhist,
"success! returning %jd slots starting at %jd",
*nslots, result + sdp->swd_drumoffset, 0, 0);
return (result + sdp->swd_drumoffset);
}
}
/* XXXMRG: BEGIN HACK */
if (*nslots > 1 && lessok) {
*nslots = 1;
/* XXXMRG: ugh! blist should support this for us */
goto ReTry;
}
/* XXXMRG: END HACK */
mutex_exit(&uvm_swap_data_lock);
return 0;
}
/*
* uvm_swapisfull: return true if most of available swap is allocated
* and in use. we don't count some small portion as it may be inaccessible
* to us at any given moment, for example if there is lock contention or if
* pages are busy.
*/
bool
uvm_swapisfull(void)
{
int swpgonly;
bool rv;
if (uvmexp.swpages == 0) {
return true;
}
mutex_enter(&uvm_swap_data_lock);
KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
swpgonly = (int)((uint64_t)uvmexp.swpgonly * 100 /
uvm_swapisfull_factor);
rv = (swpgonly >= uvmexp.swpgavail);
mutex_exit(&uvm_swap_data_lock);
return (rv);
}
/*
* uvm_swap_markbad: keep track of swap ranges where we've had i/o errors
*
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_markbad(int startslot, int nslots)
{
struct swapdev *sdp;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
KASSERT(sdp != NULL);
/*
* we just keep track of how many pages have been marked bad
* in this device, to make everything add up in swap_off().
* we assume here that the range of slots will all be within
* one swap device.
*/
KASSERT(uvmexp.swpgonly >= nslots);
atomic_add_int(&uvmexp.swpgonly, -nslots);
sdp->swd_npgbad += nslots;
UVMHIST_LOG(pdhist, "now %jd bad", sdp->swd_npgbad, 0,0,0);
mutex_exit(&uvm_swap_data_lock);
}
/*
* uvm_swap_free: free swap slots
*
* => this can be all or part of an allocation made by uvm_swap_alloc
* => we lock uvm_swap_data_lock
*/
void
uvm_swap_free(int startslot, int nslots)
{
struct swapdev *sdp;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "freeing %jd slots starting at %jd", nslots,
startslot, 0, 0);
/*
* ignore attempts to free the "bad" slot.
*/
if (startslot == SWSLOT_BAD) {
return;
}
/*
* convert drum slot offset back to sdp, free the blocks
* in the extent, and return. must hold pri lock to do
* lookup and access the extent.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
KASSERT(uvmexp.nswapdev >= 1);
KASSERT(sdp != NULL);
KASSERT(sdp->swd_npginuse >= nslots);
blist_free(sdp->swd_blist, startslot - sdp->swd_drumoffset, nslots);
sdp->swd_npginuse -= nslots;
uvmexp.swpginuse -= nslots;
mutex_exit(&uvm_swap_data_lock);
}
/*
* uvm_swap_put: put any number of pages into a contig place on swap
*
* => can be sync or async
*/
int
uvm_swap_put(int swslot, struct vm_page **ppsp, int npages, int flags)
{
int error;
error = uvm_swap_io(ppsp, swslot, npages, B_WRITE |
((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
return error;
}
/*
* uvm_swap_get: get a single page from swap
*
* => usually a sync op (from fault)
*/
int
uvm_swap_get(struct vm_page *page, int swslot, int flags)
{
int error;
atomic_inc_uint(&uvmexp.nswget);
KASSERT(flags & PGO_SYNCIO);
if (swslot == SWSLOT_BAD) {
return EIO;
}
error = uvm_swap_io(&page, swslot, 1, B_READ |
((flags & PGO_SYNCIO) ? 0 : B_ASYNC));
if (error == 0) {
/*
* this page is no longer only in swap.
*/
KASSERT(uvmexp.swpgonly > 0);
atomic_dec_uint(&uvmexp.swpgonly);
}
return error;
}
/*
* uvm_swap_io: do an i/o operation to swap
*/
static int
uvm_swap_io(struct vm_page **pps, int startslot, int npages, int flags)
{
daddr_t startblk;
struct buf *bp;
vaddr_t kva;
int error, mapinflags;
bool write, async, swap_encrypt;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(pdhist, "<- called, startslot=%jd, npages=%jd, flags=%#jx",
startslot, npages, flags, 0);
write = (flags & B_READ) == 0;
async = (flags & B_ASYNC) != 0;
swap_encrypt = atomic_load_relaxed(&uvm_swap_encrypt);
/*
* allocate a buf for the i/o.
*/
KASSERT(curlwp != uvm.pagedaemon_lwp || write);
KASSERT(curlwp != uvm.pagedaemon_lwp || async);
bp = getiobuf(swapdev_vp, curlwp != uvm.pagedaemon_lwp);
if (bp == NULL) {
uvm_aio_aiodone_pages(pps, npages, true, ENOMEM);
return ENOMEM;
}
/*
* convert starting drum slot to block number
*/
startblk = btodb((uint64_t)startslot << PAGE_SHIFT);
/*
* first, map the pages into the kernel.
*/
mapinflags = !write ?
UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_READ :
UVMPAGER_MAPIN_WAITOK|UVMPAGER_MAPIN_WRITE;
if (write && swap_encrypt) /* need to encrypt in-place */
mapinflags |= UVMPAGER_MAPIN_READ;
kva = uvm_pagermapin(pps, npages, mapinflags);
/*
* encrypt writes in place if requested
*/
if (write) do {
struct swapdev *sdp;
int i;
/*
* Get the swapdev so we can discriminate on the
* encryption state. There may or may not be an
* encryption key generated; we may or may not be asked
* to encrypt swap.
*
* 1. NO KEY, NO ENCRYPTION: Nothing to do.
*
* 2. NO KEY, BUT ENCRYPTION: Generate a key, encrypt,
* and mark the slots encrypted.
*
* 3. KEY, BUT NO ENCRYPTION: The slots may already be
* marked encrypted from a past life. Mark them not
* encrypted.
*
* 4. KEY, ENCRYPTION: Encrypt and mark the slots
* encrypted.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
if (!sdp->swd_encinit) {
if (!swap_encrypt) {
mutex_exit(&uvm_swap_data_lock);
break;
}
uvm_swap_genkey(sdp);
}
KASSERT(sdp->swd_encinit);
mutex_exit(&uvm_swap_data_lock);
for (i = 0; i < npages; i++) {
int s = startslot + i;
KDASSERT(swapdrum_sdp_is(s, sdp));
KASSERT(s >= sdp->swd_drumoffset);
s -= sdp->swd_drumoffset;
KASSERT(s < sdp->swd_drumsize);
if (swap_encrypt) {
uvm_swap_encryptpage(sdp,
(void *)(kva + (vsize_t)i*PAGE_SIZE), s);
atomic_or_32(&sdp->swd_encmap[s/32],
__BIT(s%32));
} else {
atomic_and_32(&sdp->swd_encmap[s/32],
~__BIT(s%32));
}
}
} while (0);
/*
* fill in the bp/sbp. we currently route our i/o through
* /dev/drum's vnode [swapdev_vp].
*/
bp->b_cflags = BC_BUSY | BC_NOCACHE;
bp->b_flags = (flags & (B_READ|B_ASYNC));
bp->b_proc = &proc0; /* XXX */
bp->b_vnbufs.le_next = NOLIST;
bp->b_data = (void *)kva;
bp->b_blkno = startblk;
bp->b_bufsize = bp->b_bcount = npages << PAGE_SHIFT;
/*
* bump v_numoutput (counter of number of active outputs).
*/
if (write) {
mutex_enter(swapdev_vp->v_interlock);
swapdev_vp->v_numoutput++;
mutex_exit(swapdev_vp->v_interlock);
}
/*
* for async ops we must set up the iodone handler.
*/
if (async) {
bp->b_iodone = uvm_aio_aiodone;
UVMHIST_LOG(pdhist, "doing async!", 0, 0, 0, 0);
if (curlwp == uvm.pagedaemon_lwp)
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
else
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
} else {
bp->b_iodone = NULL;
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
}
UVMHIST_LOG(pdhist,
"about to start io: data = %#jx blkno = %#jx, bcount = %jd",
(uintptr_t)bp->b_data, bp->b_blkno, bp->b_bcount, 0);
/*
* now we start the I/O, and if async, return.
*/
VOP_STRATEGY(swapdev_vp, bp);
if (async) {
/*
* Reads are always synchronous; if this changes, we
* need to add an asynchronous path for decryption.
*/
KASSERT(write);
return 0;
}
/*
* must be sync i/o. wait for it to finish
*/
error = biowait(bp);
if (error)
goto out;
/*
* decrypt reads in place if needed
*/
if (!write) do {
struct swapdev *sdp;
bool encinit;
int i;
/*
* Get the sdp. Everything about it except the encinit
* bit, saying whether the encryption key is
* initialized or not, and the encrypted bit for each
* page, is stable until all swap pages have been
* released and the device is removed.
*/
mutex_enter(&uvm_swap_data_lock);
sdp = swapdrum_getsdp(startslot);
encinit = sdp->swd_encinit;
mutex_exit(&uvm_swap_data_lock);
if (!encinit)
/*
* If there's no encryption key, there's no way
* any of these slots can be encrypted, so
* nothing to do here.
*/
break;
for (i = 0; i < npages; i++) {
int s = startslot + i;
KDASSERT(swapdrum_sdp_is(s, sdp));
KASSERT(s >= sdp->swd_drumoffset);
s -= sdp->swd_drumoffset;
KASSERT(s < sdp->swd_drumsize);
if ((atomic_load_relaxed(&sdp->swd_encmap[s/32]) &
__BIT(s%32)) == 0)
continue;
uvm_swap_decryptpage(sdp,
(void *)(kva + (vsize_t)i*PAGE_SIZE), s);
}
} while (0);
out:
/*
* kill the pager mapping
*/
uvm_pagermapout(kva, npages);
/*
* now dispose of the buf and we're done.
*/
if (write) {
mutex_enter(swapdev_vp->v_interlock);
vwakeup(bp);
mutex_exit(swapdev_vp->v_interlock);
}
putiobuf(bp);
UVMHIST_LOG(pdhist, "<- done (sync) error=%jd", error, 0, 0, 0);
return (error);
}
/*
* uvm_swap_genkey(sdp)
*
* Generate a key for swap encryption.
*/
static void
uvm_swap_genkey(struct swapdev *sdp)
{
uint8_t key[32];
KASSERT(!sdp->swd_encinit);
cprng_strong(kern_cprng, key, sizeof key, 0);
aes_setenckey256(&sdp->swd_enckey, key);
aes_setdeckey256(&sdp->swd_deckey, key);
explicit_memset(key, 0, sizeof key);
sdp->swd_encinit = true;
}
/*
* uvm_swap_encryptpage(sdp, kva, slot)
*
* Encrypt one page of data at kva for the specified slot number
* in the swap device.
*/
static void
uvm_swap_encryptpage(struct swapdev *sdp, void *kva, int slot)
{
uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
/* iv := AES_k(le32enc(slot) || 0^96) */
le32enc(preiv, slot);
aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
/* *kva := AES-CBC_k(iv, *kva) */
aes_cbc_enc(&sdp->swd_enckey, kva, kva, PAGE_SIZE, iv,
AES_256_NROUNDS);
explicit_memset(&iv, 0, sizeof iv);
}
/*
* uvm_swap_decryptpage(sdp, kva, slot)
*
* Decrypt one page of data at kva for the specified slot number
* in the swap device.
*/
static void
uvm_swap_decryptpage(struct swapdev *sdp, void *kva, int slot)
{
uint8_t preiv[16] __aligned(16) = {0}, iv[16] __aligned(16);
/* iv := AES_k(le32enc(slot) || 0^96) */
le32enc(preiv, slot);
aes_enc(&sdp->swd_enckey, (const void *)preiv, iv, AES_256_NROUNDS);
/* *kva := AES-CBC^{-1}_k(iv, *kva) */
aes_cbc_dec(&sdp->swd_deckey, kva, kva, PAGE_SIZE, iv,
AES_256_NROUNDS);
explicit_memset(&iv, 0, sizeof iv);
}
SYSCTL_SETUP(sysctl_uvmswap_setup, "sysctl uvmswap setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "swap_encrypt",
SYSCTL_DESCR("Encrypt data when swapped out to disk"),
NULL, 0, &uvm_swap_encrypt, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
}
/* $NetBSD: sysmon.c,v 1.32 2022/03/28 12:33:21 riastradh Exp $ */
/*-
* Copyright (c) 2000 Zembu Labs, Inc.
* All rights reserved.
*
* Author: Jason R. Thorpe <thorpej@zembu.com>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Zembu Labs, Inc.
* 4. Neither the name of Zembu Labs nor the names of its employees may
* be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ZEMBU LABS, INC. ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WAR-
* RANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DIS-
* CLAIMED. IN NO EVENT SHALL ZEMBU LABS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Clearing house for system monitoring hardware. We currently
* handle environmental sensors, watchdog timers, and power management.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysmon.c,v 1.32 2022/03/28 12:33:21 riastradh Exp $");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/fcntl.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/device.h>
#include <sys/once.h>
#include <dev/sysmon/sysmonvar.h>
dev_type_open(sysmonopen);
dev_type_close(sysmonclose);
dev_type_ioctl(sysmonioctl);
dev_type_read(sysmonread);
dev_type_poll(sysmonpoll);
dev_type_kqfilter(sysmonkqfilter);
const struct cdevsw sysmon_cdevsw = {
.d_open = sysmonopen,
.d_close = sysmonclose,
.d_read = sysmonread,
.d_write = nowrite,
.d_ioctl = sysmonioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = sysmonpoll,
.d_mmap = nommap,
.d_kqfilter = sysmonkqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
static int sysmon_modcmd(modcmd_t, void *);
static int sm_init_once(void);
/*
* Info about our minor "devices"
*/
static struct sysmon_opvec *sysmon_opvec_table[] = { NULL, NULL, NULL };
static int sysmon_refcnt[] = { 0, 0, 0 };
static const char *sysmon_mod[] = { "sysmon_envsys",
"sysmon_wdog",
"sysmon_power" };
static kmutex_t sysmon_minor_mtx;
#ifdef _MODULE
static bool sm_is_attached;
#endif
ONCE_DECL(once_sm);
/*
* sysmon_attach_minor
*
* Attach a minor device for wdog, power, or envsys. Manage a
* reference count so we can prevent the device from being
* detached if there are still users with the minor device opened.
*
* If the opvec argument is NULL, this is a request to detach the
* minor device - make sure the refcnt is zero!
*/
int
sysmon_attach_minor(int minor, struct sysmon_opvec *opvec)
{
int ret;
mutex_enter(&sysmon_minor_mtx);
if (opvec) {
if (sysmon_opvec_table[minor] == NULL) {
sysmon_refcnt[minor] = 0;
sysmon_opvec_table[minor] = opvec;
ret = 0;
} else
ret = EEXIST;
} else {
if (sysmon_refcnt[minor] == 0) {
sysmon_opvec_table[minor] = NULL;
ret = 0;
} else
ret = EBUSY;
}
mutex_exit(&sysmon_minor_mtx);
return ret;
}
/*
* sysmonopen:
*
* Open the system monitor device.
*/
int
sysmonopen(dev_t dev, int flag, int mode, struct lwp *l)
{
int error;
mutex_enter(&sysmon_minor_mtx);
switch (minor(dev)) {
case SYSMON_MINOR_ENVSYS:
case SYSMON_MINOR_WDOG:
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL) {
mutex_exit(&sysmon_minor_mtx);
error = module_autoload(sysmon_mod[minor(dev)],
MODULE_CLASS_DRIVER);
if (error)
return error;
mutex_enter(&sysmon_minor_mtx);
if (sysmon_opvec_table[minor(dev)] == NULL) {
error = ENODEV;
break;
}
}
error = (sysmon_opvec_table[minor(dev)]->so_open)(dev, flag,
mode, l);
if (error == 0) sysmon_refcnt[minor(dev)]++;
break;
default:
error = ENODEV;
}
mutex_exit(&sysmon_minor_mtx);
return error;
}
/*
* sysmonclose:
*
* Close the system monitor device.
*/
int
sysmonclose(dev_t dev, int flag, int mode, struct lwp *l)
{
int error;
switch (minor(dev)) {
case SYSMON_MINOR_ENVSYS:
case SYSMON_MINOR_WDOG:
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL)
error = ENODEV;
else {
error = (sysmon_opvec_table[minor(dev)]->so_close)(dev,
flag, mode, l);
if (error == 0) {
sysmon_refcnt[minor(dev)]--;
KASSERT(sysmon_refcnt[minor(dev)] >= 0);
}
}
break;
default:
error = ENODEV;
}
return (error);
}
/*
* sysmonioctl:
*
* Perform a control request.
*/
int
sysmonioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
int error;
switch (minor(dev)) {
case SYSMON_MINOR_ENVSYS:
case SYSMON_MINOR_WDOG:
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL)
error = ENODEV;
else
error = (sysmon_opvec_table[minor(dev)]->so_ioctl)(dev,
cmd, data, flag, l);
break;
default:
error = ENODEV;
}
return (error);
}
/*
* sysmonread:
*
* Perform a read request.
*/
int
sysmonread(dev_t dev, struct uio *uio, int flags)
{
int error;
switch (minor(dev)) {
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL)
error = ENODEV;
else
error = (sysmon_opvec_table[minor(dev)]->so_read)(dev,
uio, flags);
break;
default:
error = ENODEV;
}
return (error);
}
/*
* sysmonpoll:
*
* Poll the system monitor device.
*/
int
sysmonpoll(dev_t dev, int events, struct lwp *l)
{
int rv;
switch (minor(dev)) {
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL)
rv = events;
else
rv = (sysmon_opvec_table[minor(dev)]->so_poll)(dev,
events, l);
break;
default:
rv = events;
}
return (rv);
}
/*
* sysmonkqfilter:
*
* Kqueue filter for the system monitor device.
*/
int
sysmonkqfilter(dev_t dev, struct knote *kn)
{
int error;
switch (minor(dev)) {
case SYSMON_MINOR_POWER:
if (sysmon_opvec_table[minor(dev)] == NULL)
error = ENODEV;
else
error = (sysmon_opvec_table[minor(dev)]->so_filter)(dev,
kn);
break;
default:
error = 1;
}
return (error);
}
MODULE(MODULE_CLASS_DRIVER, sysmon, NULL);
static int
sm_init_once(void)
{
mutex_init(&sysmon_minor_mtx, MUTEX_DEFAULT, IPL_NONE);
return 0;
}
int
sysmon_init(void)
{
int error;
#ifdef _MODULE
devmajor_t bmajor, cmajor;
#endif
error = RUN_ONCE(&once_sm, sm_init_once);
#ifdef _MODULE
mutex_enter(&sysmon_minor_mtx);
if (!sm_is_attached) {
bmajor = cmajor = -1;
error = devsw_attach("sysmon", NULL, &bmajor,
&sysmon_cdevsw, &cmajor);
sm_is_attached = (error != 0);
}
mutex_exit(&sysmon_minor_mtx);
#endif
return error;
}
int
sysmon_fini(void)
{
int error = 0;
if ((sysmon_opvec_table[SYSMON_MINOR_ENVSYS] != NULL) ||
(sysmon_opvec_table[SYSMON_MINOR_WDOG] != NULL) ||
(sysmon_opvec_table[SYSMON_MINOR_POWER] != NULL))
error = EBUSY;
#ifdef _MODULE
if (error == 0) {
mutex_enter(&sysmon_minor_mtx);
sm_is_attached = false;
devsw_detach(NULL, &sysmon_cdevsw);
mutex_exit(&sysmon_minor_mtx);
}
#endif
return error;
}
static int
sysmon_modcmd(modcmd_t cmd, void *arg)
{
int ret;
switch (cmd) {
case MODULE_CMD_INIT:
ret = sysmon_init();
break;
case MODULE_CMD_FINI:
ret = sysmon_fini();
break;
case MODULE_CMD_STAT:
default:
ret = ENOTTY;
}
return ret;
}
/* $NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_subr.c, v 1.11 1992/05/30 10:05:43 jsp Exp
* @(#)umap_subr.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_subr.c,v 1.29 2014/11/09 18:08:07 maxv Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/umapfs/umap.h>
u_long umap_findid(u_long, u_long [][2], int);
int umap_node_alloc(struct mount *, struct vnode *,
struct vnode **);
/*
* umap_findid is called by various routines in umap_vnodeops.c to
* find a user or group id in a map.
*/
u_long
umap_findid(u_long id, u_long map[][2], int nentries)
{
int i;
/* Find uid entry in map */
i = 0;
while ((i<nentries) && ((map[i][0]) != id))
i++;
if (i < nentries)
return (map[i][1]);
else
return (-1);
}
/*
* umap_reverse_findid is called by umap_getattr() in umap_vnodeops.c to
* find a user or group id in a map, in reverse.
*/
u_long
umap_reverse_findid(u_long id, u_long map[][2], int nentries)
{
int i;
/* Find uid entry in map */
i = 0;
while ((i<nentries) && ((map[i][1]) != id))
i++;
if (i < nentries)
return (map[i][0]);
else
return (-1);
}
/* umap_mapids maps all of the ids in a credential, both user and group. */
void
umap_mapids(struct mount *v_mount, kauth_cred_t credp)
{
int i, unentries, gnentries;
uid_t uid;
gid_t gid;
u_long (*usermap)[2], (*groupmap)[2];
gid_t groups[NGROUPS];
uint16_t ngroups;
if (credp == NOCRED || credp == FSCRED)
return;
unentries = MOUNTTOUMAPMOUNT(v_mount)->info_nentries;
usermap = MOUNTTOUMAPMOUNT(v_mount)->info_mapdata;
gnentries = MOUNTTOUMAPMOUNT(v_mount)->info_gnentries;
groupmap = MOUNTTOUMAPMOUNT(v_mount)->info_gmapdata;
/* Find uid entry in map */
uid = (uid_t) umap_findid(kauth_cred_geteuid(credp), usermap, unentries); if (uid != -1)
kauth_cred_seteuid(credp, uid);
else
kauth_cred_seteuid(credp, (uid_t)NOBODY);
#if 1
/* cr_gid is the same as cr_groups[0] in 4BSD, but not in NetBSD */
/* Find gid entry in map */
gid = (gid_t) umap_findid(kauth_cred_getegid(credp), groupmap, gnentries); if (gid != -1)
kauth_cred_setegid(credp, gid);
else
kauth_cred_setegid(credp, NULLGROUP);
#endif
/* Now we must map each of the set of groups in the cr_groups
structure. */
ngroups = kauth_cred_ngroups(credp);
for (i = 0; i < ngroups; i++) {
/* XXX elad: can't we just skip cases where gid == -1? */
groups[i] = kauth_cred_group(credp, i);
gid = (gid_t) umap_findid(groups[i],
groupmap, gnentries);
if (gid != -1)
groups[i] = gid;
else
groups[i] = NULLGROUP;
}
kauth_cred_setgroups(credp, groups, ngroups, -1, UIO_SYSSPACE);
}
/* $NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cfglock.c,v 1.1 2010/08/21 13:17:31 pgoyette Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/mutex.h>
#include <sys/lwp.h>
#include <sys/systm.h>
static kmutex_t kernconfig_mutex;
static lwp_t *kernconfig_lwp;
static int kernconfig_recurse;
/*
* Functions for manipulating the kernel configuration lock. This
* recursive lock should be used to protect all additions and removals
* of kernel functionality, such as device configuration and loading
* of modular kernel components.
*/
void
kernconfig_lock_init(void)
{
mutex_init(&kernconfig_mutex, MUTEX_DEFAULT, IPL_NONE);
kernconfig_lwp = NULL;
kernconfig_recurse = 0;
}
void
kernconfig_lock(void)
{
lwp_t *my_lwp;
/*
* It's OK to check this unlocked, since it could only be set to
* curlwp by the current thread itself, and not by an interrupt
* or any other LWP.
*/
KASSERT(!cpu_intr_p());
my_lwp = curlwp;
if (kernconfig_lwp == my_lwp) {
kernconfig_recurse++;
KASSERT(kernconfig_recurse > 1);
} else {
mutex_enter(&kernconfig_mutex);
kernconfig_lwp = my_lwp;
kernconfig_recurse = 1;
}
}
void
kernconfig_unlock(void)
{ KASSERT(kernconfig_is_held()); KASSERT(kernconfig_recurse != 0); if (--kernconfig_recurse == 0) { kernconfig_lwp = NULL;
mutex_exit(&kernconfig_mutex);
}
}
bool
kernconfig_is_held(void)
{
return mutex_owned(&kernconfig_mutex);
}
/* $NetBSD: procfs.h,v 1.84 2024/01/17 10:20:12 hannken Exp $ */
/*
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs.h 8.9 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs.h 8.9 (Berkeley) 5/14/95
*/
/* This also pulls in __HAVE_PROCFS_MACHDEP */
#include <sys/ptrace.h>
#ifdef _KERNEL
#include <sys/proc.h>
/*
* The different types of node in a procfs filesystem
*/
typedef enum {
PFSauxv, /* ELF Auxiliary Vector */
PFSchroot, /* the process's current root directory */
PFScmdline, /* process command line args */
PFScpuinfo, /* CPU info (if -o linux) */
PFScpustat, /* status info (if -o linux) */
PFScurproc, /* symbolic link for curproc */
PFScwd, /* the process's current working directory */
PFSdevices, /* major/device name mappings (if -o linux) */
PFSemul, /* the process's emulation */
PFSenviron, /* process environment */
PFSexe, /* symlink to the executable file */
PFSfd, /* a directory containing the processes open fd's */
PFSfile, /* the executable file */
PFSfpregs, /* the process's FP register set */
PFSloadavg, /* load average (if -o linux) */
PFSlimit, /* resource limits */
PFSmap, /* memory map */
PFSmaps, /* memory map, Linux style (if -o linux) */
PFSmem, /* the process's memory image */
PFSmeminfo, /* system memory info (if -o linux) */
PFSmounts, /* mounted filesystems (if -o linux) */
PFSnote, /* process notifier */
PFSnotepg, /* process group notifier */
PFSproc, /* a process-specific sub-directory */
PFSregs, /* the process's register set */
PFSroot, /* the filesystem root */
PFSself, /* like curproc, but this is the Linux name */
PFSstat, /* process status (if -o linux) */
PFSstatm, /* process memory info (if -o linux) */
PFSstatus, /* process status */
PFStask, /* task subdirector (if -o linux) */
PFSuptime, /* elapsed time since (if -o linux) */
PFSversion, /* kernel version (if -o linux) */
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODE_TYPES
#endif
PFSlast, /* track number of types */
} pfstype;
/*
* control data for the proc file system.
*/
struct pfskey {
pfstype pk_type; /* type of procfs node */
pid_t pk_pid; /* associated process */
int pk_fd; /* associated fd if not -1 */
};
struct pfsnode {
LIST_ENTRY(pfsnode) pfs_hash; /* per pid hash list */
struct vnode *pfs_vnode; /* vnode associated with this pfsnode */
struct mount *pfs_mount; /* mount associated with this pfsnode */
struct pfskey pfs_key;
#define pfs_type pfs_key.pk_type
#define pfs_pid pfs_key.pk_pid
#define pfs_fd pfs_key.pk_fd
mode_t pfs_mode; /* mode bits for stat() */
u_long pfs_flags; /* open flags */
uint64_t pfs_fileno; /* unique file id */
};
#define PROCFS_NOTELEN 64 /* max length of a note (/proc/$pid/note) */
#define PROCFS_MAXNAMLEN 255
#endif /* _KERNEL */
struct procfs_args {
int version;
int flags;
};
#define PROCFS_ARGSVERSION 1
#define PROCFSMNT_LINUXCOMPAT 0x01
#define PROCFSMNT_BITS "\177\20" \
"b\00linuxcompat\0"
/*
* Kernel stuff follows
*/
#ifdef _KERNEL
#define CNEQ(cnp, s, len) \
((cnp)->cn_namelen == (len) && \
(memcmp((s), (cnp)->cn_nameptr, (len)) == 0))
#define UIO_MX 32
static __inline ino_t
procfs_fileno(pid_t _pid, pfstype _type, int _fd)
{
ino_t _ino;
switch (_type) {
case PFSroot:
return 2;
case PFScurproc:
return 3;
case PFSself:
return 4;
default:
_ino = _pid + 1;
if (_fd != -1)
_ino = _ino << 32 | _fd;
return _ino * PFSlast + _type;
}
}
#define PROCFS_FILENO(pid, type, fd) procfs_fileno(pid, type, fd)
#define PROCFS_TYPE(type) ((type) % PFSlast)
struct procfsmount {
int pmnt_flags;
};
#define VFSTOPROC(mp) ((struct procfsmount *)(mp)->mnt_data)
/*
* Convert between pfsnode vnode
*/
#define VTOPFS(vp) ((struct pfsnode *)(vp)->v_data)
#define PFSTOV(pfs) ((pfs)->pfs_vnode)
typedef struct vfs_namemap vfs_namemap_t;
struct vfs_namemap {
const char *nm_name;
int nm_val;
};
int vfs_getuserstr(struct uio *, char *, int *);
const vfs_namemap_t *vfs_findname(const vfs_namemap_t *, const char *, int);
struct mount;
struct proc *procfs_proc_find(struct mount *, pid_t);
bool procfs_use_linux_compat(struct mount *);
static inline bool
procfs_proc_is_linux_compat(void)
{
const char *emulname = curlwp->l_proc->p_emul->e_name;
return (strncmp(emulname, "linux", 5) == 0);
}
int procfs_proc_lock(struct mount *, int, struct proc **, int);
void procfs_proc_unlock(struct proc *);
int procfs_allocvp(struct mount *, struct vnode **, pid_t, pfstype, int);
int procfs_donote(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doregs(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dofpregs(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_domem(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_do_pid_stat(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dostatus(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_domap(struct lwp *, struct proc *, struct pfsnode *,
struct uio *, int);
int procfs_doprocargs(struct lwp *, struct proc *, struct pfsnode *,
struct uio *, int);
int procfs_domeminfo(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_dodevices(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_docpuinfo(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_docpustat(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doloadavg(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_do_pid_statm(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_dofd(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_douptime(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_domounts(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doemul(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doversion(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_doauxv(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
int procfs_dolimit(struct lwp *, struct proc *, struct pfsnode *,
struct uio *);
void procfs_hashrem(struct pfsnode *);
int procfs_getfp(struct pfsnode *, struct proc *, struct file **);
/* functions to check whether or not files should be displayed */
int procfs_validauxv(struct lwp *, struct mount *);
int procfs_validfile(struct lwp *, struct mount *);
int procfs_validfpregs(struct lwp *, struct mount *);
int procfs_validregs(struct lwp *, struct mount *);
int procfs_validmap(struct lwp *, struct mount *);
int procfs_rw(void *);
int procfs_getcpuinfstr(char *, size_t *);
#define PROCFS_LOCKED 0x01
#define PROCFS_WANT 0x02
extern int (**procfs_vnodeop_p)(void *);
extern struct vfsops procfs_vfsops;
int procfs_root(struct mount *, int, struct vnode **);
#ifdef __HAVE_PROCFS_MACHDEP
struct vattr;
void procfs_machdep_allocvp(struct vnode *);
int procfs_machdep_rw(struct lwp *, struct lwp *, struct pfsnode *,
struct uio *);
int procfs_machdep_getattr(struct vnode *, struct vattr *, struct proc *);
#endif
#endif /* _KERNEL */
/* $NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_amap.c: amap operations
*/
/*
* this file contains functions that perform operations on amaps. see
* uvm_amap.h for a brief explanation of the role of amaps in uvm.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_amap.c,v 1.129 2023/09/10 14:54:34 ad Exp $");
#include "opt_uvmhist.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/pool.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
#include <uvm/uvm_swap.h>
/*
* cache for allocation of vm_map structures. note that in order to
* avoid an endless loop, the amap cache's allocator cannot allocate
* memory from an amap (it currently goes through the kernel uobj, so
* we are ok).
*/
static struct pool_cache uvm_amap_cache;
static kmutex_t amap_list_lock __cacheline_aligned;
static LIST_HEAD(, vm_amap) amap_list;
/*
* local functions
*/
static int
amap_roundup_slots(int slots)
{
return kmem_roundup_size(slots * sizeof(int)) / sizeof(int);
}
#ifdef UVM_AMAP_PPREF
/*
* what is ppref? ppref is an _optional_ amap feature which is used
* to keep track of reference counts on a per-page basis. it is enabled
* when UVM_AMAP_PPREF is defined.
*
* when enabled, an array of ints is allocated for the pprefs. this
* array is allocated only when a partial reference is added to the
* map (either by unmapping part of the amap, or gaining a reference
* to only a part of an amap). if the allocation of the array fails
* (KM_NOSLEEP), then we set the array pointer to PPREF_NONE to indicate
* that we tried to do ppref's but couldn't alloc the array so just
* give up (after all, this is an optional feature!).
*
* the array is divided into page sized "chunks." for chunks of length 1,
* the chunk reference count plus one is stored in that chunk's slot.
* for chunks of length > 1 the first slot contains (the reference count
* plus one) * -1. [the negative value indicates that the length is
* greater than one.] the second slot of the chunk contains the length
* of the chunk. here is an example:
*
* actual REFS: 2 2 2 2 3 1 1 0 0 0 4 4 0 1 1 1
* ppref: -3 4 x x 4 -2 2 -1 3 x -5 2 1 -2 3 x
* <----------><-><----><-------><----><-><------->
* (x = don't care)
*
* this allows us to allow one int to contain the ref count for the whole
* chunk. note that the "plus one" part is needed because a reference
* count of zero is neither positive or negative (need a way to tell
* if we've got one zero or a bunch of them).
*
* here are some in-line functions to help us.
*/
/*
* pp_getreflen: get the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_getreflen(int *ppref, int offset, int *refp, int *lenp)
{
if (ppref[offset] > 0) { /* chunk size must be 1 */
*refp = ppref[offset] - 1; /* don't forget to adjust */
*lenp = 1;
} else {
*refp = (ppref[offset] * -1) - 1;
*lenp = ppref[offset+1];
}
}
/*
* pp_setreflen: set the reference and length for a specific offset
*
* => ppref's amap must be locked
*/
static inline void
pp_setreflen(int *ppref, int offset, int ref, int len)
{
if (len == 0)
return;
if (len == 1) {
ppref[offset] = ref + 1;
} else {
ppref[offset] = (ref + 1) * -1;
ppref[offset+1] = len;
}
}
#endif /* UVM_AMAP_PPREF */
/*
* amap_alloc1: allocate an amap, but do not initialise the overlay.
*
* => Note: lock is not set.
*/
static struct vm_amap *
amap_alloc1(int slots, int padslots, int flags)
{
const bool nowait = (flags & UVM_FLAG_NOWAIT) != 0;
const km_flag_t kmflags = nowait ? KM_NOSLEEP : KM_SLEEP;
struct vm_amap *amap;
krwlock_t *newlock, *oldlock;
int totalslots;
amap = pool_cache_get(&uvm_amap_cache, nowait ? PR_NOWAIT : PR_WAITOK);
if (amap == NULL) {
return NULL;
}
KASSERT(amap->am_lock != NULL); KASSERT(amap->am_nused == 0);
/* Try to privatize the lock if currently shared. */
if (rw_obj_refcnt(amap->am_lock) > 1) {
newlock = rw_obj_tryalloc();
if (newlock != NULL) { oldlock = amap->am_lock;
mutex_enter(&amap_list_lock);
amap->am_lock = newlock;
mutex_exit(&amap_list_lock);
rw_obj_free(oldlock);
}
}
totalslots = amap_roundup_slots(slots + padslots);
amap->am_ref = 1;
amap->am_flags = 0;
#ifdef UVM_AMAP_PPREF
amap->am_ppref = NULL;
#endif
amap->am_maxslot = totalslots;
amap->am_nslot = slots;
/*
* Note: since allocations are likely big, we expect to reduce the
* memory fragmentation by allocating them in separate blocks.
*/
amap->am_slots = kmem_alloc(totalslots * sizeof(int), kmflags);
if (amap->am_slots == NULL)
goto fail1;
amap->am_bckptr = kmem_alloc(totalslots * sizeof(int), kmflags);
if (amap->am_bckptr == NULL)
goto fail2;
amap->am_anon = kmem_alloc(totalslots * sizeof(struct vm_anon *),
kmflags);
if (amap->am_anon == NULL)
goto fail3;
return amap;
fail3:
kmem_free(amap->am_bckptr, totalslots * sizeof(int));
fail2:
kmem_free(amap->am_slots, totalslots * sizeof(int));
fail1:
pool_cache_put(&uvm_amap_cache, amap);
/*
* XXX hack to tell the pagedaemon how many pages we need,
* since we can need more than it would normally free.
*/
if (nowait) {
extern u_int uvm_extrapages;
atomic_add_int(&uvm_extrapages,
((sizeof(int) * 2 + sizeof(struct vm_anon *)) *
totalslots) >> PAGE_SHIFT);
}
return NULL;
}
/*
* amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM
*
* => caller should ensure sz is a multiple of PAGE_SIZE
* => reference count to new amap is set to one
* => new amap is returned unlocked
*/
struct vm_amap *
amap_alloc(vaddr_t sz, vaddr_t padsz, int waitf)
{
struct vm_amap *amap;
int slots, padslots;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
AMAP_B2SLOT(slots, sz); AMAP_B2SLOT(padslots, padsz);
amap = amap_alloc1(slots, padslots, waitf);
if (amap) { memset(amap->am_anon, 0,
amap->am_maxslot * sizeof(struct vm_anon *));
}
UVMHIST_LOG(maphist,"<- done, amap = %#jx, sz=%jd", (uintptr_t)amap,
sz, 0, 0);
return(amap);
}
/*
* amap_ctor: pool_cache constructor for new amaps
*
* => carefully synchronize with amap_swap_off()
*/
static int
amap_ctor(void *arg, void *obj, int flags)
{
struct vm_amap *amap = obj;
if ((flags & PR_NOWAIT) != 0) {
amap->am_lock = rw_obj_tryalloc();
if (amap->am_lock == NULL) {
return ENOMEM;
}
} else {
amap->am_lock = rw_obj_alloc();
}
amap->am_nused = 0;
amap->am_flags = 0;
mutex_enter(&amap_list_lock);
LIST_INSERT_HEAD(&amap_list, amap, am_list);
mutex_exit(&amap_list_lock);
return 0;
}
/*
* amap_ctor: pool_cache destructor for amaps
*
* => carefully synchronize with amap_swap_off()
*/
static void
amap_dtor(void *arg, void *obj)
{
struct vm_amap *amap = obj;
KASSERT(amap->am_nused == 0);
mutex_enter(&amap_list_lock);
LIST_REMOVE(amap, am_list);
mutex_exit(&amap_list_lock);
rw_obj_free(amap->am_lock);
}
/*
* uvm_amap_init: initialize the amap system.
*/
void
uvm_amap_init(void)
{
mutex_init(&amap_list_lock, MUTEX_DEFAULT, IPL_NONE);
pool_cache_bootstrap(&uvm_amap_cache, sizeof(struct vm_amap),
COHERENCY_UNIT, 0, 0, "amappl", NULL, IPL_NONE,
amap_ctor, amap_dtor, NULL);
}
/*
* amap_free: free an amap
*
* => the amap must be unlocked
* => the amap should have a zero reference count and be empty
*/
void
amap_free(struct vm_amap *amap)
{
int slots;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(amap->am_ref == 0); KASSERT(amap->am_nused == 0); KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
slots = amap->am_maxslot;
kmem_free(amap->am_slots, slots * sizeof(*amap->am_slots));
kmem_free(amap->am_bckptr, slots * sizeof(*amap->am_bckptr));
kmem_free(amap->am_anon, slots * sizeof(*amap->am_anon));
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) kmem_free(amap->am_ppref, slots * sizeof(*amap->am_ppref));
#endif
pool_cache_put(&uvm_amap_cache, amap);
UVMHIST_LOG(maphist,"<- done, freed amap = %#jx", (uintptr_t)amap,
0, 0, 0);
}
/*
* amap_extend: extend the size of an amap (if needed)
*
* => called from uvm_map when we want to extend an amap to cover
* a new mapping (rather than allocate a new one)
* => amap should be unlocked (we will lock it)
* => to safely extend an amap it should have a reference count of
* one (thus it can't be shared)
*/
int
amap_extend(struct vm_map_entry *entry, vsize_t addsize, int flags)
{
struct vm_amap *amap = entry->aref.ar_amap;
int slotoff = entry->aref.ar_pageoff;
int slotmapped, slotadd, slotneed, slotadded, slotalloc;
int slotadj, slotarea, slotendoff;
int oldnslots;
#ifdef UVM_AMAP_PPREF
int *newppref, *oldppref;
#endif
int i, *newsl, *newbck, *oldsl, *oldbck;
struct vm_anon **newover, **oldover;
const km_flag_t kmflags =
(flags & AMAP_EXTEND_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (entry=%#jx, addsize=%#jx, flags=%#jx)",
(uintptr_t)entry, addsize, flags, 0);
/*
* first, determine how many slots we need in the amap. don't
* forget that ar_pageoff could be non-zero: this means that
* there are some unused slots before us in the amap.
*/
amap_lock(amap, RW_WRITER);
KASSERT(amap_refs(amap) == 1); /* amap can't be shared */ AMAP_B2SLOT(slotmapped, entry->end - entry->start); /* slots mapped */ AMAP_B2SLOT(slotadd, addsize); /* slots to add */
if (flags & AMAP_EXTEND_FORWARDS) {
slotneed = slotoff + slotmapped + slotadd;
slotadj = 0;
slotarea = 0;
} else {
slotneed = slotadd + slotmapped;
slotadj = slotadd - slotoff;
slotarea = amap->am_maxslot - slotmapped;
}
/*
* Because this amap only has 1 ref, we know that there is
* only one vm_map_entry pointing to it, and the one entry is
* using slots between slotoff and slotoff + slotmapped. If
* we have been using ppref then we know that only slots in
* the one map entry's range can have anons, since ppref
* allowed us to free any anons outside that range as other map
* entries which used this amap were removed. But without ppref,
* we couldn't know which slots were still needed by other map
* entries, so we couldn't free any anons as we removed map
* entries, and so any slot from 0 to am_nslot can have an
* anon. But now that we know there is only one map entry
* left and we know its range, we can free up any anons
* outside that range. This is necessary because the rest of
* this function assumes that there are no anons in the amap
* outside of the one map entry's range.
*/
slotendoff = slotoff + slotmapped;
if (amap->am_ppref == PPREF_NONE) { amap_wiperange(amap, 0, slotoff);
amap_wiperange(amap, slotendoff, amap->am_nslot - slotendoff);
}
for (i = 0; i < slotoff; i++) { KASSERT(amap->am_anon[i] == NULL);
}
for (i = slotendoff; i < amap->am_nslot - slotendoff; i++) { KASSERT(amap->am_anon[i] == NULL);
}
/*
* case 1: we already have enough slots in the map and thus
* only need to bump the reference counts on the slots we are
* adding.
*/
if (flags & AMAP_EXTEND_FORWARDS) {
if (amap->am_nslot >= slotneed) {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
amap_pp_adjref(amap, slotoff + slotmapped,
slotadd, 1);
}
#endif
amap_unlock(amap);
UVMHIST_LOG(maphist,
"<- done (case 1f), amap = %#jx, sltneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
} else {
if (slotadj <= 0) {
slotoff -= slotadd;
entry->aref.ar_pageoff = slotoff;
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, slotoff, slotadd, 1);
}
#endif
amap_unlock(amap);
UVMHIST_LOG(maphist,
"<- done (case 1b), amap = %#jx, sltneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
}
/*
* case 2: we pre-allocated slots for use and we just need to
* bump nslot up to take account for these slots.
*/
if (amap->am_maxslot >= slotneed) {
if (flags & AMAP_EXTEND_FORWARDS) {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { if ((slotoff + slotmapped) < amap->am_nslot)
amap_pp_adjref(amap,
slotoff + slotmapped,
(amap->am_nslot -
(slotoff + slotmapped)), 1);
pp_setreflen(amap->am_ppref, amap->am_nslot, 1,
slotneed - amap->am_nslot);
}
#endif
amap->am_nslot = slotneed;
amap_unlock(amap);
/*
* no need to zero am_anon since that was done at
* alloc time and we never shrink an allocation.
*/
UVMHIST_LOG(maphist,"<- done (case 2f), amap = %#jx, "
"slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
return 0;
} else {
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
/*
* Slide up the ref counts on the pages that
* are actually in use.
*/
memmove(amap->am_ppref + slotarea,
amap->am_ppref + slotoff,
slotmapped * sizeof(int));
/*
* Mark the (adjusted) gap at the front as
* referenced/not referenced.
*/
pp_setreflen(amap->am_ppref,
0, 0, slotarea - slotadd);
pp_setreflen(amap->am_ppref,
slotarea - slotadd, 1, slotadd);
}
#endif
/*
* Slide the anon pointers up and clear out
* the space we just made.
*/
memmove(amap->am_anon + slotarea,
amap->am_anon + slotoff,
slotmapped * sizeof(struct vm_anon*));
memset(amap->am_anon + slotoff, 0,
(slotarea - slotoff) * sizeof(struct vm_anon *));
/*
* Slide the backpointers up, but don't bother
* wiping out the old slots.
*/
memmove(amap->am_bckptr + slotarea,
amap->am_bckptr + slotoff,
slotmapped * sizeof(int));
/*
* Adjust all the useful active slot numbers.
*/
for (i = 0; i < amap->am_nused; i++)
amap->am_slots[i] += (slotarea - slotoff);
/*
* We just filled all the empty space in the
* front of the amap by activating a few new
* slots.
*/
amap->am_nslot = amap->am_maxslot;
entry->aref.ar_pageoff = slotarea - slotadd;
amap_unlock(amap);
UVMHIST_LOG(maphist,"<- done (case 2b), amap = %#jx, "
"slotneed=%jd", (uintptr_t)amap, slotneed, 0, 0);
return 0;
}
}
/*
* Case 3: we need to allocate a new amap and copy all the amap
* data over from old amap to the new one. Drop the lock before
* performing allocation.
*
* Note: since allocations are likely big, we expect to reduce the
* memory fragmentation by allocating them in separate blocks.
*/
amap_unlock(amap);
if (slotneed >= UVM_AMAP_LARGE) {
return E2BIG;
}
slotalloc = amap_roundup_slots(slotneed);
#ifdef UVM_AMAP_PPREF
newppref = NULL;
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
/* Will be handled later if fails. */
newppref = kmem_alloc(slotalloc * sizeof(*newppref), kmflags);
}
#endif
newsl = kmem_alloc(slotalloc * sizeof(*newsl), kmflags);
newbck = kmem_alloc(slotalloc * sizeof(*newbck), kmflags);
newover = kmem_alloc(slotalloc * sizeof(*newover), kmflags);
if (newsl == NULL || newbck == NULL || newover == NULL) {
#ifdef UVM_AMAP_PPREF
if (newppref != NULL) { kmem_free(newppref, slotalloc * sizeof(*newppref));
}
#endif
if (newsl != NULL) { kmem_free(newsl, slotalloc * sizeof(*newsl));
}
if (newbck != NULL) { kmem_free(newbck, slotalloc * sizeof(*newbck));
}
if (newover != NULL) { kmem_free(newover, slotalloc * sizeof(*newover));
}
return ENOMEM;
}
amap_lock(amap, RW_WRITER);
KASSERT(amap->am_maxslot < slotneed);
/*
* Copy everything over to new allocated areas.
*/
slotadded = slotalloc - amap->am_nslot;
if (!(flags & AMAP_EXTEND_FORWARDS))
slotarea = slotalloc - slotmapped;
/* do am_slots */
oldsl = amap->am_slots;
if (flags & AMAP_EXTEND_FORWARDS)
memcpy(newsl, oldsl, sizeof(int) * amap->am_nused);
else
for (i = 0; i < amap->am_nused; i++)
newsl[i] = oldsl[i] + slotarea - slotoff;
amap->am_slots = newsl;
/* do am_anon */
oldover = amap->am_anon;
if (flags & AMAP_EXTEND_FORWARDS) {
memcpy(newover, oldover,
sizeof(struct vm_anon *) * amap->am_nslot);
memset(newover + amap->am_nslot, 0,
sizeof(struct vm_anon *) * slotadded);
} else {
memcpy(newover + slotarea, oldover + slotoff,
sizeof(struct vm_anon *) * slotmapped);
memset(newover, 0,
sizeof(struct vm_anon *) * slotarea);
}
amap->am_anon = newover;
/* do am_bckptr */
oldbck = amap->am_bckptr;
if (flags & AMAP_EXTEND_FORWARDS)
memcpy(newbck, oldbck, sizeof(int) * amap->am_nslot);
else
memcpy(newbck + slotarea, oldbck + slotoff,
sizeof(int) * slotmapped);
amap->am_bckptr = newbck;
#ifdef UVM_AMAP_PPREF
/* do ppref */
oldppref = amap->am_ppref;
if (newppref) {
if (flags & AMAP_EXTEND_FORWARDS) {
memcpy(newppref, oldppref,
sizeof(int) * amap->am_nslot);
memset(newppref + amap->am_nslot, 0,
sizeof(int) * slotadded);
} else {
memcpy(newppref + slotarea, oldppref + slotoff,
sizeof(int) * slotmapped);
}
amap->am_ppref = newppref;
if ((flags & AMAP_EXTEND_FORWARDS) &&
(slotoff + slotmapped) < amap->am_nslot)
amap_pp_adjref(amap, slotoff + slotmapped,
(amap->am_nslot - (slotoff + slotmapped)), 1);
if (flags & AMAP_EXTEND_FORWARDS)
pp_setreflen(newppref, amap->am_nslot, 1,
slotneed - amap->am_nslot);
else {
pp_setreflen(newppref, 0, 0,
slotalloc - slotneed);
pp_setreflen(newppref, slotalloc - slotneed, 1,
slotneed - slotmapped);
}
} else {
if (amap->am_ppref) amap->am_ppref = PPREF_NONE;
}
#endif
/* update master values */
if (flags & AMAP_EXTEND_FORWARDS)
amap->am_nslot = slotneed;
else {
entry->aref.ar_pageoff = slotarea - slotadd;
amap->am_nslot = slotalloc;
}
oldnslots = amap->am_maxslot;
amap->am_maxslot = slotalloc;
amap_unlock(amap);
kmem_free(oldsl, oldnslots * sizeof(*oldsl));
kmem_free(oldbck, oldnslots * sizeof(*oldbck));
kmem_free(oldover, oldnslots * sizeof(*oldover));
#ifdef UVM_AMAP_PPREF
if (oldppref && oldppref != PPREF_NONE) kmem_free(oldppref, oldnslots * sizeof(*oldppref));
#endif
UVMHIST_LOG(maphist,"<- done (case 3), amap = %#jx, slotneed=%jd",
(uintptr_t)amap, slotneed, 0, 0);
return 0;
}
/*
* amap_share_protect: change protection of anons in a shared amap
*
* for shared amaps, given the current data structure layout, it is
* not possible for us to directly locate all maps referencing the
* shared anon (to change the protection). in order to protect data
* in shared maps we use pmap_page_protect(). [this is useful for IPC
* mechanisms like map entry passing that may want to write-protect
* all mappings of a shared amap.] we traverse am_anon or am_slots
* depending on the current state of the amap.
*
* => entry's map and amap must be locked by the caller
*/
void
amap_share_protect(struct vm_map_entry *entry, vm_prot_t prot)
{
struct vm_amap *amap = entry->aref.ar_amap;
u_int slots, lcv, slot, stop;
struct vm_anon *anon;
KASSERT(rw_write_held(amap->am_lock));
AMAP_B2SLOT(slots, (entry->end - entry->start));
stop = entry->aref.ar_pageoff + slots;
if (slots < amap->am_nused) {
/*
* Cheaper to traverse am_anon.
*/
for (lcv = entry->aref.ar_pageoff ; lcv < stop ; lcv++) {
anon = amap->am_anon[lcv];
if (anon == NULL) {
continue;
}
if (anon->an_page) {
pmap_page_protect(anon->an_page, prot);
}
}
return;
}
/*
* Cheaper to traverse am_slots.
*/
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
slot = amap->am_slots[lcv];
if (slot < entry->aref.ar_pageoff || slot >= stop) {
continue;
}
anon = amap->am_anon[slot];
if (anon->an_page) {
pmap_page_protect(anon->an_page, prot);
}
}
}
/*
* amap_wipeout: wipeout all anon's in an amap; then free the amap!
*
* => Called from amap_unref(), when reference count drops to zero.
* => amap must be locked.
*/
void
amap_wipeout(struct vm_amap *amap)
{
u_int lcv;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist,"(amap=%#jx)", (uintptr_t)amap, 0,0,0);
KASSERT(rw_write_held(amap->am_lock)); KASSERT(amap->am_ref == 0);
if (__predict_false(amap->am_flags & AMAP_SWAPOFF)) {
/*
* Note: amap_swap_off() will call us again.
*/
amap_unlock(amap);
return;
}
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
struct vm_anon *anon;
u_int slot;
slot = amap->am_slots[lcv];
anon = amap->am_anon[slot];
KASSERT(anon != NULL); KASSERT(anon->an_ref != 0); KASSERT(anon->an_lock == amap->am_lock);
UVMHIST_LOG(maphist," processing anon %#jx, ref=%jd",
(uintptr_t)anon, anon->an_ref, 0, 0);
/*
* Drop the reference.
*/
if (__predict_true(--anon->an_ref == 0)) { uvm_anfree(anon);
}
if (__predict_false((lcv & 31) == 31)) { preempt_point();
}
}
/*
* Finally, destroy the amap.
*/
amap->am_nused = 0;
amap_unlock(amap);
amap_free(amap);
UVMHIST_LOG(maphist,"<- done!", 0,0,0,0);
}
/*
* amap_copy: ensure that a map entry's "needs_copy" flag is false
* by copying the amap if necessary.
*
* => an entry with a null amap pointer will get a new (blank) one.
* => the map that the map entry belongs to must be locked by caller.
* => the amap currently attached to "entry" (if any) must be unlocked.
* => if canchunk is true, then we may clip the entry into a chunk
* => "startva" and "endva" are used only if canchunk is true. they are
* used to limit chunking (e.g. if you have a large space that you
* know you are going to need to allocate amaps for, there is no point
* in allowing that to be chunked)
*/
void
amap_copy(struct vm_map *map, struct vm_map_entry *entry, int flags,
vaddr_t startva, vaddr_t endva)
{
const int waitf = (flags & AMAP_COPY_NOWAIT) ? UVM_FLAG_NOWAIT : 0;
struct vm_amap *amap, *srcamap;
u_int slots, lcv;
krwlock_t *oldlock;
vsize_t len;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, " (map=%#jx, entry=%#jx, flags=%#jx)",
(uintptr_t)map, (uintptr_t)entry, flags, -2);
KASSERT(map != kernel_map); /* we use nointr pool */
srcamap = entry->aref.ar_amap;
len = entry->end - entry->start;
/*
* Is there an amap to copy? If not, create one.
*/
if (srcamap == NULL) {
const bool canchunk = (flags & AMAP_COPY_NOCHUNK) == 0;
/*
* Check to see if we have a large amap that we can
* chunk. We align startva/endva to chunk-sized
* boundaries and then clip to them.
*/
if (canchunk && atop(len) >= UVM_AMAP_LARGE) {
vsize_t chunksize;
/* Convert slots to bytes. */
chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
startva = (startva / chunksize) * chunksize;
endva = roundup(endva, chunksize);
UVMHIST_LOG(maphist,
" chunk amap ==> clip %#jx->%#jx to %#jx->%#jx",
entry->start, entry->end, startva, endva);
UVM_MAP_CLIP_START(map, entry, startva);
/* Watch out for endva wrap-around! */
if (endva >= startva) { UVM_MAP_CLIP_END(map, entry, endva);
}
}
if ((flags & AMAP_COPY_NOMERGE) == 0 &&
uvm_mapent_trymerge(map, entry, UVM_MERGE_COPYING)) {
return;
}
UVMHIST_LOG(maphist, "<- done [creating new amap %#jx->%#jx]",
entry->start, entry->end, 0, 0);
/*
* Allocate an initialised amap and install it.
* Note: we must update the length after clipping.
*/
len = entry->end - entry->start;
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap_alloc(len, 0, waitf);
if (entry->aref.ar_amap != NULL) {
entry->etype &= ~UVM_ET_NEEDSCOPY;
}
return;
}
/*
* First check and see if we are the only map entry referencing
* he amap we currently have. If so, then just take it over instead
* of copying it. Note that we are reading am_ref without lock held
* as the value can only be one if we have the only reference
* to the amap (via our locked map). If the value is greater than
* one, then allocate amap and re-check the value.
*/
if (srcamap->am_ref == 1) {
entry->etype &= ~UVM_ET_NEEDSCOPY;
UVMHIST_LOG(maphist, "<- done [ref cnt = 1, took it over]",
0, 0, 0, 0);
return;
}
UVMHIST_LOG(maphist," amap=%#jx, ref=%jd, must copy it",
(uintptr_t)srcamap, srcamap->am_ref, 0, 0);
/*
* Allocate a new amap (note: not initialised, etc).
*/
AMAP_B2SLOT(slots, len);
amap = amap_alloc1(slots, 0, waitf);
if (amap == NULL) {
UVMHIST_LOG(maphist, " amap_alloc1 failed", 0,0,0,0);
return;
}
/*
* Make the new amap share the source amap's lock, and then lock
* both. We must do this before we set am_nused != 0, otherwise
* amap_swap_off() can become interested in the amap.
*/
oldlock = amap->am_lock;
mutex_enter(&amap_list_lock);
amap->am_lock = srcamap->am_lock;
mutex_exit(&amap_list_lock);
rw_obj_hold(amap->am_lock);
rw_obj_free(oldlock);
amap_lock(srcamap, RW_WRITER);
/*
* Re-check the reference count with the lock held. If it has
* dropped to one - we can take over the existing map.
*/
if (srcamap->am_ref == 1) {
/* Just take over the existing amap. */
entry->etype &= ~UVM_ET_NEEDSCOPY;
amap_unlock(srcamap);
/* Destroy the new (unused) amap. */
amap->am_ref--;
amap_free(amap);
return;
}
/*
* Copy the slots. Zero the padded part.
*/
UVMHIST_LOG(maphist, " copying amap now",0, 0, 0, 0);
for (lcv = 0 ; lcv < slots; lcv++) {
amap->am_anon[lcv] =
srcamap->am_anon[entry->aref.ar_pageoff + lcv];
if (amap->am_anon[lcv] == NULL)
continue;
KASSERT(amap->am_anon[lcv]->an_lock == srcamap->am_lock); KASSERT(amap->am_anon[lcv]->an_ref > 0); KASSERT(amap->am_nused < amap->am_maxslot);
amap->am_anon[lcv]->an_ref++;
amap->am_bckptr[lcv] = amap->am_nused;
amap->am_slots[amap->am_nused] = lcv;
amap->am_nused++;
}
memset(&amap->am_anon[lcv], 0,
(amap->am_maxslot - lcv) * sizeof(struct vm_anon *));
/*
* Drop our reference to the old amap (srcamap) and unlock.
* Since the reference count on srcamap is greater than one,
* (we checked above), it cannot drop to zero while it is locked.
*/
srcamap->am_ref--;
KASSERT(srcamap->am_ref > 0); if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0) { srcamap->am_flags &= ~AMAP_SHARED;
}
#ifdef UVM_AMAP_PPREF
if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) { amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
len >> PAGE_SHIFT, -1);
}
#endif
amap_unlock(srcamap);
/*
* Install new amap.
*/
entry->aref.ar_pageoff = 0;
entry->aref.ar_amap = amap;
entry->etype &= ~UVM_ET_NEEDSCOPY;
UVMHIST_LOG(maphist, "<- done",0, 0, 0, 0);
}
/*
* amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2)
*
* called during fork(2) when the parent process has a wired map
* entry. in that case we want to avoid write-protecting pages
* in the parent's map (e.g. like what you'd do for a COW page)
* so we resolve the COW here.
*
* => assume parent's entry was wired, thus all pages are resident.
* => assume pages that are loaned out (loan_count) are already mapped
* read-only in all maps, and thus no need for us to worry about them
* => assume both parent and child vm_map's are locked
* => caller passes child's map/entry in to us
* => if we run out of memory we will unlock the amap and sleep _with_ the
* parent and child vm_map's locked(!). we have to do this since
* we are in the middle of a fork(2) and we can't let the parent
* map change until we are done copying all the map entrys.
* => XXXCDC: out of memory should cause fork to fail, but there is
* currently no easy way to do this (needs fix)
*/
void
amap_cow_now(struct vm_map *map, struct vm_map_entry *entry)
{
struct vm_amap *amap = entry->aref.ar_amap;
struct vm_anon *anon, *nanon;
struct vm_page *pg, *npg;
u_int lcv, slot;
/*
* note that if we unlock the amap then we must ReStart the "lcv" for
* loop because some other process could reorder the anon's in the
* am_anon[] array on us while the lock is dropped.
*/
ReStart:
amap_lock(amap, RW_WRITER);
for (lcv = 0 ; lcv < amap->am_nused ; lcv++) {
slot = amap->am_slots[lcv];
anon = amap->am_anon[slot];
KASSERT(anon->an_lock == amap->am_lock);
/*
* If anon has only one reference - we must have already
* copied it. This can happen if we needed to sleep waiting
* for memory in a previous run through this loop. The new
* page might even have been paged out, since is not wired.
*/
if (anon->an_ref == 1) {
KASSERT(anon->an_page != NULL || anon->an_swslot != 0);
continue;
}
/*
* The old page must be resident since the parent is wired.
*/
pg = anon->an_page;
KASSERT(pg != NULL); KASSERT(pg->wire_count > 0);
/*
* If the page is loaned then it must already be mapped
* read-only and we don't need to copy it.
*/
if (pg->loan_count != 0) {
continue;
}
KASSERT(pg->uanon == anon); KASSERT(pg->uobject == NULL);
/*
* If the page is busy, then we have to unlock, wait for
* it and then restart.
*/
if (pg->flags & PG_BUSY) {
uvm_pagewait(pg, amap->am_lock, "cownow");
goto ReStart;
}
/*
* Perform a copy-on-write.
* First - get a new anon and a page.
*/
nanon = uvm_analloc();
if (nanon) {
nanon->an_lock = amap->am_lock;
npg = uvm_pagealloc(NULL, 0, nanon, 0);
} else {
npg = NULL;
}
if (nanon == NULL || npg == NULL) {
amap_unlock(amap);
if (nanon) {
nanon->an_lock = NULL;
nanon->an_ref--;
KASSERT(nanon->an_ref == 0);
uvm_anfree(nanon);
}
uvm_wait("cownowpage");
goto ReStart;
}
/*
* Copy the data and replace anon with the new one.
* Also, setup its lock (share the with amap's lock).
*/
uvm_pagecopy(pg, npg);
anon->an_ref--;
KASSERT(anon->an_ref > 0);
amap->am_anon[slot] = nanon;
/*
* Drop PG_BUSY on new page. Since its owner was write
* locked all this time - it cannot be PG_RELEASED or
* waited on.
*/
uvm_pagelock(npg);
uvm_pageactivate(npg);
uvm_pageunlock(npg);
npg->flags &= ~(PG_BUSY|PG_FAKE);
UVM_PAGE_OWN(npg, NULL);
}
amap_unlock(amap);
}
/*
* amap_splitref: split a single reference into two separate references
*
* => called from uvm_map's clip routines
* => origref's map should be locked
* => origref->ar_amap should be unlocked (we will lock)
*/
void
amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset)
{
struct vm_amap *amap = origref->ar_amap;
u_int leftslots;
KASSERT(splitref->ar_amap == origref->ar_amap); AMAP_B2SLOT(leftslots, offset); KASSERT(leftslots != 0);
amap_lock(amap, RW_WRITER);
KASSERT(amap->am_nslot - origref->ar_pageoff - leftslots > 0);
#ifdef UVM_AMAP_PPREF
/* Establish ppref before we add a duplicate reference to the amap. */
if (amap->am_ppref == NULL) { amap_pp_establish(amap, origref->ar_pageoff);
}
#endif
/* Note: not a share reference. */
amap->am_ref++;
splitref->ar_pageoff = origref->ar_pageoff + leftslots;
amap_unlock(amap);
}
#ifdef UVM_AMAP_PPREF
/*
* amap_pp_establish: add a ppref array to an amap, if possible.
*
* => amap should be locked by caller.
*/
void
amap_pp_establish(struct vm_amap *amap, vaddr_t offset)
{
const size_t sz = amap->am_maxslot * sizeof(*amap->am_ppref);
KASSERT(rw_write_held(amap->am_lock));
amap->am_ppref = kmem_zalloc(sz, KM_NOSLEEP);
if (amap->am_ppref == NULL) {
/* Failure - just do not use ppref. */
amap->am_ppref = PPREF_NONE;
return;
}
pp_setreflen(amap->am_ppref, 0, 0, offset); pp_setreflen(amap->am_ppref, offset, amap->am_ref,
amap->am_nslot - offset);
}
/*
* amap_pp_adjref: adjust reference count to a part of an amap using the
* per-page reference count array.
*
* => caller must check that ppref != PPREF_NONE before calling.
* => map and amap must be locked.
*/
void
amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval)
{
int stopslot, *ppref, lcv, prevlcv;
int ref, len, prevref, prevlen;
KASSERT(rw_write_held(amap->am_lock));
stopslot = curslot + slotlen;
ppref = amap->am_ppref;
prevlcv = 0;
/*
* Advance to the correct place in the array, fragment if needed.
*/
for (lcv = 0 ; lcv < curslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > curslot) { /* goes past start? */ pp_setreflen(ppref, lcv, ref, curslot - lcv); pp_setreflen(ppref, curslot, ref, len - (curslot -lcv));
len = curslot - lcv; /* new length of entry @ lcv */
}
prevlcv = lcv;
}
if (lcv == 0) {
/*
* Ensure that the "prevref == ref" test below always
* fails, since we are starting from the beginning of
* the ppref array; that is, there is no previous chunk.
*/
prevref = -1;
prevlen = 0;
} else {
pp_getreflen(ppref, prevlcv, &prevref, &prevlen);
}
/*
* Now adjust reference counts in range. Merge the first
* changed entry with the last unchanged entry if possible.
*/
KASSERT(lcv == curslot); for (/* lcv already set */; lcv < stopslot ; lcv += len) { pp_getreflen(ppref, lcv, &ref, &len); if (lcv + len > stopslot) { /* goes past end? */ pp_setreflen(ppref, lcv, ref, stopslot - lcv); pp_setreflen(ppref, stopslot, ref,
len - (stopslot - lcv));
len = stopslot - lcv;
}
ref += adjval;
KASSERT(ref >= 0); KASSERT(ref <= amap->am_ref);
if (lcv == prevlcv + prevlen && ref == prevref) {
pp_setreflen(ppref, prevlcv, ref, prevlen + len);
} else {
pp_setreflen(ppref, lcv, ref, len);
}
if (ref == 0) {
amap_wiperange(amap, lcv, len);
}
}
}
/*
* amap_wiperange: wipe out a range of an amap.
* Note: different from amap_wipeout because the amap is kept intact.
*
* => Both map and amap must be locked by caller.
*/
void
amap_wiperange(struct vm_amap *amap, int slotoff, int slots)
{
u_int lcv, stop, slotend;
bool byanon;
KASSERT(rw_write_held(amap->am_lock));
/*
* We can either traverse the amap by am_anon or by am_slots.
* Determine which way is less expensive.
*/
if (slots < amap->am_nused) {
byanon = true;
lcv = slotoff;
stop = slotoff + slots;
slotend = 0;
} else {
byanon = false;
lcv = 0;
stop = amap->am_nused;
slotend = slotoff + slots;
}
while (lcv < stop) {
struct vm_anon *anon;
u_int curslot, ptr, last;
if (byanon) {
curslot = lcv++; /* lcv advances here */
if (amap->am_anon[curslot] == NULL)
continue;
} else {
curslot = amap->am_slots[lcv];
if (curslot < slotoff || curslot >= slotend) {
lcv++; /* lcv advances here */
continue;
}
stop--; /* drop stop, since anon will be removed */
}
anon = amap->am_anon[curslot];
KASSERT(anon->an_lock == amap->am_lock);
/*
* Remove anon from the amap.
*/
amap->am_anon[curslot] = NULL;
ptr = amap->am_bckptr[curslot];
last = amap->am_nused - 1;
if (ptr != last) { amap->am_slots[ptr] = amap->am_slots[last];
amap->am_bckptr[amap->am_slots[ptr]] = ptr;
}
amap->am_nused--;
/*
* Drop its reference count.
*/
KASSERT(anon->an_lock == amap->am_lock); if (--anon->an_ref == 0) { uvm_anfree(anon);
}
}
}
#endif
#if defined(VMSWAP)
/*
* amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
*
* => called with swap_syscall_lock held.
* => note that we don't always traverse all anons.
* eg. amaps being wiped out, released anons.
* => return true if failed.
*/
bool
amap_swap_off(int startslot, int endslot)
{
struct vm_amap *am;
struct vm_amap *am_next;
struct vm_amap marker_prev;
struct vm_amap marker_next;
bool rv = false;
#if defined(DIAGNOSTIC)
memset(&marker_prev, 0, sizeof(marker_prev));
memset(&marker_next, 0, sizeof(marker_next));
#endif /* defined(DIAGNOSTIC) */
mutex_enter(&amap_list_lock);
for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) {
int i;
LIST_INSERT_BEFORE(am, &marker_prev, am_list);
LIST_INSERT_AFTER(am, &marker_next, am_list);
/* amap_list_lock prevents the lock pointer from changing. */
if (!amap_lock_try(am, RW_WRITER)) {
(void)kpause("amapswpo", false, 1, &amap_list_lock);
am_next = LIST_NEXT(&marker_prev, am_list);
if (am_next == &marker_next) {
am_next = LIST_NEXT(am_next, am_list);
} else {
KASSERT(LIST_NEXT(am_next, am_list) ==
&marker_next);
}
LIST_REMOVE(&marker_prev, am_list);
LIST_REMOVE(&marker_next, am_list);
continue;
}
mutex_exit(&amap_list_lock);
/* If am_nused == 0, the amap could be free - careful. */
for (i = 0; i < am->am_nused; i++) {
int slot;
int swslot;
struct vm_anon *anon;
slot = am->am_slots[i];
anon = am->am_anon[slot];
KASSERT(anon->an_lock == am->am_lock);
swslot = anon->an_swslot;
if (swslot < startslot || endslot <= swslot) {
continue;
}
am->am_flags |= AMAP_SWAPOFF;
rv = uvm_anon_pagein(am, anon);
amap_lock(am, RW_WRITER);
am->am_flags &= ~AMAP_SWAPOFF;
if (amap_refs(am) == 0) {
amap_wipeout(am);
am = NULL;
break;
}
if (rv) {
break;
}
i = 0;
}
if (am) {
amap_unlock(am);
}
mutex_enter(&amap_list_lock);
KASSERT(LIST_NEXT(&marker_prev, am_list) == &marker_next ||
LIST_NEXT(LIST_NEXT(&marker_prev, am_list), am_list) ==
&marker_next);
am_next = LIST_NEXT(&marker_next, am_list);
LIST_REMOVE(&marker_prev, am_list);
LIST_REMOVE(&marker_next, am_list);
}
mutex_exit(&amap_list_lock);
return rv;
}
#endif /* defined(VMSWAP) */
/*
* amap_lookup: look up a page in an amap.
*
* => amap should be locked by caller.
*/
struct vm_anon *
amap_lookup(struct vm_aref *aref, vaddr_t offset)
{
struct vm_amap *amap = aref->ar_amap;
struct vm_anon *an;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
an = amap->am_anon[slot];
UVMHIST_LOG(maphist,
"<- done (amap=%#jx, offset=%#jx, result=%#jx)",
(uintptr_t)amap, offset, (uintptr_t)an, 0);
KASSERT(slot < amap->am_nslot); KASSERT(an == NULL || an->an_ref != 0); KASSERT(an == NULL || an->an_lock == amap->am_lock);
return an;
}
/*
* amap_lookups: look up a range of pages in an amap.
*
* => amap should be locked by caller.
*/
void
amap_lookups(struct vm_aref *aref, vaddr_t offset, struct vm_anon **anons,
int npages)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_lock_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
UVMHIST_LOG(maphist, " slot=%u, npages=%d, nslot=%d",
slot, npages, amap->am_nslot, 0);
KASSERT((slot + (npages - 1)) < amap->am_nslot);
memcpy(anons, &amap->am_anon[slot], npages * sizeof(struct vm_anon *));
#if defined(DIAGNOSTIC)
for (int i = 0; i < npages; i++) {
struct vm_anon * const an = anons[i];
if (an == NULL) {
continue;
}
KASSERT(an->an_ref != 0); KASSERT(an->an_lock == amap->am_lock);
}
#endif
UVMHIST_LOG(maphist, "<- done", 0, 0, 0, 0);
}
/*
* amap_add: add (or replace) a page to an amap.
*
* => amap should be locked by caller.
* => anon must have the lock associated with this amap.
*/
void
amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon,
bool replace)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_write_held(amap->am_lock)); KASSERT(anon->an_lock == amap->am_lock); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot);
if (replace) {
struct vm_anon *oanon = amap->am_anon[slot];
KASSERT(oanon != NULL); if (oanon->an_page && (amap->am_flags & AMAP_SHARED) != 0) { pmap_page_protect(oanon->an_page, VM_PROT_NONE);
/*
* XXX: suppose page is supposed to be wired somewhere?
*/
}
} else {
KASSERT(amap->am_anon[slot] == NULL); KASSERT(amap->am_nused < amap->am_maxslot);
amap->am_bckptr[slot] = amap->am_nused;
amap->am_slots[amap->am_nused] = slot;
amap->am_nused++;
}
amap->am_anon[slot] = anon;
UVMHIST_LOG(maphist,
"<- done (amap=%#jx, offset=%#x, anon=%#jx, rep=%d)",
(uintptr_t)amap, offset, (uintptr_t)anon, replace);
}
/*
* amap_unadd: remove a page from an amap.
*
* => amap should be locked by caller.
*/
void
amap_unadd(struct vm_aref *aref, vaddr_t offset)
{
struct vm_amap *amap = aref->ar_amap;
u_int slot, ptr, last;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(rw_write_held(amap->am_lock)); AMAP_B2SLOT(slot, offset);
slot += aref->ar_pageoff;
KASSERT(slot < amap->am_nslot); KASSERT(amap->am_anon[slot] != NULL); KASSERT(amap->am_anon[slot]->an_lock == amap->am_lock);
amap->am_anon[slot] = NULL;
ptr = amap->am_bckptr[slot];
last = amap->am_nused - 1;
if (ptr != last) {
/* Move the last entry to keep the slots contiguous. */
amap->am_slots[ptr] = amap->am_slots[last];
amap->am_bckptr[amap->am_slots[ptr]] = ptr;
}
amap->am_nused--;
UVMHIST_LOG(maphist, "<- done (amap=%#jx, slot=%#jx)",
(uintptr_t)amap, slot,0, 0);
}
/*
* amap_adjref_anons: adjust the reference count(s) on amap and its anons.
*/
static void
amap_adjref_anons(struct vm_amap *amap, vaddr_t offset, vsize_t len,
int refv, bool all)
{
#ifdef UVM_AMAP_PPREF
KASSERT(rw_write_held(amap->am_lock));
/*
* We must establish the ppref array before changing am_ref
* so that the ppref values match the current amap refcount.
*/
if (amap->am_ppref == NULL) { amap_pp_establish(amap, offset);
}
#endif
amap->am_ref += refv;
#ifdef UVM_AMAP_PPREF
if (amap->am_ppref && amap->am_ppref != PPREF_NONE) { amap_pp_adjref(amap, offset, len, refv);
}
#endif
amap_unlock(amap);
}
/*
* amap_ref: gain a reference to an amap.
*
* => amap must not be locked (we will lock).
* => "offset" and "len" are in units of pages.
* => Called at fork time to gain the child's reference.
*/
void
amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
amap_lock(amap, RW_WRITER);
if (flags & AMAP_SHARED) { amap->am_flags |= AMAP_SHARED;
}
amap_adjref_anons(amap, offset, len, 1, (flags & AMAP_REFALL) != 0);
UVMHIST_LOG(maphist,"<- done! amap=%#jx", (uintptr_t)amap, 0, 0, 0);
}
/*
* amap_unref: remove a reference to an amap.
*
* => All pmap-level references to this amap must be already removed.
* => Called from uvm_unmap_detach(); entry is already removed from the map.
* => We will lock amap, so it must be unlocked.
*/
void
amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, bool all)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
amap_lock(amap, RW_WRITER);
UVMHIST_LOG(maphist," amap=%#jx refs=%d, nused=%d",
(uintptr_t)amap, amap->am_ref, amap->am_nused, 0);
KASSERT(amap->am_ref > 0); if (amap->am_ref == 1) {
/*
* If the last reference - wipeout and destroy the amap.
*/
amap->am_ref--;
amap_wipeout(amap);
UVMHIST_LOG(maphist,"<- done (was last ref)!", 0, 0, 0, 0);
return;
}
/*
* Otherwise, drop the reference count(s) on anons.
*/
if (amap->am_ref == 2 && (amap->am_flags & AMAP_SHARED) != 0) { amap->am_flags &= ~AMAP_SHARED;
}
amap_adjref_anons(amap, offset, len, -1, all);
UVMHIST_LOG(maphist,"<- done!", 0, 0, 0, 0);
}
/* $NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */
/*
* Copyright (c) 2002, 2007, 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of Wasabi Systems, Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2004 The FreeBSD Foundation
* Copyright (c) 2004 Robert Watson
* Copyright (c) 1982, 1986, 1988, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
*/
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vn.c 1.13 94/04/02$
*
* @(#)vn.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uipc_socket_50.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/compat_stub.h>
#include <sys/socketvar.h>
#include <compat/sys/time.h>
#include <compat/sys/socket.h>
#include <compat/common/compat_mod.h>
static int
uipc_socket_50_getopt1(int opt, struct socket *so, struct sockopt *sopt)
{
int optval, error;
struct timeval50 otv;
switch (opt) {
case SO_OSNDTIMEO:
case SO_ORCVTIMEO:
optval = (opt == SO_OSNDTIMEO ?
so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
otv.tv_sec = optval / hz;
otv.tv_usec = (optval % hz) * tick;
error = sockopt_set(sopt, &otv, sizeof(otv));
break;
case SO_OTIMESTAMP:
error = sockopt_setint(sopt, (so->so_options & opt) ? 1 : 0);
break;
default:
error = EPASSTHROUGH;
}
return error;
}
static int
uipc_socket_50_setopt1(int opt, struct socket *so, const struct sockopt *sopt)
{
int optval, error;
struct timeval50 otv;
struct timeval tv;
switch (opt) {
case SO_OSNDTIMEO:
case SO_ORCVTIMEO:
solock(so);
error = sockopt_get(sopt, &otv, sizeof(otv));
if (error)
break;
timeval50_to_timeval(&otv, &tv);
/* Code duplicated from sys/kern/uipc_socket.c */
if (tv.tv_sec < 0 || tv.tv_usec < 0 || tv.tv_usec >= 1000000) {
error = EDOM;
break;
}
if (tv.tv_sec > (INT_MAX - tv.tv_usec / tick) / hz) {
error = EDOM;
break;
}
optval = tv.tv_sec * hz + tv.tv_usec / tick;
if (optval == 0 && tv.tv_usec != 0)
optval = 1;
switch (opt) {
case SO_OSNDTIMEO:
so->so_snd.sb_timeo = optval;
break;
case SO_ORCVTIMEO:
so->so_rcv.sb_timeo = optval;
break;
}
break;
case SO_OTIMESTAMP:
error = sockopt_getint(sopt, &optval);
solock(so); if (error)
break;
if (optval)
so->so_options |= opt;
else
so->so_options &= ~opt;
break;
default:
error = EPASSTHROUGH;
}
return error;
}
static int
uipc_socket_50_sbts(int opt, struct mbuf ***mp)
{
struct timeval50 tv50;
struct timeval tv;
microtime(&tv);
if (opt & SO_OTIMESTAMP) {
timeval_to_timeval50(&tv, &tv50);
**mp = sbcreatecontrol(&tv50, sizeof(tv50), SCM_OTIMESTAMP,
SOL_SOCKET);
if (**mp)
*mp = &(**mp)->m_next;
return 0;
} else
return EPASSTHROUGH;
}
void
kern_uipc_socket_50_init(void)
{
MODULE_HOOK_SET(uipc_socket_50_setopt1_hook, uipc_socket_50_setopt1);
MODULE_HOOK_SET(uipc_socket_50_getopt1_hook, uipc_socket_50_getopt1);
MODULE_HOOK_SET(uipc_socket_50_sbts_hook, uipc_socket_50_sbts);
}
void
kern_uipc_socket_50_fini(void)
{
MODULE_HOOK_UNSET(uipc_socket_50_setopt1_hook);
MODULE_HOOK_UNSET(uipc_socket_50_getopt1_hook);
MODULE_HOOK_UNSET(uipc_socket_50_sbts_hook);
}
/* $NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf32.c,v 1.143 2019/11/20 19:37:53 pgoyette Exp $");
#define ELFSIZE 32
#include "exec_elf.c"
#include <sys/module.h>
#define ELF32_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux32Info) \
+ MAXPATHLEN + ALIGN(1))
MODULE(MODULE_CLASS_EXEC, exec_elf32, NULL);
static struct execsw exec_elf32_execsw[] = {
{
.es_hdrsz = sizeof (Elf32_Ehdr),
.es_makecmds = exec_elf32_makecmds,
.u = {
.elf_probe_func = netbsd_elf32_probe,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_FIRST,
.es_arglen = ELF32_AUXSIZE,
.es_copyargs = elf32_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf32,
.es_setup_stack = exec_setup_stack,
},
#if EXEC_ELF_NOTELESS
{
.es_hdrsz = sizeof (Elf32_Ehdr),
.es_makecmds = exec_elf32_makecmds,
.u {
elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_LAST,
.es_arglen = ELF32_AUXSIZE,
.es_copyargs = elf32_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf32,
.es_setup_stack = exec_setup_stack,
},
#endif
};
static int
exec_elf32_modcmd(modcmd_t cmd, void *arg)
{
#if ARCH_ELFSIZE == 64
/*
* If we are on a 64bit system, we don't want the 32bit execsw[] to be
* added in the global array, because the exec_elf32 module only works
* on 32bit systems.
*
* However, we need the exec_elf32 module, because it will make the 32bit
* functions available for netbsd32 and linux32.
*
* Therefore, allow this module on 64bit systems, but make it dormant.
*/
(void)exec_elf32_execsw; /* unused */
switch (cmd) {
case MODULE_CMD_INIT:
case MODULE_CMD_FINI:
return 0;
default:
return ENOTTY;
}
#else /* ARCH_ELFSIZE == 64 */
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(exec_elf32_execsw,
__arraycount(exec_elf32_execsw));
case MODULE_CMD_FINI:
return exec_remove(exec_elf32_execsw,
__arraycount(exec_elf32_execsw));
default:
return ENOTTY;
}
#endif /* ARCH_ELFSIZE == 64 */
}
/* $NetBSD: rndpseudo_50.c,v 1.7 2020/04/30 03:30:10 riastradh Exp $ */
/*-
* Copyright (c) 1997-2011 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Michael Graff <explorer@flame.org> and Thor Lancelot Simon.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rndpseudo_50.c,v 1.7 2020/04/30 03:30:10 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/file.h>
#include <sys/module_hook.h>
#include <sys/compat_stub.h>
#include <compat/sys/rnd.h>
#include <compat/common/compat_mod.h>
/*
* Convert from rndsource_t to rndsource50_t, for the results from
* RNDGETNUM50 and RNDGETNAME50.
*/
static void
rndsource_to_rndsource50(rndsource_t *r, rndsource50_t *r50)
{
memset(r50, 0, sizeof(*r50));
strlcpy(r50->name, r->name, sizeof(r50->name));
r50->total = r->total;
r50->type = r->type;
r50->flags = r->flags;
}
/*
* COMPAT_50 handling for rnd_ioctl. This is called from rnd_ioctl.
*
* It also handles the case of (COMPAT_50 && COMPAT_NETBSD32).
*/
int
compat_50_rnd_ioctl(struct file *fp, u_long cmd, void *addr)
{
int ret = 0;
switch (cmd) {
case RNDGETSRCNUM50:
{
rndstat_t rstbuf = {.start = 0};
rndstat50_t *rst50 = (rndstat50_t *)addr;
size_t count;
if (rst50->count > RND_MAXSTATCOUNT50)
return EINVAL;
rstbuf.start = rst50->start;
rstbuf.count = rst50->count;
ret = (fp->f_ops->fo_ioctl)(fp, RNDGETSRCNUM, &rstbuf);
if (ret != 0)
return ret;
for (count = 0; count < rst50->count; count++) {
rndsource_to_rndsource50(&rstbuf.source[count],
&rst50->source[count]);
}
rst50->count = rstbuf.count;
break;
}
case RNDGETSRCNAME50:
{
rndstat_name_t rstnmbuf = {.name[0] = 0};
rndstat_name50_t *rstnm50;
rstnm50 = (rndstat_name50_t *)addr;
strlcpy(rstnmbuf.name, rstnm50->name, sizeof(rstnmbuf.name));
ret = (fp->f_ops->fo_ioctl)(fp, RNDGETSRCNAME, &rstnmbuf);
if (ret != 0)
return ret;
rndsource_to_rndsource50(&rstnmbuf.source, &rstnm50->source);
break;
}
default:
return ENOTTY;
}
return ret;
}
void
rndpseudo_50_init(void)
{
MODULE_HOOK_SET(rnd_ioctl_50_hook, compat_50_rnd_ioctl);
}
void
rndpseudo_50_fini(void)
{
MODULE_HOOK_UNSET(rnd_ioctl_50_hook);
}
/* $NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Alan Barrett
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_60.c,v 1.11 2021/07/21 06:35:44 skrll Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/compat_stub.h>
#include <sys/kmem.h>
#include <sys/tty.h>
#include <compat/common/compat_mod.h>
#include <compat/sys/ttycom.h>
/* convert struct ptmget to struct compat_60_ptmget */
static int
ptmget_to_ptmget60(struct ptmget *pg, struct compat_60_ptmget *pg60)
{
memset(pg60, 0, sizeof(*pg60));
pg60->cfd = pg->cfd;
pg60->sfd = pg->sfd;
strlcpy(pg60->cn, pg->cn, sizeof(pg60->cn));
strlcpy(pg60->sn, pg->sn, sizeof(pg60->sn));
if (strlen(pg->cn) >= sizeof(pg60->cn)
|| strlen(pg->sn) >= sizeof(pg60->sn))
return E2BIG;
return 0;
}
/* Helper for compat ioctls that use struct compat_60_ptmget. */
static int
compat_60_ptmget_ioctl(dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
int ret;
u_long newcmd;
struct ptmget *pg;
const struct cdevsw *cd = cdevsw_lookup(dev);
if (cd == NULL || cd->d_ioctl == NULL)
return ENXIO;
switch (cmd) {
case COMPAT_60_TIOCPTMGET: newcmd = TIOCPTMGET; break;
case COMPAT_60_TIOCPTSNAME: newcmd = TIOCPTSNAME; break;
default: return ENOTTY;
}
pg = kmem_alloc(sizeof(*pg), KM_SLEEP);
ret = (cd->d_ioctl)(dev, newcmd, pg, flag, l);
if (ret != 0)
goto out;
ret = ptmget_to_ptmget60(pg, data);
out:
kmem_free(pg, sizeof(*pg));
return ret;
}
/*
* COMPAT_60 versions of ttioctl and ptmioctl.
*/
int
compat_60_ttioctl(struct tty *tp, u_long cmd, void *data, int flag,
struct lwp *l)
{ switch (cmd) {
case COMPAT_60_TIOCPTMGET:
case COMPAT_60_TIOCPTSNAME:
return compat_60_ptmget_ioctl(tp->t_dev, cmd, data, flag, l);
default:
return EPASSTHROUGH;
}
}
int
compat_60_ptmioctl(dev_t dev, u_long cmd, void *data, int flag,
struct lwp *l)
{
switch (cmd) {
case COMPAT_60_TIOCPTMGET:
return compat_60_ptmget_ioctl(dev, cmd, data, flag, l);
default:
return EPASSTHROUGH;
}
}
void
kern_tty_60_init(void)
{
MODULE_HOOK_SET(tty_ttioctl_60_hook, compat_60_ttioctl);
MODULE_HOOK_SET(tty_ptmioctl_60_hook, compat_60_ptmioctl);
}
void
kern_tty_60_fini(void)
{
MODULE_HOOK_UNSET(tty_ttioctl_60_hook);
MODULE_HOOK_UNSET(tty_ptmioctl_60_hook);
}
/* $NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou
* for the NetBSD Project.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_elf64.c,v 1.8 2019/11/20 19:37:53 pgoyette Exp $");
#define ELFSIZE 64
#include "exec_elf.c"
#include <sys/module.h>
#define ELF64_AUXSIZE (ELF_AUX_ENTRIES * sizeof(Aux64Info) \
+ MAXPATHLEN + ALIGN(1))
MODULE(MODULE_CLASS_EXEC, exec_elf64, NULL);
static struct execsw exec_elf64_execsw[] = {
/* Native Elf64 */
{
.es_hdrsz = sizeof (Elf64_Ehdr),
.es_makecmds = exec_elf64_makecmds,
.u = {
.elf_probe_func = netbsd_elf64_probe,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_FIRST,
.es_arglen = ELF64_AUXSIZE,
.es_copyargs = elf64_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf64,
.es_setup_stack = exec_setup_stack,
},
#if EXEC_ELF_NOTELESS
/* Generic Elf64 -- run at NetBSD Elf64 */
{
.es_hdrsz = sizeof (Elf64_Ehdr),
.es_makecmds = exec_elf64_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = ELF64_AUXSIZE,
.es_copyargs = elf64_copyargs,
.es_setregs = NULL,
.es_coredump = coredump_elf64,
.es_setup_stack = exec_setup_stack,
},
#endif
};
static int
exec_elf64_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(exec_elf64_execsw,
__arraycount(exec_elf64_execsw));
case MODULE_CMD_FINI:
return exec_remove(exec_elf64_execsw,
__arraycount(exec_elf64_execsw));
default:
return ENOTTY;
}
}
/* $NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $ */
/*
* Copyright (c) 1993, 1994 Theo de Raadt
* All rights reserved.
*
* Per Lindqvist <pgd@compuram.bbt.se> supplied an almost fully working
* set of true clist functions that this is very loosely based on.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty_subr.c,v 1.43 2019/12/27 09:41:51 msaitoh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/ioctl.h>
#include <sys/tty.h>
#include <sys/kmem.h>
/*
* At compile time, choose:
* There are two ways the TTY_QUOTE bit can be stored. If QBITS is
* defined we allocate an array of bits -- 1/8th as much memory but
* setbit(), clrbit(), and isset() take more CPU. If QBITS is
* undefined, we just use an array of bytes.
*
* If TTY_QUOTE functionality isn't required by a line discipline,
* it can free c_cq and set it to NULL. This speeds things up,
* and also does not use any extra memory. This is useful for (say)
* a SLIP line discipline that wants a 32K ring buffer for data
* but doesn't need quoting.
*/
#define QBITS
#ifdef QBITS
#define QMEM(n) ((((n)-1)/NBBY)+1)
#else
#define QMEM(n) (n)
#endif
#ifdef QBITS
static void clrbits(u_char *, unsigned int, unsigned int);
#endif
/*
* Initialize a particular clist. Ok, they are really ring buffers,
* of the specified length, with/without quoting support.
*/
int
clalloc(struct clist *clp, int size, int quot)
{
clp->c_cs = kmem_zalloc(size, KM_SLEEP);
if (quot)
clp->c_cq = kmem_zalloc(QMEM(size), KM_SLEEP);
else
clp->c_cq = NULL;
clp->c_cf = clp->c_cl = NULL;
clp->c_ce = clp->c_cs + size;
clp->c_cn = size;
clp->c_cc = 0;
return (0);
}
void
clfree(struct clist *clp)
{
if (clp->c_cs)
kmem_free(clp->c_cs, clp->c_cn);
if (clp->c_cq)
kmem_free(clp->c_cq, QMEM(clp->c_cn));
clp->c_cs = clp->c_cq = NULL;
}
/*
* Get a character from a clist.
*/
int
getc(struct clist *clp)
{
int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
c = *clp->c_cf & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, clp->c_cf - clp->c_cs) )
c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
c |= TTY_QUOTE;
#endif
}
*clp->c_cf = 0; /* wipe out to avoid information disclosure */
if (++clp->c_cf == clp->c_ce) clp->c_cf = clp->c_cs; if (--clp->c_cc == 0) clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
return c;
}
/*
* Copy clist to buffer.
* Return number of bytes moved.
*/
int
q_to_b(struct clist *clp, u_char *cp, int count)
{
int cc;
u_char *p = cp;
int s;
s = spltty();
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl)
cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
memcpy(p, clp->c_cf, cc);
count -= cc;
p += cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce)
clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
splx(s);
return p - cp;
}
/*
* Return count of contiguous characters in clist.
* Stop counting if flag&character is non-null.
*/
int
ndqb(struct clist *clp, int flag)
{
int count = 0;
int i;
int cc;
int s;
s = spltty();
if ((cc = clp->c_cc) == 0)
goto out;
if (flag == 0) {
count = clp->c_cl - clp->c_cf;
if (count <= 0) count = clp->c_ce - clp->c_cf;
goto out;
}
i = clp->c_cf - clp->c_cs;
if (flag & TTY_QUOTE) {
while (cc-- > 0 && !(clp->c_cs[i++] & (flag & ~TTY_QUOTE) ||
isset(clp->c_cq, i))) {
count++;
if (i == clp->c_cn)
break;
}
} else {
while (cc-- > 0 && !(clp->c_cs[i++] & flag)) {
count++;
if (i == clp->c_cn)
break;
}
}
out:
splx(s);
return count;
}
/*
* Flush count bytes from clist.
*/
void
ndflush(struct clist *clp, int count)
{
int cc;
int s;
s = spltty();
if (count == clp->c_cc) {
clp->c_cc = 0;
clp->c_cf = clp->c_cl = (u_char *)0;
goto out;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc > 0) {
cc = clp->c_cl - clp->c_cf;
if (clp->c_cf >= clp->c_cl)
cc = clp->c_ce - clp->c_cf;
if (cc > count)
cc = count;
count -= cc;
clp->c_cc -= cc;
clp->c_cf += cc;
if (clp->c_cf == clp->c_ce)
clp->c_cf = clp->c_cs;
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
}
/*
* Put a character into the output queue.
*/
int
putc(int c, struct clist *clp)
{
int i;
int s;
s = spltty();
if (clp->c_cc == clp->c_cn)
goto out;
if (clp->c_cc == 0) {
if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
printf("putc: required clalloc\n");
#endif
if (clalloc(clp, clp->c_cn, 1)) {
out:
splx(s);
return -1;
}
}
clp->c_cf = clp->c_cl = clp->c_cs;
}
*clp->c_cl = c & 0xff;
i = clp->c_cl - clp->c_cs;
if (clp->c_cq) {
#ifdef QBITS
if (c & TTY_QUOTE)
setbit(clp->c_cq, i);
else
clrbit(clp->c_cq, i);
#else
q = clp->c_cq + i;
*q = (c & TTY_QUOTE) ? 1 : 0;
#endif
}
clp->c_cc++;
clp->c_cl++;
if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs;
splx(s);
return 0;
}
#ifdef QBITS
/*
* optimized version of
*
* for (i = 0; i < len; i++)
* clrbit(cp, off + len);
*/
static void
clrbits(u_char *cp, unsigned int off, unsigned int len)
{
unsigned int sbi, ebi;
u_char *scp, *ecp;
unsigned int end;
unsigned char mask;
scp = cp + off / NBBY;
sbi = off % NBBY;
end = off + len + NBBY - 1;
ecp = cp + end / NBBY - 1;
ebi = end % NBBY + 1;
if (scp >= ecp) {
mask = ((1 << len) - 1) << sbi;
*scp &= ~mask;
} else {
mask = (1 << sbi) - 1;
*scp++ &= mask;
mask = (1 << ebi) - 1;
*ecp &= ~mask;
while (scp < ecp)
*scp++ = 0x00;
}
}
#endif
/*
* Copy buffer to clist.
* Return number of bytes not transferred.
*/
int
b_to_q(const u_char *cp, int count, struct clist *clp)
{
int cc;
const u_char *p = cp;
int s;
if (count <= 0)
return 0;
s = spltty();
if (clp->c_cc == clp->c_cn)
goto out;
if (clp->c_cc == 0) { if (!clp->c_cs) {
#if defined(DIAGNOSTIC) || 1
printf("b_to_q: required clalloc\n");
#endif
if (clalloc(clp, clp->c_cn, 1))
goto out;
}
clp->c_cf = clp->c_cl = clp->c_cs;
}
/* optimize this while loop */
while (count > 0 && clp->c_cc < clp->c_cn) {
cc = clp->c_ce - clp->c_cl;
if (clp->c_cf > clp->c_cl)
cc = clp->c_cf - clp->c_cl;
if (cc > count)
cc = count;
memcpy(clp->c_cl, p, cc);
if (clp->c_cq) {
#ifdef QBITS
clrbits(clp->c_cq, clp->c_cl - clp->c_cs, cc);
#else
memset(clp->c_cl - clp->c_cs + clp->c_cq, 0, cc);
#endif
}
p += cc;
count -= cc;
clp->c_cc += cc;
clp->c_cl += cc;
if (clp->c_cl == clp->c_ce) clp->c_cl = clp->c_cs;
}
out:
splx(s);
return count;
}
static int tty_global_cc;
/*
* Given a non-NULL pointer into the clist return the pointer
* to the next character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and getc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*/
u_char *
nextc(struct clist *clp, u_char *cp, int *c)
{
if (clp->c_cf == cp) {
/*
* First time initialization.
*/
tty_global_cc = clp->c_cc;
}
if (tty_global_cc == 0 || cp == NULL)
return NULL;
if (--tty_global_cc == 0)
return NULL;
if (++cp == clp->c_ce)
cp = clp->c_cs;
*c = *cp & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, cp - clp->c_cs))
*c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
*c |= TTY_QUOTE;
#endif
}
return cp;
}
/*
* Given a non-NULL pointer into the clist return the pointer
* to the first character in the list or return NULL if no more chars.
*
* Callers must not allow getc's to happen between firstc's and getc's
* so that the pointer becomes invalid. Note that interrupts are NOT
* masked.
*
* *c is set to the NEXT character
*/
u_char *
firstc(struct clist *clp, int *c)
{
u_char *cp;
tty_global_cc = clp->c_cc;
if (tty_global_cc == 0)
return NULL;
cp = clp->c_cf;
*c = *cp & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, cp - clp->c_cs))
*c |= TTY_QUOTE;
#else
if (*(cp - clp->c_cs + clp->c_cq))
*c |= TTY_QUOTE;
#endif
}
return clp->c_cf;
}
/*
* Remove the last character in the clist and return it.
*/
int
unputc(struct clist *clp)
{
unsigned int c = -1;
int s;
s = spltty();
if (clp->c_cc == 0)
goto out;
if (clp->c_cl == clp->c_cs)
clp->c_cl = clp->c_ce - 1;
else
--clp->c_cl;
clp->c_cc--;
c = *clp->c_cl & 0xff;
if (clp->c_cq) {
#ifdef QBITS
if (isset(clp->c_cq, clp->c_cl - clp->c_cs))
c |= TTY_QUOTE;
#else
if (*(clp->c_cf - clp->c_cs + clp->c_cq))
c |= TTY_QUOTE;
#endif
}
if (clp->c_cc == 0)
clp->c_cf = clp->c_cl = (u_char *)0;
out:
splx(s);
return c;
}
/*
* Put the chars in the from queue on the end of the to queue.
*/
void
catq(struct clist *from, struct clist *to)
{
int c;
while ((c = getc(from)) != -1)
putc(c, to);
}
/* $NetBSD: lfs_accessors.h,v 1.51 2022/04/24 20:32:44 rillig Exp $ */
/* from NetBSD: lfs.h,v 1.165 2015/07/24 06:59:32 dholland Exp */
/* from NetBSD: dinode.h,v 1.25 2016/01/22 23:06:10 dholland Exp */
/* from NetBSD: dir.h,v 1.25 2015/09/01 06:16:03 dholland Exp */
/*-
* Copyright (c) 1999, 2000, 2001, 2002, 2003 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Konrad E. Schroder <perseant@hhhh.org>.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)lfs.h 8.9 (Berkeley) 5/8/95
*/
/*
* Copyright (c) 2002 Networks Associates Technology, Inc.
* All rights reserved.
*
* This software was developed for the FreeBSD Project by Marshall
* Kirk McKusick and Network Associates Laboratories, the Security
* Research Division of Network Associates, Inc. under DARPA/SPAWAR
* contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
* research program
*
* Copyright (c) 1982, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)dinode.h 8.9 (Berkeley) 3/29/95
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)dir.h 8.5 (Berkeley) 4/27/95
*/
#ifndef _UFS_LFS_LFS_ACCESSORS_H_
#define _UFS_LFS_LFS_ACCESSORS_H_
#if defined(_KERNEL_OPT)
#include "opt_lfs.h"
#endif
#include <sys/bswap.h>
#include <ufs/lfs/lfs.h>
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <assert.h>
#include <string.h>
#define KASSERT assert
#else
#include <sys/systm.h>
#endif
/*
* STRUCT_LFS is used by the libsa code to get accessors that work
* with struct salfs instead of struct lfs, and by the cleaner to
* get accessors that work with struct clfs.
*/
#ifndef STRUCT_LFS
#define STRUCT_LFS struct lfs
#endif
/*
* byte order
*/
/*
* For now at least, the bootblocks shall not be endian-independent.
* We can see later if it fits in the size budget. Also disable the
* byteswapping if LFS_EI is off.
*
* Caution: these functions "know" that bswap16/32/64 are unsigned,
* and if that changes will likely break silently.
*/
#if defined(_STANDALONE) || (defined(_KERNEL) && !defined(LFS_EI))
#define LFS_SWAP_int16_t(fs, val) (val)
#define LFS_SWAP_int32_t(fs, val) (val)
#define LFS_SWAP_int64_t(fs, val) (val)
#define LFS_SWAP_uint16_t(fs, val) (val)
#define LFS_SWAP_uint32_t(fs, val) (val)
#define LFS_SWAP_uint64_t(fs, val) (val)
#else
#define LFS_SWAP_int16_t(fs, val) \
((fs)->lfs_dobyteswap ? (int16_t)bswap16(val) : (val))
#define LFS_SWAP_int32_t(fs, val) \
((fs)->lfs_dobyteswap ? (int32_t)bswap32(val) : (val))
#define LFS_SWAP_int64_t(fs, val) \
((fs)->lfs_dobyteswap ? (int64_t)bswap64(val) : (val))
#define LFS_SWAP_uint16_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap16(val) : (val))
#define LFS_SWAP_uint32_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap32(val) : (val))
#define LFS_SWAP_uint64_t(fs, val) \
((fs)->lfs_dobyteswap ? bswap64(val) : (val))
#endif
/*
* For handling directories we will need to know if the volume is
* little-endian.
*/
#if BYTE_ORDER == LITTLE_ENDIAN
#define LFS_LITTLE_ENDIAN_ONDISK(fs) (!(fs)->lfs_dobyteswap)
#else
#define LFS_LITTLE_ENDIAN_ONDISK(fs) ((fs)->lfs_dobyteswap)
#endif
/*
* Suppress spurious warnings -- we use
*
* type *foo = &obj->member;
*
* in macros to verify that obj->member has the right type. When the
* object is a packed structure with misaligned members, this causes
* some compiles to squeal that taking the address might lead to
* undefined behaviour later on -- which is helpful in general, not
* relevant in this case, because we don't do anything with foo
* afterward; we only declare it to get a type check and then we
* discard it.
*/
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Waddress-of-packed-member"
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Waddress-of-packed-member"
#endif
#endif
/*
* directories
*/
#define LFS_DIRHEADERSIZE(fs) \
((fs)->lfs_is64 ? sizeof(struct lfs_dirheader64) : sizeof(struct lfs_dirheader32))
/*
* The LFS_DIRSIZ macro gives the minimum record length which will hold
* the directory entry. This requires the amount of space in struct lfs_direct
* without the d_name field, plus enough space for the name with a terminating
* null byte (dp->d_namlen+1), rounded up to a 4 byte boundary.
*/
#define LFS_DIRECTSIZ(fs, namlen) \
(LFS_DIRHEADERSIZE(fs) + (((namlen)+1 + 3) &~ 3))
/*
* The size of the largest possible directory entry. This is
* used by ulfs_dirhash to figure the size of an array, so we
* need a single constant value true for both lfs32 and lfs64.
*/
#define LFS_MAXDIRENTRYSIZE \
(sizeof(struct lfs_dirheader64) + (((LFS_MAXNAMLEN+1)+1 + 3) & ~3))
#if (BYTE_ORDER == LITTLE_ENDIAN)
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \
(((oldfmt) && !(needswap)) ? \
LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#else
#define LFS_OLDDIRSIZ(oldfmt, dp, needswap) \
(((oldfmt) && (needswap)) ? \
LFS_DIRECTSIZ((dp)->d_type) : LFS_DIRECTSIZ((dp)->d_namlen))
#endif
#define LFS_DIRSIZ(fs, dp) LFS_DIRECTSIZ(fs, lfs_dir_getnamlen(fs, dp))
/* Constants for the first argument of LFS_OLDDIRSIZ */
#define LFS_OLDDIRFMT 1
#define LFS_NEWDIRFMT 0
#define LFS_NEXTDIR(fs, dp) \
((LFS_DIRHEADER *)((char *)(dp) + lfs_dir_getreclen(fs, dp)))
static __inline char *
lfs_dir_nameptr(const STRUCT_LFS *fs, LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return (char *)(&dh->u_64 + 1);
} else {
return (char *)(&dh->u_32 + 1);
}
}
static __inline uint64_t
lfs_dir_getino(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return LFS_SWAP_uint64_t(fs, dh->u_64.dh_ino);
} else {
return LFS_SWAP_uint32_t(fs, dh->u_32.dh_ino);
}
}
static __inline uint16_t
lfs_dir_getreclen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
return LFS_SWAP_uint16_t(fs, dh->u_64.dh_reclen);
} else {
return LFS_SWAP_uint16_t(fs, dh->u_32.dh_reclen);
}
}
static __inline uint8_t
lfs_dir_gettype(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
return dh->u_64.dh_type;
} else if (fs->lfs_hasolddirfmt) {
return LFS_DT_UNKNOWN;
} else {
return dh->u_32.dh_type;
}
}
static __inline uint8_t
lfs_dir_getnamlen(const STRUCT_LFS *fs, const LFS_DIRHEADER *dh)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
return dh->u_64.dh_namlen;
} else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
/* low-order byte of old 16-bit namlen field */
return dh->u_32.dh_type;
} else {
return dh->u_32.dh_namlen;
}
}
static __inline void
lfs_dir_setino(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint64_t ino)
{
if (fs->lfs_is64) {
dh->u_64.dh_ino = LFS_SWAP_uint64_t(fs, ino);
} else {
dh->u_32.dh_ino = LFS_SWAP_uint32_t(fs, ino);
}
}
static __inline void
lfs_dir_setreclen(STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint16_t reclen)
{
if (fs->lfs_is64) {
dh->u_64.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
} else {
dh->u_32.dh_reclen = LFS_SWAP_uint16_t(fs, reclen);
}
}
static __inline void
lfs_dir_settype(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t type)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
dh->u_64.dh_type = type;
} else if (fs->lfs_hasolddirfmt) {
/* do nothing */
return;
} else {
dh->u_32.dh_type = type;
}
}
static __inline void
lfs_dir_setnamlen(const STRUCT_LFS *fs, LFS_DIRHEADER *dh, uint8_t namlen)
{
if (fs->lfs_is64) {
KASSERT(fs->lfs_hasolddirfmt == 0);
dh->u_64.dh_namlen = namlen;
} else if (fs->lfs_hasolddirfmt && LFS_LITTLE_ENDIAN_ONDISK(fs)) {
/* low-order byte of old 16-bit namlen field */
dh->u_32.dh_type = namlen;
} else {
dh->u_32.dh_namlen = namlen;
}
}
static __inline void
lfs_copydirname(STRUCT_LFS *fs, char *dest, const char *src,
unsigned namlen, unsigned reclen)
{
unsigned spacelen;
KASSERT(reclen > LFS_DIRHEADERSIZE(fs));
spacelen = reclen - LFS_DIRHEADERSIZE(fs);
/* must always be at least 1 byte as a null terminator */
KASSERT(spacelen > namlen);
memcpy(dest, src, namlen);
memset(dest + namlen, '\0', spacelen - namlen);
}
static __inline LFS_DIRHEADER *
lfs_dirtemplate_dotdot(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
/* XXX blah, be nice to have a way to do this w/o casts */
if (fs->lfs_is64) {
return (LFS_DIRHEADER *)&dt->u_64.dotdot_header;
} else {
return (LFS_DIRHEADER *)&dt->u_32.dotdot_header;
}
}
static __inline char *
lfs_dirtemplate_dotdotname(STRUCT_LFS *fs, union lfs_dirtemplate *dt)
{
if (fs->lfs_is64) {
return dt->u_64.dotdot_name;
} else {
return dt->u_32.dotdot_name;
}
}
/*
* dinodes
*/
/*
* Maximum length of a symlink that can be stored within the inode.
*/
#define LFS32_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int32_t))
#define LFS64_MAXSYMLINKLEN ((ULFS_NDADDR + ULFS_NIADDR) * sizeof(int64_t))
#define LFS_MAXSYMLINKLEN(fs) \
((fs)->lfs_is64 ? LFS64_MAXSYMLINKLEN : LFS32_MAXSYMLINKLEN)
#define DINOSIZE(fs) ((fs)->lfs_is64 ? sizeof(struct lfs64_dinode) : sizeof(struct lfs32_dinode))
#define DINO_IN_BLOCK(fs, base, ix) \
((union lfs_dinode *)((char *)(base) + DINOSIZE(fs) * (ix)))
static __inline void
lfs_copy_dinode(STRUCT_LFS *fs,
union lfs_dinode *dst, const union lfs_dinode *src)
{
/*
* We can do structure assignment of the structs, but not of
* the whole union, as the union is the size of the (larger)
* 64-bit struct and on a 32-bit fs the upper half of it might
* be off the end of a buffer or otherwise invalid.
*/
if (fs->lfs_is64) {
dst->u_64 = src->u_64;
} else {
dst->u_32 = src->u_32;
}
}
#define LFS_DEF_DINO_ACCESSOR(type, type32, field) \
static __inline type \
lfs_dino_get##field(STRUCT_LFS *fs, union lfs_dinode *dip) \
{ \
if (fs->lfs_is64) { \
return LFS_SWAP_##type(fs, dip->u_64.di_##field); \
} else { \
return LFS_SWAP_##type32(fs, dip->u_32.di_##field); \
} \
} \
static __inline void \
lfs_dino_set##field(STRUCT_LFS *fs, union lfs_dinode *dip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &dip->u_64.di_##field; \
(void)p; \
dip->u_64.di_##field = LFS_SWAP_##type(fs, val); \
} else { \
type32 *p = &dip->u_32.di_##field; \
(void)p; \
dip->u_32.di_##field = LFS_SWAP_##type32(fs, val); \
} \
} \
LFS_DEF_DINO_ACCESSOR(uint16_t, uint16_t, mode)
LFS_DEF_DINO_ACCESSOR(int16_t, int16_t, nlink)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, inumber)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint64_t, size)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, atime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, atimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, mtime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, mtimensec)
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, ctime)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, ctimensec)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, flags)
LFS_DEF_DINO_ACCESSOR(uint64_t, uint32_t, blocks)
LFS_DEF_DINO_ACCESSOR(int32_t, int32_t, gen)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, uid)
LFS_DEF_DINO_ACCESSOR(uint32_t, uint32_t, gid)
/* XXX this should be done differently (it's a fake field) */
LFS_DEF_DINO_ACCESSOR(int64_t, int32_t, rdev)
static __inline daddr_t
lfs_dino_getdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
KASSERT(ix < ULFS_NDADDR);
if (fs->lfs_is64) {
return LFS_SWAP_int64_t(fs, dip->u_64.di_db[ix]);
} else {
/* note: this must sign-extend or UNWRITTEN gets trashed */
return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_db[ix]);
}
}
static __inline daddr_t
lfs_dino_getib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix)
{
KASSERT(ix < ULFS_NIADDR);
if (fs->lfs_is64) {
return LFS_SWAP_int64_t(fs, dip->u_64.di_ib[ix]);
} else {
/* note: this must sign-extend or UNWRITTEN gets trashed */
return (int32_t)LFS_SWAP_int32_t(fs, dip->u_32.di_ib[ix]);
}
}
static __inline void
lfs_dino_setdb(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
KASSERT(ix < ULFS_NDADDR);
if (fs->lfs_is64) {
dip->u_64.di_db[ix] = LFS_SWAP_int64_t(fs, val);
} else {
dip->u_32.di_db[ix] = LFS_SWAP_uint32_t(fs, val);
}
}
static __inline void
lfs_dino_setib(STRUCT_LFS *fs, union lfs_dinode *dip, unsigned ix, daddr_t val)
{
KASSERT(ix < ULFS_NIADDR);
if (fs->lfs_is64) {
dip->u_64.di_ib[ix] = LFS_SWAP_int64_t(fs, val);
} else {
dip->u_32.di_ib[ix] = LFS_SWAP_uint32_t(fs, val);
}
}
/* birthtime is present only in the 64-bit inode */
static __inline void
lfs_dino_setbirthtime(STRUCT_LFS *fs, union lfs_dinode *dip,
const struct timespec *ts)
{
if (fs->lfs_is64) {
dip->u_64.di_birthtime = ts->tv_sec;
dip->u_64.di_birthnsec = ts->tv_nsec;
} else {
/* drop it on the floor */
}
}
/*
* indirect blocks
*/
static __inline daddr_t
lfs_iblock_get(STRUCT_LFS *fs, void *block, unsigned ix)
{
if (fs->lfs_is64) {
// XXX re-enable these asserts after reorging this file
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
return (daddr_t)(((int64_t *)block)[ix]);
} else {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
/* must sign-extend or UNWRITTEN gets trashed */
return (daddr_t)(int64_t)(((int32_t *)block)[ix]);
}
}
static __inline void
lfs_iblock_set(STRUCT_LFS *fs, void *block, unsigned ix, daddr_t val)
{
if (fs->lfs_is64) {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int64_t));
((int64_t *)block)[ix] = val;
} else {
//KASSERT(ix < lfs_sb_getbsize(fs) / sizeof(int32_t));
((int32_t *)block)[ix] = val;
}
}
/*
* "struct buf" associated definitions
*/
# define LFS_LOCK_BUF(bp) do { \
if (((bp)->b_flags & B_LOCKED) == 0 && bp->b_iodone == NULL) { \
mutex_enter(&lfs_lock); \
++locked_queue_count; \
locked_queue_bytes += bp->b_bufsize; \
mutex_exit(&lfs_lock); \
} \
(bp)->b_flags |= B_LOCKED; \
} while (0)
# define LFS_UNLOCK_BUF(bp) do { \
if (((bp)->b_flags & B_LOCKED) != 0 && bp->b_iodone == NULL) { \
mutex_enter(&lfs_lock); \
--locked_queue_count; \
locked_queue_bytes -= bp->b_bufsize; \
if (locked_queue_count < LFS_WAIT_BUFS && \
locked_queue_bytes < LFS_WAIT_BYTES) \
cv_broadcast(&locked_queue_cv); \
mutex_exit(&lfs_lock); \
} \
(bp)->b_flags &= ~B_LOCKED; \
} while (0)
/*
* "struct inode" associated definitions
*/
#define LFS_SET_UINO(ip, states) do { \
if (((states) & IN_ACCESSED) && !((ip)->i_state & IN_ACCESSED)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
if (((states) & IN_CLEANING) && !((ip)->i_state & IN_CLEANING)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
if (((states) & IN_MODIFIED) && !((ip)->i_state & IN_MODIFIED)) \
lfs_sb_adduinodes((ip)->i_lfs, 1); \
(ip)->i_state |= (states); \
} while (0)
#define LFS_CLR_UINO(ip, states) do { \
if (((states) & IN_ACCESSED) && ((ip)->i_state & IN_ACCESSED)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
if (((states) & IN_CLEANING) && ((ip)->i_state & IN_CLEANING)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
if (((states) & IN_MODIFIED) && ((ip)->i_state & IN_MODIFIED)) \
lfs_sb_subuinodes((ip)->i_lfs, 1); \
(ip)->i_state &= ~(states); \
if (lfs_sb_getuinodes((ip)->i_lfs) < 0) { \
panic("lfs_uinodes < 0"); \
} \
} while (0)
#define LFS_ITIMES(ip, acc, mod, cre) \
while ((ip)->i_state & (IN_ACCESS | IN_CHANGE | IN_UPDATE | IN_MODIFY)) \
lfs_itimes(ip, acc, mod, cre)
/*
* On-disk and in-memory checkpoint segment usage structure.
*/
#define SEGUPB(fs) (lfs_sb_getsepb(fs))
#define SEGTABSIZE_SU(fs) \
((lfs_sb_getnseg(fs) + SEGUPB(fs) - 1) / lfs_sb_getsepb(fs))
#ifdef _KERNEL
# define SHARE_IFLOCK(F) \
do { \
rw_enter(&(F)->lfs_iflock, RW_READER); \
} while(0)
# define UNSHARE_IFLOCK(F) \
do { \
rw_exit(&(F)->lfs_iflock); \
} while(0)
#else /* ! _KERNEL */
# define SHARE_IFLOCK(F)
# define UNSHARE_IFLOCK(F)
#endif /* ! _KERNEL */
/* Read in the block with a specific segment usage entry from the ifile. */
#define LFS_SEGENTRY(SP, F, IN, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
if ((_e = bread((F)->lfs_ivnode, \
((IN) / lfs_sb_getsepb(F)) + lfs_sb_getcleansz(F), \
lfs_sb_getbsize(F), 0, &(BP))) != 0) \
panic("lfs: ifile read: segentry %llu: error %d\n", \
(unsigned long long)(IN), _e); \
if (lfs_sb_getversion(F) == 1) \
(SP) = (SEGUSE *)((SEGUSE_V1 *)(BP)->b_data + \
((IN) & (lfs_sb_getsepb(F) - 1))); \
else \
(SP) = (SEGUSE *)(BP)->b_data + ((IN) % lfs_sb_getsepb(F)); \
UNSHARE_IFLOCK(F); \
} while (0)
#define LFS_WRITESEGENTRY(SP, F, IN, BP) do { \
if ((SP)->su_nbytes == 0) \
(SP)->su_flags |= SEGUSE_EMPTY; \
else \
(SP)->su_flags &= ~SEGUSE_EMPTY; \
(F)->lfs_suflags[(F)->lfs_activesb][(IN)] = (SP)->su_flags; \
LFS_BWRITE_LOG(BP); \
} while (0)
/*
* FINFO (file info) entries.
*/
/* Size of an on-disk block pointer, e.g. in an indirect block. */
/* XXX: move to a more suitable location in this file */
#define LFS_BLKPTRSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))
/* Size of an on-disk inode number. */
/* XXX: move to a more suitable location in this file */
#define LFS_INUMSIZE(fs) ((fs)->lfs_is64 ? sizeof(int64_t) : sizeof(int32_t))
/* size of a FINFO, without the block pointers */
#define FINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(FINFO64) : sizeof(FINFO32))
/* Full size of the provided FINFO record, including its block pointers. */
#define FINFO_FULLSIZE(fs, fip) \
(FINFOSIZE(fs) + lfs_fi_getnblocks(fs, fip) * LFS_BLKPTRSIZE(fs))
#define NEXT_FINFO(fs, fip) \
((FINFO *)((char *)(fip) + FINFO_FULLSIZE(fs, fip)))
#define LFS_DEF_FI_ACCESSOR(type, type32, field) \
static __inline type \
lfs_fi_get##field(STRUCT_LFS *fs, FINFO *fip) \
{ \
if (fs->lfs_is64) { \
return fip->u_64.fi_##field; \
} else { \
return fip->u_32.fi_##field; \
} \
} \
static __inline void \
lfs_fi_set##field(STRUCT_LFS *fs, FINFO *fip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &fip->u_64.fi_##field; \
(void)p; \
fip->u_64.fi_##field = val; \
} else { \
type32 *p = &fip->u_32.fi_##field; \
(void)p; \
fip->u_32.fi_##field = val; \
} \
} \
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, nblocks)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_FI_ACCESSOR(uint64_t, uint32_t, ino)
LFS_DEF_FI_ACCESSOR(uint32_t, uint32_t, lastlength)
static __inline daddr_t
lfs_fi_getblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
KASSERT(idx < lfs_fi_getnblocks(fs, fip));
if (fs->lfs_is64) {
return ((int64_t *)firstblock)[idx];
} else {
return ((int32_t *)firstblock)[idx];
}
}
static __inline void
lfs_fi_setblock(STRUCT_LFS *fs, FINFO *fip, unsigned idx, daddr_t blk)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
KASSERT(idx < lfs_fi_getnblocks(fs, fip));
if (fs->lfs_is64) {
((int64_t *)firstblock)[idx] = blk;
} else {
((int32_t *)firstblock)[idx] = blk;
}
}
/*
* inode info entries (in the segment summary)
*/
#define IINFOSIZE(fs) ((fs)->lfs_is64 ? sizeof(IINFO64) : sizeof(IINFO32))
/* iinfos scroll backward from the end of the segment summary block */
#define SEGSUM_IINFOSTART(fs, buf) \
((IINFO *)((char *)buf + lfs_sb_getsumsize(fs) - IINFOSIZE(fs)))
#define NEXTLOWER_IINFO(fs, iip) \
((IINFO *)((char *)(iip) - IINFOSIZE(fs)))
#define NTH_IINFO(fs, buf, n) \
((IINFO *)((char *)SEGSUM_IINFOSTART(fs, buf) - (n)*IINFOSIZE(fs)))
static __inline uint64_t
lfs_ii_getblock(STRUCT_LFS *fs, IINFO *iip)
{
if (fs->lfs_is64) {
return iip->u_64.ii_block;
} else {
return iip->u_32.ii_block;
}
}
static __inline void
lfs_ii_setblock(STRUCT_LFS *fs, IINFO *iip, uint64_t block)
{
if (fs->lfs_is64) {
iip->u_64.ii_block = block;
} else {
iip->u_32.ii_block = block;
}
}
/*
* Index file inode entries.
*/
#define IFILE_ENTRYSIZE(fs) \
((fs)->lfs_is64 ? sizeof(IFILE64) : sizeof(IFILE32))
/*
* LFSv1 compatibility code is not allowed to touch if_atime, since it
* may not be mapped!
*/
/* Read in the block with a specific inode from the ifile. */
#define LFS_IENTRY(IP, F, IN, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
if ((_e = bread((F)->lfs_ivnode, \
(IN) / lfs_sb_getifpb(F) + lfs_sb_getcleansz(F) + lfs_sb_getsegtabsz(F), \
lfs_sb_getbsize(F), 0, &(BP))) != 0) \
panic("lfs: ifile ino %d read %d", (int)(IN), _e); \
if ((F)->lfs_is64) { \
(IP) = (IFILE *)((IFILE64 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} else if (lfs_sb_getversion(F) > 1) { \
(IP) = (IFILE *)((IFILE32 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} else { \
(IP) = (IFILE *)((IFILE_V1 *)(BP)->b_data + \
(IN) % lfs_sb_getifpb(F)); \
} \
UNSHARE_IFLOCK(F); \
} while (0)
#define LFS_IENTRY_NEXT(IP, F) do { \
if ((F)->lfs_is64) { \
(IP) = (IFILE *)((IFILE64 *)(IP) + 1); \
} else if (lfs_sb_getversion(F) > 1) { \
(IP) = (IFILE *)((IFILE32 *)(IP) + 1); \
} else { \
(IP) = (IFILE *)((IFILE_V1 *)(IP) + 1); \
} \
} while (0)
#define LFS_DEF_IF_ACCESSOR(type, type32, field) \
static __inline type \
lfs_if_get##field(STRUCT_LFS *fs, IFILE *ifp) \
{ \
if (fs->lfs_is64) { \
return ifp->u_64.if_##field; \
} else { \
return ifp->u_32.if_##field; \
} \
} \
static __inline void \
lfs_if_set##field(STRUCT_LFS *fs, IFILE *ifp, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &ifp->u_64.if_##field; \
(void)p; \
ifp->u_64.if_##field = val; \
} else { \
type32 *p = &ifp->u_32.if_##field; \
(void)p; \
ifp->u_32.if_##field = val; \
} \
} \
LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, version)
LFS_DEF_IF_ACCESSOR(int64_t, int32_t, daddr)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, nextfree)
LFS_DEF_IF_ACCESSOR(uint64_t, uint32_t, atime_sec)
LFS_DEF_IF_ACCESSOR(uint32_t, uint32_t, atime_nsec)
/*
* Cleaner information structure. This resides in the ifile and is used
* to pass information from the kernel to the cleaner.
*/
#define CLEANSIZE_SU(fs) \
((((fs)->lfs_is64 ? sizeof(CLEANERINFO64) : sizeof(CLEANERINFO32)) + \
lfs_sb_getbsize(fs) - 1) >> lfs_sb_getbshift(fs))
#define LFS_DEF_CI_ACCESSOR(type, type32, field) \
static __inline type \
lfs_ci_get##field(STRUCT_LFS *fs, CLEANERINFO *cip) \
{ \
if (fs->lfs_is64) { \
return cip->u_64.field; \
} else { \
return cip->u_32.field; \
} \
} \
static __inline void \
lfs_ci_set##field(STRUCT_LFS *fs, CLEANERINFO *cip, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &cip->u_64.field; \
(void)p; \
cip->u_64.field = val; \
} else { \
type32 *p = &cip->u_32.field; \
(void)p; \
cip->u_32.field = val; \
} \
} \
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, clean)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, dirty)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, bfree)
LFS_DEF_CI_ACCESSOR(int64_t, int32_t, avail)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_head)
LFS_DEF_CI_ACCESSOR(uint64_t, uint32_t, free_tail)
LFS_DEF_CI_ACCESSOR(uint32_t, uint32_t, flags)
static __inline void
lfs_ci_shiftcleantodirty(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) - num);
lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) + num);
}
static __inline void
lfs_ci_shiftdirtytoclean(STRUCT_LFS *fs, CLEANERINFO *cip, unsigned num)
{
lfs_ci_setdirty(fs, cip, lfs_ci_getdirty(fs, cip) - num);
lfs_ci_setclean(fs, cip, lfs_ci_getclean(fs, cip) + num);
}
/* Read in the block with the cleaner info from the ifile. */
#define LFS_CLEANERINFO(CP, F, BP) do { \
int _e; \
SHARE_IFLOCK(F); \
VTOI((F)->lfs_ivnode)->i_state |= IN_ACCESS; \
_e = bread((F)->lfs_ivnode, \
(daddr_t)0, lfs_sb_getbsize(F), 0, &(BP)); \
if (_e) \
panic("lfs: ifile read: cleanerinfo: error %d\n", _e); \
(CP) = (CLEANERINFO *)(BP)->b_data; \
UNSHARE_IFLOCK(F); \
} while (0)
/*
* Synchronize the Ifile cleaner info with current avail and bfree.
*/
#define LFS_SYNC_CLEANERINFO(cip, fs, bp, w) do { \
mutex_enter(&lfs_lock); \
if ((w) || lfs_ci_getbfree(fs, cip) != lfs_sb_getbfree(fs) || \
lfs_ci_getavail(fs, cip) != lfs_sb_getavail(fs) - fs->lfs_ravail - \
fs->lfs_favail) { \
lfs_ci_setbfree(fs, cip, lfs_sb_getbfree(fs)); \
lfs_ci_setavail(fs, cip, lfs_sb_getavail(fs) - fs->lfs_ravail - \
fs->lfs_favail); \
if (((bp)->b_flags & B_GATHERED) == 0) { \
fs->lfs_flags |= LFS_IFDIRTY; \
} \
mutex_exit(&lfs_lock); \
(void) LFS_BWRITE_LOG(bp); /* Ifile */ \
} else { \
mutex_exit(&lfs_lock); \
brelse(bp, 0); \
} \
} while (0)
/*
* Get the head of the inode free list.
* Always called with the segment lock held.
*/
#define LFS_GET_HEADFREE(FS, CIP, BP, FREEP) do { \
if (lfs_sb_getversion(FS) > 1) { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_sb_setfreehd(FS, lfs_ci_getfree_head(FS, CIP)); \
brelse(BP, 0); \
} \
*(FREEP) = lfs_sb_getfreehd(FS); \
} while (0)
#define LFS_PUT_HEADFREE(FS, CIP, BP, VAL) do { \
lfs_sb_setfreehd(FS, VAL); \
if (lfs_sb_getversion(FS) > 1) { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_ci_setfree_head(FS, CIP, VAL); \
LFS_BWRITE_LOG(BP); \
mutex_enter(&lfs_lock); \
(FS)->lfs_flags |= LFS_IFDIRTY; \
mutex_exit(&lfs_lock); \
} \
} while (0)
#define LFS_GET_TAILFREE(FS, CIP, BP, FREEP) do { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
*(FREEP) = lfs_ci_getfree_tail(FS, CIP); \
brelse(BP, 0); \
} while (0)
#define LFS_PUT_TAILFREE(FS, CIP, BP, VAL) do { \
LFS_CLEANERINFO((CIP), (FS), (BP)); \
lfs_ci_setfree_tail(FS, CIP, VAL); \
LFS_BWRITE_LOG(BP); \
mutex_enter(&lfs_lock); \
(FS)->lfs_flags |= LFS_IFDIRTY; \
mutex_exit(&lfs_lock); \
} while (0)
/*
* On-disk segment summary information
*/
#define SEGSUM_SIZE(fs) \
(fs->lfs_is64 ? sizeof(SEGSUM64) : \
lfs_sb_getversion(fs) > 1 ? sizeof(SEGSUM32) : sizeof(SEGSUM_V1))
/*
* The SEGSUM structure is followed by FINFO structures. Get the pointer
* to the first FINFO.
*
* XXX this can't be a macro yet; this file needs to be resorted.
*/
#if 0
static __inline FINFO *
segsum_finfobase(STRUCT_LFS *fs, SEGSUM *ssp)
{
return (FINFO *)((char *)ssp + SEGSUM_SIZE(fs));
}
#else
#define SEGSUM_FINFOBASE(fs, ssp) \
((FINFO *)((char *)(ssp) + SEGSUM_SIZE(fs)));
#endif
#define LFS_DEF_SS_ACCESSOR(type, type32, field) \
static __inline type \
lfs_ss_get##field(STRUCT_LFS *fs, SEGSUM *ssp) \
{ \
if (fs->lfs_is64) { \
return ssp->u_64.ss_##field; \
} else { \
return ssp->u_32.ss_##field; \
} \
} \
static __inline void \
lfs_ss_set##field(STRUCT_LFS *fs, SEGSUM *ssp, type val) \
{ \
if (fs->lfs_is64) { \
type *p = &ssp->u_64.ss_##field; \
(void)p; \
ssp->u_64.ss_##field = val; \
} else { \
type32 *p = &ssp->u_32.ss_##field; \
(void)p; \
ssp->u_32.ss_##field = val; \
} \
} \
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, sumsum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, datasum)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, magic)
LFS_DEF_SS_ACCESSOR(uint32_t, uint32_t, ident)
LFS_DEF_SS_ACCESSOR(int64_t, int32_t, next)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, nfinfo)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, ninos)
LFS_DEF_SS_ACCESSOR(uint16_t, uint16_t, flags)
LFS_DEF_SS_ACCESSOR(uint64_t, uint32_t, reclino)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, serial)
LFS_DEF_SS_ACCESSOR(uint64_t, uint64_t, create)
static __inline size_t
lfs_ss_getsumstart(STRUCT_LFS *fs)
{
/* These are actually all the same. */
if (fs->lfs_is64) {
return offsetof(SEGSUM64, ss_datasum);
} else /* if (lfs_sb_getversion(fs) > 1) */ {
return offsetof(SEGSUM32, ss_datasum);
} /* else {
return offsetof(SEGSUM_V1, ss_datasum);
} */
/*
* XXX ^^^ until this file is resorted lfs_sb_getversion isn't
* defined yet.
*/
}
static __inline uint32_t
lfs_ss_getocreate(STRUCT_LFS *fs, SEGSUM *ssp)
{
KASSERT(fs->lfs_is64 == 0);
/* XXX need to resort this file before we can do this */
//KASSERT(lfs_sb_getversion(fs) == 1);
return ssp->u_v1.ss_create;
}
static __inline void
lfs_ss_setocreate(STRUCT_LFS *fs, SEGSUM *ssp, uint32_t val)
{
KASSERT(fs->lfs_is64 == 0);
/* XXX need to resort this file before we can do this */
//KASSERT(lfs_sb_getversion(fs) == 1);
ssp->u_v1.ss_create = val;
}
/*
* Super block.
*/
/*
* Generate accessors for the on-disk superblock fields with cpp.
*/
#define LFS_DEF_SB_ACCESSOR_FULL(type, type32, field) \
static __inline type \
lfs_sb_get##field(STRUCT_LFS *fs) \
{ \
if (fs->lfs_is64) { \
return fs->lfs_dlfs_u.u_64.dlfs_##field; \
} else { \
return fs->lfs_dlfs_u.u_32.dlfs_##field; \
} \
} \
static __inline void \
lfs_sb_set##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
fs->lfs_dlfs_u.u_64.dlfs_##field = val; \
} else { \
fs->lfs_dlfs_u.u_32.dlfs_##field = val; \
} \
} \
static __inline void \
lfs_sb_add##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
*p64 += val; \
} else { \
type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
*p32 += val; \
} \
} \
static __inline void \
lfs_sb_sub##field(STRUCT_LFS *fs, type val) \
{ \
if (fs->lfs_is64) { \
type *p64 = &fs->lfs_dlfs_u.u_64.dlfs_##field; \
*p64 -= val; \
} else { \
type32 *p32 = &fs->lfs_dlfs_u.u_32.dlfs_##field; \
*p32 -= val; \
} \
}
#define LFS_DEF_SB_ACCESSOR(t, f) LFS_DEF_SB_ACCESSOR_FULL(t, t, f)
#define LFS_DEF_SB_ACCESSOR_32ONLY(type, field, val64) \
static __inline type \
lfs_sb_get##field(STRUCT_LFS *fs) \
{ \
if (fs->lfs_is64) { \
return val64; \
} else { \
return fs->lfs_dlfs_u.u_32.dlfs_##field; \
} \
}
LFS_DEF_SB_ACCESSOR(uint32_t, version)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, size)
LFS_DEF_SB_ACCESSOR(uint32_t, ssize)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, dsize)
LFS_DEF_SB_ACCESSOR(uint32_t, bsize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsize)
LFS_DEF_SB_ACCESSOR(uint32_t, frag)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, freehd)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, bfree)
LFS_DEF_SB_ACCESSOR_FULL(uint64_t, uint32_t, nfiles)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, avail)
LFS_DEF_SB_ACCESSOR(int32_t, uinodes)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, idaddr)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, ifile, LFS_IFILE_INUM)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, nextseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, curseg)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, offset)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, lastpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopf)
LFS_DEF_SB_ACCESSOR(uint32_t, minfree)
LFS_DEF_SB_ACCESSOR(uint64_t, maxfilesize)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbpseg)
LFS_DEF_SB_ACCESSOR(uint32_t, inopb)
LFS_DEF_SB_ACCESSOR(uint32_t, ifpb)
LFS_DEF_SB_ACCESSOR(uint32_t, sepb)
LFS_DEF_SB_ACCESSOR(uint32_t, nindir)
LFS_DEF_SB_ACCESSOR(uint32_t, nseg)
LFS_DEF_SB_ACCESSOR(uint32_t, nspf)
LFS_DEF_SB_ACCESSOR(uint32_t, cleansz)
LFS_DEF_SB_ACCESSOR(uint32_t, segtabsz)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segmask, 0)
LFS_DEF_SB_ACCESSOR_32ONLY(uint32_t, segshift, 0)
LFS_DEF_SB_ACCESSOR(uint64_t, bmask)
LFS_DEF_SB_ACCESSOR(uint32_t, bshift)
LFS_DEF_SB_ACCESSOR(uint64_t, ffmask)
LFS_DEF_SB_ACCESSOR(uint32_t, ffshift)
LFS_DEF_SB_ACCESSOR(uint64_t, fbmask)
LFS_DEF_SB_ACCESSOR(uint32_t, fbshift)
LFS_DEF_SB_ACCESSOR(uint32_t, blktodb)
LFS_DEF_SB_ACCESSOR(uint32_t, fsbtodb)
LFS_DEF_SB_ACCESSOR(uint32_t, sushift)
LFS_DEF_SB_ACCESSOR(int32_t, maxsymlinklen)
LFS_DEF_SB_ACCESSOR(uint32_t, cksum)
LFS_DEF_SB_ACCESSOR(uint16_t, pflags)
LFS_DEF_SB_ACCESSOR(uint32_t, nclean)
LFS_DEF_SB_ACCESSOR(int32_t, dmeta)
LFS_DEF_SB_ACCESSOR(uint32_t, minfreeseg)
LFS_DEF_SB_ACCESSOR(uint32_t, sumsize)
LFS_DEF_SB_ACCESSOR(uint64_t, serial)
LFS_DEF_SB_ACCESSOR(uint32_t, ibsize)
LFS_DEF_SB_ACCESSOR_FULL(int64_t, int32_t, s0addr)
LFS_DEF_SB_ACCESSOR(uint64_t, tstamp)
LFS_DEF_SB_ACCESSOR(uint32_t, inodefmt)
LFS_DEF_SB_ACCESSOR(uint32_t, interleave)
LFS_DEF_SB_ACCESSOR(uint32_t, ident)
LFS_DEF_SB_ACCESSOR(uint32_t, resvseg)
/* special-case accessors */
/*
* the v1 otstamp field lives in what's now dlfs_inopf
*/
#define lfs_sb_getotstamp(fs) lfs_sb_getinopf(fs)
#define lfs_sb_setotstamp(fs, val) lfs_sb_setinopf(fs, val)
/*
* lfs_sboffs is an array
*/
static __inline int32_t
lfs_sb_getsboff(STRUCT_LFS *fs, unsigned n)
{
#ifdef KASSERT /* ugh */
KASSERT(n < LFS_MAXNUMSB);
#endif
if (fs->lfs_is64) {
return fs->lfs_dlfs_u.u_64.dlfs_sboffs[n];
} else {
return fs->lfs_dlfs_u.u_32.dlfs_sboffs[n];
}
}
static __inline void
lfs_sb_setsboff(STRUCT_LFS *fs, unsigned n, int32_t val)
{
#ifdef KASSERT /* ugh */
KASSERT(n < LFS_MAXNUMSB);
#endif
if (fs->lfs_is64) {
fs->lfs_dlfs_u.u_64.dlfs_sboffs[n] = val;
} else {
fs->lfs_dlfs_u.u_32.dlfs_sboffs[n] = val;
}
}
/*
* lfs_fsmnt is a string
*/
static __inline const char *
lfs_sb_getfsmnt(STRUCT_LFS *fs)
{
if (fs->lfs_is64) {
return (const char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt;
} else {
return (const char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt;
}
}
static __inline void
lfs_sb_setfsmnt(STRUCT_LFS *fs, const char *str)
{
if (fs->lfs_is64) {
(void)strncpy((char *)fs->lfs_dlfs_u.u_64.dlfs_fsmnt, str,
sizeof(fs->lfs_dlfs_u.u_64.dlfs_fsmnt));
} else {
(void)strncpy((char *)fs->lfs_dlfs_u.u_32.dlfs_fsmnt, str,
sizeof(fs->lfs_dlfs_u.u_32.dlfs_fsmnt));
}
}
/* Highest addressable fsb */
#define LFS_MAX_DADDR(fs) \
((fs)->lfs_is64 ? 0x7fffffffffffffff : 0x7fffffff)
/* LFS_NINDIR is the number of indirects in a file system block. */
#define LFS_NINDIR(fs) (lfs_sb_getnindir(fs))
/* LFS_INOPB is the number of inodes in a secondary storage block. */
#define LFS_INOPB(fs) (lfs_sb_getinopb(fs))
/* LFS_INOPF is the number of inodes in a fragment. */
#define LFS_INOPF(fs) (lfs_sb_getinopf(fs))
#define lfs_blkoff(fs, loc) ((int)((loc) & lfs_sb_getbmask(fs)))
#define lfs_fragoff(fs, loc) /* calculates (loc % fs->lfs_fsize) */ \
((int)((loc) & lfs_sb_getffmask(fs)))
/* XXX: lowercase these as they're no longer macros */
/* Frags to diskblocks */
static __inline uint64_t
LFS_FSBTODB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
return b << (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
return b << lfs_sb_getfsbtodb(fs);
#endif
}
/* Diskblocks to frags */
static __inline uint64_t
LFS_DBTOFSB(STRUCT_LFS *fs, uint64_t b)
{
#if defined(_KERNEL)
return b >> (lfs_sb_getffshift(fs) - DEV_BSHIFT);
#else
return b >> lfs_sb_getfsbtodb(fs);
#endif
}
#define lfs_lblkno(fs, loc) ((loc) >> lfs_sb_getbshift(fs))
#define lfs_lblktosize(fs, blk) ((blk) << lfs_sb_getbshift(fs))
/* Frags to bytes */
static __inline uint64_t
lfs_fsbtob(STRUCT_LFS *fs, uint64_t b)
{
return b << lfs_sb_getffshift(fs);
}
/* Bytes to frags */
static __inline uint64_t
lfs_btofsb(STRUCT_LFS *fs, uint64_t b)
{
return b >> lfs_sb_getffshift(fs);
}
#define lfs_numfrags(fs, loc) /* calculates (loc / fs->lfs_fsize) */ \
((loc) >> lfs_sb_getffshift(fs))
#define lfs_blkroundup(fs, size)/* calculates roundup(size, lfs_sb_getbsize(fs)) */ \
((off_t)(((size) + lfs_sb_getbmask(fs)) & (~lfs_sb_getbmask(fs))))
#define lfs_fragroundup(fs, size)/* calculates roundup(size, fs->lfs_fsize) */ \
((off_t)(((size) + lfs_sb_getffmask(fs)) & (~lfs_sb_getffmask(fs))))
#define lfs_fragstoblks(fs, frags)/* calculates (frags / fs->fs_frag) */ \
((frags) >> lfs_sb_getfbshift(fs))
#define lfs_blkstofrags(fs, blks)/* calculates (blks * fs->fs_frag) */ \
((blks) << lfs_sb_getfbshift(fs))
#define lfs_fragnum(fs, fsb) /* calculates (fsb % fs->lfs_frag) */ \
((fsb) & ((fs)->lfs_frag - 1))
#define lfs_blknum(fs, fsb) /* calculates rounddown(fsb, fs->lfs_frag) */ \
((fsb) &~ ((fs)->lfs_frag - 1))
#define lfs_dblksize(fs, dp, lbn) \
(((lbn) >= ULFS_NDADDR || lfs_dino_getsize(fs, dp) >= ((lbn) + 1) << lfs_sb_getbshift(fs)) \
? lfs_sb_getbsize(fs) \
: (lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, dp)))))
#define lfs_segsize(fs) (lfs_sb_getversion(fs) == 1 ? \
lfs_lblktosize((fs), lfs_sb_getssize(fs)) : \
lfs_sb_getssize(fs))
/* XXX segtod produces a result in frags despite the 'd' */
#define lfs_segtod(fs, seg) (lfs_btofsb(fs, lfs_segsize(fs)) * (seg))
#define lfs_dtosn(fs, daddr) /* block address to segment number */ \
((uint32_t)(((daddr) - lfs_sb_gets0addr(fs)) / lfs_segtod((fs), 1)))
#define lfs_sntod(fs, sn) /* segment number to disk address */ \
((daddr_t)(lfs_segtod((fs), (sn)) + lfs_sb_gets0addr(fs)))
/* XXX, blah. make this appear only if struct inode is defined */
#ifdef _UFS_LFS_LFS_INODE_H_
static __inline uint32_t
lfs_blksize(STRUCT_LFS *fs, struct inode *ip, uint64_t lbn)
{
if (lbn >= ULFS_NDADDR || lfs_dino_getsize(fs, ip->i_din) >= (lbn + 1) << lfs_sb_getbshift(fs)) {
return lfs_sb_getbsize(fs);
} else {
return lfs_fragroundup(fs, lfs_blkoff(fs, lfs_dino_getsize(fs, ip->i_din)));
}
}
#endif
/*
* union lfs_blocks
*/
static __inline void
lfs_blocks_fromvoid(STRUCT_LFS *fs, union lfs_blocks *bp, void *p)
{
if (fs->lfs_is64) {
bp->b64 = p;
} else {
bp->b32 = p;
}
}
static __inline void
lfs_blocks_fromfinfo(STRUCT_LFS *fs, union lfs_blocks *bp, FINFO *fip)
{
void *firstblock;
firstblock = (char *)fip + FINFOSIZE(fs);
if (fs->lfs_is64) {
bp->b64 = (int64_t *)firstblock;
} else {
bp->b32 = (int32_t *)firstblock;
}
}
static __inline daddr_t
lfs_blocks_get(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx)
{
if (fs->lfs_is64) {
return bp->b64[idx];
} else {
return bp->b32[idx];
}
}
static __inline void
lfs_blocks_set(STRUCT_LFS *fs, union lfs_blocks *bp, unsigned idx, daddr_t val)
{
if (fs->lfs_is64) {
bp->b64[idx] = val;
} else {
bp->b32[idx] = val;
}
}
static __inline void
lfs_blocks_inc(STRUCT_LFS *fs, union lfs_blocks *bp)
{
if (fs->lfs_is64) {
bp->b64++;
} else {
bp->b32++;
}
}
static __inline int
lfs_blocks_eq(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
if (fs->lfs_is64) {
return bp1->b64 == bp2->b64;
} else {
return bp1->b32 == bp2->b32;
}
}
static __inline int
lfs_blocks_sub(STRUCT_LFS *fs, union lfs_blocks *bp1, union lfs_blocks *bp2)
{
/* (remember that the pointers are typed) */
if (fs->lfs_is64) {
return bp1->b64 - bp2->b64;
} else {
return bp1->b32 - bp2->b32;
}
}
/*
* struct segment
*/
/*
* Macros for determining free space on the disk, with the variable metadata
* of segment summaries and inode blocks taken into account.
*/
/*
* Estimate number of clean blocks not available for writing because
* they will contain metadata or overhead. This is calculated as
*
* E = ((C * M / D) * D + (0) * (T - D)) / T
* or more simply
* E = (C * M) / T
*
* where
* C is the clean space,
* D is the dirty space,
* M is the dirty metadata, and
* T = C + D is the total space on disk.
*
* This approximates the old formula of E = C * M / D when D is close to T,
* but avoids falsely reporting "disk full" when the sample size (D) is small.
*/
#define LFS_EST_CMETA(F) (( \
(lfs_sb_getdmeta(F) * (int64_t)lfs_sb_getnclean(F)) / \
(lfs_sb_getnseg(F))))
/* Estimate total size of the disk not including metadata */
#define LFS_EST_NONMETA(F) (lfs_sb_getdsize(F) - lfs_sb_getdmeta(F) - LFS_EST_CMETA(F))
/* Estimate number of blocks actually available for writing */
#define LFS_EST_BFREE(F) (lfs_sb_getbfree(F) > LFS_EST_CMETA(F) ? \
lfs_sb_getbfree(F) - LFS_EST_CMETA(F) : 0)
/* Amount of non-meta space not available to mortal man */
#define LFS_EST_RSVD(F) ((LFS_EST_NONMETA(F) * \
(uint64_t)lfs_sb_getminfree(F)) / \
100)
/* Can credential C write BB blocks? XXX: kauth_cred_geteuid is abusive */
#define ISSPACE(F, BB, C) \
((((C) == NOCRED || kauth_cred_geteuid(C) == 0) && \
LFS_EST_BFREE(F) >= (BB)) || \
(kauth_cred_geteuid(C) != 0 && IS_FREESPACE(F, BB)))
/* Can an ordinary user write BB blocks */
#define IS_FREESPACE(F, BB) \
(LFS_EST_BFREE(F) >= (BB) + LFS_EST_RSVD(F))
/*
* The minimum number of blocks to create a new inode. This is:
* directory direct block (1) + ULFS_NIADDR indirect blocks + inode block (1) +
* ifile direct block (1) + ULFS_NIADDR indirect blocks = 3 + 2 * ULFS_NIADDR blocks.
*/
#define LFS_NRESERVE(F) (lfs_btofsb((F), (2 * ULFS_NIADDR + 3) << lfs_sb_getbshift(F)))
/*
* Suppress spurious clang warnings
*/
#ifdef __GNUC__
#if defined(__clang__)
#pragma clang diagnostic pop
#elif __GNUC_PREREQ__(9,0)
#pragma GCC diagnostic pop
#endif
#endif
#endif /* _UFS_LFS_LFS_ACCESSORS_H_ */
/* $NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* System calls relating to loadable modules.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_mod_80.c,v 1.6 2019/12/12 02:15:42 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#include "opt_modular.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/kobj.h>
#include <sys/module.h>
#include <sys/syscall.h>
#include <sys/syscallargs.h>
#include <sys/compat_stub.h>
#include <compat/sys/module.h>
#include <compat/common/compat_mod.h>
static int
compat_80_modstat(int cmd, struct iovec *iov, void *arg)
{
omodstat_t *oms, *omso;
modinfo_t *mi;
module_t *mod;
vaddr_t addr;
size_t size;
size_t omslen;
size_t used;
int error;
int omscnt;
bool stataddr;
const char *suffix = "...";
if (cmd != MODCTL_OSTAT)
return EINVAL;
error = copyin(arg, iov, sizeof(*iov));
if (error != 0) {
return error;
}
/* If not privileged, don't expose kernel addresses. */
error = kauth_authorize_system(kauth_cred_get(), KAUTH_SYSTEM_MODULE,
0, (void *)(uintptr_t)MODCTL_STAT, NULL, NULL);
stataddr = (error == 0);
kernconfig_lock();
omscnt = 0;
TAILQ_FOREACH(mod, &module_list, mod_chain) {
omscnt++;
mi = mod->mod_info;
}
TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
omscnt++;
mi = mod->mod_info;
}
omslen = omscnt * sizeof(omodstat_t);
omso = kmem_zalloc(omslen, KM_SLEEP);
oms = omso;
TAILQ_FOREACH(mod, &module_list, mod_chain) {
mi = mod->mod_info;
strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
if (mi->mi_required != NULL) {
used = strlcpy(oms->oms_required, mi->mi_required,
sizeof(oms->oms_required));
if (used >= sizeof(oms->oms_required)) {
oms->oms_required[sizeof(oms->oms_required) -
strlen(suffix) - 1] = '\0';
strlcat(oms->oms_required, suffix,
sizeof(oms->oms_required));
}
}
if (mod->mod_kobj != NULL && stataddr) {
kobj_stat(mod->mod_kobj, &addr, &size);
oms->oms_addr = addr;
oms->oms_size = size;
}
oms->oms_class = mi->mi_class;
oms->oms_refcnt = mod->mod_refcnt;
oms->oms_source = mod->mod_source;
oms->oms_flags = mod->mod_flags;
oms++;
}
TAILQ_FOREACH(mod, &module_builtins, mod_chain) {
mi = mod->mod_info;
strlcpy(oms->oms_name, mi->mi_name, sizeof(oms->oms_name));
if (mi->mi_required != NULL) {
used = strlcpy(oms->oms_required, mi->mi_required,
sizeof(oms->oms_required));
if (used >= sizeof(oms->oms_required)) {
oms->oms_required[sizeof(oms->oms_required) -
strlen(suffix) - 1] = '\0';
strlcat(oms->oms_required, suffix,
sizeof(oms->oms_required));
}
}
if (mod->mod_kobj != NULL && stataddr) {
kobj_stat(mod->mod_kobj, &addr, &size);
oms->oms_addr = addr;
oms->oms_size = size;
}
oms->oms_class = mi->mi_class;
oms->oms_refcnt = -1;
KASSERT(mod->mod_source == MODULE_SOURCE_KERNEL);
oms->oms_source = mod->mod_source;
oms++;
}
kernconfig_unlock();
error = copyout(omso, iov->iov_base, uimin(omslen, iov->iov_len));
kmem_free(omso, omslen);
if (error == 0) { iov->iov_len = omslen;
error = copyout(iov, arg, sizeof(*iov));
}
return error;
}
void
kern_mod_80_init(void)
{
MODULE_HOOK_SET(compat_modstat_80_hook, compat_80_modstat);
}
void
kern_mod_80_fini(void)
{
MODULE_HOOK_UNSET(compat_modstat_80_hook);
}
/* $NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and
* Daniel Sieger.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.366 2023/11/22 13:18:48 riastradh Exp $");
#include "opt_kstack.h"
#include "opt_ddb.h"
#include "opt_dtrace.h"
#define __MUTEX_PRIVATE
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/dtrace_bsd.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/lockdebug.h>
#include <sys/lwpctl.h>
#include <sys/proc.h>
#include <sys/pserialize.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/syscall_stats.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
#include <dev/lockstat.h>
int dtrace_vtime_active=0;
dtrace_vtime_switch_func_t dtrace_vtime_switch_func;
#ifdef DDB
#include <ddb/ddb.h>
#endif
static void sched_unsleep(struct lwp *, bool);
static void sched_changepri(struct lwp *, pri_t);
static void sched_lendpri(struct lwp *, pri_t);
syncobj_t sleep_syncobj = {
.sobj_name = "sleep",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
syncobj_t sched_syncobj = {
.sobj_name = "sched",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_USER,
.sobj_unsleep = sched_unsleep,
.sobj_changepri = sched_changepri,
.sobj_lendpri = sched_lendpri,
.sobj_owner = syncobj_noowner,
};
syncobj_t kpause_syncobj = {
.sobj_name = "kpause",
.sobj_flag = SOBJ_SLEEPQ_NULL,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
/* "Lightning bolt": once a second sleep address. */
kcondvar_t lbolt __cacheline_aligned;
u_int sched_pstats_ticks __cacheline_aligned;
/* Preemption event counters. */
static struct evcnt kpreempt_ev_crit __cacheline_aligned;
static struct evcnt kpreempt_ev_klock __cacheline_aligned;
static struct evcnt kpreempt_ev_immed __cacheline_aligned;
void
synch_init(void)
{
cv_init(&lbolt, "lbolt");
evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL,
"kpreempt", "defer: critical section");
evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL,
"kpreempt", "defer: kernel_lock");
evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL,
"kpreempt", "immediate");
}
/*
* OBSOLETE INTERFACE
*
* General sleep call. Suspends the current LWP until a wakeup is
* performed on the specified identifier. The LWP will then be made
* runnable with the specified priority. Sleeps at most timo/hz seconds (0
* means no timeout). If pri includes PCATCH flag, signals are checked
* before and after sleeping, else signals are not checked. Returns 0 if
* awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a
* signal needs to be delivered, ERESTART is returned if the current system
* call should be restarted if possible, and EINTR is returned if the system
* call should be interrupted by the signal (return EINTR).
*/
int
tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo)
{
struct lwp *l = curlwp;
sleepq_t *sq;
kmutex_t *mp;
bool catch_p;
int nlocks;
KASSERT((l->l_pflag & LP_INTR) == 0); KASSERT(ident != &lbolt);
//KASSERT(KERNEL_LOCKED_P());
if (sleepq_dontsleep(l)) {
(void)sleepq_abort(NULL, 0);
return 0;
}
catch_p = priority & PCATCH;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
return sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);
}
int
mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo,
kmutex_t *mtx)
{
struct lwp *l = curlwp;
sleepq_t *sq;
kmutex_t *mp;
bool catch_p;
int error, nlocks;
KASSERT((l->l_pflag & LP_INTR) == 0); KASSERT(ident != &lbolt); if (sleepq_dontsleep(l)) {
(void)sleepq_abort(mtx, (priority & PNORELOCK) != 0);
return 0;
}
catch_p = priority & PCATCH;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
nlocks = sleepq_enter(sq, l, mp);
sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p);
mutex_exit(mtx);
error = sleepq_block(timo, catch_p, &sleep_syncobj, nlocks);
if ((priority & PNORELOCK) == 0) mutex_enter(mtx);
return error;
}
/*
* General sleep call for situations where a wake-up is not expected.
*/
int
kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx)
{
struct lwp *l = curlwp;
int error, nlocks;
KASSERTMSG(timo != 0 || intr, "wmesg=%s intr=%s timo=%d mtx=%p",
wmesg, intr ? "true" : "false", timo, mtx);
if (sleepq_dontsleep(l))
return sleepq_abort(NULL, 0);
if (mtx != NULL) mutex_exit(mtx);
nlocks = sleepq_enter(NULL, l, NULL);
sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr);
error = sleepq_block(timo, intr, &kpause_syncobj, nlocks);
if (mtx != NULL) mutex_enter(mtx);
return error;
}
/*
* OBSOLETE INTERFACE
*
* Make all LWPs sleeping on the specified identifier runnable.
*/
void
wakeup(wchan_t ident)
{
sleepq_t *sq;
kmutex_t *mp;
if (__predict_false(cold))
return;
sq = sleeptab_lookup(&sleeptab, ident, &mp);
sleepq_wake(sq, ident, (u_int)-1, mp);
}
/*
* General yield call. Puts the current LWP back on its run queue and
* performs a context switch.
*/
void
yield(void)
{
struct lwp *l = curlwp;
int nlocks;
KERNEL_UNLOCK_ALL(l, &nlocks);
lwp_lock(l);
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); KASSERT(l->l_stat == LSONPROC);
spc_lock(l->l_cpu);
mi_switch(l);
KERNEL_LOCK(nlocks, l);
}
/*
* General preemption call. Puts the current LWP back on its run queue
* and performs an involuntary context switch. Different from yield()
* in that:
*
* - It's counted differently (involuntary vs. voluntary).
* - Realtime threads go to the head of their runqueue vs. tail for yield().
*/
void
preempt(void)
{
struct lwp *l = curlwp;
int nlocks;
KERNEL_UNLOCK_ALL(l, &nlocks);
lwp_lock(l);
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); KASSERT(l->l_stat == LSONPROC);
spc_lock(l->l_cpu);
l->l_pflag |= LP_PREEMPTING;
mi_switch(l);
KERNEL_LOCK(nlocks, l);
}
/*
* Return true if the current LWP should yield the processor. Intended to
* be used by long-running code in kernel.
*/
inline bool
preempt_needed(void)
{
lwp_t *l = curlwp;
int needed;
KPREEMPT_DISABLE(l);
needed = l->l_cpu->ci_want_resched;
KPREEMPT_ENABLE(l);
return (needed != 0);
}
/*
* A breathing point for long running code in kernel.
*/
void
preempt_point(void)
{ if (__predict_false(preempt_needed())) { preempt();
}
}
/*
* Handle a request made by another agent to preempt the current LWP
* in-kernel. Usually called when l_dopreempt may be non-zero.
*
* Character addresses for lockstat only.
*/
static char kpreempt_is_disabled;
static char kernel_lock_held;
static char is_softint_lwp;
static char spl_is_raised;
bool
kpreempt(uintptr_t where)
{
uintptr_t failed;
lwp_t *l;
int s, dop, lsflag;
l = curlwp;
failed = 0;
while ((dop = l->l_dopreempt) != 0) {
if (l->l_stat != LSONPROC) {
/*
* About to block (or die), let it happen.
* Doesn't really count as "preemption has
* been blocked", since we're going to
* context switch.
*/
atomic_swap_uint(&l->l_dopreempt, 0);
return true;
}
KASSERT((l->l_flag & LW_IDLE) == 0);
if (__predict_false(l->l_nopreempt != 0)) {
/* LWP holds preemption disabled, explicitly. */
if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_crit.ev_count++;
}
failed = (uintptr_t)&kpreempt_is_disabled;
break;
}
if (__predict_false((l->l_pflag & LP_INTR) != 0)) {
/* Can't preempt soft interrupts yet. */
atomic_swap_uint(&l->l_dopreempt, 0);
failed = (uintptr_t)&is_softint_lwp;
break;
}
s = splsched();
if (__predict_false(l->l_blcnt != 0 ||
curcpu()->ci_biglock_wanted != NULL)) {
/* Hold or want kernel_lock, code is not MT safe. */
splx(s);
if ((dop & DOPREEMPT_COUNTED) == 0) { kpreempt_ev_klock.ev_count++;
}
failed = (uintptr_t)&kernel_lock_held;
break;
}
if (__predict_false(!cpu_kpreempt_enter(where, s))) {
/*
* It may be that the IPL is too high.
* kpreempt_enter() can schedule an
* interrupt to retry later.
*/
splx(s);
failed = (uintptr_t)&spl_is_raised;
break;
}
/* Do it! */
if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { kpreempt_ev_immed.ev_count++;
}
lwp_lock(l);
l->l_pflag |= LP_PREEMPTING;
spc_lock(l->l_cpu);
mi_switch(l);
l->l_nopreempt++;
splx(s);
/* Take care of any MD cleanup. */
cpu_kpreempt_exit(where);
l->l_nopreempt--;
}
if (__predict_true(!failed)) {
return false;
}
/* Record preemption failure for reporting via lockstat. */
atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED);
lsflag = 0;
LOCKSTAT_ENTER(lsflag);
if (__predict_false(lsflag)) { if (where == 0) { where = (uintptr_t)__builtin_return_address(0);
}
/* Preemption is on, might recurse, so make it atomic. */
if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL,
(void *)where) == NULL) {
LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime);
l->l_pfaillock = failed;
}
}
LOCKSTAT_EXIT(lsflag);
return true;
}
/*
* Return true if preemption is explicitly disabled.
*/
bool
kpreempt_disabled(void)
{
const lwp_t *l = curlwp;
return l->l_nopreempt != 0 || l->l_stat == LSZOMB || (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || cpu_kpreempt_disabled();
}
/*
* Disable kernel preemption.
*/
void
kpreempt_disable(void)
{
KPREEMPT_DISABLE(curlwp);
}
/*
* Reenable kernel preemption.
*/
void
kpreempt_enable(void)
{ KPREEMPT_ENABLE(curlwp);
}
/*
* Compute the amount of time during which the current lwp was running.
*
* - update l_rtime unless it's an idle lwp.
*/
void
updatertime(lwp_t *l, const struct bintime *now)
{
static bool backwards = false;
if (__predict_false(l->l_flag & LW_IDLE))
return;
if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { char caller[128];
#ifdef DDB
db_symstr(caller, sizeof(caller),
(db_expr_t)(intptr_t)__builtin_return_address(0),
DB_STGY_PROC);
#else
snprintf(caller, sizeof(caller), "%p",
__builtin_return_address(0));
#endif
backwards = true;
printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:"
" timecounter went backwards"
" from (%jd + 0x%016"PRIx64"/2^64) sec"
" to (%jd + 0x%016"PRIx64"/2^64) sec"
" in %s\n",
(long)l->l_lid,
l->l_proc->p_comm,
l->l_name ? " " : "",
l->l_name ? l->l_name : "",
l->l_pflag,
(intmax_t)l->l_stime.sec, l->l_stime.frac,
(intmax_t)now->sec, now->frac,
caller);
}
/* rtime += now - stime */
bintime_add(&l->l_rtime, now);
bintime_sub(&l->l_rtime, &l->l_stime);
}
/*
* Select next LWP from the current CPU to run..
*/
static inline lwp_t *
nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc)
{
lwp_t *newl;
/*
* Let sched_nextlwp() select the LWP to run the CPU next.
* If no LWP is runnable, select the idle LWP.
*
* On arrival here LWPs on a run queue are locked by spc_mutex which
* is currently held. Idle LWPs are always locked by spc_lwplock,
* which may or may not be held here. On exit from this code block,
* in all cases newl is locked by spc_lwplock.
*/
newl = sched_nextlwp();
if (newl != NULL) {
sched_dequeue(newl);
KASSERT(lwp_locked(newl, spc->spc_mutex)); KASSERT(newl->l_cpu == ci);
newl->l_stat = LSONPROC;
newl->l_pflag |= LP_RUNNING;
newl->l_boostpri = PRI_NONE;
spc->spc_curpriority = lwp_eprio(newl);
spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE);
lwp_setlock(newl, spc->spc_lwplock);
} else {
/*
* The idle LWP does not get set to LSONPROC, because
* otherwise it screws up the output from top(1) etc.
*/
newl = ci->ci_data.cpu_idlelwp;
newl->l_pflag |= LP_RUNNING;
spc->spc_curpriority = PRI_IDLE;
spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) |
SPCF_IDLE;
}
/*
* Only clear want_resched if there are no pending (slow) software
* interrupts. We can do this without an atomic, because no new
* LWPs can appear in the queue due to our hold on spc_mutex, and
* the update to ci_want_resched will become globally visible before
* the release of spc_mutex becomes globally visible.
*/
if (ci->ci_data.cpu_softints == 0) ci->ci_want_resched = 0;
return newl;
}
/*
* The machine independent parts of context switch.
*
* NOTE: l->l_cpu is not changed in this routine, because an LWP never
* changes its own l_cpu (that would screw up curcpu on many ports and could
* cause all kinds of other evil stuff). l_cpu is always changed by some
* other actor, when it's known the LWP is not running (the LP_RUNNING flag
* is checked under lock).
*/
void
mi_switch(lwp_t *l)
{
struct cpu_info *ci;
struct schedstate_percpu *spc;
struct lwp *newl;
kmutex_t *lock;
int oldspl;
struct bintime bt;
bool returning;
KASSERT(lwp_locked(l, NULL)); KASSERT(kpreempt_disabled()); KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked");
kstack_check_magic(l);
binuptime(&bt);
KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); KASSERT((l->l_pflag & LP_RUNNING) != 0); KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN);
ci = curcpu();
spc = &ci->ci_schedstate;
returning = false;
newl = NULL;
/*
* If we have been asked to switch to a specific LWP, then there
* is no need to inspect the run queues. If a soft interrupt is
* blocking, then return to the interrupted thread without adjusting
* VM context or its start time: neither have been changed in order
* to take the interrupt.
*/
if (l->l_switchto != NULL) { if ((l->l_pflag & LP_INTR) != 0) {
returning = true;
softint_block(l);
if ((l->l_pflag & LP_TIMEINTR) != 0) updatertime(l, &bt);
}
newl = l->l_switchto;
l->l_switchto = NULL;
}
#ifndef __HAVE_FAST_SOFTINTS
else if (ci->ci_data.cpu_softints != 0) {
/* There are pending soft interrupts, so pick one. */
newl = softint_picklwp();
newl->l_stat = LSONPROC;
newl->l_pflag |= LP_RUNNING;
}
#endif /* !__HAVE_FAST_SOFTINTS */
/*
* If on the CPU and we have gotten this far, then we must yield.
*/
if (l->l_stat == LSONPROC && l != newl) { KASSERT(lwp_locked(l, spc->spc_lwplock)); KASSERT((l->l_flag & LW_IDLE) == 0);
l->l_stat = LSRUN;
lwp_setlock(l, spc->spc_mutex);
sched_enqueue(l);
sched_preempted(l);
/*
* Handle migration. Note that "migrating LWP" may
* be reset here, if interrupt/preemption happens
* early in idle LWP.
*/
if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { KASSERT((l->l_pflag & LP_INTR) == 0);
spc->spc_migrating = l;
}
}
/* Pick new LWP to run. */
if (newl == NULL) { newl = nextlwp(ci, spc);
}
/* Items that must be updated with the CPU locked. */
if (!returning) {
/* Count time spent in current system call */
SYSCALL_TIME_SLEEP(l);
updatertime(l, &bt);
/* Update the new LWP's start time. */
newl->l_stime = bt;
/*
* ci_curlwp changes when a fast soft interrupt occurs.
* We use ci_onproc to keep track of which kernel or
* user thread is running 'underneath' the software
* interrupt. This is important for time accounting,
* itimers and forcing user threads to preempt (aston).
*/
ci->ci_onproc = newl;
}
/*
* Preemption related tasks. Must be done holding spc_mutex. Clear
* l_dopreempt without an atomic - it's only ever set non-zero by
* sched_resched_cpu() which also holds spc_mutex, and only ever
* cleared by the LWP itself (us) with atomics when not under lock.
*/
l->l_dopreempt = 0;
if (__predict_false(l->l_pfailaddr != 0)) {
LOCKSTAT_FLAG(lsflag);
LOCKSTAT_ENTER(lsflag);
LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime);
LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN,
1, l->l_pfailtime, l->l_pfailaddr);
LOCKSTAT_EXIT(lsflag);
l->l_pfailtime = 0;
l->l_pfaillock = 0;
l->l_pfailaddr = 0;
}
if (l != newl) {
struct lwp *prevlwp;
/* Release all locks, but leave the current LWP locked */
if (l->l_mutex == spc->spc_mutex) {
/*
* Drop spc_lwplock, if the current LWP has been moved
* to the run queue (it is now locked by spc_mutex).
*/
mutex_spin_exit(spc->spc_lwplock);
} else {
/*
* Otherwise, drop the spc_mutex, we are done with the
* run queues.
*/
mutex_spin_exit(spc->spc_mutex);
}
/* We're down to only one lock, so do debug checks. */
LOCKDEBUG_BARRIER(l->l_mutex, 1);
/* Count the context switch. */
CPU_COUNT(CPU_COUNT_NSWTCH, 1);
if ((l->l_pflag & LP_PREEMPTING) != 0) {
l->l_ru.ru_nivcsw++;
l->l_pflag &= ~LP_PREEMPTING;
} else {
l->l_ru.ru_nvcsw++;
}
/*
* Increase the count of spin-mutexes before the release
* of the last lock - we must remain at IPL_SCHED after
* releasing the lock.
*/
KASSERTMSG(ci->ci_mtx_count == -1,
"%s: cpu%u: ci_mtx_count (%d) != -1 "
"(block with spin-mutex held)",
__func__, cpu_index(ci), ci->ci_mtx_count);
oldspl = MUTEX_SPIN_OLDSPL(ci);
ci->ci_mtx_count = -2;
/* Update status for lwpctl, if present. */
if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ?
LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE);
}
/*
* If curlwp is a soft interrupt LWP, there's nobody on the
* other side to unlock - we're returning into an assembly
* trampoline. Unlock now. This is safe because this is a
* kernel LWP and is bound to current CPU: the worst anyone
* else will do to it, is to put it back onto this CPU's run
* queue (and the CPU is busy here right now!).
*/
if (returning) {
/* Keep IPL_SCHED after this; MD code will fix up. */
l->l_pflag &= ~LP_RUNNING;
lwp_unlock(l);
} else {
/* A normal LWP: save old VM context. */
pmap_deactivate(l);
}
/*
* If DTrace has set the active vtime enum to anything
* other than INACTIVE (0), then it should have set the
* function to call.
*/
if (__predict_false(dtrace_vtime_active)) { (*dtrace_vtime_switch_func)(newl);
}
/*
* We must ensure not to come here from inside a read section.
*/
KASSERT(pserialize_not_in_read_section());
/* Switch to the new LWP.. */
#ifdef MULTIPROCESSOR
KASSERT(curlwp == ci->ci_curlwp);
#endif
KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp);
prevlwp = cpu_switchto(l, newl, returning);
ci = curcpu();
#ifdef MULTIPROCESSOR
KASSERT(curlwp == ci->ci_curlwp);
#endif
KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p",
l, curlwp, prevlwp);
KASSERT(prevlwp != NULL); KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -2);
/*
* Immediately mark the previous LWP as no longer running
* and unlock (to keep lock wait times short as possible).
* We'll still be at IPL_SCHED afterwards. If a zombie,
* don't touch after clearing LP_RUNNING as it could be
* reaped by another CPU. Issue a memory barrier to ensure
* this.
*
* atomic_store_release matches atomic_load_acquire in
* lwp_free.
*/
KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0);
lock = prevlwp->l_mutex;
if (__predict_false(prevlwp->l_stat == LSZOMB)) {
atomic_store_release(&prevlwp->l_pflag,
prevlwp->l_pflag & ~LP_RUNNING);
} else {
prevlwp->l_pflag &= ~LP_RUNNING;
}
mutex_spin_exit(lock);
/*
* Switched away - we have new curlwp.
* Restore VM context and IPL.
*/
pmap_activate(l);
pcu_switchpoint(l);
/* Update status for lwpctl, if present. */
if (l->l_lwpctl != NULL) { l->l_lwpctl->lc_curcpu = (int)cpu_index(ci);
l->l_lwpctl->lc_pctr++;
}
/*
* Normalize the spin mutex count and restore the previous
* SPL. Note that, unless the caller disabled preemption,
* we can be preempted at any time after this splx().
*/
KASSERT(l->l_cpu == ci); KASSERT(ci->ci_mtx_count == -1);
ci->ci_mtx_count = 0;
splx(oldspl);
} else {
/* Nothing to do - just unlock and return. */
mutex_spin_exit(spc->spc_mutex);
l->l_pflag &= ~LP_PREEMPTING;
lwp_unlock(l);
}
KASSERT(l == curlwp); KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0);
SYSCALL_TIME_WAKEUP(l);
LOCKDEBUG_BARRIER(NULL, 1);
}
/*
* setrunnable: change LWP state to be runnable, placing it on the run queue.
*
* Call with the process and LWP locked. Will return with the LWP unlocked.
*/
void
setrunnable(struct lwp *l)
{
struct proc *p = l->l_proc;
struct cpu_info *ci;
kmutex_t *oldlock;
KASSERT((l->l_flag & LW_IDLE) == 0); KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); KASSERT(mutex_owned(p->p_lock)); KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex);
switch (l->l_stat) {
case LSSTOP:
/*
* If we're being traced (possibly because someone attached us
* while we were stopped), check for a signal from the debugger.
*/
if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) signotify(l);
p->p_nrlwps++;
break;
case LSSUSPENDED:
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
l->l_flag &= ~LW_WSUSPEND;
p->p_nrlwps++;
cv_broadcast(&p->p_lwpcv);
break;
case LSSLEEP:
KASSERT(l->l_wchan != NULL);
break;
case LSIDL:
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock));
break;
default:
panic("setrunnable: lwp %p state was %d", l, l->l_stat);
}
/*
* If the LWP was sleeping, start it again.
*/
if (l->l_wchan != NULL) {
l->l_stat = LSSLEEP;
/* lwp_unsleep() will release the lock. */
lwp_unsleep(l, true);
return;
}
/*
* If the LWP is still on the CPU, mark it as LSONPROC. It may be
* about to call mi_switch(), in which case it will yield.
*/
if ((l->l_pflag & LP_RUNNING) != 0) {
l->l_stat = LSONPROC;
l->l_slptime = 0;
lwp_unlock(l);
return;
}
/*
* Look for a CPU to run.
* Set the LWP runnable.
*/
ci = sched_takecpu(l);
l->l_cpu = ci;
spc_lock(ci);
oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex);
sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
sched_enqueue(l);
sched_resched_lwp(l, true);
/* SPC & LWP now unlocked. */
mutex_spin_exit(oldlock);
}
/*
* suspendsched:
*
* Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED.
*/
void
suspendsched(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct lwp *l;
struct proc *p;
/*
* We do this by process in order not to violate the locking rules.
*/
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
mutex_enter(p->p_lock);
if ((p->p_flag & PK_SYSTEM) != 0) {
mutex_exit(p->p_lock);
continue;
}
if (p->p_stat != SSTOP) {
if (p->p_stat != SZOMB && p->p_stat != SDEAD) {
p->p_pptr->p_nstopchild++;
p->p_waited = 0;
}
p->p_stat = SSTOP;
}
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
if (l == curlwp)
continue;
lwp_lock(l);
/*
* Set L_WREBOOT so that the LWP will suspend itself
* when it tries to return to user mode. We want to
* try and get to get as many LWPs as possible to
* the user / kernel boundary, so that they will
* release any locks that they hold.
*/
l->l_flag |= (LW_WREBOOT | LW_WSUSPEND);
if (l->l_stat == LSSLEEP &&
(l->l_flag & LW_SINTR) != 0) {
/* setrunnable() will release the lock. */
setrunnable(l);
continue;
}
lwp_unlock(l);
}
mutex_exit(p->p_lock);
}
mutex_exit(&proc_lock);
/*
* Kick all CPUs to make them preempt any LWPs running in user mode.
* They'll trap into the kernel and suspend themselves in userret().
*
* Unusually, we don't hold any other scheduler object locked, which
* would keep preemption off for sched_resched_cpu(), so disable it
* explicitly.
*/
kpreempt_disable();
for (CPU_INFO_FOREACH(cii, ci)) {
spc_lock(ci);
sched_resched_cpu(ci, PRI_KERNEL, true);
/* spc now unlocked */
}
kpreempt_enable();
}
/*
* sched_unsleep:
*
* The is called when the LWP has not been awoken normally but instead
* interrupted: for example, if the sleep timed out. Because of this,
* it's not a valid action for running or idle LWPs.
*/
static void
sched_unsleep(struct lwp *l, bool cleanup)
{
lwp_unlock(l);
panic("sched_unsleep");
}
static void
sched_changepri(struct lwp *l, pri_t pri)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
ci = l->l_cpu;
spc = &ci->ci_schedstate;
if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex));
sched_dequeue(l);
l->l_priority = pri;
sched_enqueue(l);
sched_resched_lwp(l, false);
} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
/* On priority drop, only evict realtime LWPs. */
KASSERT(lwp_locked(l, spc->spc_lwplock));
l->l_priority = pri;
spc_lock(ci);
sched_resched_cpu(ci, spc->spc_maxpriority, true);
/* spc now unlocked */
} else {
l->l_priority = pri;
}
}
static void
sched_lendpri(struct lwp *l, pri_t pri)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
ci = l->l_cpu;
spc = &ci->ci_schedstate;
if (l->l_stat == LSRUN) { KASSERT(lwp_locked(l, spc->spc_mutex));
sched_dequeue(l);
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
sched_enqueue(l);
sched_resched_lwp(l, false);
} else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) {
/* On priority drop, only evict realtime LWPs. */
KASSERT(lwp_locked(l, spc->spc_lwplock));
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
spc_lock(ci);
sched_resched_cpu(ci, spc->spc_maxpriority, true);
/* spc now unlocked */
} else {
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
}
}
struct lwp *
syncobj_noowner(wchan_t wchan)
{
return NULL;
}
/* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */
const fixpt_t ccpu = 0.95122942450071400909 * FSCALE;
/*
* Constants for averages over 1, 5 and 15 minutes when sampling at
* 5 second intervals.
*/
static const fixpt_t cexp[ ] = {
0.9200444146293232 * FSCALE, /* exp(-1/12) */
0.9834714538216174 * FSCALE, /* exp(-1/60) */
0.9944598480048967 * FSCALE, /* exp(-1/180) */
};
/*
* sched_pstats:
*
* => Update process statistics and check CPU resource allocation.
* => Call scheduler-specific hook to eventually adjust LWP priorities.
* => Compute load average of a quantity on 1, 5 and 15 minute intervals.
*/
void
sched_pstats(void)
{
struct loadavg *avg = &averunnable;
const int clkhz = (stathz != 0 ? stathz : hz);
static bool backwardslwp = false;
static bool backwardsproc = false;
static u_int lavg_count = 0;
struct proc *p;
int nrun;
sched_pstats_ticks++;
if (++lavg_count >= 5) {
lavg_count = 0;
nrun = 0;
}
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
struct lwp *l;
struct rlimit *rlim;
time_t runtm;
int sig;
/* Increment sleep time (if sleeping), ignore overflow. */
mutex_enter(p->p_lock);
runtm = p->p_rtime.sec;
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
fixpt_t lpctcpu;
u_int lcpticks;
if (__predict_false((l->l_flag & LW_IDLE) != 0))
continue;
lwp_lock(l);
if (__predict_false(l->l_rtime.sec < 0) &&
!backwardslwp) {
backwardslwp = true;
printf("WARNING: lwp %ld (%s%s%s): "
"negative runtime: "
"(%jd + 0x%016"PRIx64"/2^64) sec\n",
(long)l->l_lid,
l->l_proc->p_comm,
l->l_name ? " " : "",
l->l_name ? l->l_name : "",
(intmax_t)l->l_rtime.sec,
l->l_rtime.frac);
}
runtm += l->l_rtime.sec;
l->l_swtime++;
sched_lwp_stats(l);
/* For load average calculation. */
if (__predict_false(lavg_count == 0) &&
(l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) {
switch (l->l_stat) {
case LSSLEEP:
if (l->l_slptime > 1) {
break;
}
/* FALLTHROUGH */
case LSRUN:
case LSONPROC:
case LSIDL:
nrun++;
}
}
lwp_unlock(l);
l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT;
if (l->l_slptime != 0)
continue;
lpctcpu = l->l_pctcpu;
lcpticks = atomic_swap_uint(&l->l_cpticks, 0);
lpctcpu += ((FSCALE - ccpu) *
(lcpticks * FSCALE / clkhz)) >> FSHIFT;
l->l_pctcpu = lpctcpu;
}
/* Calculating p_pctcpu only for ps(1) */
p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
if (__predict_false(runtm < 0)) {
if (!backwardsproc) {
backwardsproc = true;
printf("WARNING: pid %ld (%s): "
"negative runtime; "
"monotonic clock has gone backwards\n",
(long)p->p_pid, p->p_comm);
}
mutex_exit(p->p_lock);
continue;
}
/*
* Check if the process exceeds its CPU resource allocation.
* If over the hard limit, kill it with SIGKILL.
* If over the soft limit, send SIGXCPU and raise
* the soft limit a little.
*/
rlim = &p->p_rlimit[RLIMIT_CPU];
sig = 0;
if (__predict_false(runtm >= rlim->rlim_cur)) {
if (runtm >= rlim->rlim_max) {
sig = SIGKILL;
log(LOG_NOTICE,
"pid %d, command %s, is killed: %s\n",
p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
uprintf("pid %d, command %s, is killed: %s\n",
p->p_pid, p->p_comm, "exceeded RLIMIT_CPU");
} else {
sig = SIGXCPU;
if (rlim->rlim_cur < rlim->rlim_max)
rlim->rlim_cur += 5;
}
}
mutex_exit(p->p_lock);
if (__predict_false(sig)) {
KASSERT((p->p_flag & PK_SYSTEM) == 0);
psignal(p, sig);
}
}
/* Load average calculation. */
if (__predict_false(lavg_count == 0)) {
int i;
CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg));
for (i = 0; i < __arraycount(cexp); i++) {
avg->ldavg[i] = (cexp[i] * avg->ldavg[i] +
nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
}
}
/* Lightning bolt. */
cv_broadcast(&lbolt);
mutex_exit(&proc_lock);
}
/* $NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $ */
/* $KAME: ip6_input.c,v 1.188 2001/03/29 05:34:31 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_input.c,v 1.227 2022/10/28 05:18:39 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_gateway.h"
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/syslog.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/cprng.h>
#include <sys/percpu.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/if_dl.h>
#include <net/route.h>
#include <net/pktqueue.h>
#include <net/pfil.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#ifdef INET
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#endif /* INET */
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet/icmp6.h>
#include <netinet6/scope6_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/nd6.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet6/ip6protosw.h>
#include "faith.h"
extern struct domain inet6domain;
u_char ip6_protox[IPPROTO_MAX];
pktqueue_t *ip6_pktq __read_mostly;
pfil_head_t *inet6_pfil_hook;
percpu_t *ip6stat_percpu;
percpu_t *ip6_forward_rt_percpu __cacheline_aligned;
static void ip6intr(void *);
static void ip6_input(struct mbuf *, struct ifnet *);
static bool ip6_badaddr(struct ip6_hdr *);
static struct m_tag *ip6_setdstifaddr(struct mbuf *, const struct in6_ifaddr *);
static struct m_tag *ip6_addaux(struct mbuf *);
static struct m_tag *ip6_findaux(struct mbuf *);
static void ip6_delaux(struct mbuf *);
static int ip6_process_hopopts(struct mbuf *, u_int8_t *, int, u_int32_t *,
u_int32_t *);
static struct mbuf *ip6_pullexthdr(struct mbuf *, size_t, int);
static void sysctl_net_inet6_ip6_setup(struct sysctllog **);
#ifdef NET_MPSAFE
#define SOFTNET_LOCK() mutex_enter(softnet_lock)
#define SOFTNET_UNLOCK() mutex_exit(softnet_lock)
#else
#define SOFTNET_LOCK() KASSERT(mutex_owned(softnet_lock))
#define SOFTNET_UNLOCK() KASSERT(mutex_owned(softnet_lock))
#endif
/* Ensure that non packed structures are the desired size. */
__CTASSERT(sizeof(struct ip6_hdr) == 40);
__CTASSERT(sizeof(struct ip6_ext) == 2);
__CTASSERT(sizeof(struct ip6_hbh) == 2);
__CTASSERT(sizeof(struct ip6_dest) == 2);
__CTASSERT(sizeof(struct ip6_opt) == 2);
__CTASSERT(sizeof(struct ip6_opt_jumbo) == 6);
__CTASSERT(sizeof(struct ip6_opt_nsap) == 4);
__CTASSERT(sizeof(struct ip6_opt_tunnel) == 3);
__CTASSERT(sizeof(struct ip6_opt_router) == 4);
__CTASSERT(sizeof(struct ip6_rthdr) == 4);
__CTASSERT(sizeof(struct ip6_rthdr0) == 8);
__CTASSERT(sizeof(struct ip6_frag) == 8);
/*
* IP6 initialization: fill in IP6 protocol switch table.
* All protocols not implemented in kernel go to raw IP6 protocol handler.
*/
void
ip6_init(void)
{
const struct ip6protosw *pr;
int i;
in6_init();
ip6_pktq = pktq_create(IFQ_MAXLEN, ip6intr, NULL);
KASSERT(ip6_pktq != NULL);
sysctl_net_inet6_ip6_setup(NULL);
pr = (const struct ip6protosw *)pffindproto(PF_INET6, IPPROTO_RAW, SOCK_RAW);
if (pr == 0)
panic("ip6_init");
for (i = 0; i < IPPROTO_MAX; i++)
ip6_protox[i] = pr - inet6sw;
for (pr = (const struct ip6protosw *)inet6domain.dom_protosw;
pr < (const struct ip6protosw *)inet6domain.dom_protoswNPROTOSW; pr++)
if (pr->pr_domain->dom_family == PF_INET6 &&
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
ip6_protox[pr->pr_protocol] = pr - inet6sw;
scope6_init();
addrsel_policy_init();
nd6_init();
frag6_init();
#ifdef GATEWAY
ip6flow_init(ip6_hashsize);
#endif
/* Register our Packet Filter hook. */
inet6_pfil_hook = pfil_head_create(PFIL_TYPE_AF, (void *)AF_INET6);
KASSERT(inet6_pfil_hook != NULL);
ip6stat_percpu = percpu_alloc(sizeof(uint64_t) * IP6_NSTATS);
ip6_forward_rt_percpu = rtcache_percpu_alloc();
}
/*
* IP6 input interrupt handling. Just pass the packet to ip6_input.
*/
static void
ip6intr(void *arg __unused)
{
struct mbuf *m;
SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
while ((m = pktq_dequeue(ip6_pktq)) != NULL) {
struct psref psref;
struct ifnet *rcvif = m_get_rcvif_psref(m, &psref);
if (rcvif == NULL) {
IP6_STATINC(IP6_STAT_IFDROP);
m_freem(m);
continue;
}
/*
* Drop the packet if IPv6 is disabled on the interface.
*/
if ((ND_IFINFO(rcvif)->flags & ND6_IFF_IFDISABLED)) {
m_put_rcvif_psref(rcvif, &psref);
IP6_STATINC(IP6_STAT_IFDROP);
m_freem(m);
continue;
}
ip6_input(m, rcvif);
m_put_rcvif_psref(rcvif, &psref);
}
SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
}
static void
ip6_input(struct mbuf *m, struct ifnet *rcvif)
{
struct ip6_hdr *ip6;
int hit, off = sizeof(struct ip6_hdr), nest;
u_int32_t plen;
u_int32_t rtalert = ~0;
int nxt, ours = 0, rh_present = 0, frg_present;
struct ifnet *deliverifp = NULL;
int srcrt = 0;
struct rtentry *rt = NULL;
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
struct route *ro;
KASSERT(rcvif != NULL);
/*
* make sure we don't have onion peering information into m_tag.
*/
ip6_delaux(m);
/*
* mbuf statistics
*/
if (m->m_flags & M_EXT) {
if (m->m_next)
IP6_STATINC(IP6_STAT_MEXT2M);
else
IP6_STATINC(IP6_STAT_MEXT1);
} else {
#define M2MMAX 32
if (m->m_next) {
if (m->m_flags & M_LOOP)
/*XXX*/ IP6_STATINC(IP6_STAT_M2M + lo0ifp->if_index);
else if (rcvif->if_index < M2MMAX)
IP6_STATINC(IP6_STAT_M2M + rcvif->if_index);
else
IP6_STATINC(IP6_STAT_M2M);
} else
IP6_STATINC(IP6_STAT_M1);
#undef M2MMAX
}
in6_ifstat_inc(rcvif, ifs6_in_receive);
IP6_STATINC(IP6_STAT_TOTAL);
/*
* If the IPv6 header is not aligned, slurp it up into a new
* mbuf with space for link headers, in the event we forward
* it. Otherwise, if it is aligned, make sure the entire base
* IPv6 header is in the first mbuf of the chain.
*/
if (M_GET_ALIGNED_HDR(&m, struct ip6_hdr, true) != 0) {
/* XXXJRT new stat, please */
IP6_STATINC(IP6_STAT_TOOSMALL);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
return;
}
ip6 = mtod(m, struct ip6_hdr *);
if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
IP6_STATINC(IP6_STAT_BADVERS);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
goto bad;
}
if (ip6_badaddr(ip6)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(rcvif, ifs6_in_addrerr);
goto bad;
}
/*
* Assume that we can create a fast-forward IP flow entry
* based on this packet.
*/
m->m_flags |= M_CANFASTFWD;
/*
* Run through list of hooks for input packets. If there are any
* filters which require that additional packets in the flow are
* not fast-forwarded, they must clear the M_CANFASTFWD flag.
* Note that filters must _never_ set this flag, as another filter
* in the list may have previously cleared it.
*
* Don't call hooks if the packet has already been processed by
* IPsec (encapsulated, tunnel mode).
*/
#if defined(IPSEC)
if (!ipsec_used || !ipsec_skip_pfil(m))
#else
if (1)
#endif
{
struct in6_addr odst;
int error;
odst = ip6->ip6_dst;
error = pfil_run_hooks(inet6_pfil_hook, &m, rcvif, PFIL_IN);
if (error != 0 || m == NULL) {
IP6_STATINC(IP6_STAT_PFILDROP_IN);
return;
}
if (m->m_len < sizeof(struct ip6_hdr)) {
if ((m = m_pullup(m, sizeof(struct ip6_hdr))) == NULL) {
IP6_STATINC(IP6_STAT_TOOSMALL);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
return;
}
}
ip6 = mtod(m, struct ip6_hdr *);
srcrt = !IN6_ARE_ADDR_EQUAL(&odst, &ip6->ip6_dst);
}
IP6_STATINC(IP6_STAT_NXTHIST + ip6->ip6_nxt);
#ifdef ALTQ
if (altq_input != NULL) {
SOFTNET_LOCK();
if ((*altq_input)(m, AF_INET6) == 0) {
SOFTNET_UNLOCK();
/* packet is dropped by traffic conditioner */
return;
}
SOFTNET_UNLOCK();
}
#endif
/*
* Disambiguate address scope zones (if there is ambiguity).
* We first make sure that the original source or destination address
* is not in our internal form for scoped addresses. Such addresses
* are not necessarily invalid spec-wise, but we cannot accept them due
* to the usage conflict.
* in6_setscope() then also checks and rejects the cases where src or
* dst are the loopback address and the receiving interface
* is not loopback.
*/
if (__predict_false(
m_makewritable(&m, 0, sizeof(struct ip6_hdr), M_DONTWAIT))) {
IP6_STATINC(IP6_STAT_IDROPPED);
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
if (in6_clearscope(&ip6->ip6_src) || in6_clearscope(&ip6->ip6_dst)) {
IP6_STATINC(IP6_STAT_BADSCOPE); /* XXX */
goto bad;
}
if (in6_setscope(&ip6->ip6_src, rcvif, NULL) ||
in6_setscope(&ip6->ip6_dst, rcvif, NULL)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
goto bad;
}
ro = rtcache_percpu_getref(ip6_forward_rt_percpu);
/*
* Multicast check
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
bool ingroup;
in6_ifstat_inc(rcvif, ifs6_in_mcast);
/*
* See if we belong to the destination multicast group on the
* arrival interface.
*/
ingroup = in6_multi_group(&ip6->ip6_dst, rcvif);
if (ingroup) {
ours = 1;
} else if (!ip6_mrouter) {
uint64_t *ip6s = IP6_STAT_GETREF();
ip6s[IP6_STAT_NOTMEMBER]++;
ip6s[IP6_STAT_CANTFORWARD]++;
IP6_STAT_PUTREF();
in6_ifstat_inc(rcvif, ifs6_in_discard);
goto bad_unref;
}
deliverifp = rcvif;
goto hbhcheck;
}
sockaddr_in6_init(&u.dst6, &ip6->ip6_dst, 0, 0, 0);
/*
* Unicast check
*/
rt = rtcache_lookup2(ro, &u.dst, 1, &hit);
if (hit)
IP6_STATINC(IP6_STAT_FORWARD_CACHEHIT);
else
IP6_STATINC(IP6_STAT_FORWARD_CACHEMISS);
/*
* Accept the packet if the forwarding interface to the destination
* (according to the routing table) is the loopback interface,
* unless the associated route has a gateway.
*
* We don't explicitly match ip6_dst against an interface here. It
* is already done in rtcache_lookup2: rt->rt_ifp->if_type will be
* IFT_LOOP if the packet is for us.
*
* Note that this approach causes to accept a packet if there is a
* route to the loopback interface for the destination of the packet.
* But we think it's even useful in some situations, e.g. when using
* a special daemon which wants to intercept the packet.
*/
if (rt != NULL &&
(rt->rt_flags & (RTF_HOST|RTF_GATEWAY)) == RTF_HOST &&
rt->rt_ifp->if_type == IFT_LOOP) {
struct in6_ifaddr *ia6 = (struct in6_ifaddr *)rt->rt_ifa;
int addrok;
if (ia6->ia6_flags & IN6_IFF_ANYCAST)
m->m_flags |= M_ANYCAST6;
/*
* packets to a tentative, duplicated, or somehow invalid
* address must not be accepted.
*/
if (ia6->ia6_flags & IN6_IFF_NOTREADY)
addrok = 0;
else if (ia6->ia6_flags & IN6_IFF_DETACHED &&
!IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src))
{
/* Allow internal traffic to DETACHED addresses */
struct sockaddr_in6 sin6;
int s;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(sin6);
sin6.sin6_addr = ip6->ip6_src;
s = pserialize_read_enter();
addrok = (ifa_ifwithaddr(sin6tosa(&sin6)) != NULL);
pserialize_read_exit(s);
} else
addrok = 1;
if (addrok) {
/* this address is ready */
ours = 1;
deliverifp = ia6->ia_ifp; /* correct? */
goto hbhcheck;
} else {
/* address is not ready, so discard the packet. */
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
nd6log(LOG_INFO, "packet to an unready address %s->%s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
IP6_STATINC(IP6_STAT_IDROPPED);
goto bad_unref;
}
}
/*
* FAITH (Firewall Aided Internet Translator)
*/
#if defined(NFAITH) && 0 < NFAITH
if (ip6_keepfaith) {
if (rt != NULL && rt->rt_ifp != NULL &&
rt->rt_ifp->if_type == IFT_FAITH) {
/* XXX do we need more sanity checks? */
ours = 1;
deliverifp = rt->rt_ifp; /* faith */
goto hbhcheck;
}
}
#endif
/*
* Now there is no reason to process the packet if it's not our own
* and we're not a router.
*/
if (!ip6_forwarding) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
in6_ifstat_inc(rcvif, ifs6_in_discard);
goto bad_unref;
}
hbhcheck:
/*
* Record address information into m_tag, if we don't have one yet.
* Note that we are unable to record it, if the address is not listed
* as our interface address (e.g. multicast addresses, addresses
* within FAITH prefixes and such).
*/
if (deliverifp && ip6_getdstifaddr(m) == NULL) {
struct in6_ifaddr *ia6;
int s = pserialize_read_enter();
ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
/* Depends on ip6_setdstifaddr never sleep */
if (ia6 != NULL && ip6_setdstifaddr(m, ia6) == NULL) {
/*
* XXX maybe we should drop the packet here,
* as we could not provide enough information
* to the upper layers.
*/
}
pserialize_read_exit(s);
}
/*
* Process Hop-by-Hop options header if it's contained.
* m may be modified in ip6_hopopts_input().
* If a JumboPayload option is included, plen will also be modified.
*/
plen = (u_int32_t)ntohs(ip6->ip6_plen);
if (ip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
if (ip6_hopopts_input(&plen, &rtalert, &m, &off)) {
/* m already freed */
in6_ifstat_inc(rcvif, ifs6_in_discard);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
/* adjust pointer */
ip6 = mtod(m, struct ip6_hdr *);
/*
* if the payload length field is 0 and the next header field
* indicates Hop-by-Hop Options header, then a Jumbo Payload
* option MUST be included.
*/
if (ip6->ip6_plen == 0 && plen == 0) {
/*
* Note that if a valid jumbo payload option is
* contained, ip6_hopopts_input() must set a valid
* (non-zero) payload length to the variable plen.
*/
IP6_STATINC(IP6_STAT_BADOPTIONS);
in6_ifstat_inc(rcvif, ifs6_in_discard);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
(char *)&ip6->ip6_plen - (char *)ip6);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
sizeof(struct ip6_hbh));
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
return;
}
KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
nxt = hbh->ip6h_nxt;
/*
* accept the packet if a router alert option is included
* and we act as an IPv6 router.
*/
if (rtalert != ~0 && ip6_forwarding)
ours = 1;
} else
nxt = ip6->ip6_nxt;
/*
* Check that the amount of data in the buffers is at least much as
* the IPv6 header would have us expect. Trim mbufs if longer than we
* expect. Drop packet if shorter than we expect.
*/
if (m->m_pkthdr.len - sizeof(struct ip6_hdr) < plen) {
IP6_STATINC(IP6_STAT_TOOSHORT);
in6_ifstat_inc(rcvif, ifs6_in_truncated);
goto bad_unref;
}
if (m->m_pkthdr.len > sizeof(struct ip6_hdr) + plen) {
if (m->m_len == m->m_pkthdr.len) {
m->m_len = sizeof(struct ip6_hdr) + plen;
m->m_pkthdr.len = sizeof(struct ip6_hdr) + plen;
} else
m_adj(m, sizeof(struct ip6_hdr) + plen - m->m_pkthdr.len);
}
/*
* Forward if desirable.
*/
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
/*
* If we are acting as a multicast router, all
* incoming multicast packets are passed to the
* kernel-level multicast forwarding function.
* The packet is returned (relatively) intact; if
* ip6_mforward() returns a non-zero value, the packet
* must be discarded, else it may be accepted below.
*/
if (ip6_mrouter != NULL) {
int error;
SOFTNET_LOCK();
error = ip6_mforward(ip6, rcvif, m);
SOFTNET_UNLOCK();
if (error != 0) {
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
IP6_STATINC(IP6_STAT_CANTFORWARD);
goto bad;
}
}
if (!ours) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
goto bad_unref;
}
} else if (!ours) {
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
ip6_forward(m, srcrt, rcvif);
return;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Malicious party may be able to use IPv4 mapped addr to confuse
* tcp/udp stack and bypass security checks (act as if it was from
* 127.0.0.1 by using IPv6 src ::ffff:127.0.0.1). Be cautious.
*
* For SIIT end node behavior, you may want to disable the check.
* However, you will become vulnerable to attacks using IPv4 mapped
* source.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
in6_ifstat_inc(rcvif, ifs6_in_addrerr);
goto bad_unref;
}
#ifdef IFA_STATS
if (deliverifp != NULL) {
struct in6_ifaddr *ia6;
int s = pserialize_read_enter();
ia6 = in6_ifawithifp(deliverifp, &ip6->ip6_dst);
if (ia6)
ia6->ia_ifa.ifa_data.ifad_inbytes += m->m_pkthdr.len;
pserialize_read_exit(s);
}
#endif
IP6_STATINC(IP6_STAT_DELIVERED);
in6_ifstat_inc(deliverifp, ifs6_in_deliver);
nest = 0;
if (rt != NULL) {
rtcache_unref(rt, ro);
rt = NULL;
}
rtcache_percpu_putref(ip6_forward_rt_percpu);
rh_present = 0;
frg_present = 0;
while (nxt != IPPROTO_DONE) {
if (ip6_hdrnestlimit && (++nest > ip6_hdrnestlimit)) {
IP6_STATINC(IP6_STAT_TOOMANYHDR);
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
goto bad;
}
M_VERIFY_PACKET(m);
/*
* protection against faulty packet - there should be
* more sanity checks in header chain processing.
*/
if (m->m_pkthdr.len < off) {
IP6_STATINC(IP6_STAT_TOOSHORT);
in6_ifstat_inc(rcvif, ifs6_in_truncated);
goto bad;
}
if (nxt == IPPROTO_ROUTING) {
if (rh_present++) {
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
IP6_STATINC(IP6_STAT_BADOPTIONS);
goto bad;
}
} else if (nxt == IPPROTO_FRAGMENT) {
if (frg_present++) {
in6_ifstat_inc(rcvif, ifs6_in_hdrerr);
IP6_STATINC(IP6_STAT_BADOPTIONS);
goto bad;
}
}
#ifdef IPSEC
if (ipsec_used) {
/*
* Enforce IPsec policy checking if we are seeing last
* header. Note that we do not visit this with
* protocols with pcb layer code - like udp/tcp/raw ip.
*/
if ((inet6sw[ip6_protox[nxt]].pr_flags
& PR_LASTHDR) != 0) {
int error;
error = ipsec_ip_input_checkpolicy(m, false);
if (error) {
IP6_STATINC(IP6_STAT_IPSECDROP_IN);
goto bad;
}
}
}
#endif
nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
}
return;
bad_unref:
rtcache_unref(rt, ro);
rtcache_percpu_putref(ip6_forward_rt_percpu);
bad:
m_freem(m);
return;
}
static bool
ip6_badaddr(struct ip6_hdr *ip6)
{
/* Check against address spoofing/corruption. */
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_src) ||
IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_dst)) {
return true;
}
/*
* The following check is not documented in specs. A malicious
* party may be able to use IPv4 mapped addr to confuse tcp/udp stack
* and bypass security checks (act as if it was from 127.0.0.1 by using
* IPv6 src ::ffff:127.0.0.1). Be cautious.
*
* This check chokes if we are in an SIIT cloud. As none of BSDs
* support IPv4-less kernel compilation, we cannot support SIIT
* environment at all. So, it makes more sense for us to reject any
* malicious packets for non-SIIT environment, than try to do a
* partial support for SIIT environment.
*/
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
return true;
}
/*
* Reject packets with IPv4-compatible IPv6 addresses (RFC4291).
*/
if (IN6_IS_ADDR_V4COMPAT(&ip6->ip6_src) ||
IN6_IS_ADDR_V4COMPAT(&ip6->ip6_dst)) {
return true;
}
return false;
}
/*
* set/grab in6_ifaddr correspond to IPv6 destination address.
*/
static struct m_tag *
ip6_setdstifaddr(struct mbuf *m, const struct in6_ifaddr *ia)
{
struct m_tag *mtag;
struct ip6aux *ip6a;
mtag = ip6_addaux(m);
if (mtag == NULL)
return NULL;
ip6a = (struct ip6aux *)(mtag + 1);
if (in6_setscope(&ip6a->ip6a_src, ia->ia_ifp, &ip6a->ip6a_scope_id)) {
IP6_STATINC(IP6_STAT_BADSCOPE);
return NULL;
}
ip6a->ip6a_src = ia->ia_addr.sin6_addr;
ip6a->ip6a_flags = ia->ia6_flags;
return mtag;
}
const struct ip6aux *
ip6_getdstifaddr(struct mbuf *m)
{
struct m_tag *mtag;
mtag = ip6_findaux(m);
if (mtag != NULL)
return (struct ip6aux *)(mtag + 1);
else
return NULL;
}
/*
* Hop-by-Hop options header processing. If a valid jumbo payload option is
* included, the real payload length will be stored in plenp.
*
* rtalertp - XXX: should be stored more smart way
*/
int
ip6_hopopts_input(u_int32_t *plenp, u_int32_t *rtalertp,
struct mbuf **mp, int *offp)
{
struct mbuf *m = *mp;
int off = *offp, hbhlen;
struct ip6_hbh *hbh;
/* validation of the length of the header */
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m,
sizeof(struct ip6_hdr), sizeof(struct ip6_hbh));
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return -1;
}
hbhlen = (hbh->ip6h_len + 1) << 3;
IP6_EXTHDR_GET(hbh, struct ip6_hbh *, m, sizeof(struct ip6_hdr),
hbhlen);
if (hbh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return -1;
}
KASSERT(ACCESSIBLE_POINTER(hbh, struct ip6_hdr));
off += hbhlen;
hbhlen -= sizeof(struct ip6_hbh);
if (ip6_process_hopopts(m, (u_int8_t *)hbh + sizeof(struct ip6_hbh),
hbhlen, rtalertp, plenp) < 0)
return -1;
*offp = off;
*mp = m;
return 0;
}
/*
* Search header for all Hop-by-hop options and process each option.
* This function is separate from ip6_hopopts_input() in order to
* handle a case where the sending node itself process its hop-by-hop
* options header. In such a case, the function is called from ip6_output().
*
* The function assumes that hbh header is located right after the IPv6 header
* (RFC2460 p7), opthead is pointer into data content in m, and opthead to
* opthead + hbhlen is located in continuous memory region.
*/
static int
ip6_process_hopopts(struct mbuf *m, u_int8_t *opthead, int hbhlen,
u_int32_t *rtalertp, u_int32_t *plenp)
{
struct ip6_hdr *ip6;
int optlen = 0;
u_int8_t *opt = opthead;
u_int16_t rtalert_val;
u_int32_t jumboplen;
const int erroff = sizeof(struct ip6_hdr) + sizeof(struct ip6_hbh);
for (; hbhlen > 0; hbhlen -= optlen, opt += optlen) { switch (*opt) {
case IP6OPT_PAD1:
optlen = 1;
break;
case IP6OPT_PADN:
if (hbhlen < IP6OPT_MINLEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
optlen = *(opt + 1) + 2;
break;
case IP6OPT_RTALERT:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_RTALERT_LEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
if (*(opt + 1) != IP6OPT_RTALERT_LEN - 2) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_RTALERT_LEN;
memcpy((void *)&rtalert_val, (void *)(opt + 2), 2);
*rtalertp = ntohs(rtalert_val);
break;
case IP6OPT_JUMBO:
/* XXX may need check for alignment */
if (hbhlen < IP6OPT_JUMBO_LEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
if (*(opt + 1) != IP6OPT_JUMBO_LEN - 2) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 1 - opthead);
return (-1);
}
optlen = IP6OPT_JUMBO_LEN;
/*
* IPv6 packets that have non 0 payload length
* must not contain a jumbo payload option.
*/
ip6 = mtod(m, struct ip6_hdr *);
if (ip6->ip6_plen) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt - opthead);
return (-1);
}
/*
* We may see jumbolen in unaligned location, so
* we'd need to perform memcpy().
*/
memcpy(&jumboplen, opt + 2, sizeof(jumboplen));
jumboplen = (u_int32_t)htonl(jumboplen);
#if 1
/*
* if there are multiple jumbo payload options,
* *plenp will be non-zero and the packet will be
* rejected.
* the behavior may need some debate in ipngwg -
* multiple options does not make sense, however,
* there's no explicit mention in specification.
*/
if (*plenp != 0) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
#endif
/*
* jumbo payload length must be larger than 65535.
*/
if (jumboplen <= IPV6_MAXPACKET) {
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_HEADER,
erroff + opt + 2 - opthead);
return (-1);
}
*plenp = jumboplen;
break;
default: /* unknown option */
if (hbhlen < IP6OPT_MINLEN) {
IP6_STATINC(IP6_STAT_TOOSMALL);
goto bad;
}
optlen = ip6_unknown_opt(opt, m,
erroff + opt - opthead);
if (optlen == -1)
return (-1);
optlen += 2;
break;
}
}
return (0);
bad:
m_freem(m);
return (-1);
}
/*
* Unknown option processing.
* The third argument `off' is the offset from the IPv6 header to the option,
* which is necessary if the IPv6 header the and option header and IPv6 header
* is not continuous in order to return an ICMPv6 error.
*/
int
ip6_unknown_opt(u_int8_t *optp, struct mbuf *m, int off)
{
struct ip6_hdr *ip6;
switch (IP6OPT_TYPE(*optp)) {
case IP6OPT_TYPE_SKIP: /* ignore the option */
return ((int)*(optp + 1));
case IP6OPT_TYPE_DISCARD: /* silently discard */
m_freem(m);
return (-1);
case IP6OPT_TYPE_FORCEICMP: /* send ICMP even if multicasted */
IP6_STATINC(IP6_STAT_BADOPTIONS);
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_OPTION, off);
return (-1);
case IP6OPT_TYPE_ICMP: /* send ICMP if not multicasted */
IP6_STATINC(IP6_STAT_BADOPTIONS);
ip6 = mtod(m, struct ip6_hdr *);
if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
(m->m_flags & (M_BCAST|M_MCAST)))
m_freem(m);
else
icmp6_error(m, ICMP6_PARAM_PROB,
ICMP6_PARAMPROB_OPTION, off);
return (-1);
}
m_freem(m); /* XXX: NOTREACHED */
return (-1);
}
void
ip6_savecontrol(struct inpcb *inp, struct mbuf **mp,
struct ip6_hdr *ip6, struct mbuf *m)
{
struct socket *so = inp->inp_socket;
#ifdef RFC2292
#define IS2292(x, y) ((inp->inp_flags & IN6P_RFC2292) ? (x) : (y))
#else
#define IS2292(x, y) (y)
#endif
KASSERT(m->m_flags & M_PKTHDR);
if (SOOPT_TIMESTAMP(so->so_options))
mp = sbsavetimestamp(so->so_options, mp);
/* some OSes call this logic with IPv4 packet, for SO_TIMESTAMP */
if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION)
return;
/* RFC 2292 sec. 5 */
if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
struct in6_pktinfo pi6;
memcpy(&pi6.ipi6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
in6_clearscope(&pi6.ipi6_addr); /* XXX */
pi6.ipi6_ifindex = m->m_pkthdr.rcvif_index;
*mp = sbcreatecontrol(&pi6, sizeof(pi6),
IS2292(IPV6_2292PKTINFO, IPV6_PKTINFO), IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if (inp->inp_flags & IN6P_HOPLIMIT) {
int hlim = ip6->ip6_hlim & 0xff;
*mp = sbcreatecontrol(&hlim, sizeof(hlim),
IS2292(IPV6_2292HOPLIMIT, IPV6_HOPLIMIT), IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
if ((inp->inp_flags & IN6P_TCLASS) != 0) {
u_int32_t flowinfo;
int tclass;
flowinfo = (u_int32_t)ntohl(ip6->ip6_flow & IPV6_FLOWINFO_MASK);
flowinfo >>= 20;
tclass = flowinfo & 0xff;
*mp = sbcreatecontrol(&tclass, sizeof(tclass),
IPV6_TCLASS, IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
}
/*
* IPV6_HOPOPTS socket option. Recall that we required super-user
* privilege for the option (see ip6_ctloutput), but it might be too
* strict, since there might be some hop-by-hop options which can be
* returned to normal user.
* See also RFC3542 section 8 (or RFC2292 section 6).
*/
if ((inp->inp_flags & IN6P_HOPOPTS) != 0) {
/*
* Check if a hop-by-hop options header is contatined in the
* received packet, and if so, store the options as ancillary
* data. Note that a hop-by-hop options header must be
* just after the IPv6 header, which fact is assured through
* the IPv6 input processing.
*/
struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
if (xip6->ip6_nxt == IPPROTO_HOPOPTS) {
struct ip6_hbh *hbh;
int hbhlen;
struct mbuf *ext;
ext = ip6_pullexthdr(m, sizeof(struct ip6_hdr),
xip6->ip6_nxt);
if (ext == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
hbh = mtod(ext, struct ip6_hbh *);
hbhlen = (hbh->ip6h_len + 1) << 3;
if (hbhlen != ext->m_len) {
m_freem(ext);
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
/*
* XXX: We copy whole the header even if a jumbo
* payload option is included, which option is to
* be removed before returning in the RFC 2292.
* Note: this constraint is removed in RFC3542.
*/
*mp = sbcreatecontrol(hbh, hbhlen,
IS2292(IPV6_2292HOPOPTS, IPV6_HOPOPTS),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
m_freem(ext);
}
}
/* IPV6_DSTOPTS and IPV6_RTHDR socket options */
if (inp->inp_flags & (IN6P_DSTOPTS | IN6P_RTHDR)) {
struct ip6_hdr *xip6 = mtod(m, struct ip6_hdr *);
int nxt = xip6->ip6_nxt, off = sizeof(struct ip6_hdr);
/*
* Search for destination options headers or routing
* header(s) through the header chain, and stores each
* header as ancillary data.
* Note that the order of the headers remains in
* the chain of ancillary data.
*/
for (;;) { /* is explicit loop prevention necessary? */
struct ip6_ext *ip6e = NULL;
int elen;
struct mbuf *ext = NULL;
/*
* if it is not an extension header, don't try to
* pull it from the chain.
*/
switch (nxt) {
case IPPROTO_DSTOPTS:
case IPPROTO_ROUTING:
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
goto loopend;
}
ext = ip6_pullexthdr(m, off, nxt);
if (ext == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
ip6e = mtod(ext, struct ip6_ext *);
if (nxt == IPPROTO_AH)
elen = (ip6e->ip6e_len + 2) << 2;
else
elen = (ip6e->ip6e_len + 1) << 3;
if (elen != ext->m_len) {
m_freem(ext);
IP6_STATINC(IP6_STAT_TOOSHORT);
return;
}
KASSERT(ACCESSIBLE_POINTER(ip6e, struct ip6_hdr));
switch (nxt) {
case IPPROTO_DSTOPTS:
if (!(inp->inp_flags & IN6P_DSTOPTS))
break;
*mp = sbcreatecontrol(ip6e, elen,
IS2292(IPV6_2292DSTOPTS, IPV6_DSTOPTS),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_ROUTING:
if (!(inp->inp_flags & IN6P_RTHDR))
break;
*mp = sbcreatecontrol(ip6e, elen,
IS2292(IPV6_2292RTHDR, IPV6_RTHDR),
IPPROTO_IPV6);
if (*mp)
mp = &(*mp)->m_next;
break;
case IPPROTO_HOPOPTS:
case IPPROTO_AH: /* is it possible? */
break;
default:
/*
* other cases have been filtered in the above.
* none will visit this case. here we supply
* the code just in case (nxt overwritten or
* other cases).
*/
m_freem(ext);
goto loopend;
}
/* proceed with the next header. */
off += elen;
nxt = ip6e->ip6e_nxt;
ip6e = NULL;
m_freem(ext);
ext = NULL;
}
loopend:
;
}
}
#undef IS2292
void
ip6_notify_pmtu(struct inpcb *inp, const struct sockaddr_in6 *dst,
uint32_t *mtu)
{
struct socket *so;
struct mbuf *m_mtu;
struct ip6_mtuinfo mtuctl;
so = inp->inp_socket;
if (mtu == NULL)
return;
KASSERT(so != NULL);
memset(&mtuctl, 0, sizeof(mtuctl)); /* zero-clear for safety */
mtuctl.ip6m_mtu = *mtu;
mtuctl.ip6m_addr = *dst;
if (sa6_recoverscope(&mtuctl.ip6m_addr))
return;
if ((m_mtu = sbcreatecontrol(&mtuctl, sizeof(mtuctl),
IPV6_PATHMTU, IPPROTO_IPV6)) == NULL)
return;
if (sbappendaddr(&so->so_rcv, (const struct sockaddr *)dst, NULL, m_mtu)
== 0) {
soroverflow(so);
m_freem(m_mtu);
} else
sorwakeup(so);
return;
}
/*
* pull single extension header from mbuf chain. returns single mbuf that
* contains the result, or NULL on error.
*/
static struct mbuf *
ip6_pullexthdr(struct mbuf *m, size_t off, int nxt)
{
struct ip6_ext ip6e;
size_t elen;
struct mbuf *n;
if (off + sizeof(ip6e) > m->m_pkthdr.len)
return NULL;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxt == IPPROTO_AH)
elen = (ip6e.ip6e_len + 2) << 2;
else
elen = (ip6e.ip6e_len + 1) << 3;
if (off + elen > m->m_pkthdr.len)
return NULL;
MGET(n, M_DONTWAIT, MT_DATA);
if (n && elen >= MLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_free(n);
n = NULL;
}
}
if (!n)
return NULL;
n->m_len = 0;
if (elen >= M_TRAILINGSPACE(n)) {
m_free(n);
return NULL;
}
m_copydata(m, off, elen, mtod(n, void *));
n->m_len = elen;
return n;
}
/*
* Get offset to the previous header followed by the header
* currently processed.
*/
int
ip6_get_prevhdr(struct mbuf *m, int off)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
if (off == sizeof(struct ip6_hdr)) {
return offsetof(struct ip6_hdr, ip6_nxt);
} else if (off < sizeof(struct ip6_hdr)) {
panic("%s: off < sizeof(struct ip6_hdr)", __func__);
} else {
int len, nlen, nxt;
struct ip6_ext ip6e;
nxt = ip6->ip6_nxt;
len = sizeof(struct ip6_hdr);
nlen = 0;
while (len < off) {
m_copydata(m, len, sizeof(ip6e), &ip6e);
switch (nxt) {
case IPPROTO_FRAGMENT:
nlen = sizeof(struct ip6_frag);
break;
case IPPROTO_AH:
nlen = (ip6e.ip6e_len + 2) << 2;
break;
default:
nlen = (ip6e.ip6e_len + 1) << 3;
break;
}
len += nlen;
nxt = ip6e.ip6e_nxt;
}
return (len - nlen);
}
}
/*
* get next header offset. m will be retained.
*/
int
ip6_nexthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
struct ip6_hdr ip6;
struct ip6_ext ip6e;
struct ip6_frag fh;
/* just in case */
if (m == NULL)
panic("%s: m == NULL", __func__); if ((m->m_flags & M_PKTHDR) == 0 || m->m_pkthdr.len < off)
return -1;
switch (proto) {
case IPPROTO_IPV6:
/* do not chase beyond intermediate IPv6 headers */
if (off != 0)
return -1;
if (m->m_pkthdr.len < off + sizeof(ip6))
return -1;
m_copydata(m, off, sizeof(ip6), (void *)&ip6);
if (nxtp) *nxtp = ip6.ip6_nxt;
off += sizeof(ip6);
return off;
case IPPROTO_FRAGMENT:
/*
* terminate parsing if it is not the first fragment,
* it does not make sense to parse through it.
*/
if (m->m_pkthdr.len < off + sizeof(fh))
return -1;
m_copydata(m, off, sizeof(fh), (void *)&fh);
if ((fh.ip6f_offlg & IP6F_OFF_MASK) != 0)
return -1;
if (nxtp) *nxtp = fh.ip6f_nxt;
off += sizeof(struct ip6_frag);
return off;
case IPPROTO_AH:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 2) << 2;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_HOPOPTS:
case IPPROTO_ROUTING:
case IPPROTO_DSTOPTS:
if (m->m_pkthdr.len < off + sizeof(ip6e))
return -1;
m_copydata(m, off, sizeof(ip6e), (void *)&ip6e);
if (nxtp) *nxtp = ip6e.ip6e_nxt;
off += (ip6e.ip6e_len + 1) << 3;
if (m->m_pkthdr.len < off)
return -1;
return off;
case IPPROTO_NONE:
case IPPROTO_ESP:
case IPPROTO_IPCOMP:
/* give up */
return -1;
default:
return -1;
}
}
/*
* get offset for the last header in the chain. m will be kept untainted.
*/
int
ip6_lasthdr(struct mbuf *m, int off, int proto, int *nxtp)
{
int newoff;
int nxt;
if (!nxtp) {
nxt = -1;
nxtp = &nxt;
}
for (;;) {
newoff = ip6_nexthdr(m, off, proto, nxtp);
if (newoff < 0)
return off;
else if (newoff < off)
return -1; /* invalid */
else if (newoff == off)
return newoff;
off = newoff;
proto = *nxtp;
}
}
static struct m_tag *
ip6_addaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
if (!mtag) {
mtag = m_tag_get(PACKET_TAG_INET6, sizeof(struct ip6aux),
M_NOWAIT);
if (mtag) {
m_tag_prepend(m, mtag);
memset(mtag + 1, 0, sizeof(struct ip6aux));
}
}
return mtag;
}
static struct m_tag *
ip6_findaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
return mtag;
}
static void
ip6_delaux(struct mbuf *m)
{
struct m_tag *mtag;
mtag = m_tag_find(m, PACKET_TAG_INET6);
if (mtag)
m_tag_delete(m, mtag);
}
/*
* System control for IP6
*/
const u_char inet6ctlerrmap[PRC_NCMDS] = {
0, 0, 0, 0,
0, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
EHOSTUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
EMSGSIZE, EHOSTUNREACH, 0, 0,
0, 0, 0, 0,
ENOPROTOOPT
};
extern int sysctl_net_inet6_addrctlpolicy(SYSCTLFN_ARGS);
static int
sysctl_net_inet6_ip6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(ip6stat_percpu, IP6_NSTATS));
}
static void
sysctl_net_inet6_ip6_setup(struct sysctllog **clog)
{
const struct sysctlnode *ip6_node;
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6",
SYSCTL_DESCR("PF_INET6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, &ip6_node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ip6",
SYSCTL_DESCR("IPv6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "forwarding",
SYSCTL_DESCR("Enable forwarding of INET6 datagrams"),
NULL, 0, &ip6_forwarding, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_FORWARDING, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "redirect",
SYSCTL_DESCR("Enable sending of ICMPv6 redirect messages"),
NULL, 0, &ip6_sendredirects, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_SENDREDIRECTS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hlim",
SYSCTL_DESCR("Hop limit for an INET6 datagram"),
NULL, 0, &ip6_defhlim, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DEFHLIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfragpackets",
SYSCTL_DESCR("Maximum number of fragments to buffer "
"for reassembly"),
NULL, 0, &ip6_maxfragpackets, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_MAXFRAGPACKETS, CTL_EOL);
pktq_sysctl_setup(ip6_pktq, clog, ip6_node, IPV6CTL_IFQ);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "keepfaith",
SYSCTL_DESCR("Activate faith interface"),
NULL, 0, &ip6_keepfaith, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_KEEPFAITH, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "log_interval",
SYSCTL_DESCR("Minimum interval between logging "
"unroutable packets"),
NULL, 0, &ip6_log_interval, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOG_INTERVAL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "hdrnestlimit",
SYSCTL_DESCR("Maximum number of nested IPv6 headers"),
NULL, 0, &ip6_hdrnestlimit, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_HDRNESTLIMIT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "dad_count",
SYSCTL_DESCR("Number of Duplicate Address Detection "
"probes to send"),
NULL, 0, &ip6_dad_count, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DAD_COUNT, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "auto_flowlabel",
SYSCTL_DESCR("Assign random IPv6 flow labels"),
NULL, 0, &ip6_auto_flowlabel, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_AUTO_FLOWLABEL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "defmcasthlim",
SYSCTL_DESCR("Default multicast hop limit"),
NULL, 0, &ip6_defmcasthlim, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_DEFMCASTHLIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "kame_version",
SYSCTL_DESCR("KAME Version"),
NULL, 0, __UNCONST(__KAME_VERSION), 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_KAME_VERSION, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "use_deprecated",
SYSCTL_DESCR("Allow use of deprecated addresses as "
"source addresses"),
NULL, 0, &ip6_use_deprecated, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_USE_DEPRECATED, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT
#ifndef INET6_BINDV6ONLY
|CTLFLAG_READWRITE,
#endif
CTLTYPE_INT, "v6only",
SYSCTL_DESCR("Disallow PF_INET6 sockets from connecting "
"to PF_INET sockets"),
NULL, 0, &ip6_v6only, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_V6ONLY, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "anonportmin",
SYSCTL_DESCR("Lowest ephemeral port number to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_anonportmin, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ANONPORTMIN, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "anonportmax",
SYSCTL_DESCR("Highest ephemeral port number to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_anonportmax, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ANONPORTMAX, CTL_EOL);
#ifndef IPNOPRIVPORTS
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "lowportmin",
SYSCTL_DESCR("Lowest privileged ephemeral port number "
"to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_lowportmin, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOWPORTMIN, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "lowportmax",
SYSCTL_DESCR("Highest privileged ephemeral port number "
"to assign"),
sysctl_net_inet_ip_ports, 0, &ip6_lowportmax, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_LOWPORTMAX, CTL_EOL);
#endif /* IPNOPRIVPORTS */
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "auto_linklocal",
SYSCTL_DESCR("Default value of per-interface flag for "
"adding an IPv6 link-local address to "
"interfaces when attached"),
NULL, 0, &ip6_auto_linklocal, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_AUTO_LINKLOCAL, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_STRUCT, "addctlpolicy",
SYSCTL_DESCR("Return the current address control"
" policy"),
sysctl_net_inet6_addrctlpolicy, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_ADDRCTLPOLICY, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "prefer_tempaddr",
SYSCTL_DESCR("Prefer temporary address as source "
"address"),
NULL, 0, &ip6_prefer_tempaddr, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfrags",
SYSCTL_DESCR("Maximum fragments in reassembly queue"),
NULL, 0, &ip6_maxfrags, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_MAXFRAGS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("IPv6 statistics"),
sysctl_net_inet6_ip6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_STATS, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "use_defaultzone",
SYSCTL_DESCR("Whether to use the default scope zones"),
NULL, 0, &ip6_use_defzone, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
IPV6CTL_USE_DEFAULTZONE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "mcast_pmtu",
SYSCTL_DESCR("Enable pMTU discovery for multicast packet"),
NULL, 0, &ip6_mcast_pmtu, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
/* anonportalgo RFC6056 subtree */
const struct sysctlnode *portalgo_node;
sysctl_createv(clog, 0, NULL, &portalgo_node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "anonportalgo",
SYSCTL_DESCR("Anonymous port algorithm selection (RFC 6056)"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6, CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "available",
SYSCTL_DESCR("available algorithms"),
sysctl_portalgo_available, 0, NULL, PORTALGO_MAXLEN,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRING, "selected",
SYSCTL_DESCR("selected algorithm"),
sysctl_portalgo_selected6, 0, NULL, PORTALGO_MAXLEN,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &portalgo_node, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_STRUCT, "reserve",
SYSCTL_DESCR("bitmap of reserved ports"),
sysctl_portalgo_reserve6, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "neighborgcthresh",
SYSCTL_DESCR("Maximum number of entries in neighbor"
" cache"),
NULL, 1, &ip6_neighborgcthresh, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxdynroutes",
SYSCTL_DESCR("Maximum number of routes created via"
" redirect"),
NULL, 1, &ip6_maxdynroutes, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "param_rt_msg",
SYSCTL_DESCR("How to send parameter changing"
" routing message"),
NULL, 0, &ip6_param_rt_msg, 0,
CTL_NET, PF_INET6, IPPROTO_IPV6,
CTL_CREATE, CTL_EOL);
}
void
ip6_statinc(u_int stat)
{
KASSERT(stat < IP6_NSTATS);
IP6_STATINC(stat);
}
/* $NetBSD: tmpfs.h,v 1.56 2020/05/17 19:39:15 ad Exp $ */
/*
* Copyright (c) 2005, 2006, 2007, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _FS_TMPFS_TMPFS_H_
#define _FS_TMPFS_TMPFS_H_
#if !defined(_KERNEL) && !defined(_KMEMUSER)
#error "not supposed to be exposed to userland"
#endif
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/vnode.h>
/*
* Internal representation of a tmpfs directory entry.
*
* All fields are protected by vnode lock.
*/
typedef struct tmpfs_dirent {
TAILQ_ENTRY(tmpfs_dirent) td_entries;
/* Pointer to the inode this entry refers to. */
struct tmpfs_node * td_node;
/* Sequence number, see tmpfs_dir_getseq(). */
uint32_t td_seq;
/* Name and its length. */
char * td_name;
uint16_t td_namelen;
} tmpfs_dirent_t;
TAILQ_HEAD(tmpfs_dir, tmpfs_dirent);
/*
* Internal representation of a tmpfs file system node -- inode.
*
* This structure is split in two parts: one holds attributes common
* to all file types and the other holds data that is only applicable to
* a particular type.
*
* All fields are protected by vnode lock. The vnode association itself
* is protected by vcache.
*/
typedef struct tmpfs_node {
LIST_ENTRY(tmpfs_node) tn_entries;
/*
* Each inode has a corresponding vnode. It is a bi-directional
* association. Whenever vnode is allocated, its v_data field is
* set to the inode it reference, and tmpfs_node_t::tn_vnode is
* set to point to the said vnode.
*
* Further attempts to allocate a vnode for this same node will
* result in returning a new reference to the value stored in
* tn_vnode. It may be NULL when the node is unused (that is,
* no vnode has been allocated or it has been reclaimed).
*/
vnode_t * tn_vnode;
/* Prevent node from being reclaimed. */
uint32_t tn_holdcount;
/* Directory entry. Only a hint, since hard link can have multiple. */
tmpfs_dirent_t * tn_dirent_hint;
/* The inode type: VBLK, VCHR, VDIR, VFIFO, VLNK, VREG or VSOCK. */
enum vtype tn_type;
/* Inode identifier and generation number. */
ino_t tn_id;
uint32_t tn_gen;
/* The inode size. */
off_t tn_size;
/* Generic node attributes. */
uid_t tn_uid;
gid_t tn_gid;
mode_t tn_mode;
int tn_flags;
nlink_t tn_links;
unsigned tn_tflags;
struct timespec tn_atime;
struct timespec tn_mtime;
struct timespec tn_ctime;
struct timespec tn_birthtime;
kmutex_t tn_timelock;
/* Head of byte-level lock list (used by tmpfs_advlock). */
struct lockf * tn_lockf;
union {
/* Type case: VBLK or VCHR. */
struct {
dev_t tn_rdev;
} tn_dev;
/* Type case: VDIR. */
struct {
/* Parent directory (root inode points to itself). */
struct tmpfs_node * tn_parent;
/* List of directory entries. */
struct tmpfs_dir tn_dir;
/* Last given sequence number and their arena. */
uint32_t tn_next_seq;
void * tn_seq_arena;
/*
* Pointer of the last directory entry returned
* by the readdir(3) operation.
*/
struct tmpfs_dirent * tn_readdir_lastp;
} tn_dir;
/* Type case: VLNK. */
struct tn_lnk {
/* The link's target. */
char * tn_link;
} tn_lnk;
/* Type case: VREG. */
struct tn_reg {
/* Underlying UVM object to store contents. */
struct uvm_object * tn_aobj;
size_t tn_aobj_pages;
} tn_reg;
} tn_spec;
} tmpfs_node_t;
#if defined(_KERNEL)
VFS_PROTOS(tmpfs);
LIST_HEAD(tmpfs_node_list, tmpfs_node);
#define TMPFS_MAXNAMLEN 255
/* Validate maximum td_namelen length. */
CTASSERT(TMPFS_MAXNAMLEN < UINT16_MAX);
/*
* Reserved values for the virtual entries (the first must be 0) and EOF.
* The start/end of the incremental range, see tmpfs_dir_getseq().
*/
#define TMPFS_DIRSEQ_DOT 0
#define TMPFS_DIRSEQ_DOTDOT 1
#define TMPFS_DIRSEQ_EOF 2
#define TMPFS_DIRSEQ_START 3 /* inclusive */
#define TMPFS_DIRSEQ_END (1U << 30) /* exclusive */
/* Mark to indicate that the number is not set. */
#define TMPFS_DIRSEQ_NONE (1U << 31)
/* Flags: time update requests. */
#define TMPFS_UPDATE_ATIME 0x01
#define TMPFS_UPDATE_MTIME 0x02
#define TMPFS_UPDATE_CTIME 0x04
/*
* Bits indicating whiteout use for the directory.
* We abuse tmpfs_node_t::tn_gen for that.
*/
#define TMPFS_WHITEOUT_BIT (1U << 31)
#define TMPFS_NODE_GEN_MASK (TMPFS_WHITEOUT_BIT - 1)
#define TMPFS_NODE_GEN(node) \
((node)->tn_gen & TMPFS_NODE_GEN_MASK)
/* White-out inode indicator. */
#define TMPFS_NODE_WHITEOUT ((tmpfs_node_t *)-1)
/*
* Bit indicating this node must be reclaimed when holdcount reaches zero.
* Ored into tmpfs_node_t::tn_holdcount.
*/
#define TMPFS_NODE_RECLAIMED (1U << 30)
/*
* Internal representation of a tmpfs mount point.
*/
typedef struct tmpfs_mount {
/* Limit and number of bytes in use by the file system. */
uint64_t tm_mem_limit;
uint64_t tm_bytes_used;
kmutex_t tm_acc_lock;
/* Pointer to the root inode. */
tmpfs_node_t * tm_root;
/* Maximum number of possible nodes for this file system. */
unsigned int tm_nodes_max;
/* Number of nodes currently allocated. */
unsigned int tm_nodes_cnt;
/* List of inodes and the lock protecting it. */
kmutex_t tm_lock;
struct tmpfs_node_list tm_nodes;
} tmpfs_mount_t;
/*
* This structure maps a file identifier to a tmpfs node. Used by the
* NFS code.
*/
typedef struct tmpfs_fid {
uint16_t tf_len;
uint16_t tf_pad;
uint32_t tf_gen;
ino_t tf_id;
} tmpfs_fid_t;
/*
* Prototypes for tmpfs_subr.c.
*/
void tmpfs_free_node(tmpfs_mount_t *, tmpfs_node_t *);
int tmpfs_construct_node(vnode_t *, vnode_t **, struct vattr *,
struct componentname *, char *);
int tmpfs_alloc_dirent(tmpfs_mount_t *, const char *, uint16_t,
tmpfs_dirent_t **);
void tmpfs_free_dirent(tmpfs_mount_t *, tmpfs_dirent_t *);
void tmpfs_dir_attach(tmpfs_node_t *, tmpfs_dirent_t *, tmpfs_node_t *);
void tmpfs_dir_detach(tmpfs_node_t *, tmpfs_dirent_t *);
tmpfs_dirent_t *tmpfs_dir_lookup(tmpfs_node_t *, struct componentname *);
tmpfs_dirent_t *tmpfs_dir_cached(tmpfs_node_t *);
uint32_t tmpfs_dir_getseq(tmpfs_node_t *, tmpfs_dirent_t *);
tmpfs_dirent_t *tmpfs_dir_lookupbyseq(tmpfs_node_t *, off_t);
int tmpfs_dir_getdents(tmpfs_node_t *, struct uio *, off_t *);
int tmpfs_reg_resize(vnode_t *, off_t);
int tmpfs_chflags(vnode_t *, int, kauth_cred_t, lwp_t *);
int tmpfs_chmod(vnode_t *, mode_t, kauth_cred_t, lwp_t *);
int tmpfs_chown(vnode_t *, uid_t, gid_t, kauth_cred_t, lwp_t *);
int tmpfs_chsize(vnode_t *, u_quad_t, kauth_cred_t, lwp_t *);
int tmpfs_chtimes(vnode_t *, const struct timespec *,
const struct timespec *, const struct timespec *, int,
kauth_cred_t, lwp_t *);
void tmpfs_update(vnode_t *, unsigned);
void tmpfs_update_locked(vnode_t *, unsigned);
void tmpfs_update_lazily(vnode_t *, unsigned);
/*
* Prototypes for tmpfs_mem.c.
*/
void tmpfs_mntmem_init(tmpfs_mount_t *, uint64_t);
void tmpfs_mntmem_destroy(tmpfs_mount_t *);
int tmpfs_mntmem_set(tmpfs_mount_t *, uint64_t);
size_t tmpfs_mem_info(bool);
uint64_t tmpfs_bytes_max(tmpfs_mount_t *);
size_t tmpfs_pages_avail(tmpfs_mount_t *);
bool tmpfs_mem_incr(tmpfs_mount_t *, size_t);
void tmpfs_mem_decr(tmpfs_mount_t *, size_t);
tmpfs_dirent_t *tmpfs_dirent_get(tmpfs_mount_t *);
void tmpfs_dirent_put(tmpfs_mount_t *, tmpfs_dirent_t *);
tmpfs_node_t * tmpfs_node_get(tmpfs_mount_t *);
void tmpfs_node_put(tmpfs_mount_t *, tmpfs_node_t *);
char * tmpfs_strname_alloc(tmpfs_mount_t *, size_t);
void tmpfs_strname_free(tmpfs_mount_t *, char *, size_t);
bool tmpfs_strname_neqlen(struct componentname *, struct componentname *);
/*
* Ensures that the node pointed by 'node' is a directory and that its
* contents are consistent with respect to directories.
*/
#define TMPFS_VALIDATE_DIR(node) \
KASSERT((node)->tn_vnode == NULL || VOP_ISLOCKED((node)->tn_vnode)); \
KASSERT((node)->tn_type == VDIR); \
KASSERT((node)->tn_size % sizeof(tmpfs_dirent_t) == 0);
/*
* Routines to convert VFS structures to tmpfs internal ones.
*/
static __inline tmpfs_mount_t *
VFS_TO_TMPFS(struct mount *mp)
{
tmpfs_mount_t *tmp = mp->mnt_data;
KASSERT(tmp != NULL);
return tmp;
}
static __inline tmpfs_node_t *
VP_TO_TMPFS_DIR(vnode_t *vp)
{
tmpfs_node_t *node = vp->v_data;
KASSERT(node != NULL); TMPFS_VALIDATE_DIR(node);
return node;
}
#endif /* defined(_KERNEL) */
static __inline tmpfs_node_t *
VP_TO_TMPFS_NODE(vnode_t *vp)
{
tmpfs_node_t *node = vp->v_data;
#ifdef KASSERT
KASSERT(node != NULL);
#endif
return node;
}
#endif /* _FS_TMPFS_TMPFS_H_ */
/* $NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $ */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* the UCLA Ficus project.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: @(#)null_vfsops.c 1.5 (Berkeley) 7/10/92
* @(#)umap_vfsops.c 8.8 (Berkeley) 5/14/95
*/
/*
* Umap Layer
* (See mount_umap(8) for a description of this layer.)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: umap_vfsops.c,v 1.104 2022/11/04 11:20:40 hannken Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/syslog.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/umapfs/umap.h>
#include <miscfs/genfs/layer_extern.h>
MODULE(MODULE_CLASS_VFS, umap, "layerfs");
VFS_PROTOS(umapfs);
/*
* Mount umap layer
*/
int
umapfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
struct pathbuf *pb;
struct nameidata nd;
struct umap_args *args = data;
struct vnode *lowerrootvp, *vp;
struct umap_mount *amp;
int error;
#ifdef UMAPFS_DIAGNOSTIC
int i;
#endif
fsid_t tfsid;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args) {
#ifdef UMAPFS_DIAGNOSTIC
printf("mount_umap: data len %d < args %d\n",
(int)*data_len, (int)(sizeof *args));
#endif
return EINVAL;
}
if (mp->mnt_flag & MNT_GETARGS) {
amp = MOUNTTOUMAPMOUNT(mp);
if (amp == NULL)
return EIO;
args->la.target = NULL;
args->nentries = amp->info_nentries;
args->gnentries = amp->info_gnentries;
*data_len = sizeof *args;
return 0;
}
/* only for root */
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_MOUNT,
KAUTH_REQ_SYSTEM_MOUNT_UMAP, NULL, NULL, NULL);
if (error)
return error;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_mount(mp = %p)\n", mp);
#endif
/*
* Update is not supported
*/
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
/*
* Find lower node
*/
error = pathbuf_copyin(args->umap_target, &pb);
if (error) {
return error;
}
NDINIT(&nd, LOOKUP, FOLLOW|LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
return error;
}
/*
* Sanity check on lower vnode
*/
lowerrootvp = nd.ni_vp;
pathbuf_destroy(pb);
#ifdef UMAPFS_DIAGNOSTIC
printf("vp = %p, check for VDIR...\n", lowerrootvp);
#endif
if (lowerrootvp->v_type != VDIR) {
vput(lowerrootvp);
return (EINVAL);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("mp = %p\n", mp);
#endif
amp = kmem_zalloc(sizeof(struct umap_mount), KM_SLEEP);
mp->mnt_data = amp;
/*
* Now copy in the number of entries and maps for umap mapping.
*/
if (args->nentries < 0 || args->nentries > MAPFILEENTRIES || args->gnentries < 0 || args->gnentries > GMAPFILEENTRIES) {
vput(lowerrootvp);
return (EINVAL);
}
amp->info_nentries = args->nentries;
amp->info_gnentries = args->gnentries;
error = copyin(args->mapdata, amp->info_mapdata,
2*sizeof(u_long)*args->nentries);
if (error) {
vput(lowerrootvp);
return (error);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("umap_mount:nentries %d\n",args->nentries);
for (i = 0; i < args->nentries; i++)
printf(" %ld maps to %ld\n", amp->info_mapdata[i][0],
amp->info_mapdata[i][1]);
#endif
error = copyin(args->gmapdata, amp->info_gmapdata,
2*sizeof(u_long)*args->gnentries);
if (error) {
vput(lowerrootvp);
return (error);
}
#ifdef UMAPFS_DIAGNOSTIC
printf("umap_mount:gnentries %d\n",args->gnentries);
for (i = 0; i < args->gnentries; i++)
printf("\tgroup %ld maps to %ld\n",
amp->info_gmapdata[i][0],
amp->info_gmapdata[i][1]);
#endif
/*
* Make sure the mount point's sufficiently initialized
* that the node create call will work.
*/
tfsid.__fsid_val[0] = (int32_t)args->fsid;
tfsid.__fsid_val[1] = makefstype(MOUNT_UMAP);
if (tfsid.__fsid_val[0] == 0) {
log(LOG_WARNING, "umapfs: fsid given as 0, ignoring\n");
vfs_getnewfsid(mp);
} else if (vfs_getvfs(&tfsid)) {
log(LOG_WARNING, "umapfs: fsid %x already mounted\n",
tfsid.__fsid_val[0]);
vfs_getnewfsid(mp);
} else {
mp->mnt_stat.f_fsidx.__fsid_val[0] = tfsid.__fsid_val[0];
mp->mnt_stat.f_fsidx.__fsid_val[1] = tfsid.__fsid_val[1];
mp->mnt_stat.f_fsid = tfsid.__fsid_val[0];
}
log(LOG_DEBUG, "umapfs: using fsid %x/%x\n",
mp->mnt_stat.f_fsidx.__fsid_val[0],
mp->mnt_stat.f_fsidx.__fsid_val[1]);
error = vfs_set_lowermount(mp, lowerrootvp->v_mount);
if (error) {
vput(lowerrootvp);
kmem_free(amp, sizeof(struct umap_mount));
return error;
}
amp->umapm_size = sizeof(struct umap_node);
amp->umapm_tag = VT_UMAP;
amp->umapm_bypass = umap_bypass;
amp->umapm_vnodeop_p = umap_vnodeop_p;
/*
* fix up umap node for root vnode.
*/
VOP_UNLOCK(lowerrootvp);
error = layer_node_create(mp, lowerrootvp, &vp);
/*
* Make sure the node alias worked
*/
if (error) {
vrele(lowerrootvp);
kmem_free(amp, sizeof(struct umap_mount));
return error;
}
/*
* Keep a held reference to the root vnode.
* It is vrele'd in umapfs_unmount.
*/
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vp->v_vflag |= VV_ROOT;
amp->umapm_rootvp = vp;
VOP_UNLOCK(vp);
error = set_statvfs_info(path, UIO_USERSPACE, args->umap_target,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error)
return error;
if (mp->mnt_lower->mnt_flag & MNT_LOCAL) mp->mnt_flag |= MNT_LOCAL;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_mount: lower %s, alias at %s\n",
mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
#endif
return 0;
}
/*
* Free reference to umap layer
*/
int
umapfs_unmount(struct mount *mp, int mntflags)
{
struct umap_mount *amp = MOUNTTOUMAPMOUNT(mp);
struct vnode *rtvp = amp->umapm_rootvp;
int error;
int flags = 0;
#ifdef UMAPFS_DIAGNOSTIC
printf("umapfs_unmount(mp = %p)\n", mp);
#endif
if (mntflags & MNT_FORCE)
flags |= FORCECLOSE;
if (vrefcnt(rtvp) > 1 && (mntflags & MNT_FORCE) == 0)
return (EBUSY);
if ((error = vflush(mp, rtvp, flags)) != 0)
return (error);
#ifdef UMAPFS_DIAGNOSTIC
vprint("alias root of lower", rtvp);
#endif
/*
* Blow it away for future re-use
*/
vgone(rtvp);
/*
* Finally, throw away the umap_mount structure
*/
kmem_free(amp, sizeof(struct umap_mount));
mp->mnt_data = NULL;
return 0;
}
extern const struct vnodeopv_desc umapfs_vnodeop_opv_desc;
const struct vnodeopv_desc * const umapfs_vnodeopv_descs[] = {
&umapfs_vnodeop_opv_desc,
NULL,
};
struct vfsops umapfs_vfsops = {
.vfs_name = MOUNT_UMAP,
.vfs_min_mount_data = sizeof (struct umap_args),
.vfs_mount = umapfs_mount,
.vfs_start = layerfs_start,
.vfs_unmount = umapfs_unmount,
.vfs_root = layerfs_root,
.vfs_quotactl = layerfs_quotactl,
.vfs_statvfs = layerfs_statvfs,
.vfs_sync = layerfs_sync,
.vfs_loadvnode = layerfs_loadvnode,
.vfs_vget = layerfs_vget,
.vfs_fhtovp = layerfs_fhtovp,
.vfs_vptofh = layerfs_vptofh,
.vfs_init = layerfs_init,
.vfs_done = layerfs_done,
.vfs_snapshot = layerfs_snapshot,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = layerfs_suspendctl,
.vfs_renamelock_enter = layerfs_renamelock_enter,
.vfs_renamelock_exit = layerfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = umapfs_vnodeopv_descs
};
SYSCTL_SETUP(umapfs_sysctl_setup, "umapfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "umap",
SYSCTL_DESCR("UID/GID remapping file system"),
NULL, 0, NULL, 0,
CTL_VFS, 10, CTL_EOL);
/*
* XXX the "10" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "10" is the order as taken from sys/mount.h
*/
}
static int
umap_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&umapfs_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&umapfs_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* NPF network interface handling module.
*/
#ifdef _KERNEL
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: npf_ifaddr.c,v 1.8 2022/02/13 19:20:11 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kmem.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#endif
#include "npf_impl.h"
static npf_table_t *
lookup_ifnet_table(npf_t *npf, ifnet_t *ifp)
{
const npf_ifops_t *ifops = npf->ifops;
char tname[NPF_TABLE_MAXNAMELEN];
const char *ifname;
npf_config_t *nc;
npf_table_t *t;
unsigned tid;
/* Get the interface name and prefix it. */
ifname = ifops->getname(npf, ifp);
snprintf(tname, sizeof(tname), ".ifnet-%s", ifname);
KERNEL_LOCK(1, NULL);
nc = npf_config_enter(npf);
/*
* Check whether this interface is of any interest to us.
*/
t = npf_tableset_getbyname(nc->tableset, tname);
if (!t) {
goto out;
}
tid = npf_table_getid(t);
/* Create a new NPF table for the interface. */
t = npf_table_create(tname, tid, NPF_TABLE_IFADDR, NULL, 0);
if (!t) {
goto out;
}
return t;
out:
npf_config_exit(npf);
KERNEL_UNLOCK_ONE(NULL);
return NULL;
}
static void
replace_ifnet_table(npf_t *npf, npf_table_t *newt)
{
npf_tableset_t *ts = atomic_load_relaxed(&npf->config)->tableset;
npf_table_t *oldt;
KASSERT(npf_config_locked_p(npf));
KERNEL_UNLOCK_ONE(NULL);
/*
* Finally, swap the tables and issue a sync barrier.
*/
oldt = npf_tableset_swap(ts, newt);
npf_config_sync(npf);
npf_config_exit(npf);
/* At this point, it is safe to destroy the old table. */
npf_table_destroy(oldt);
}
void
npf_ifaddr_sync(npf_t *npf, ifnet_t *ifp)
{
npf_table_t *t;
struct ifaddr *ifa;
/*
* First, check whether this interface is of any interest to us.
*
* => Acquires npf-config-lock and kernel-lock on success.
*/
t = lookup_ifnet_table(npf, ifp);
if (!t)
return;
/*
* Populate the table with the interface addresses.
* Note: currently, this list is protected by the kernel-lock.
*/
IFADDR_FOREACH(ifa, ifp) {
struct sockaddr *sa = ifa->ifa_addr;
const void *p = NULL;
int alen = 0;
if (sa->sa_family == AF_INET) {
const struct sockaddr_in *sin4 = satosin(sa);
alen = sizeof(struct in_addr);
p = &sin4->sin_addr;
}
if (sa->sa_family == AF_INET6) {
const struct sockaddr_in6 *sin6 = satosin6(sa);
alen = sizeof(struct in6_addr);
p = &sin6->sin6_addr;
}
if (alen) {
npf_addr_t addr;
memcpy(&addr, p, alen);
npf_table_insert(t, alen, &addr, NPF_NO_NETMASK);
}
}
/* Publish the new table. */
replace_ifnet_table(npf, t);
}
void
npf_ifaddr_flush(npf_t *npf, ifnet_t *ifp)
{
npf_table_t *t;
/*
* Flush: just load an empty table.
*/
t = lookup_ifnet_table(npf, ifp);
if (!t) {
return;
}
replace_ifnet_table(npf, t);
}
void
npf_ifaddr_syncall(npf_t *npf)
{
ifnet_t *ifp;
KERNEL_LOCK(1, NULL);
IFNET_GLOBAL_LOCK();
IFNET_WRITER_FOREACH(ifp) {
npf_ifaddr_sync(npf, ifp);
}
IFNET_GLOBAL_UNLOCK();
KERNEL_UNLOCK_ONE(NULL);
}
/* $NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)subr_log.c 8.3 (Berkeley) 2/14/95
*/
/*
* Error log buffer for kernel printf's.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_log.c,v 1.63 2022/10/26 23:28:30 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/ioctl.h>
#include <sys/msgbuf.h>
#include <sys/file.h>
#include <sys/syslog.h>
#include <sys/conf.h>
#include <sys/select.h>
#include <sys/poll.h>
#include <sys/intr.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>
static int sysctl_msgbuf(SYSCTLFN_PROTO);
static void logsoftintr(void *);
static bool log_async;
static struct selinfo log_selp; /* process waiting on select call */
static pid_t log_pgid; /* process/group for async I/O */
static kcondvar_t log_cv;
static void *log_sih;
static kmutex_t log_lock;
int log_open; /* also used in log() */
int msgbufmapped; /* is the message buffer mapped */
int msgbufenabled; /* is logging to the buffer enabled */
struct kern_msgbuf *msgbufp; /* the mapped buffer, itself. */
void
initmsgbuf(void *bf, size_t bufsize)
{
struct kern_msgbuf *mbp;
long new_bufs;
/* Sanity-check the given size. */
if (bufsize < sizeof(struct kern_msgbuf))
return;
mbp = msgbufp = (struct kern_msgbuf *)bf;
new_bufs = bufsize - offsetof(struct kern_msgbuf, msg_bufc);
if ((mbp->msg_magic != MSG_MAGIC) || (mbp->msg_bufs != new_bufs) ||
(mbp->msg_bufr < 0) || (mbp->msg_bufr >= mbp->msg_bufs) ||
(mbp->msg_bufx < 0) || (mbp->msg_bufx >= mbp->msg_bufs)) {
/*
* If the buffer magic number is wrong, has changed
* size (which shouldn't happen often), or is
* internally inconsistent, initialize it.
*/
memset(bf, 0, bufsize);
mbp->msg_magic = MSG_MAGIC;
mbp->msg_bufs = new_bufs;
}
/* mark it as ready for use. */
msgbufmapped = msgbufenabled = 1;
}
void
loginit(void)
{
mutex_init(&log_lock, MUTEX_DEFAULT, IPL_VM);
selinit(&log_selp);
cv_init(&log_cv, "klog");
log_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
logsoftintr, NULL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "msgbufsize",
SYSCTL_DESCR("Size of the kernel message buffer"),
sysctl_msgbuf, 0, NULL, 0,
CTL_KERN, KERN_MSGBUFSIZE, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "msgbuf",
SYSCTL_DESCR("Kernel message buffer"),
sysctl_msgbuf, 0, NULL, 0,
CTL_KERN, KERN_MSGBUF, CTL_EOL);
}
/*ARGSUSED*/
static int
logopen(dev_t dev, int flags, int mode, struct lwp *l)
{
struct kern_msgbuf *mbp = msgbufp;
int error = 0;
mutex_spin_enter(&log_lock);
if (log_open) {
error = EBUSY;
} else {
log_open = 1;
log_pgid = l->l_proc->p_pid; /* signal process only */
/*
* The message buffer is initialized during system
* configuration. If it's been clobbered, note that
* and return an error. (This allows a user to read
* the buffer via /dev/kmem, and try to figure out
* what clobbered it.
*/
if (mbp->msg_magic != MSG_MAGIC) { msgbufenabled = 0;
error = ENXIO;
}
}
mutex_spin_exit(&log_lock);
return error;
}
/*ARGSUSED*/
static int
logclose(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_spin_enter(&log_lock);
log_pgid = 0;
log_open = 0;
log_async = 0;
mutex_spin_exit(&log_lock);
return 0;
}
/*ARGSUSED*/
static int
logread(dev_t dev, struct uio *uio, int flag)
{
struct kern_msgbuf *mbp = msgbufp;
long l;
int error = 0;
mutex_spin_enter(&log_lock);
while (mbp->msg_bufr == mbp->msg_bufx) {
if (flag & IO_NDELAY) {
mutex_spin_exit(&log_lock);
return EWOULDBLOCK;
}
error = cv_wait_sig(&log_cv, &log_lock);
if (error) {
mutex_spin_exit(&log_lock);
return error;
}
}
while (uio->uio_resid > 0) {
l = mbp->msg_bufx - mbp->msg_bufr;
if (l < 0)
l = mbp->msg_bufs - mbp->msg_bufr;
l = uimin(l, uio->uio_resid);
if (l == 0)
break;
mutex_spin_exit(&log_lock);
error = uiomove(&mbp->msg_bufc[mbp->msg_bufr], (int)l, uio);
mutex_spin_enter(&log_lock);
if (error)
break;
mbp->msg_bufr += l;
if (mbp->msg_bufr < 0 || mbp->msg_bufr >= mbp->msg_bufs)
mbp->msg_bufr = 0;
}
mutex_spin_exit(&log_lock);
return error;
}
/*ARGSUSED*/
static int
logpoll(dev_t dev, int events, struct lwp *l)
{
int revents = 0;
if (events & (POLLIN | POLLRDNORM)) {
mutex_spin_enter(&log_lock);
if (msgbufp->msg_bufr != msgbufp->msg_bufx)
revents |= events & (POLLIN | POLLRDNORM);
else
selrecord(l, &log_selp);
mutex_spin_exit(&log_lock);
}
return revents;
}
static void
filt_logrdetach(struct knote *kn)
{
mutex_spin_enter(&log_lock);
selremove_knote(&log_selp, kn);
mutex_spin_exit(&log_lock);
}
static int
filt_logread(struct knote *kn, long hint)
{
int rv;
if ((hint & NOTE_SUBMIT) == 0) mutex_spin_enter(&log_lock); if (msgbufp->msg_bufr == msgbufp->msg_bufx) {
rv = 0;
} else if (msgbufp->msg_bufr < msgbufp->msg_bufx) {
kn->kn_data = msgbufp->msg_bufx - msgbufp->msg_bufr;
rv = 1;
} else {
kn->kn_data = (msgbufp->msg_bufs - msgbufp->msg_bufr) +
msgbufp->msg_bufx;
rv = 1;
}
if ((hint & NOTE_SUBMIT) == 0) mutex_spin_exit(&log_lock);
return rv;
}
static const struct filterops logread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_logrdetach,
.f_event = filt_logread,
};
static int
logkqfilter(dev_t dev, struct knote *kn)
{
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &logread_filtops;
mutex_spin_enter(&log_lock);
selrecord_knote(&log_selp, kn);
mutex_spin_exit(&log_lock);
break;
default:
return (EINVAL);
}
return (0);
}
void
logwakeup(void)
{ if (!cold && log_open) {
mutex_spin_enter(&log_lock);
selnotify(&log_selp, 0, NOTE_SUBMIT);
if (log_async) softint_schedule(log_sih);
cv_broadcast(&log_cv);
mutex_spin_exit(&log_lock);
}
}
static void
logsoftintr(void *cookie)
{
pid_t pid;
if ((pid = log_pgid) != 0)
fownsignal(pid, SIGIO, 0, 0, NULL);
}
/*ARGSUSED*/
static int
logioctl(dev_t dev, u_long com, void *data, int flag, struct lwp *lwp)
{
long l;
switch (com) {
/* return number of characters immediately available */
case FIONREAD:
mutex_spin_enter(&log_lock);
l = msgbufp->msg_bufx - msgbufp->msg_bufr;
if (l < 0)
l += msgbufp->msg_bufs;
mutex_spin_exit(&log_lock);
*(int *)data = l;
break;
case FIONBIO:
break;
case FIOASYNC:
/* No locking needed, 'thread private'. */
log_async = (*((int *)data) != 0);
break;
case TIOCSPGRP:
case FIOSETOWN:
return fsetown(&log_pgid, com, data);
case TIOCGPGRP:
case FIOGETOWN:
return fgetown(log_pgid, com, data);
default:
return (EPASSTHROUGH);
}
return (0);
}
static void
logskip(struct kern_msgbuf *mbp)
{
/*
* Move forward read pointer to the next line
* in the buffer. Note that the buffer is
* a ring buffer so we should reset msg_bufr
* to 0 when msg_bufr exceeds msg_bufs.
*
* To prevent to loop forever, give up if we
* cannot find a newline in mbp->msg_bufs
* characters (the max size of the buffer).
*/
for (int i = 0; i < mbp->msg_bufs; i++) {
char c0 = mbp->msg_bufc[mbp->msg_bufr];
if (++mbp->msg_bufr >= mbp->msg_bufs)
mbp->msg_bufr = 0;
if (c0 == '\n')
break;
}
}
static void
logaddchar(struct kern_msgbuf *mbp, int c)
{
mbp->msg_bufc[mbp->msg_bufx++] = c;
if (mbp->msg_bufx < 0 || mbp->msg_bufx >= mbp->msg_bufs)
mbp->msg_bufx = 0;
/* If the buffer is full, keep the most recent data. */
if (mbp->msg_bufr == mbp->msg_bufx) logskip(mbp);
}
void
logputchar(int c)
{
struct kern_msgbuf *mbp;
if (!cold) mutex_spin_enter(&log_lock); if (!msgbufenabled)
goto out;
mbp = msgbufp;
if (mbp->msg_magic != MSG_MAGIC) {
/*
* Arguably should panic or somehow notify the
* user... but how? Panic may be too drastic,
* and would obliterate the message being kicked
* out (maybe a panic itself), and printf
* would invoke us recursively. Silently punt
* for now. If syslog is running, it should
* notice.
*/
msgbufenabled = 0;
goto out;
}
logaddchar(mbp, c);
out:
if (!cold) mutex_spin_exit(&log_lock);
}
/*
* sysctl helper routine for kern.msgbufsize and kern.msgbuf. For the
* former it merely checks the message buffer is set up. For the latter,
* it also copies out the data if necessary.
*/
static int
sysctl_msgbuf(SYSCTLFN_ARGS)
{
char *where = oldp;
size_t len, maxlen;
long beg, end;
int error;
if (!logenabled(msgbufp)) {
msgbufenabled = 0;
return (ENXIO);
}
switch (rnode->sysctl_num) {
case KERN_MSGBUFSIZE: {
struct sysctlnode node = *rnode;
int msg_bufs = (int)msgbufp->msg_bufs;
node.sysctl_data = &msg_bufs;
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
case KERN_MSGBUF:
break;
default:
return (EOPNOTSUPP);
}
if (newp != NULL)
return (EPERM);
if (oldp == NULL) {
/* always return full buffer size */
*oldlenp = msgbufp->msg_bufs;
return (0);
}
sysctl_unlock();
/*
* First, copy from the write pointer to the end of
* message buffer.
*/
error = 0;
mutex_spin_enter(&log_lock);
maxlen = MIN(msgbufp->msg_bufs, *oldlenp);
beg = msgbufp->msg_bufx;
end = msgbufp->msg_bufs;
mutex_spin_exit(&log_lock);
while (maxlen > 0) {
len = MIN(end - beg, maxlen);
if (len == 0)
break;
/* XXX unlocked, but hardly matters. */
error = copyout(&msgbufp->msg_bufc[beg], where, len);
ktrmibio(-1, UIO_READ, where, len, error);
if (error)
break;
where += len;
maxlen -= len;
/*
* ... then, copy from the beginning of message buffer to
* the write pointer.
*/
beg = 0;
end = msgbufp->msg_bufx;
}
sysctl_relock();
return (error);
}
const struct cdevsw log_cdevsw = {
.d_open = logopen,
.d_close = logclose,
.d_read = logread,
.d_write = nowrite,
.d_ioctl = logioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = logpoll,
.d_mmap = nommap,
.d_kqfilter = logkqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* $NetBSD: subr_disk_open.c,v 1.15 2020/02/29 14:44:44 mlelstv Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_disk_open.c,v 1.15 2020/02/29 14:44:44 mlelstv Exp $");
#include <sys/param.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/disklabel.h>
#include <sys/fcntl.h>
#include <sys/kauth.h>
#include <sys/vnode.h>
#include <miscfs/specfs/specdev.h>
struct vnode *
opendisk(device_t dv)
{
devmajor_t bmajor;
int unit;
struct vnode *tmpvn;
int error;
dev_t dev;
/*
* Lookup major number for disk block device.
*/
bmajor = devsw_name2blk(device_xname(dv), NULL, 0);
if (bmajor == -1)
return NULL;
unit = device_unit(dv);
/*
* Fake a temporary vnode for the disk, open it, and read
* and hash the sectors.
*/
dev = device_is_a(dv, "dk") ? makedev(bmajor, unit) :
MAKEDISKDEV(bmajor, unit, RAW_PART);
if (bdevvp(dev, &tmpvn))
panic("%s: can't alloc vnode for %s", __func__,
device_xname(dv));
vn_lock(tmpvn, LK_EXCLUSIVE | LK_RETRY);
error = VOP_OPEN(tmpvn, FREAD | FSILENT, NOCRED);
if (error) {
/*
* Ignore errors caused by missing device, partition,
* medium, or busy [presumably because of a wedge covering it]
*/
switch (error) {
case ENXIO:
case ENODEV:
case EBUSY:
break;
default:
printf("%s: can't open dev %s (%d)\n",
__func__, device_xname(dv), error);
break;
}
vput(tmpvn);
return NULL;
}
return tmpvn;
}
int
getdisksize(struct vnode *vp, uint64_t *numsecp, unsigned int *secsizep)
{
struct partinfo pi;
struct dkwedge_info dkw;
struct disk *pdk;
unsigned int secsize;
uint64_t numsec;
int error;
/*
* We attempt to get the wedge information first if it exists,
* because the label does not support larger size disks.
*/
error = VOP_IOCTL(vp, DIOCGWEDGEINFO, &dkw, FREAD, NOCRED);
if (error == 0) {
pdk = disk_find(dkw.dkw_parent);
if (pdk != NULL) { secsize = DEV_BSIZE << pdk->dk_blkshift;
numsec = dkw.dkw_size;
} else
error = ENODEV;
}
if (error) {
error = VOP_IOCTL(vp, DIOCGPARTINFO, &pi, FREAD, NOCRED);
if (error == 0) { secsize = pi.pi_secsize;
numsec = pi.pi_size;
}
}
if (error == 0 &&
(secsize == 0 || secsize > MAXBSIZE || !powerof2(secsize) ||
numsec == 0)) {
#ifdef DIAGNOSTIC
printf("%s: %s returns invalid disksize values"
" (secsize = %u, numsec = %" PRIu64 ")\n",
__func__,
devsw_blk2name(major(vp->v_specnode->sn_rdev)),
secsize, numsec);
#endif
error = EINVAL;
}
if (error == 0) {
*secsizep = secsize;
*numsecp = numsec;
}
return error;
}
int
getdiskinfo(struct vnode *vp, struct dkwedge_info *dkw)
{
struct partinfo pi;
int error;
dev_t dev = vp->v_specnode->sn_rdev;
if (VOP_IOCTL(vp, DIOCGWEDGEINFO, dkw, FREAD, NOCRED) == 0)
return 0;
if ((error = VOP_IOCTL(vp, DIOCGPARTINFO, &pi, FREAD, NOCRED)) != 0)
return error;
snprintf(dkw->dkw_devname, sizeof(dkw->dkw_devname), "%s%" PRId32 "%c",
devsw_blk2name(major(dev)), DISKUNIT(dev), (char)DISKPART(dev) +
'a');
dkw->dkw_wname[0] = '\0';
snprintf(dkw->dkw_parent, sizeof(dkw->dkw_parent), "%s%" PRId32,
devsw_blk2name(major(dev)), DISKUNIT(dev));
dkw->dkw_size = pi.pi_size;
dkw->dkw_offset = pi.pi_offset;
strlcpy(dkw->dkw_ptype, getfstypename(pi.pi_fstype),
sizeof(dkw->dkw_ptype));
return 0;
}
/* $NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $ */
/*
* Copyright (c) 1987, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1996 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95
*/
/*
* Wrapper interface for obsolete malloc(9).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_malloc.c,v 1.158 2019/11/14 16:23:52 maxv Exp $");
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>
/*
* Built-in malloc types. Note: ought to be removed.
*/
MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
MALLOC_DEFINE(M_DMAMAP, "DMA map", "bus_dma(9) structures");
MALLOC_DEFINE(M_FREE, "free", "should be on free list");
MALLOC_DEFINE(M_TEMP, "temp", "misc. temporary data buffers");
MALLOC_DEFINE(M_RTABLE, "routetbl", "routing tables");
MALLOC_DEFINE(M_FTABLE, "fragtbl", "fragment reassembly header");
MALLOC_DEFINE(M_UFSMNT, "UFS mount", "UFS mount structure");
MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
MALLOC_DEFINE(M_MRTABLE, "mrt", "multicast routing tables");
/*
* Header contains total size, including the header itself.
*/
struct malloc_header {
size_t mh_size;
#ifdef KASAN
size_t mh_rqsz;
#endif
} __aligned(ALIGNBYTES + 1);
void *
kern_malloc(unsigned long reqsize, int flags)
{
const int kmflags = (flags & M_NOWAIT) ? KM_NOSLEEP : KM_SLEEP;
#ifdef KASAN
const size_t origsize = reqsize;
#endif
size_t size = reqsize;
size_t allocsize, hdroffset;
struct malloc_header *mh;
void *p;
kasan_add_redzone(&size);
if (size >= PAGE_SIZE) {
if (size > (ULONG_MAX-PAGE_SIZE))
allocsize = ULONG_MAX; /* this will fail later */
else
allocsize = PAGE_SIZE + size; /* for page alignment */
hdroffset = PAGE_SIZE - sizeof(struct malloc_header);
} else {
allocsize = sizeof(struct malloc_header) + size;
hdroffset = 0;
}
p = kmem_intr_alloc(allocsize, kmflags);
if (p == NULL)
return NULL;
kmsan_mark(p, allocsize, KMSAN_STATE_UNINIT);
kmsan_orig(p, allocsize, KMSAN_TYPE_MALLOC, __RET_ADDR);
if ((flags & M_ZERO) != 0) { memset(p, 0, allocsize);
}
mh = (void *)((char *)p + hdroffset);
mh->mh_size = allocsize - hdroffset;
#ifdef KASAN
mh->mh_rqsz = origsize;
#endif
mh++;
kasan_mark(mh, origsize, size, KASAN_MALLOC_REDZONE);
return mh;
}
void
kern_free(void *addr)
{
struct malloc_header *mh;
mh = addr;
mh--;
kasan_mark(addr, mh->mh_size - sizeof(struct malloc_header),
mh->mh_size - sizeof(struct malloc_header), KASAN_MALLOC_REDZONE);
if (mh->mh_size >= PAGE_SIZE + sizeof(struct malloc_header)) {
kmsan_mark((char *)addr - PAGE_SIZE,
mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header),
KMSAN_STATE_INITED);
kmem_intr_free((char *)addr - PAGE_SIZE,
mh->mh_size + PAGE_SIZE - sizeof(struct malloc_header));
} else {
kmsan_mark(mh, mh->mh_size, KMSAN_STATE_INITED);
kmem_intr_free(mh, mh->mh_size);
}
}
void *
kern_realloc(void *curaddr, unsigned long newsize, int flags)
{
struct malloc_header *mh;
unsigned long cursize;
void *newaddr;
/*
* realloc() with a NULL pointer is the same as malloc().
*/
if (curaddr == NULL)
return malloc(newsize, ksp, flags);
/*
* realloc() with zero size is the same as free().
*/
if (newsize == 0) {
free(curaddr, ksp);
return NULL;
}
if ((flags & M_NOWAIT) == 0) {
ASSERT_SLEEPABLE();
}
mh = curaddr;
mh--;
#ifdef KASAN
cursize = mh->mh_rqsz;
#else
cursize = mh->mh_size - sizeof(struct malloc_header);
#endif
/*
* If we already actually have as much as they want, we're done.
*/
if (newsize <= cursize)
return curaddr;
/*
* Can't satisfy the allocation with the existing block.
* Allocate a new one and copy the data.
*/
newaddr = malloc(newsize, ksp, flags);
if (__predict_false(newaddr == NULL)) {
/*
* malloc() failed, because flags included M_NOWAIT.
* Return NULL to indicate that failure. The old
* pointer is still valid.
*/
return NULL;
}
memcpy(newaddr, curaddr, cursize);
/*
* We were successful: free the old allocation and return
* the new one.
*/
free(curaddr, ksp);
return newaddr;
}
/* $NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $ */
/*-
* Copyright (c) 1997, 1998, 1999, 2001, 2005, 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_congctl.c,v 1.28 2021/07/31 20:29:37 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_tcp_debug.h"
#include "opt_tcp_congctl.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/syslog.h>
#include <sys/pool.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet/icmp6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_congctl.h>
#ifdef TCP_DEBUG
#include <netinet/tcp_debug.h>
#endif
/*
* TODO:
* consider separating the actual implementations in another file.
*/
static void tcp_common_congestion_exp(struct tcpcb *, int, int);
static int tcp_reno_do_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static int tcp_reno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_slow_retransmit(struct tcpcb *);
static void tcp_reno_fast_retransmit_newack(struct tcpcb *,
const struct tcphdr *);
static void tcp_reno_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_reno_congestion_exp(struct tcpcb *tp);
static int tcp_newreno_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_newreno_fast_retransmit_newack(struct tcpcb *,
const struct tcphdr *);
static void tcp_newreno_newack(struct tcpcb *, const struct tcphdr *);
static int tcp_cubic_fast_retransmit(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_slow_retransmit(struct tcpcb *tp);
static void tcp_cubic_newack(struct tcpcb *, const struct tcphdr *);
static void tcp_cubic_congestion_exp(struct tcpcb *);
static void tcp_congctl_fillnames(void);
extern int tcprexmtthresh;
MALLOC_DEFINE(M_TCPCONGCTL, "tcpcongctl", "TCP congestion control structures");
/* currently selected global congestion control */
char tcp_congctl_global_name[TCPCC_MAXLEN];
/* available global congestion control algorithms */
char tcp_congctl_avail[10 * TCPCC_MAXLEN];
/*
* Used to list the available congestion control algorithms.
*/
TAILQ_HEAD(, tcp_congctlent) tcp_congctlhd =
TAILQ_HEAD_INITIALIZER(tcp_congctlhd);
static struct tcp_congctlent * tcp_congctl_global;
static kmutex_t tcp_congctl_mtx;
void
tcp_congctl_init(void)
{
int r __diagused;
mutex_init(&tcp_congctl_mtx, MUTEX_DEFAULT, IPL_NONE);
/* Base algorithms. */
r = tcp_congctl_register("reno", &tcp_reno_ctl);
KASSERT(r == 0);
r = tcp_congctl_register("newreno", &tcp_newreno_ctl);
KASSERT(r == 0);
r = tcp_congctl_register("cubic", &tcp_cubic_ctl);
KASSERT(r == 0);
/* NewReno is the default. */
#ifndef TCP_CONGCTL_DEFAULT
#define TCP_CONGCTL_DEFAULT "newreno"
#endif
r = tcp_congctl_select(NULL, TCP_CONGCTL_DEFAULT);
KASSERT(r == 0);
}
/*
* Register a congestion algorithm and select it if we have none.
*/
int
tcp_congctl_register(const char *name, const struct tcp_congctl *tcc)
{
struct tcp_congctlent *ntcc, *tccp;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
if (!strcmp(name, tccp->congctl_name)) {
/* name already registered */
return EEXIST;
}
ntcc = malloc(sizeof(*ntcc), M_TCPCONGCTL, M_WAITOK|M_ZERO);
strlcpy(ntcc->congctl_name, name, sizeof(ntcc->congctl_name) - 1);
ntcc->congctl_ctl = tcc;
TAILQ_INSERT_TAIL(&tcp_congctlhd, ntcc, congctl_ent);
tcp_congctl_fillnames();
if (TAILQ_FIRST(&tcp_congctlhd) == ntcc)
tcp_congctl_select(NULL, name);
return 0;
}
int
tcp_congctl_unregister(const char *name)
{
struct tcp_congctlent *tccp, *rtccp;
unsigned int size;
rtccp = NULL;
size = 0;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
if (!strcmp(name, tccp->congctl_name))
rtccp = tccp;
size++;
}
if (!rtccp)
return ENOENT;
if (size <= 1 || tcp_congctl_global == rtccp || rtccp->congctl_refcnt)
return EBUSY;
TAILQ_REMOVE(&tcp_congctlhd, rtccp, congctl_ent);
free(rtccp, M_TCPCONGCTL);
tcp_congctl_fillnames();
return 0;
}
/*
* Select a congestion algorithm by name.
*/
int
tcp_congctl_select(struct tcpcb *tp, const char *name)
{
struct tcp_congctlent *tccp, *old_tccp, *new_tccp;
bool old_found, new_found;
KASSERT(name); old_found = (tp == NULL || tp->t_congctl == NULL);
old_tccp = NULL;
new_found = false;
new_tccp = NULL;
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) { if (!old_found && tccp->congctl_ctl == tp->t_congctl) {
old_tccp = tccp;
old_found = true;
}
if (!new_found && !strcmp(name, tccp->congctl_name)) {
new_tccp = tccp;
new_found = true;
}
if (new_found && old_found) {
if (tp) {
mutex_enter(&tcp_congctl_mtx);
if (old_tccp) old_tccp->congctl_refcnt--;
tp->t_congctl = new_tccp->congctl_ctl;
new_tccp->congctl_refcnt++;
mutex_exit(&tcp_congctl_mtx);
} else {
tcp_congctl_global = new_tccp;
strlcpy(tcp_congctl_global_name,
new_tccp->congctl_name,
sizeof(tcp_congctl_global_name) - 1);
}
return 0;
}
}
return EINVAL;
}
void
tcp_congctl_release(struct tcpcb *tp)
{
struct tcp_congctlent *tccp;
KASSERT(tp->t_congctl); TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
if (tccp->congctl_ctl == tp->t_congctl) {
tccp->congctl_refcnt--;
return;
}
}
}
/*
* Returns the name of a congestion algorithm.
*/
const char *
tcp_congctl_bystruct(const struct tcp_congctl *tcc)
{
struct tcp_congctlent *tccp;
KASSERT(tcc);
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent)
if (tccp->congctl_ctl == tcc)
return tccp->congctl_name;
return NULL;
}
static void
tcp_congctl_fillnames(void)
{
struct tcp_congctlent *tccp;
const char *delim = " ";
tcp_congctl_avail[0] = '\0';
TAILQ_FOREACH(tccp, &tcp_congctlhd, congctl_ent) {
strlcat(tcp_congctl_avail, tccp->congctl_name,
sizeof(tcp_congctl_avail) - 1);
if (TAILQ_NEXT(tccp, congctl_ent))
strlcat(tcp_congctl_avail, delim,
sizeof(tcp_congctl_avail) - 1);
}
}
/* ------------------------------------------------------------------------ */
/*
* Common stuff
*/
/* Window reduction (1-beta) for [New]Reno: 0.5 */
#define RENO_BETAA 1
#define RENO_BETAB 2
/* Window reduction (1-beta) for Cubic: 0.8 */
#define CUBIC_BETAA 4
#define CUBIC_BETAB 5
/* Draft Rhee Section 4.1 */
#define CUBIC_CA 4
#define CUBIC_CB 10
static void
tcp_common_congestion_exp(struct tcpcb *tp, int betaa, int betab)
{
u_long win;
/*
* Reduce the congestion window and the slow start threshold.
*/
win = ulmin(tp->snd_wnd, tp->snd_cwnd) * betaa / betab / tp->t_segsz;
if (win < 2)
win = 2;
tp->snd_ssthresh = win * tp->t_segsz;
tp->snd_recover = tp->snd_max;
tp->snd_cwnd = tp->snd_ssthresh;
/*
* When using TCP ECN, notify the peer that
* we reduced the cwnd.
*/
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
/* ------------------------------------------------------------------------ */
/*
* TCP/Reno congestion control.
*/
static void
tcp_reno_congestion_exp(struct tcpcb *tp)
{
tcp_common_congestion_exp(tp, RENO_BETAA, RENO_BETAB);
}
static int
tcp_reno_do_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* Dup acks mean that packets have left the
* network (they're now cached at the receiver)
* so bump cwnd by the amount in the receiver
* to keep a constant cwnd packets in the
* network.
*
* If we are using TCP/SACK, then enter
* Fast Recovery if the receiver SACKs
* data that is tcprexmtthresh * MSS
* bytes past the last ACKed segment,
* irrespective of the number of DupAcks.
*/
tcp_seq onxt = tp->snd_nxt;
tp->t_partialacks = 0;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
if (TCP_SACK_ENABLED(tp)) {
tp->t_dupacks = tcprexmtthresh;
tp->sack_newdata = tp->snd_nxt;
tp->snd_cwnd = tp->t_segsz;
(void) tcp_output(tp);
return 0;
}
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_segsz;
(void) tcp_output(tp);
tp->snd_cwnd = tp->snd_ssthresh + tp->t_segsz * tp->t_dupacks;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
return 0;
}
static int
tcp_reno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* We know we're losing at the current
* window size so do congestion avoidance
* (set ssthresh to half the current window
* and pull our congestion window back to
* the new ssthresh).
*/
tcp_reno_congestion_exp(tp);
return tcp_reno_do_fast_retransmit(tp, th);
}
static void
tcp_reno_slow_retransmit(struct tcpcb *tp)
{
u_long win;
/*
* Close the congestion window down to one segment
* (we'll open it by one segment for each ack we get).
* Since we probably have a window's worth of unacked
* data accumulated, this "slow start" keeps us from
* dumping all that data as back-to-back packets (which
* might overwhelm an intermediate gateway).
*
* There are two phases to the opening: Initially we
* open by one mss on each ack. This makes the window
* size increase exponentially with time. If the
* window is larger than the path can handle, this
* exponential growth results in dropped packet(s)
* almost immediately. To get more time between
* drops but still "push" the network to take advantage
* of improving conditions, we switch from exponential
* to linear window opening at some threshold size.
* For a threshold, we use half the current window
* size, truncated to a multiple of the mss.
*
* (the minimum cwnd that will give us exponential
* growth is 2 mss. We don't allow the threshold
* to go below this.)
*/
win = ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_segsz;
if (win < 2)
win = 2;
/* Loss Window MUST be one segment. */
tp->snd_cwnd = tp->t_segsz;
tp->snd_ssthresh = win * tp->t_segsz;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
static void
tcp_reno_fast_retransmit_newack(struct tcpcb *tp,
const struct tcphdr *th)
{
if (tp->t_partialacks < 0) {
/*
* We were not in fast recovery. Reset the duplicate ack
* counter.
*/
tp->t_dupacks = 0;
} else {
/*
* Clamp the congestion window to the crossover point and
* exit fast recovery.
*/
if (tp->snd_cwnd > tp->snd_ssthresh)
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
tp->snd_fack = th->th_ack;
}
}
static void
tcp_reno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* When new data is acked, open the congestion window.
*/
u_int cw = tp->snd_cwnd;
u_int incr = tp->t_segsz;
if (tcp_do_abc) {
/*
* RFC 3465 Appropriate Byte Counting (ABC)
*/
int acked = th->th_ack - tp->snd_una;
if (cw >= tp->snd_ssthresh) {
tp->t_bytes_acked += acked;
if (tp->t_bytes_acked >= cw) {
/* Time to increase the window. */
tp->t_bytes_acked -= cw;
} else {
/* No need to increase yet. */
incr = 0;
}
} else {
/*
* use 2*SMSS or 1*SMSS for the "L" param,
* depending on sysctl setting.
*
* (See RFC 3465 2.3 Choosing the Limit)
*/
u_int abc_lim;
abc_lim = (tcp_abc_aggressive == 0 ||
tp->snd_nxt != tp->snd_max) ? incr : incr * 2;
incr = uimin(acked, abc_lim);
}
} else {
/*
* If the window gives us less than ssthresh packets
* in flight, open exponentially (segsz per packet).
* Otherwise open linearly: segsz per window
* (segsz^2 / cwnd per packet).
*/
if (cw >= tp->snd_ssthresh) {
incr = incr * incr / cw;
}
}
tp->snd_cwnd = uimin(cw + incr, TCP_MAXWIN << tp->snd_scale);
}
const struct tcp_congctl tcp_reno_ctl = {
.fast_retransmit = tcp_reno_fast_retransmit,
.slow_retransmit = tcp_reno_slow_retransmit,
.fast_retransmit_newack = tcp_reno_fast_retransmit_newack,
.newack = tcp_reno_newack,
.cong_exp = tcp_reno_congestion_exp,
};
/*
* TCP/NewReno Congestion control.
*/
static int
tcp_newreno_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
if (SEQ_LT(th->th_ack, tp->snd_high)) {
/*
* False fast retransmit after timeout.
* Do not enter fast recovery
*/
tp->t_dupacks = 0;
return 1;
}
/*
* Fast retransmit is same as reno.
*/
return tcp_reno_fast_retransmit(tp, th);
}
/*
* Implement the NewReno response to a new ack, checking for partial acks in
* fast recovery.
*/
static void
tcp_newreno_fast_retransmit_newack(struct tcpcb *tp, const struct tcphdr *th)
{
if (tp->t_partialacks < 0) {
/*
* We were not in fast recovery. Reset the duplicate ack
* counter.
*/
tp->t_dupacks = 0;
} else if (SEQ_LT(th->th_ack, tp->snd_recover)) {
/*
* This is a partial ack. Retransmit the first unacknowledged
* segment and deflate the congestion window by the amount of
* acknowledged data. Do not exit fast recovery.
*/
tcp_seq onxt = tp->snd_nxt;
u_long ocwnd = tp->snd_cwnd;
int sack_num_segs = 1, sack_bytes_rxmt = 0;
/*
* snd_una has not yet been updated and the socket's send
* buffer has not yet drained off the ACK'd data, so we
* have to leave snd_una as it was to get the correct data
* offset in tcp_output().
*/
tp->t_partialacks++;
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
if (TCP_SACK_ENABLED(tp)) {
/*
* Partial ack handling within a sack recovery episode.
* Keeping this very simple for now. When a partial ack
* is received, force snd_cwnd to a value that will
* allow the sender to transmit no more than 2 segments.
* If necessary, a fancier scheme can be adopted at a
* later point, but for now, the goal is to prevent the
* sender from bursting a large amount of data in the
* midst of sack recovery.
*/
/*
* send one or 2 segments based on how much
* new data was acked
*/
if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2)
sack_num_segs = 2;
(void)tcp_sack_output(tp, &sack_bytes_rxmt);
tp->snd_cwnd = sack_bytes_rxmt +
(tp->snd_nxt - tp->sack_newdata) +
sack_num_segs * tp->t_segsz;
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
} else {
tp->snd_nxt = th->th_ack;
/*
* Set snd_cwnd to one segment beyond ACK'd offset
* snd_una is not yet updated when we're called
*/
tp->snd_cwnd = tp->t_segsz + (th->th_ack - tp->snd_una);
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
if (SEQ_GT(onxt, tp->snd_nxt))
tp->snd_nxt = onxt;
/*
* Partial window deflation. Relies on fact that
* tp->snd_una not updated yet.
*/
tp->snd_cwnd -= (th->th_ack - tp->snd_una -
tp->t_segsz);
}
} else {
/*
* Complete ack. Inflate the congestion window to ssthresh
* and exit fast recovery.
*
* Window inflation should have left us with approx.
* snd_ssthresh outstanding data. But in case we
* would be inclined to send a burst, better to do
* it via the slow start mechanism.
*/
if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh)
tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack)
+ tp->t_segsz;
else
tp->snd_cwnd = tp->snd_ssthresh;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_SACK_ENABLED(tp) && SEQ_GT(th->th_ack, tp->snd_fack))
tp->snd_fack = th->th_ack;
}
}
static void
tcp_newreno_newack(struct tcpcb *tp, const struct tcphdr *th)
{
/*
* If we are still in fast recovery (meaning we are using
* NewReno and we have only received partial acks), do not
* inflate the window yet.
*/
if (tp->t_partialacks < 0)
tcp_reno_newack(tp, th);
}
const struct tcp_congctl tcp_newreno_ctl = {
.fast_retransmit = tcp_newreno_fast_retransmit,
.slow_retransmit = tcp_reno_slow_retransmit,
.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
.newack = tcp_newreno_newack,
.cong_exp = tcp_reno_congestion_exp,
};
/*
* CUBIC - http://tools.ietf.org/html/draft-rhee-tcpm-cubic-02
*/
/* Cubic prototypes */
static void tcp_cubic_update_ctime(struct tcpcb *tp);
static uint32_t tcp_cubic_diff_ctime(struct tcpcb *);
static uint32_t tcp_cubic_cbrt(uint32_t);
static ulong tcp_cubic_getW(struct tcpcb *, uint32_t, uint32_t);
/* Cubic TIME functions - XXX I don't like using timevals and microuptime */
/*
* Set congestion timer to now
*/
static void
tcp_cubic_update_ctime(struct tcpcb *tp)
{
struct timeval now_timeval;
getmicrouptime(&now_timeval);
tp->snd_cubic_ctime = now_timeval.tv_sec * 1000 +
now_timeval.tv_usec / 1000;
}
/*
* miliseconds from last congestion
*/
static uint32_t
tcp_cubic_diff_ctime(struct tcpcb *tp)
{
struct timeval now_timeval;
getmicrouptime(&now_timeval);
return now_timeval.tv_sec * 1000 + now_timeval.tv_usec / 1000 -
tp->snd_cubic_ctime;
}
/*
* Approximate cubic root
*/
#define CBRT_ROUNDS 30
static uint32_t
tcp_cubic_cbrt(uint32_t v)
{
int i, rounds = CBRT_ROUNDS;
uint64_t x = v / 3;
/* We fail to calculate correct for small numbers */
if (v == 0)
return 0;
else if (v < 4)
return 1;
/*
* largest x that 2*x^3+3*x fits 64bit
* Avoid overflow for a time cost
*/
if (x > 2097151)
rounds += 10;
for (i = 0; i < rounds; i++)
if (rounds == CBRT_ROUNDS)
x = (v + 2 * x * x * x) / (3 * x * x);
else
/* Avoid overflow */
x = v / (3 * x * x) + 2 * x / 3;
return (uint32_t)x;
}
/* Draft Rhee Section 3.1 - get W(t+rtt) - Eq. 1 */
static ulong
tcp_cubic_getW(struct tcpcb *tp, uint32_t ms_elapsed, uint32_t rtt)
{
uint32_t K;
long tK3;
/* Section 3.1 Eq. 2 */
K = tcp_cubic_cbrt(tp->snd_cubic_wmax / CUBIC_BETAB *
CUBIC_CB / CUBIC_CA);
/* (t-K)^3 - not clear why is the measure unit mattering */
tK3 = (long)(ms_elapsed + rtt) - (long)K;
tK3 = tK3 * tK3 * tK3;
return CUBIC_CA * tK3 / CUBIC_CB + tp->snd_cubic_wmax;
}
static void
tcp_cubic_congestion_exp(struct tcpcb *tp)
{
/*
* Congestion - Set WMax and shrink cwnd
*/
tcp_cubic_update_ctime(tp);
/* Section 3.6 - Fast Convergence */
if (tp->snd_cubic_wmax < tp->snd_cubic_wmax_last) {
tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
tp->snd_cubic_wmax = tp->snd_cubic_wmax / 2 +
tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB / 2;
} else {
tp->snd_cubic_wmax_last = tp->snd_cubic_wmax;
tp->snd_cubic_wmax = tp->snd_cwnd;
}
tp->snd_cubic_wmax = uimax(tp->t_segsz, tp->snd_cubic_wmax);
/* Shrink CWND */
tcp_common_congestion_exp(tp, CUBIC_BETAA, CUBIC_BETAB);
}
static int
tcp_cubic_fast_retransmit(struct tcpcb *tp, const struct tcphdr *th)
{
if (SEQ_LT(th->th_ack, tp->snd_high)) {
/* See newreno */
tp->t_dupacks = 0;
return 1;
}
/*
* mark WMax
*/
tcp_cubic_congestion_exp(tp);
/* Do fast retransmit */
return tcp_reno_do_fast_retransmit(tp, th);
}
static void
tcp_cubic_newack(struct tcpcb *tp, const struct tcphdr *th)
{
uint32_t ms_elapsed, rtt;
u_long w_tcp;
/* Congestion avoidance and not in fast recovery and usable rtt */
if (tp->snd_cwnd > tp->snd_ssthresh && tp->t_partialacks < 0 &&
/*
* t_srtt is 1/32 units of slow ticks
* converting it in ms would be equal to
* (t_srtt >> 5) * 1000 / PR_SLOWHZ ~= (t_srtt << 5) / PR_SLOWHZ
*/
(rtt = (tp->t_srtt << 5) / PR_SLOWHZ) > 0) {
ms_elapsed = tcp_cubic_diff_ctime(tp);
/* Compute W_tcp(t) */
w_tcp = tp->snd_cubic_wmax * CUBIC_BETAA / CUBIC_BETAB +
ms_elapsed / rtt / 3;
if (tp->snd_cwnd > w_tcp) {
/* Not in TCP friendly mode */
tp->snd_cwnd += (tcp_cubic_getW(tp, ms_elapsed, rtt) -
tp->snd_cwnd) / tp->snd_cwnd;
} else {
/* friendly TCP mode */
tp->snd_cwnd = w_tcp;
}
/* Make sure we are within limits */
tp->snd_cwnd = uimax(tp->snd_cwnd, tp->t_segsz);
tp->snd_cwnd = uimin(tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale);
} else {
/* Use New Reno */
tcp_newreno_newack(tp, th);
}
}
static void
tcp_cubic_slow_retransmit(struct tcpcb *tp)
{
/* Timeout - Mark new congestion */
tcp_cubic_congestion_exp(tp);
/* Loss Window MUST be one segment. */
tp->snd_cwnd = tp->t_segsz;
tp->t_partialacks = -1;
tp->t_dupacks = 0;
tp->t_bytes_acked = 0;
if (TCP_ECN_ALLOWED(tp))
tp->t_flags |= TF_ECN_SND_CWR;
}
const struct tcp_congctl tcp_cubic_ctl = {
.fast_retransmit = tcp_cubic_fast_retransmit,
.slow_retransmit = tcp_cubic_slow_retransmit,
.fast_retransmit_newack = tcp_newreno_fast_retransmit_newack,
.newack = tcp_cubic_newack,
.cong_exp = tcp_cubic_congestion_exp,
};
/* $NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95
*/
/*
* Copyright (c) 1993 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)procfs_vnops.c 8.18 (Berkeley) 5/21/95
*/
/*
* procfs vnode interface
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: procfs_vnops.c,v 1.230 2024/01/17 10:19:21 hannken Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/dirent.h>
#include <sys/resourcevar.h>
#include <sys/stat.h>
#include <sys/ptrace.h>
#include <sys/kauth.h>
#include <sys/exec.h>
#include <uvm/uvm_extern.h> /* for PAGE_SIZE */
#include <machine/reg.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/procfs/procfs.h>
/*
* Vnode Operations.
*
*/
static int procfs_validfile_linux(struct lwp *, struct mount *);
static int procfs_root_readdir_callback(struct proc *, void *);
static void procfs_dir(pfstype, struct lwp *, struct proc *, char **, char *,
size_t);
/*
* This is a list of the valid names in the
* process-specific sub-directories. It is
* used in procfs_lookup and procfs_readdir
*/
static const struct proc_target {
u_char pt_type;
u_char pt_namlen;
const char *pt_name;
pfstype pt_pfstype;
int (*pt_valid)(struct lwp *, struct mount *);
} proc_targets[] = {
#define N(s) sizeof(s)-1, s
/* name type validp */
{ DT_DIR, N("."), PFSproc, NULL },
{ DT_DIR, N(".."), PFSroot, NULL },
{ DT_DIR, N("fd"), PFSfd, NULL },
{ DT_DIR, N("task"), PFStask, procfs_validfile_linux },
{ DT_LNK, N("cwd"), PFScwd, NULL },
{ DT_REG, N("emul"), PFSemul, NULL },
{ DT_LNK, N("root"), PFSchroot, NULL },
{ DT_REG, N("auxv"), PFSauxv, procfs_validauxv },
{ DT_REG, N("cmdline"), PFScmdline, NULL },
{ DT_REG, N("environ"), PFSenviron, NULL },
{ DT_LNK, N("exe"), PFSexe, procfs_validfile },
{ DT_REG, N("file"), PFSfile, procfs_validfile },
{ DT_REG, N("fpregs"), PFSfpregs, procfs_validfpregs },
{ DT_REG, N("limit"), PFSlimit, NULL },
{ DT_REG, N("map"), PFSmap, procfs_validmap },
{ DT_REG, N("maps"), PFSmaps, procfs_validmap },
{ DT_REG, N("mem"), PFSmem, NULL },
{ DT_REG, N("note"), PFSnote, NULL },
{ DT_REG, N("notepg"), PFSnotepg, NULL },
{ DT_REG, N("regs"), PFSregs, procfs_validregs },
{ DT_REG, N("stat"), PFSstat, procfs_validfile_linux },
{ DT_REG, N("statm"), PFSstatm, procfs_validfile_linux },
{ DT_REG, N("status"), PFSstatus, NULL },
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_DEFNS
#endif
#undef N
};
static const int nproc_targets = sizeof(proc_targets) / sizeof(proc_targets[0]);
/*
* List of files in the root directory. Note: the validate function will
* be called with p == NULL for these ones.
*/
static const struct proc_target proc_root_targets[] = {
#define N(s) sizeof(s)-1, s
/* name type validp */
{ DT_REG, N("meminfo"), PFSmeminfo, procfs_validfile_linux },
{ DT_REG, N("cpuinfo"), PFScpuinfo, procfs_validfile_linux },
{ DT_REG, N("uptime"), PFSuptime, procfs_validfile_linux },
{ DT_REG, N("mounts"), PFSmounts, procfs_validfile_linux },
{ DT_REG, N("devices"), PFSdevices, procfs_validfile_linux },
{ DT_REG, N("stat"), PFScpustat, procfs_validfile_linux },
{ DT_REG, N("loadavg"), PFSloadavg, procfs_validfile_linux },
{ DT_REG, N("version"), PFSversion, procfs_validfile_linux },
#undef N
};
static const int nproc_root_targets =
sizeof(proc_root_targets) / sizeof(proc_root_targets[0]);
int procfs_lookup(void *);
int procfs_open(void *);
int procfs_close(void *);
int procfs_access(void *);
int procfs_getattr(void *);
int procfs_setattr(void *);
int procfs_readdir(void *);
int procfs_readlink(void *);
int procfs_inactive(void *);
int procfs_reclaim(void *);
int procfs_print(void *);
int procfs_pathconf(void *);
int procfs_getpages(void *);
static uint8_t fttodt(file_t *);
static int atoi(const char *, size_t);
/*
* procfs vnode operations.
*/
int (**procfs_vnodeop_p)(void *);
const struct vnodeopv_entry_desc procfs_vnodeop_entries[] = {
{ &vop_default_desc, vn_default_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, procfs_lookup }, /* lookup */
{ &vop_create_desc, genfs_eopnotsupp }, /* create */
{ &vop_mknod_desc, genfs_eopnotsupp }, /* mknod */
{ &vop_open_desc, procfs_open }, /* open */
{ &vop_close_desc, procfs_close }, /* close */
{ &vop_access_desc, procfs_access }, /* access */
{ &vop_accessx_desc, genfs_accessx }, /* accessx */
{ &vop_getattr_desc, procfs_getattr }, /* getattr */
{ &vop_setattr_desc, procfs_setattr }, /* setattr */
{ &vop_read_desc, procfs_rw }, /* read */
{ &vop_write_desc, procfs_rw }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, genfs_eopnotsupp }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, genfs_enoioctl }, /* ioctl */
{ &vop_poll_desc, genfs_poll }, /* poll */
{ &vop_kqfilter_desc, genfs_kqfilter }, /* kqfilter */
{ &vop_revoke_desc, genfs_revoke }, /* revoke */
{ &vop_fsync_desc, genfs_nullop }, /* fsync */
{ &vop_seek_desc, genfs_nullop }, /* seek */
{ &vop_remove_desc, genfs_eopnotsupp }, /* remove */
{ &vop_link_desc, genfs_erofs_link }, /* link */
{ &vop_rename_desc, genfs_eopnotsupp }, /* rename */
{ &vop_mkdir_desc, genfs_eopnotsupp }, /* mkdir */
{ &vop_rmdir_desc, genfs_eopnotsupp }, /* rmdir */
{ &vop_symlink_desc, genfs_erofs_symlink }, /* symlink */
{ &vop_readdir_desc, procfs_readdir }, /* readdir */
{ &vop_readlink_desc, procfs_readlink }, /* readlink */
{ &vop_abortop_desc, genfs_abortop }, /* abortop */
{ &vop_inactive_desc, procfs_inactive }, /* inactive */
{ &vop_reclaim_desc, procfs_reclaim }, /* reclaim */
{ &vop_lock_desc, genfs_lock }, /* lock */
{ &vop_unlock_desc, genfs_unlock }, /* unlock */
{ &vop_bmap_desc, genfs_eopnotsupp }, /* bmap */
{ &vop_strategy_desc, genfs_badop }, /* strategy */
{ &vop_print_desc, procfs_print }, /* print */
{ &vop_islocked_desc, genfs_islocked }, /* islocked */
{ &vop_pathconf_desc, procfs_pathconf }, /* pathconf */
{ &vop_advlock_desc, genfs_einval }, /* advlock */
{ &vop_getpages_desc, procfs_getpages }, /* getpages */
{ &vop_putpages_desc, genfs_null_putpages }, /* putpages */
{ NULL, NULL }
};
const struct vnodeopv_desc procfs_vnodeop_opv_desc =
{ &procfs_vnodeop_p, procfs_vnodeop_entries };
/*
* set things up for doing i/o on
* the pfsnode (vp). (vp) is locked
* on entry, and should be left locked
* on exit.
*
* for procfs we don't need to do anything
* in particular for i/o. all that is done
* is to support exclusive open on process
* memory images.
*/
int
procfs_open(void *v)
{
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct lwp *l1;
struct proc *p2;
int error;
if ((error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p2, ENOENT)) != 0)
return error;
l1 = curlwp; /* tracer */
#define M2K(m) (((m) & FREAD) && ((m) & FWRITE) ? \
KAUTH_REQ_PROCESS_PROCFS_RW : \
(m) & FWRITE ? KAUTH_REQ_PROCESS_PROCFS_WRITE : \
KAUTH_REQ_PROCESS_PROCFS_READ)
mutex_enter(p2->p_lock);
error = kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_PROCFS,
p2, pfs, KAUTH_ARG(M2K(ap->a_mode)), NULL);
mutex_exit(p2->p_lock);
if (error) {
procfs_proc_unlock(p2);
return (error);
}
#undef M2K
switch (pfs->pfs_type) {
case PFSmem:
if (((pfs->pfs_flags & FWRITE) && (ap->a_mode & O_EXCL)) || ((pfs->pfs_flags & O_EXCL) && (ap->a_mode & FWRITE))) {
error = EBUSY;
break;
}
if (!proc_isunder(p2, l1)) {
error = EPERM;
break;
}
if (ap->a_mode & FWRITE) pfs->pfs_flags = ap->a_mode & (FWRITE|O_EXCL);
break;
case PFSregs:
case PFSfpregs:
if (!proc_isunder(p2, l1)) {
error = EPERM;
break;
}
break;
default:
break;
}
procfs_proc_unlock(p2);
return (error);
}
/*
* close the pfsnode (vp) after doing i/o.
* (vp) is not locked on entry or exit.
*
* nothing to do for procfs other than undo
* any exclusive open flag (see _open above).
*/
int
procfs_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct pfsnode *pfs = VTOPFS(ap->a_vp);
switch (pfs->pfs_type) {
case PFSmem:
if ((ap->a_fflag & FWRITE) && (pfs->pfs_flags & O_EXCL))
pfs->pfs_flags &= ~(FWRITE|O_EXCL);
break;
default:
break;
}
return (0);
}
/*
* _inactive is called when the pfsnode
* is vrele'd and the reference count goes
* to zero. (vp) will be on the vnode free
* list, so to get it back vget() must be
* used.
*
* (vp) is locked on entry, but must be unlocked on exit.
*/
int
procfs_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
bool *a_recycle;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
mutex_enter(&proc_lock);
*ap->a_recycle = (procfs_proc_find(vp->v_mount, pfs->pfs_pid) == NULL);
mutex_exit(&proc_lock);
return (0);
}
/*
* _reclaim is called when getnewvnode()
* wants to make use of an entry on the vnode
* free list. at this time the filesystem needs
* to free any private data and remove the node
* from any private lists.
*/
int
procfs_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
VOP_UNLOCK(vp);
/*
* To interlock with procfs_revoke_vnodes().
*/
mutex_enter(vp->v_interlock);
vp->v_data = NULL;
mutex_exit(vp->v_interlock);
procfs_hashrem(pfs);
kmem_free(pfs, sizeof(*pfs));
return 0;
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
procfs_pathconf(void *v)
{
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap = v;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
return (0);
case _PC_MAX_CANON:
*ap->a_retval = MAX_CANON;
return (0);
case _PC_MAX_INPUT:
*ap->a_retval = MAX_INPUT;
return (0);
case _PC_PIPE_BUF:
*ap->a_retval = PIPE_BUF;
return (0);
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
return (0);
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
return (0);
case _PC_SYNC_IO:
*ap->a_retval = 1;
return (0);
default:
return genfs_pathconf(ap);
}
/* NOTREACHED */
}
/*
* _print is used for debugging.
* just print a readable description
* of (vp).
*/
int
procfs_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct pfsnode *pfs = VTOPFS(ap->a_vp);
printf("tag VT_PROCFS, type %d, pid %d, mode %x, flags %lx\n",
pfs->pfs_type, pfs->pfs_pid, pfs->pfs_mode, pfs->pfs_flags);
return 0;
}
/*
* Works out the path to the target process's current
* working directory or chroot. If the caller is in a chroot and
* can't "reach" the target's cwd or root (or some other error
* occurs), a "/" is returned for the path.
*/
static void
procfs_dir(pfstype t, struct lwp *caller, struct proc *target, char **bpp,
char *path, size_t len)
{
struct cwdinfo *cwdi;
struct vnode *vp, *rvp;
char *bp;
/*
* Lock target cwdi and take a reference to the vnode
* we are interested in to prevent it from disappearing
* before getcwd_common() below.
*/
rw_enter(&target->p_cwdi->cwdi_lock, RW_READER);
switch (t) {
case PFScwd:
vp = target->p_cwdi->cwdi_cdir;
break;
case PFSchroot:
vp = target->p_cwdi->cwdi_rdir;
break;
default:
rw_exit(&target->p_cwdi->cwdi_lock);
return;
}
if (vp != NULL)
vref(vp);
rw_exit(&target->p_cwdi->cwdi_lock);
cwdi = caller->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
rvp = cwdi->cwdi_rdir;
bp = bpp ? *bpp : NULL;
/*
* XXX: this horrible kludge avoids locking panics when
* attempting to lookup links that point to within procfs
*/
if (vp != NULL && vp->v_tag == VT_PROCFS) {
if (bpp) {
*--bp = '/';
*bpp = bp;
}
vrele(vp);
rw_exit(&cwdi->cwdi_lock);
return;
}
if (rvp == NULL)
rvp = rootvnode;
if (vp == NULL || getcwd_common(vp, rvp, bp ? &bp : NULL, path,
len / 2, 0, caller) != 0) {
if (bpp) {
bp = *bpp;
*--bp = '/';
}
}
if (bpp)
*bpp = bp;
if (vp != NULL)
vrele(vp);
rw_exit(&cwdi->cwdi_lock);
}
/*
* Invent attributes for pfsnode (vp) and store
* them in (vap).
* Directories lengths are returned as zero since
* any real length would require the genuine size
* to be computed, and nothing cares anyway.
*
* this is relatively minimal for procfs.
*/
int
procfs_getattr(void *v)
{
struct vop_getattr_args /* {
struct vnode *a_vp;
struct vattr *a_vap;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct vattr *vap = ap->a_vap;
struct proc *procp;
char *path, *bp, bf[16];
int error;
/* first check the process still exists */
switch (pfs->pfs_type) {
case PFSroot:
case PFScurproc:
case PFSself:
procp = NULL;
break;
default:
error =
procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &procp, ENOENT);
if (error != 0)
return (error);
break;
}
switch (pfs->pfs_type) {
case PFStask:
if (pfs->pfs_fd == -1) {
path = NULL;
break;
}
/*FALLTHROUGH*/
case PFScwd:
case PFSchroot:
path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
if (path == NULL && procp != NULL) { procfs_proc_unlock(procp);
return (ENOMEM);
}
break;
default:
path = NULL;
break;
}
if (procp != NULL) {
mutex_enter(procp->p_lock);
error = kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, procp,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL);
mutex_exit(procp->p_lock);
if (error != 0) {
procfs_proc_unlock(procp);
if (path != NULL) free(path, M_TEMP);
return (ENOENT);
}
}
error = 0;
/* start by zeroing out the attributes */
vattr_null(vap);
/* next do all the common fields */
vap->va_type = ap->a_vp->v_type;
vap->va_mode = pfs->pfs_mode;
vap->va_fileid = pfs->pfs_fileno;
vap->va_flags = 0;
vap->va_blocksize = PAGE_SIZE;
/*
* Make all times be current TOD.
*
* It would be possible to get the process start
* time from the p_stats structure, but there's
* no "file creation" time stamp anyway, and the
* p_stats structure is not addressable if u. gets
* swapped out for that process.
*/
getnanotime(&vap->va_ctime);
vap->va_atime = vap->va_mtime = vap->va_ctime;
if (procp)
TIMEVAL_TO_TIMESPEC(&procp->p_stats->p_start,
&vap->va_birthtime);
else
getnanotime(&vap->va_birthtime); switch (pfs->pfs_type) {
case PFSmem:
case PFSregs:
case PFSfpregs:
#if defined(__HAVE_PROCFS_MACHDEP) && defined(PROCFS_MACHDEP_PROTECT_CASES)
PROCFS_MACHDEP_PROTECT_CASES
#endif
/*
* If the process has exercised some setuid or setgid
* privilege, then rip away read/write permission so
* that only root can gain access.
*/
if (procp->p_flag & PK_SUGID) vap->va_mode &= ~(S_IRUSR|S_IWUSR);
/* FALLTHROUGH */
case PFSstatus:
case PFSstat:
case PFSnote:
case PFSnotepg:
case PFScmdline:
case PFSenviron:
case PFSemul:
case PFSstatm:
case PFSmap:
case PFSmaps:
case PFSlimit:
case PFSauxv:
vap->va_nlink = 1;
vap->va_uid = kauth_cred_geteuid(procp->p_cred);
vap->va_gid = kauth_cred_getegid(procp->p_cred);
break;
case PFScwd:
case PFSchroot:
case PFSmeminfo:
case PFSdevices:
case PFScpuinfo:
case PFSuptime:
case PFSmounts:
case PFScpustat:
case PFSloadavg:
case PFSversion:
case PFSexe:
case PFSself:
case PFScurproc:
case PFSroot:
vap->va_nlink = 1;
vap->va_uid = vap->va_gid = 0;
break;
case PFSproc:
case PFStask:
case PFSfile:
case PFSfd:
break;
default:
panic("%s: %d/1", __func__, pfs->pfs_type);
}
/*
* now do the object specific fields
*
* The size could be set from struct reg, but it's hardly
* worth the trouble, and it puts some (potentially) machine
* dependent data into this machine-independent code. If it
* becomes important then this function should break out into
* a per-file stat function in the corresponding .c file.
*/
switch (pfs->pfs_type) {
case PFSroot:
vap->va_bytes = vap->va_size = DEV_BSIZE;
break;
case PFSself:
case PFScurproc:
vap->va_bytes = vap->va_size =
snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
break;
case PFStask:
if (pfs->pfs_fd != -1) { vap->va_nlink = 1;
vap->va_uid = 0;
vap->va_gid = 0;
vap->va_bytes = vap->va_size =
snprintf(bf, sizeof(bf), "..");
break;
}
/*FALLTHROUGH*/
case PFSfd:
if (pfs->pfs_fd != -1) {
file_t *fp;
fp = fd_getfile2(procp, pfs->pfs_fd);
if (fp == NULL) {
error = EBADF;
break;
}
vap->va_nlink = 1;
vap->va_uid = kauth_cred_geteuid(fp->f_cred);
vap->va_gid = kauth_cred_getegid(fp->f_cred);
switch (fp->f_type) {
case DTYPE_VNODE:
vap->va_bytes = vap->va_size =
fp->f_vnode->v_size;
break;
default:
vap->va_bytes = vap->va_size = 0;
break;
}
closef(fp);
break;
}
/*FALLTHROUGH*/
case PFSproc:
vap->va_nlink = 2;
vap->va_uid = kauth_cred_geteuid(procp->p_cred);
vap->va_gid = kauth_cred_getegid(procp->p_cred);
vap->va_bytes = vap->va_size = DEV_BSIZE;
break;
case PFSfile:
error = EOPNOTSUPP;
break;
case PFSmem:
vap->va_bytes = vap->va_size =
ctob(procp->p_vmspace->vm_tsize +
procp->p_vmspace->vm_dsize +
procp->p_vmspace->vm_ssize);
break;
case PFSauxv:
vap->va_bytes = vap->va_size = procp->p_execsw->es_arglen;
break;
#if defined(PT_GETREGS) || defined(PT_SETREGS)
case PFSregs:
vap->va_bytes = vap->va_size = sizeof(struct reg);
break;
#endif
#if defined(PT_GETFPREGS) || defined(PT_SETFPREGS)
case PFSfpregs:
vap->va_bytes = vap->va_size = sizeof(struct fpreg);
break;
#endif
case PFSstatus:
case PFSstat:
case PFSnote:
case PFSnotepg:
case PFScmdline:
case PFSenviron:
case PFSmeminfo:
case PFSdevices:
case PFScpuinfo:
case PFSuptime:
case PFSmounts:
case PFScpustat:
case PFSloadavg:
case PFSstatm:
case PFSversion:
vap->va_bytes = vap->va_size = 0;
break;
case PFSlimit:
case PFSmap:
case PFSmaps:
/*
* Advise a larger blocksize for the map files, so that
* they may be read in one pass.
*/
vap->va_blocksize = 4 * PAGE_SIZE;
vap->va_bytes = vap->va_size = 0;
break;
case PFScwd:
case PFSchroot:
bp = path + MAXPATHLEN;
*--bp = '\0';
procfs_dir(pfs->pfs_type, curlwp, procp, &bp, path,
MAXPATHLEN);
vap->va_bytes = vap->va_size = strlen(bp);
break;
case PFSexe:
vap->va_bytes = vap->va_size = strlen(procp->p_path);
break;
case PFSemul:
vap->va_bytes = vap->va_size = strlen(procp->p_emul->e_name);
break;
#ifdef __HAVE_PROCFS_MACHDEP
PROCFS_MACHDEP_NODETYPE_CASES
error = procfs_machdep_getattr(ap->a_vp, vap, procp);
break;
#endif
default:
panic("%s: %d/2", __func__, pfs->pfs_type);
}
if (procp != NULL) procfs_proc_unlock(procp); if (path != NULL) free(path, M_TEMP);
return (error);
}
/*ARGSUSED*/
int
procfs_setattr(void *v)
{
/*
* just fake out attribute setting
* it's not good to generate an error
* return, otherwise things like creat()
* will fail when they try to set the
* file length to 0. worse, this means
* that echo $note > /proc/$pid/note will fail.
*/
return (0);
}
/*
* implement access checking.
*
* actually, the check for super-user is slightly
* broken since it will allow read access to write-only
* objects. this doesn't cause any particular trouble
* but does mean that the i/o entry points need to check
* that the operation really does make sense.
*/
int
procfs_access(void *v)
{
struct vop_access_args /* {
struct vnode *a_vp;
accmode_t a_accmode;
kauth_cred_t a_cred;
} */ *ap = v;
struct vattr va;
int error;
if ((error = VOP_GETATTR(ap->a_vp, &va, ap->a_cred)) != 0)
return (error);
return kauth_authorize_vnode(ap->a_cred,
KAUTH_ACCESS_ACTION(ap->a_accmode, ap->a_vp->v_type, va.va_mode),
ap->a_vp, NULL, genfs_can_access(ap->a_vp, ap->a_cred,
va.va_uid, va.va_gid, va.va_mode, NULL, ap->a_accmode));
}
/*
* lookup. this is incredibly complicated in the
* general case, however for most pseudo-filesystems
* very little needs to be done.
*
* Locking isn't hard here, just poorly documented.
*
* If we're looking up ".", just vref the parent & return it.
*
* If we're looking up "..", unlock the parent, and lock "..". If everything
* went ok, and we're on the last component and the caller requested the
* parent locked, try to re-lock the parent. We do this to prevent lock
* races.
*
* For anything else, get the needed node. Then unlock the parent if not
* the last component or not LOCKPARENT (i.e. if we wouldn't re-lock the
* parent in the .. case).
*
* We try to exit with the parent locked in error cases.
*/
int
procfs_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode * a_dvp;
struct vnode ** a_vpp;
struct componentname * a_cnp;
} */ *ap = v;
struct componentname *cnp = ap->a_cnp;
struct vnode **vpp = ap->a_vpp;
struct vnode *dvp = ap->a_dvp;
const char *pname = cnp->cn_nameptr;
const struct proc_target *pt = NULL;
struct vnode *fvp;
pid_t pid, vnpid;
struct pfsnode *pfs;
struct proc *p = NULL;
struct lwp *plwp;
int i, error;
pfstype type;
*vpp = NULL;
if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
return (error);
if (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)
return (EROFS);
if (cnp->cn_namelen == 1 && *pname == '.') { *vpp = dvp;
vref(dvp);
return (0);
}
pfs = VTOPFS(dvp);
switch (pfs->pfs_type) {
case PFSroot:
/*
* Shouldn't get here with .. in the root node.
*/
if (cnp->cn_flags & ISDOTDOT)
return (EIO);
for (i = 0; i < nproc_root_targets; i++) {
pt = &proc_root_targets[i];
/*
* check for node match. proc is always NULL here,
* so call pt_valid with constant NULL lwp.
*/
if (cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
(pt->pt_valid == NULL ||
(*pt->pt_valid)(NULL, dvp->v_mount)))
break;
}
if (i != nproc_root_targets) {
error = procfs_allocvp(dvp->v_mount, vpp, 0,
pt->pt_pfstype, -1);
return (error);
}
if (CNEQ(cnp, "curproc", 7)) {
pid = curproc->p_pid;
vnpid = 0;
type = PFScurproc;
} else if (CNEQ(cnp, "self", 4)) {
pid = curproc->p_pid;
vnpid = 0;
type = PFSself;
} else {
pid = (pid_t)atoi(pname, cnp->cn_namelen);
vnpid = pid;
type = PFSproc;
}
if (procfs_proc_lock(dvp->v_mount, pid, &p, ESRCH) != 0)
break;
error = procfs_allocvp(dvp->v_mount, vpp, vnpid, type, -1);
procfs_proc_unlock(p);
return (error);
case PFSproc:
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, 0, PFSroot,
-1);
return (error);
}
if (procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ESRCH) != 0)
break;
mutex_enter(p->p_lock);
LIST_FOREACH(plwp, &p->p_lwps, l_sibling) {
if (plwp->l_stat != LSZOMB)
break;
}
/* Process is exiting if no-LWPS or all LWPs are LSZOMB */
if (plwp == NULL) {
mutex_exit(p->p_lock);
procfs_proc_unlock(p);
return ESRCH;
}
lwp_addref(plwp);
mutex_exit(p->p_lock);
for (pt = proc_targets, i = 0; i < nproc_targets; pt++, i++) {
int found;
found = cnp->cn_namelen == pt->pt_namlen && memcmp(pt->pt_name, pname, cnp->cn_namelen) == 0 &&
(pt->pt_valid == NULL
|| (*pt->pt_valid)(plwp, dvp->v_mount));
if (found)
break;
}
lwp_delref(plwp);
if (i == nproc_targets) {
procfs_proc_unlock(p);
break;
}
if (pt->pt_pfstype == PFSfile) {
fvp = p->p_textvp;
/* We already checked that it exists. */
vref(fvp);
procfs_proc_unlock(p);
*vpp = fvp;
return (0);
}
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
pt->pt_pfstype, -1);
procfs_proc_unlock(p);
return (error);
case PFSfd: {
int fd;
file_t *fp;
if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ENOENT)) != 0)
return error;
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSproc, -1);
procfs_proc_unlock(p);
return (error);
}
fd = atoi(pname, cnp->cn_namelen);
fp = fd_getfile2(p, fd);
if (fp == NULL) {
procfs_proc_unlock(p);
return ENOENT;
}
fvp = fp->f_vnode;
/* Don't show directories */
if (fp->f_type == DTYPE_VNODE && fvp->v_type != VDIR &&
!procfs_proc_is_linux_compat()) {
vref(fvp);
closef(fp);
procfs_proc_unlock(p);
*vpp = fvp;
return 0;
}
closef(fp);
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSfd, fd);
procfs_proc_unlock(p);
return error;
}
case PFStask: {
int xpid;
if ((error = procfs_proc_lock(dvp->v_mount, pfs->pfs_pid, &p,
ENOENT)) != 0)
return error;
if (cnp->cn_flags & ISDOTDOT) {
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFSproc, -1);
procfs_proc_unlock(p);
return (error);
}
xpid = atoi(pname, cnp->cn_namelen);
if (xpid != pfs->pfs_pid) {
procfs_proc_unlock(p);
return ENOENT;
}
error = procfs_allocvp(dvp->v_mount, vpp, pfs->pfs_pid,
PFStask, 0);
procfs_proc_unlock(p);
return error;
}
default:
return (ENOTDIR);
}
return (cnp->cn_nameiop == LOOKUP ? ENOENT : EROFS);
}
int
procfs_validfile(struct lwp *l, struct mount *mp)
{
return l != NULL && l->l_proc != NULL && l->l_proc->p_textvp != NULL;
}
static int
procfs_validfile_linux(struct lwp *l, struct mount *mp)
{ return procfs_use_linux_compat(mp) && (l == NULL || l->l_proc == NULL || procfs_validfile(l, mp));
}
struct procfs_root_readdir_ctx {
struct uio *uiop;
off_t *cookies;
int ncookies;
off_t off;
off_t startoff;
int error;
};
static int
procfs_root_readdir_callback(struct proc *p, void *arg)
{
struct procfs_root_readdir_ctx *ctxp = arg;
struct dirent d;
struct uio *uiop;
int error;
uiop = ctxp->uiop;
if (uiop->uio_resid < UIO_MX)
return -1; /* no space */
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY), NULL, NULL) != 0)
return 0;
if (ctxp->off < ctxp->startoff) {
ctxp->off++;
return 0;
}
memset(&d, 0, UIO_MX);
d.d_reclen = UIO_MX;
d.d_fileno = PROCFS_FILENO(p->p_pid, PFSproc, -1);
d.d_namlen = snprintf(d.d_name,
UIO_MX - offsetof(struct dirent, d_name), "%ld", (long)p->p_pid);
d.d_type = DT_DIR;
mutex_exit(&proc_lock);
error = uiomove(&d, UIO_MX, uiop);
mutex_enter(&proc_lock);
if (error) {
ctxp->error = error;
return -1;
}
ctxp->ncookies++;
if (ctxp->cookies) *(ctxp->cookies)++ = ctxp->off + 1;
ctxp->off++;
return 0;
}
/*
* readdir returns directory entries from pfsnode (vp).
*
* the strategy here with procfs is to generate a single
* directory entry at a time (struct dirent) and then
* copy that out to userland using uiomove. a more efficient
* though more complex implementation, would try to minimize
* the number of calls to uiomove(). for procfs, this is
* hardly worth the added code complexity.
*
* this should just be done through read()
*/
int
procfs_readdir(void *v)
{
struct vop_readdir_args /* {
struct vnode *a_vp;
struct uio *a_uio;
kauth_cred_t a_cred;
int *a_eofflag;
off_t **a_cookies;
int *a_ncookies;
} */ *ap = v;
struct uio *uio = ap->a_uio;
struct dirent d;
struct pfsnode *pfs;
off_t i;
int error;
off_t *cookies = NULL;
int ncookies;
struct vnode *vp;
const struct proc_target *pt;
struct procfs_root_readdir_ctx ctx;
struct proc *p = NULL;
struct lwp *l;
int nfd;
int nc = 0;
vp = ap->a_vp;
pfs = VTOPFS(vp);
if (uio->uio_resid < UIO_MX)
return (EINVAL);
if (uio->uio_offset < 0)
return (EINVAL);
error = 0;
i = uio->uio_offset;
memset(&d, 0, UIO_MX);
d.d_reclen = UIO_MX;
ncookies = uio->uio_resid / UIO_MX;
switch (pfs->pfs_type) {
/*
* this is for the process-specific sub-directories.
* all that is needed to is copy out all the entries
* from the procent[] table (top of this file).
*/
case PFSproc: {
if (i >= nproc_targets)
return 0;
if (procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH) != 0)
break;
if (ap->a_ncookies) { ncookies = uimin(ncookies, (nproc_targets - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (pt = &proc_targets[i];
uio->uio_resid >= UIO_MX && i < nproc_targets; pt++, i++) {
if (pt->pt_valid) {
/* XXXSMP LWP can disappear */
mutex_enter(p->p_lock);
l = LIST_FIRST(&p->p_lwps);
KASSERT(l != NULL);
mutex_exit(p->p_lock);
if ((*pt->pt_valid)(l, vp->v_mount) == 0)
continue;
}
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
d.d_namlen = pt->pt_namlen;
memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies) *cookies++ = i + 1;
}
procfs_proc_unlock(p);
break;
}
case PFSfd: {
file_t *fp;
int lim;
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
ESRCH)) != 0)
return error;
/* XXX Should this be by file as well? */
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES), NULL,
NULL) != 0) {
procfs_proc_unlock(p);
return ESRCH;
}
nfd = atomic_load_consume(&p->p_fd->fd_dt)->dt_nfiles;
lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
if (i >= lim) {
procfs_proc_unlock(p);
return 0;
}
if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
pt = &proc_targets[i];
d.d_namlen = pt->pt_namlen;
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies) *cookies++ = i + 1;
nc++;
}
if (error)
goto out;
for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
/* check the descriptor exists */
if ((fp = fd_getfile2(p, i - 2)) == NULL)
continue;
closef(fp);
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFSfd, i - 2);
d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
"%lld", (long long)(i - 2));
d.d_type = fttodt(fp); if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies) *cookies++ = i + 1;
nc++;
}
goto out;
}
case PFStask: {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p,
ESRCH)) != 0)
return error;
nfd = 3; /* ., .., pid */
if (ap->a_ncookies) { ncookies = uimin(ncookies, (nfd + 2 - i));
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
for (; i < 2 && uio->uio_resid >= UIO_MX; i++) {
pt = &proc_targets[i];
d.d_namlen = pt->pt_namlen;
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid,
pt->pt_pfstype, -1);
(void)memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies) *cookies++ = i + 1;
nc++;
}
if (error)
goto out;
for (; uio->uio_resid >= UIO_MX && i < nfd; i++) {
/* check the descriptor exists */
d.d_fileno = PROCFS_FILENO(pfs->pfs_pid, PFStask,
i - 2);
d.d_namlen = snprintf(d.d_name, sizeof(d.d_name),
"%ld", (long)pfs->pfs_pid);
d.d_type = DT_LNK;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
if (cookies) *cookies++ = i + 1;
nc++;
}
goto out;
}
/*
* this is for the root of the procfs filesystem
* what is needed are special entries for "curproc"
* and "self" followed by an entry for each process
* on allproc.
*/
case PFSroot: {
if (ap->a_ncookies) {
/*
* XXX Potentially allocating too much space here,
* but I'm lazy. This loop needs some work.
*/
cookies = malloc(ncookies * sizeof (off_t),
M_TEMP, M_WAITOK);
*ap->a_cookies = cookies;
}
/* 0 ... 3 are static entries. */
for (; i <= 3 && uio->uio_resid >= UIO_MX; i++) { switch (i) {
case 0: /* `.' */
case 1: /* `..' */
d.d_fileno = PROCFS_FILENO(0, PFSroot, -1);
d.d_namlen = i + 1;
memcpy(d.d_name, "..", d.d_namlen);
d.d_name[i + 1] = '\0';
d.d_type = DT_DIR;
break;
case 2:
d.d_fileno = PROCFS_FILENO(0, PFScurproc, -1);
d.d_namlen = sizeof("curproc") - 1;
memcpy(d.d_name, "curproc", sizeof("curproc"));
d.d_type = DT_LNK;
break;
case 3:
d.d_fileno = PROCFS_FILENO(0, PFSself, -1);
d.d_namlen = sizeof("self") - 1;
memcpy(d.d_name, "self", sizeof("self"));
d.d_type = DT_LNK;
break;
}
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
nc++;
if (cookies) *cookies++ = i + 1;
}
if (error)
break;
/* 4 ... are process entries. */
ctx.uiop = uio;
ctx.error = 0;
ctx.off = 4;
ctx.startoff = i;
ctx.cookies = cookies;
ctx.ncookies = nc;
proclist_foreach_call(&allproc,
procfs_root_readdir_callback, &ctx);
cookies = ctx.cookies;
nc = ctx.ncookies;
error = ctx.error;
if (error)
break;
/* misc entries. */
if (i < ctx.off)
i = ctx.off;
if (i >= ctx.off + nproc_root_targets)
break;
error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &p, ESRCH);
if (error)
break;
for (pt = &proc_root_targets[i - ctx.off];
uio->uio_resid >= UIO_MX &&
pt < &proc_root_targets[nproc_root_targets];
pt++, i++) {
if (pt->pt_valid &&
(*pt->pt_valid)(NULL, vp->v_mount) == 0)
continue;
if (kauth_authorize_process(kauth_cred_get(),
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_ENTRY),
NULL, NULL) != 0)
continue;
d.d_fileno = PROCFS_FILENO(0, pt->pt_pfstype, -1);
d.d_namlen = pt->pt_namlen;
memcpy(d.d_name, pt->pt_name, pt->pt_namlen + 1);
d.d_type = pt->pt_type;
if ((error = uiomove(&d, UIO_MX, uio)) != 0)
break;
nc++;
if (cookies) *cookies++ = i + 1;
}
out:
KASSERT(p != NULL);
ncookies = nc;
procfs_proc_unlock(p);
break;
}
default:
error = ENOTDIR;
break;
}
if (ap->a_ncookies) {
if (error) {
if (cookies) free(*ap->a_cookies, M_TEMP);
*ap->a_ncookies = 0;
*ap->a_cookies = NULL;
} else
*ap->a_ncookies = ncookies;
}
uio->uio_offset = i;
return (error);
}
/*
* readlink reads the link of `curproc' and others
*/
int
procfs_readlink(void *v)
{
struct vop_readlink_args *ap = v;
char bf[16]; /* should be enough */
char *bp = bf;
char *path = NULL;
int len = 0;
int error = 0;
struct vnode *vp = ap->a_vp;
struct pfsnode *pfs = VTOPFS(vp);
struct proc *pown = NULL;
if (pfs->pfs_fileno == PROCFS_FILENO(0, PFScurproc, -1))
len = snprintf(bf, sizeof(bf), "%ld", (long)curproc->p_pid);
else if (pfs->pfs_fileno == PROCFS_FILENO(0, PFSself, -1))
len = snprintf(bf, sizeof(bf), "%s", "curproc");
else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFStask, 0))
len = snprintf(bf, sizeof(bf), "..");
else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSexe, -1)) {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
bp = pown->p_path;
len = strlen(bp);
} else if (pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFScwd, -1) ||
pfs->pfs_fileno == PROCFS_FILENO(pfs->pfs_pid, PFSchroot, -1)) {
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
path = malloc(MAXPATHLEN + 4, M_TEMP, M_WAITOK);
if (path == NULL) {
procfs_proc_unlock(pown);
return (ENOMEM);
}
bp = path + MAXPATHLEN;
*--bp = '\0';
procfs_dir(PROCFS_TYPE(pfs->pfs_fileno), curlwp, pown,
&bp, path, MAXPATHLEN);
len = strlen(bp);
} else {
file_t *fp;
struct vnode *vxp;
if ((error = procfs_proc_lock(vp->v_mount, pfs->pfs_pid, &pown,
ESRCH)) != 0)
return error;
fp = fd_getfile2(pown, pfs->pfs_fd);
if (fp == NULL) {
procfs_proc_unlock(pown);
return EBADF;
}
switch (fp->f_type) {
case DTYPE_VNODE:
vxp = fp->f_vnode;
if (vxp->v_type != VDIR &&
!procfs_proc_is_linux_compat()) {
error = EINVAL;
break;
}
if ((path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK))
== NULL) {
error = ENOMEM;
break;
}
bp = path + MAXPATHLEN;
*--bp = '\0';
/*
* XXX: kludge to avoid locking against ourselves
* in getcwd()
*/
if (vxp->v_tag == VT_PROCFS) {
*--bp = '/';
} else {
rw_enter(&curproc->p_cwdi->cwdi_lock,
RW_READER);
vp = curproc->p_cwdi->cwdi_rdir;
if (vp == NULL)
vp = rootvnode;
error = getcwd_common(vxp, vp, &bp, path,
MAXPATHLEN / 2, 0, curlwp);
rw_exit(&curproc->p_cwdi->cwdi_lock);
}
if (error)
break;
len = strlen(bp);
break;
case DTYPE_MISC:
len = snprintf(bf, sizeof(bf), "%s", "[misc]");
break;
case DTYPE_KQUEUE:
len = snprintf(bf, sizeof(bf), "%s", "[kqueue]");
break;
case DTYPE_SEM:
len = snprintf(bf, sizeof(bf), "%s", "[ksem]");
break;
default:
error = EINVAL;
break;
}
closef(fp);
}
if (error == 0)
error = uiomove(bp, len, ap->a_uio);
if (pown)
procfs_proc_unlock(pown);
if (path)
free(path, M_TEMP);
return error;
}
int
procfs_getpages(void *v)
{
struct vop_getpages_args /* {
struct vnode *a_vp;
voff_t a_offset;
struct vm_page **a_m;
int *a_count;
int a_centeridx;
vm_prot_t a_access_type;
int a_advice;
int a_flags;
} */ *ap = v;
if ((ap->a_flags & PGO_LOCKED) == 0)
rw_exit(ap->a_vp->v_uobj.vmobjlock);
return (EFAULT);
}
/*
* convert decimal ascii to int
*/
static int
atoi(const char *b, size_t len)
{
int p = 0;
while (len--) {
char c = *b++;
if (c < '0' || c > '9')
return -1;
p = 10 * p + (c - '0');
}
return p;
}
/**
* convert DTYPE_XXX to corresponding DT_XXX
* matching what procfs_loadvnode() does.
*/
static uint8_t
fttodt(file_t *fp)
{
switch (fp->f_type) {
case DTYPE_VNODE:
switch (fp->f_vnode->v_type) {
case VREG: return DT_REG;
case VDIR: return DT_LNK; /* symlink */
case VBLK: return DT_BLK;
case VCHR: return DT_CHR;
case VLNK: return DT_LNK;
case VSOCK: return DT_SOCK;
case VFIFO: return DT_FIFO;
default: return DT_UNKNOWN;
}
case DTYPE_PIPE: return DT_FIFO;
case DTYPE_SOCKET: return DT_SOCK;
case DTYPE_KQUEUE: /*FALLTHROUGH*/
case DTYPE_MISC: /*FALLTHROUGH*/
case DTYPE_SEM: return DT_LNK; /* symlinks */
default: return DT_UNKNOWN;
}
}
/* $NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_vfsops.c,v 1.9 1992/05/30 10:26:24 jsp Exp
* from: @(#)lofs_vfsops.c 1.2 (Berkeley) 6/18/92
* @(#)null_vfsops.c 8.7 (Berkeley) 5/14/95
*/
/*
* Generic layer VFS operations.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_vfsops.c,v 1.56 2022/12/09 10:33:18 hannken Exp $");
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
SYSCTL_SETUP_PROTO(sysctl_vfs_layerfs_setup);
MODULE(MODULE_CLASS_MISC, layerfs, NULL);
static int
layerfs_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return 0;
case MODULE_CMD_FINI:
return 0;
default:
return ENOTTY;
}
return 0;
}
/*
* VFS start. Nothing needed here - the start routine on the underlying
* filesystem will have been called when that filesystem was mounted.
*/
int
layerfs_start(struct mount *mp, int flags)
{
#ifdef notyet
return VFS_START(mp->mnt_lower, flags);
#else
return 0;
#endif
}
int
layerfs_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct vnode *vp;
vp = MOUNTTOLAYERMOUNT(mp)->layerm_rootvp;
if (vp == NULL) {
*vpp = NULL;
return EINVAL;
}
/*
* Return root vnode with locked and with a reference held.
*/
vref(vp);
vn_lock(vp, lktype | LK_RETRY);
*vpp = vp;
return 0;
}
int
layerfs_quotactl(struct mount *mp, struct quotactl_args *args)
{
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_QUOTACTL(mp->mnt_lower, args);
vfs_unbusy(mp);
}
return error;
}
int
layerfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
struct statvfs *sbuf;
int error;
sbuf = kmem_zalloc(sizeof(*sbuf), KM_SLEEP);
error = vfs_busy(mp);
if (error == 0) {
error = VFS_STATVFS(mp->mnt_lower, sbuf);
vfs_unbusy(mp);
}
if (error) {
goto done;
}
/* Copy across the relevant data and fake the rest. */
sbp->f_flag = sbuf->f_flag;
sbp->f_bsize = sbuf->f_bsize;
sbp->f_frsize = sbuf->f_frsize;
sbp->f_iosize = sbuf->f_iosize;
sbp->f_blocks = sbuf->f_blocks;
sbp->f_bfree = sbuf->f_bfree;
sbp->f_bavail = sbuf->f_bavail;
sbp->f_bresvd = sbuf->f_bresvd;
sbp->f_files = sbuf->f_files;
sbp->f_ffree = sbuf->f_ffree;
sbp->f_favail = sbuf->f_favail;
sbp->f_fresvd = sbuf->f_fresvd;
sbp->f_namemax = sbuf->f_namemax;
copy_statvfs_info(sbp, mp);
done:
kmem_free(sbuf, sizeof(*sbuf));
return error;
}
int
layerfs_sync(struct mount *mp, int waitfor,
kauth_cred_t cred)
{
/*
* XXX - Assumes no data cached at layer.
*/
return 0;
}
int
layerfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct layer_mount *lmp = MOUNTTOLAYERMOUNT(mp);
struct vnode *lowervp;
struct layer_node *xp;
KASSERT(key_len == sizeof(struct vnode *));
memcpy(&lowervp, key, key_len);
xp = kmem_alloc(lmp->layerm_size, KM_SLEEP);
/* Share the interlock, vmobjlock, and klist with the lower node. */
vshareilock(vp, lowervp);
rw_obj_hold(lowervp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, lowervp->v_uobj.vmobjlock);
vshareklist(vp, lowervp);
vp->v_tag = lmp->layerm_tag;
vp->v_type = lowervp->v_type;
vp->v_op = lmp->layerm_vnodeop_p;
if (vp->v_type == VBLK || vp->v_type == VCHR)
spec_node_init(vp, lowervp->v_rdev);
vp->v_data = xp;
xp->layer_vnode = vp;
xp->layer_lowervp = lowervp;
xp->layer_flags = 0;
uvm_vnp_setsize(vp, 0);
/* Add a reference to the lower node. */
vref(lowervp);
*new_key = &xp->layer_lowervp;
return 0;
}
int
layerfs_vget(struct mount *mp, ino_t ino, int lktype, struct vnode **vpp)
{
struct vnode *vp;
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_VGET(mp->mnt_lower, ino, lktype, &vp);
vfs_unbusy(mp);
}
if (error) {
*vpp = NULL;
return error;
}
VOP_UNLOCK(vp);
error = layer_node_create(mp, vp, vpp);
if (error) {
vrele(vp);
*vpp = NULL;
return error;
}
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
int
layerfs_fhtovp(struct mount *mp, struct fid *fidp, int lktype,
struct vnode **vpp)
{
struct vnode *vp;
int error;
error = vfs_busy(mp);
if (error == 0) {
error = VFS_FHTOVP(mp->mnt_lower, fidp, lktype, &vp);
vfs_unbusy(mp);
}
if (error) {
*vpp = NULL;
return error;
}
VOP_UNLOCK(vp);
error = layer_node_create(mp, vp, vpp);
if (error) {
vput(vp);
*vpp = NULL;
return (error);
}
error = vn_lock(*vpp, lktype);
if (error) {
vrele(*vpp);
*vpp = NULL;
return error;
}
return 0;
}
int
layerfs_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
return VFS_VPTOFH(LAYERVPTOLOWERVP(vp), fhp, fh_size);
}
/*
* layerfs_snapshot - handle a snapshot through a layered file system
*
* At present, we do NOT support snapshotting through a layered file
* system as the ffs implementation changes v_vnlock of the snapshot
* vnodes to point to one common lock. As there is no way for us to
* absolutely pass this change up the stack, a layered file system
* would end up referencing the wrong lock.
*
* This routine serves as a central resource for this behavior; all
* layered file systems don't need to worry about the above. Also, if
* things get fixed, all layers get the benefit.
*/
int
layerfs_snapshot(struct mount *mp, struct vnode *vp,
struct timespec *ts)
{
return EOPNOTSUPP;
}
/*
* layerfs_suspendctl - suspend a layered file system
*
* Here we should suspend the lower file system(s) too. At present
* this will deadlock as we don't know which to suspend first.
*
* This routine serves as a central resource for this behavior; all
* layered file systems don't need to worry about the above. Also, if
* things get fixed, all layers get the benefit.
*/
int
layerfs_suspendctl(struct mount *mp, int cmd)
{
return genfs_suspendctl(mp, cmd);
}
SYSCTL_SETUP(sysctl_vfs_layerfs_setup, "sysctl vfs.layerfs subtree setup")
{
const struct sysctlnode *layerfs_node = NULL;
sysctl_createv(clog, 0, NULL, &layerfs_node,
#ifdef _MODULE
0,
#else
CTLFLAG_PERMANENT,
#endif
CTLTYPE_NODE, "layerfs",
SYSCTL_DESCR("Generic layered file system"),
NULL, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
#ifdef LAYERFS_DIAGNOSTIC
sysctl_createv(clog, 0, &layerfs_node, NULL,
#ifndef _MODULE
CTLFLAG_PERMANENT |
#endif
CTLFLAG_READWRITE,
CTLTYPE_INT,
"debug",
SYSCTL_DESCR("Verbose debugging messages"),
NULL, 0, &layerfs_debug, 0,
CTL_CREATE, CTL_EOL);
#endif
/*
* other subtrees should really be aliases to this, but since
* they can't tell if layerfs has been instantiated yet, they
* can't do that...not easily. not yet. :-)
*/
}
int
layerfs_renamelock_enter(struct mount *mp)
{
return VFS_RENAMELOCK_ENTER(mp->mnt_lower);
}
void
layerfs_renamelock_exit(struct mount *mp)
{
VFS_RENAMELOCK_EXIT(mp->mnt_lower);
}
/* $NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_pgflcache.c: page freelist cache.
*
* This implements a tiny per-CPU cache of pages that sits between the main
* page allocator and the freelists. By allocating and freeing pages in
* batch, it reduces freelist contention by an order of magnitude.
*
* The cache can be paused & resumed at runtime so that UVM_HOTPLUG,
* uvm_pglistalloc() and uvm_page_redim() can have a consistent view of the
* world. On system with one CPU per physical package (e.g. a uniprocessor)
* the cache is not enabled.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_pgflcache.c,v 1.6 2020/10/18 18:31:31 chs Exp $");
#include "opt_uvm.h"
#include "opt_multiprocessor.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <uvm/uvm.h>
#include <uvm/uvm_pglist.h>
#include <uvm/uvm_pgflcache.h>
/* There is no point doing any of this on a uniprocessor. */
#ifdef MULTIPROCESSOR
/*
* MAXPGS - maximum pages per color, per bucket.
* FILLPGS - number of pages to allocate at once, per color, per bucket.
*
* Why the chosen values:
*
* (1) In 2019, an average Intel system has 4kB pages and 8x L2 cache
* colors. We make the assumption that most of the time allocation activity
* will be centered around one UVM freelist, so most of the time there will
* be no more than 224kB worth of cached pages per-CPU. That's tiny, but
* enough to hugely reduce contention on the freelist locks, and give us a
* small pool of pages which if we're very lucky may have some L1/L2 cache
* locality, and do so without subtracting too much from the L2/L3 cache
* benefits of having per-package free lists in the page allocator.
*
* (2) With the chosen values on _LP64, the data structure for each color
* takes up a single cache line (64 bytes) giving this very low overhead
* even in the "miss" case.
*
* (3) We don't want to cause too much pressure by hiding away memory that
* could otherwise be put to good use.
*/
#define MAXPGS 7
#define FILLPGS 6
/* Variable size, according to # colors. */
struct pgflcache {
struct pccolor {
intptr_t count;
struct vm_page *pages[MAXPGS];
} color[1];
};
static kmutex_t uvm_pgflcache_lock;
static int uvm_pgflcache_sem;
/*
* uvm_pgflcache_fill: fill specified freelist/color from global list
*
* => must be called at IPL_VM
* => must be called with given bucket lock held
* => must only fill from the correct bucket for this CPU
*/
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
struct pgflbucket *pgb;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
struct vm_page *pg;
int count;
KASSERT(mutex_owned(&uvm_freelist_locks[b].lock)); KASSERT(ucpu->pgflbucket == b);
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return;
}
/* Fill only to the limit. */
pcc = &pc->color[c];
pgb = uvm.page_free[fl].pgfl_buckets[b];
head = &pgb->pgb_colors[c];
if (pcc->count >= FILLPGS) {
return;
}
/* Pull pages from the bucket until it's empty, or we are full. */
count = pcc->count;
pg = LIST_FIRST(head);
while (__predict_true(pg != NULL && count < FILLPGS)) { KASSERT(pg->flags & PG_FREE); KASSERT(uvm_page_get_bucket(pg) == b);
pcc->pages[count++] = pg;
pg = LIST_NEXT(pg, pageq.list);
}
/* Violate LIST abstraction to remove all pages at once. */
head->lh_first = pg;
if (__predict_true(pg != NULL)) { pg->pageq.list.le_prev = &head->lh_first;
}
pgb->pgb_nfree -= (count - pcc->count);
CPU_COUNT(CPU_COUNT_FREEPAGES, -(count - pcc->count));
pcc->count = count;
}
/*
* uvm_pgflcache_spill: spill specified freelist/color to global list
*
* => must be called at IPL_VM
* => mark __noinline so we don't pull it into uvm_pgflcache_free()
*/
static void __noinline
uvm_pgflcache_spill(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflbucket *pgb;
struct pgfreelist *pgfl;
struct pgflcache *pc;
struct pccolor *pcc;
struct pgflist *head;
kmutex_t *lock;
int b, adj;
pc = ucpu->pgflcache[fl];
pcc = &pc->color[c];
pgfl = &uvm.page_free[fl];
b = ucpu->pgflbucket;
pgb = pgfl->pgfl_buckets[b];
head = &pgb->pgb_colors[c];
lock = &uvm_freelist_locks[b].lock;
mutex_spin_enter(lock);
for (adj = pcc->count; pcc->count != 0;) {
pcc->count--;
KASSERT(pcc->pages[pcc->count] != NULL); KASSERT(pcc->pages[pcc->count]->flags & PG_FREE); LIST_INSERT_HEAD(head, pcc->pages[pcc->count], pageq.list);
}
pgb->pgb_nfree += adj;
CPU_COUNT(CPU_COUNT_FREEPAGES, adj);
mutex_spin_exit(lock);
}
/*
* uvm_pgflcache_alloc: try to allocate a cached page.
*
* => must be called at IPL_VM
* => allocate only from the given freelist and given page color
*/
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
struct pgflcache *pc;
struct pccolor *pcc;
struct vm_page *pg;
/* If caching is off, then bail out. */
if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return NULL;
}
/* Very simple: if we have a page then return it. */
pcc = &pc->color[c];
if (__predict_false(pcc->count == 0)) {
return NULL;
}
pg = pcc->pages[--(pcc->count)];
KASSERT(pg != NULL); KASSERT(pg->flags == PG_FREE); KASSERT(uvm_page_get_freelist(pg) == fl); KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
return pg;
}
/*
* uvm_pgflcache_free: cache a page, if possible.
*
* => must be called at IPL_VM
* => must only send pages for the correct bucket for this CPU
*/
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
struct pgflcache *pc;
struct pccolor *pcc;
int fl, c;
KASSERT(uvm_page_get_bucket(pg) == ucpu->pgflbucket);
/* If caching is off, then bail out. */
fl = uvm_page_get_freelist(pg); if (__predict_false((pc = ucpu->pgflcache[fl]) == NULL)) {
return false;
}
/* If the array is full spill it first, then add page to array. */
c = VM_PGCOLOR(pg);
pcc = &pc->color[c];
KASSERT((pg->flags & PG_FREE) == 0); if (__predict_false(pcc->count == MAXPGS)) { uvm_pgflcache_spill(ucpu, fl, c);
}
pg->flags = PG_FREE;
pcc->pages[pcc->count] = pg;
pcc->count++;
return true;
}
/*
* uvm_pgflcache_init: allocate and initialize per-CPU data structures for
* the free page cache. Don't set anything in motion - that's taken care
* of by uvm_pgflcache_resume().
*/
static void
uvm_pgflcache_init_cpu(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
size_t sz;
ucpu = ci->ci_data.cpu_uvm;
KASSERT(ucpu->pgflcachemem == NULL);
KASSERT(ucpu->pgflcache[0] == NULL);
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
ucpu->pgflcachememsz =
(roundup2(sz * VM_NFREELIST, coherency_unit) + coherency_unit - 1);
ucpu->pgflcachemem = kmem_zalloc(ucpu->pgflcachememsz, KM_SLEEP);
}
/*
* uvm_pgflcache_fini_cpu: dump all cached pages back to global free list
* and shut down caching on the CPU. Called on each CPU in the system via
* xcall.
*/
static void
uvm_pgflcache_fini_cpu(void *arg1 __unused, void *arg2 __unused)
{
struct uvm_cpu *ucpu;
int fl, color, s;
ucpu = curcpu()->ci_data.cpu_uvm;
for (fl = 0; fl < VM_NFREELIST; fl++) {
s = splvm();
for (color = 0; color < uvmexp.ncolors; color++) {
uvm_pgflcache_spill(ucpu, fl, color);
}
ucpu->pgflcache[fl] = NULL;
splx(s);
}
}
/*
* uvm_pgflcache_pause: pause operation of the caches
*/
void
uvm_pgflcache_pause(void)
{
uint64_t where;
/* First one in starts draining. Everyone else waits. */
mutex_enter(&uvm_pgflcache_lock);
if (uvm_pgflcache_sem++ == 0) {
where = xc_broadcast(XC_HIGHPRI, uvm_pgflcache_fini_cpu,
(void *)1, NULL);
xc_wait(where);
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_resume: resume operation of the caches
*/
void
uvm_pgflcache_resume(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
struct uvm_cpu *ucpu;
uintptr_t addr;
size_t sz;
int fl;
/* Last guy out takes care of business. */
mutex_enter(&uvm_pgflcache_lock);
KASSERT(uvm_pgflcache_sem > 0);
if (uvm_pgflcache_sem-- > 1) {
mutex_exit(&uvm_pgflcache_lock);
return;
}
/*
* Make sure dependant data structure updates are remotely visible.
* Essentially this functions as a global memory barrier.
*/
xc_barrier(XC_HIGHPRI);
/*
* Then set all of the pointers in place on each CPU. As soon as
* each pointer is set, caching is operational in that dimension.
*/
sz = offsetof(struct pgflcache, color[uvmexp.ncolors]);
for (CPU_INFO_FOREACH(cii, ci)) {
ucpu = ci->ci_data.cpu_uvm;
addr = roundup2((uintptr_t)ucpu->pgflcachemem, coherency_unit);
for (fl = 0; fl < VM_NFREELIST; fl++) {
ucpu->pgflcache[fl] = (struct pgflcache *)addr;
addr += sz;
}
}
mutex_exit(&uvm_pgflcache_lock);
}
/*
* uvm_pgflcache_start: start operation of the cache.
*
* => called once only, when init(8) is about to be started
*/
void
uvm_pgflcache_start(void)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(uvm_pgflcache_sem > 0);
/*
* There's not much point doing this if every CPU has its own
* bucket (and that includes the uniprocessor case).
*/
if (ncpu == uvm.bucketcount) {
return;
}
/* Create data structures for each CPU. */
for (CPU_INFO_FOREACH(cii, ci)) {
uvm_pgflcache_init_cpu(ci);
}
/* Kick it into action. */
uvm_pgflcache_resume();
}
/*
* uvm_pgflcache_init: set up data structures for the free page cache.
*/
void
uvm_pgflcache_init(void)
{
uvm_pgflcache_sem = 1;
mutex_init(&uvm_pgflcache_lock, MUTEX_DEFAULT, IPL_NONE);
}
#else /* MULTIPROCESSOR */
struct vm_page *
uvm_pgflcache_alloc(struct uvm_cpu *ucpu, int fl, int c)
{
return NULL;
}
bool
uvm_pgflcache_free(struct uvm_cpu *ucpu, struct vm_page *pg)
{
return false;
}
void
uvm_pgflcache_fill(struct uvm_cpu *ucpu, int fl, int b, int c)
{
}
void
uvm_pgflcache_pause(void)
{
}
void
uvm_pgflcache_resume(void)
{
}
void
uvm_pgflcache_start(void)
{
}
void
uvm_pgflcache_init(void)
{
}
#endif /* MULTIPROCESSOR */
/* $NetBSD: in4_cksum.c,v 1.20 2014/11/30 18:15:41 christos Exp $ */
/*-
* Copyright (c) 2008 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in4_cksum.c,v 1.20 2014/11/30 18:15:41 christos Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
/*
* Checksum of the IPv4 pseudo header.
*
* off is supposed to be the skipped IPv4 header, len is the payload size.
*/
#ifdef DIAGNOSTIC
#define PANIC(a,...) panic(a, __VA_ARGS__)
#else
#define PANIC(a,...) do { \
printf(a, __VA_ARGS__); \
return -1; \
} while (/*CONSTCOND*/0)
#endif
int
in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len)
{
uint32_t sum;
uint16_t *w;
if (__predict_false(m->m_len < sizeof(struct ip)))
PANIC("%s: mbuf %d too short for IP header %zu", __func__,
m->m_len, sizeof(struct ip));
if (nxt == 0)
return cpu_in_cksum(m, len, off, 0);
if (__predict_false(off < sizeof(struct ip)))
PANIC("%s: offset %d too short for IP header %zu", __func__,
off, sizeof(struct ip));
/*
* Compute the equivalent of:
* struct ipovly ip;
*
* bzero(sizeof(*ip));
* ip.ih_pr = nxt;
* ip.ip_len = htons(len);
* ip.ih_src = mtod(m, struct ip *)->ip_src;
* ip.ih_dst = mtod(m, struct ip *)->ip_dst;
* sum = one_add(&ip);
*/
#if BYTE_ORDER == LITTLE_ENDIAN
sum = ((len & 0xffff) + nxt) << 8;
#else
sum = (len & 0xffff) + nxt;
#endif
w = (uint16_t *)(mtod(m, char *) + offsetof(struct ip, ip_src));
if (__predict_true((uintptr_t)w % 2 == 0)) {
sum += w[0];
sum += w[1];
sum += w[2];
sum += w[3];
} else {
uint32_t partial;
w = (void *)((uintptr_t)w - 1);
#if BYTE_ORDER == LITTLE_ENDIAN
partial = w[0] & 0xff00;
#else
partial = w[0] & 0x00ff;
#endif
partial += w[1];
partial += w[2];
partial += w[3];
#if BYTE_ORDER == LITTLE_ENDIAN
partial += w[4] & 0x00ff;
#else
partial += w[4] & 0xff00;
#endif
sum += partial << 8;
}
return cpu_in_cksum(m, len, off, sum);
}
/* $NetBSD: if_ether.h,v 1.91 2024/02/05 21:46:06 andvar Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_ether.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NET_IF_ETHER_H_
#define _NET_IF_ETHER_H_
#ifdef _KERNEL
#ifdef _KERNEL_OPT
#include "opt_mbuftrace.h"
#endif
#include <sys/mbuf.h>
#endif
#ifndef _STANDALONE
#include <net/if.h>
#endif
/*
* Some basic Ethernet constants.
*/
#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */
#define ETHER_TYPE_LEN 2 /* length of the Ethernet type field */
#define ETHER_CRC_LEN 4 /* length of the Ethernet CRC */
#define ETHER_HDR_LEN ((ETHER_ADDR_LEN * 2) + ETHER_TYPE_LEN)
#define ETHER_MIN_LEN 64 /* minimum frame length, including CRC */
#define ETHER_MAX_LEN 1518 /* maximum frame length, including CRC */
#define ETHER_MAX_LEN_JUMBO 9018 /* maximum jumbo frame len, including CRC */
/*
* Some Ethernet extensions.
*/
#define ETHER_VLAN_ENCAP_LEN 4 /* length of 802.1Q VLAN encapsulation */
#define EVL_VLANOFTAG(tag) ((tag) & 4095) /* VLAN ID */
#define EVL_PRIOFTAG(tag) (((tag) >> 13) & 7) /* Priority */
#define EVL_CFIOFTAG(tag) (((tag) >> 12) & 1) /* CFI */
#define ETHER_PPPOE_ENCAP_LEN 8 /* length of PPPoE encapsulation */
/*
* Mbuf adjust factor to force 32-bit alignment of IP header.
* Drivers should do m_adj(m, ETHER_ALIGN) when setting up a
* receive so the upper layers get the IP header properly aligned
* past the 14-byte Ethernet header.
*/
#define ETHER_ALIGN 2 /* driver adjust for IP hdr alignment */
/*
* Ethernet address - 6 octets
* this is only used by the ethers(3) functions.
*/
struct ether_addr {
uint8_t ether_addr_octet[ETHER_ADDR_LEN];
};
/*
* Structure of a 10Mb/s Ethernet header.
*/
struct ether_header {
uint8_t ether_dhost[ETHER_ADDR_LEN];
uint8_t ether_shost[ETHER_ADDR_LEN];
uint16_t ether_type;
};
#include <net/ethertypes.h>
#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */
#define ETHER_IS_LOCAL(addr) (*(addr) & 0x02) /* is address local? */
#define ETHERMTU_JUMBO (ETHER_MAX_LEN_JUMBO - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define ETHERMTU (ETHER_MAX_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
#define ETHERMIN (ETHER_MIN_LEN - ETHER_HDR_LEN - ETHER_CRC_LEN)
/*
* Compute the maximum frame size based on ethertype (i.e. possible
* encapsulation) and whether or not an FCS is present.
*/
#define ETHER_MAX_FRAME(ifp, etype, hasfcs) \
((ifp)->if_mtu + ETHER_HDR_LEN + \
((hasfcs) ? ETHER_CRC_LEN : 0) + \
(((etype) == ETHERTYPE_VLAN) ? ETHER_VLAN_ENCAP_LEN : 0) + \
(((etype) == ETHERTYPE_PPPOE) ? ETHER_PPPOE_ENCAP_LEN : 0))
/*
* Ethernet CRC32 polynomials (big- and little-endian versions).
*/
#define ETHER_CRC_POLY_LE 0xedb88320
#define ETHER_CRC_POLY_BE 0x04c11db6
#ifndef _STANDALONE
/*
* Ethernet-specific mbuf flags.
*/
#define M_HASFCS M_LINK0 /* FCS included at end of frame */
#define M_PROMISC M_LINK1 /* this packet is not for us */
#ifdef _KERNEL
/*
* Macro to map an IP multicast address to an Ethernet multicast address.
* The high-order 25 bits of the Ethernet address are statically assigned,
* and the low-order 23 bits are taken from the low end of the IP address.
*/
#define ETHER_MAP_IP_MULTICAST(ipaddr, enaddr) \
/* const struct in_addr *ipaddr; */ \
/* uint8_t enaddr[ETHER_ADDR_LEN]; */ \
do { \
(enaddr)[0] = 0x01; \
(enaddr)[1] = 0x00; \
(enaddr)[2] = 0x5e; \
(enaddr)[3] = ((const uint8_t *)ipaddr)[1] & 0x7f; \
(enaddr)[4] = ((const uint8_t *)ipaddr)[2]; \
(enaddr)[5] = ((const uint8_t *)ipaddr)[3]; \
} while (/*CONSTCOND*/0)
/*
* Macro to map an IP6 multicast address to an Ethernet multicast address.
* The high-order 16 bits of the Ethernet address are statically assigned,
* and the low-order 32 bits are taken from the low end of the IP6 address.
*/
#define ETHER_MAP_IPV6_MULTICAST(ip6addr, enaddr) \
/* struct in6_addr *ip6addr; */ \
/* uint8_t enaddr[ETHER_ADDR_LEN]; */ \
{ \
(enaddr)[0] = 0x33; \
(enaddr)[1] = 0x33; \
(enaddr)[2] = ((const uint8_t *)ip6addr)[12]; \
(enaddr)[3] = ((const uint8_t *)ip6addr)[13]; \
(enaddr)[4] = ((const uint8_t *)ip6addr)[14]; \
(enaddr)[5] = ((const uint8_t *)ip6addr)[15]; \
}
#endif
struct mii_data;
struct ethercom;
typedef int (*ether_cb_t)(struct ethercom *);
typedef int (*ether_vlancb_t)(struct ethercom *, uint16_t, bool);
/*
* Structure shared between the ethernet driver modules and
* the multicast list code. For example, each ec_softc or il_softc
* begins with this structure.
*/
struct ethercom {
struct ifnet ec_if; /* network-visible interface */
LIST_HEAD(, ether_multi) ec_multiaddrs; /* list of ether multicast
addrs */
int ec_multicnt; /* length of ec_multiaddrs
list */
int ec_capabilities; /* capabilities, provided by
driver */
int ec_capenable; /* tells hardware which
capabilities to enable */
int ec_nvlans; /* # VLANs on this interface */
SIMPLEQ_HEAD(, vlanid_list) ec_vids; /* list of VLAN IDs */
/* The device handle for the MII bus child device. */
struct mii_data *ec_mii;
struct ifmedia *ec_ifmedia;
/*
* Called after a change to ec_if.if_flags. Returns
* ENETRESET if the device should be reinitialized with
* ec_if.if_init, 0 on success, not 0 on failure.
*/
ether_cb_t ec_ifflags_cb;
/*
* Called whenever a vlan interface is configured or unconfigured.
* Args include the vlan tag and a flag indicating whether the tag is
* being added or removed.
*/
ether_vlancb_t ec_vlan_cb;
/* Hooks called at the beginning of detach of this interface */
khook_list_t *ec_ifdetach_hooks;
kmutex_t *ec_lock;
/* Flags used only by the kernel */
int ec_flags;
#ifdef MBUFTRACE
struct mowner ec_rx_mowner; /* mbufs received */
struct mowner ec_tx_mowner; /* mbufs transmitted */
#endif
};
#define ETHERCAP_VLAN_MTU 0x00000001 /* VLAN-compatible MTU */
#define ETHERCAP_VLAN_HWTAGGING 0x00000002 /* hardware VLAN tag support */
#define ETHERCAP_JUMBO_MTU 0x00000004 /* 9000 byte MTU supported */
#define ETHERCAP_VLAN_HWFILTER 0x00000008 /* iface hw can filter vlan tag */
#define ETHERCAP_EEE 0x00000010 /* Energy Efficiency Ethernet */
#define ETHERCAP_MASK 0x0000001f
#define ECCAPBITS \
"\020" \
"\1VLAN_MTU" \
"\2VLAN_HWTAGGING" \
"\3JUMBO_MTU" \
"\4VLAN_HWFILTER" \
"\5EEE"
/* ioctl() for Ethernet capabilities */
struct eccapreq {
char eccr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
int eccr_capabilities; /* supported capabiliites */
int eccr_capenable; /* capabilities enabled */
};
/* sysctl for Ethernet multicast addresses */
struct ether_multi_sysctl {
u_int enm_refcount;
uint8_t enm_addrlo[ETHER_ADDR_LEN];
uint8_t enm_addrhi[ETHER_ADDR_LEN];
};
#ifdef _KERNEL
/*
* Flags for ec_flags
*/
/* Store IFF_ALLMULTI in ec_flags instead of if_flags to avoid data races. */
#define ETHER_F_ALLMULTI __BIT(0)
extern const uint8_t etherbroadcastaddr[ETHER_ADDR_LEN];
extern const uint8_t ethermulticastaddr_slowprotocols[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_min[ETHER_ADDR_LEN];
extern const uint8_t ether_ipmulticast_max[ETHER_ADDR_LEN];
void ether_set_ifflags_cb(struct ethercom *, ether_cb_t);
void ether_set_vlan_cb(struct ethercom *, ether_vlancb_t);
int ether_ioctl(struct ifnet *, u_long, void *);
int ether_addmulti(const struct sockaddr *, struct ethercom *);
int ether_delmulti(const struct sockaddr *, struct ethercom *);
int ether_multiaddr(const struct sockaddr *, uint8_t[ETHER_ADDR_LEN],
uint8_t[ETHER_ADDR_LEN]);
void ether_input(struct ifnet *, struct mbuf *);
/*
* Ethernet multicast address structure. There is one of these for each
* multicast address or range of multicast addresses that we are supposed
* to listen to on a particular interface. They are kept in a linked list,
* rooted in the interface's ethercom structure.
*/
struct ether_multi {
uint8_t enm_addrlo[ETHER_ADDR_LEN]; /* low or only address of range */
uint8_t enm_addrhi[ETHER_ADDR_LEN]; /* high or only address of range */
u_int enm_refcount; /* no. claims to this addr/range */
LIST_ENTRY(ether_multi) enm_list;
};
/*
* Structure used by macros below to remember position when stepping through
* all of the ether_multi records.
*/
struct ether_multistep {
struct ether_multi *e_enm;
};
/*
* lookup the ether_multi record for a given range of Ethernet
* multicast addresses connected to a given ethercom structure.
* If no matching record is found, NULL is returned.
*/
static __inline struct ether_multi *
ether_lookup_multi(const uint8_t *addrlo, const uint8_t *addrhi,
const struct ethercom *ec)
{
struct ether_multi *enm;
LIST_FOREACH(enm, &ec->ec_multiaddrs, enm_list) { if (memcmp(enm->enm_addrlo, addrlo, ETHER_ADDR_LEN) != 0)
continue;
if (memcmp(enm->enm_addrhi, addrhi, ETHER_ADDR_LEN) != 0)
continue;
break;
}
return enm;
}
/*
* step through all of the ether_multi records, one at a time.
* The current position is remembered in "step", which the caller must
* provide. ether_first_multi(), below, must be called to initialize "step"
* and get the first record. Both functions return a NULL when there
* are no remaining records.
*/
static __inline struct ether_multi *
ether_next_multi(struct ether_multistep *step)
{
struct ether_multi *enm;
enm = step->e_enm;
if (enm != NULL)
step->e_enm = LIST_NEXT(enm, enm_list);
return enm;
}
#define ETHER_NEXT_MULTI(step, enm) \
/* struct ether_multistep step; */ \
/* struct ether_multi *enm; */ \
(enm) = ether_next_multi(&(step))
static __inline struct ether_multi *
ether_first_multi(struct ether_multistep *step, const struct ethercom *ec)
{
step->e_enm = LIST_FIRST(&ec->ec_multiaddrs);
return ether_next_multi(step);
}
#define ETHER_FIRST_MULTI(step, ec, enm) \
/* struct ether_multistep step; */ \
/* struct ethercom *ec; */ \
/* struct ether_multi *enm; */ \
(enm) = ether_first_multi(&(step), (ec))
#define ETHER_LOCK(ec) mutex_enter((ec)->ec_lock)
#define ETHER_UNLOCK(ec) mutex_exit((ec)->ec_lock)
/*
* Ethernet 802.1Q VLAN structures.
*/
/* for ethercom */
struct vlanid_list {
uint16_t vid;
SIMPLEQ_ENTRY(vlanid_list) vid_list;
};
/* add VLAN tag to input/received packet */
static __inline void
vlan_set_tag(struct mbuf *m, uint16_t vlantag)
{
/* VLAN tag contains priority, CFI and VLAN ID */
KASSERT((m->m_flags & M_PKTHDR) != 0);
m->m_pkthdr.ether_vtag = vlantag;
m->m_flags |= M_VLANTAG;
return;
}
/* extract VLAN ID value from a VLAN tag */
static __inline uint16_t
vlan_get_tag(struct mbuf *m)
{
KASSERT((m->m_flags & M_PKTHDR) != 0);
KASSERT(m->m_flags & M_VLANTAG);
return m->m_pkthdr.ether_vtag;
}
static __inline bool
vlan_has_tag(struct mbuf *m)
{
return (m->m_flags & M_VLANTAG) != 0;
}
static __inline bool
vlan_is_hwtag_enabled(struct ifnet *_ifp)
{
struct ethercom *ec = (void *)_ifp;
if (ec->ec_capenable & ETHERCAP_VLAN_HWTAGGING)
return true;
return false;
}
/* test if any VLAN is configured for this interface */
#define VLAN_ATTACHED(ec) ((ec)->ec_nvlans > 0)
void etherinit(void);
void ether_ifattach(struct ifnet *, const uint8_t *);
void ether_ifdetach(struct ifnet *);
int ether_mediachange(struct ifnet *);
void ether_mediastatus(struct ifnet *, struct ifmediareq *);
void * ether_ifdetachhook_establish(struct ifnet *,
void (*)(void *), void *arg);
void ether_ifdetachhook_disestablish(struct ifnet *,
void *, kmutex_t *);
char *ether_sprintf(const uint8_t *);
char *ether_snprintf(char *, size_t, const uint8_t *);
uint32_t ether_crc32_le(const uint8_t *, size_t);
uint32_t ether_crc32_be(const uint8_t *, size_t);
int ether_aton_r(u_char *, size_t, const char *);
int ether_enable_vlan_mtu(struct ifnet *);
int ether_disable_vlan_mtu(struct ifnet *);
int ether_add_vlantag(struct ifnet *, uint16_t, bool *);
int ether_del_vlantag(struct ifnet *, uint16_t);
int ether_inject_vlantag(struct mbuf **, uint16_t, uint16_t);
struct mbuf *
ether_strip_vlantag(struct mbuf *);
#else
/*
* Prototype ethers(3) functions.
*/
#include <sys/cdefs.h>
__BEGIN_DECLS
char * ether_ntoa(const struct ether_addr *);
struct ether_addr *
ether_aton(const char *);
int ether_ntohost(char *, const struct ether_addr *);
int ether_hostton(const char *, struct ether_addr *);
int ether_line(const char *, struct ether_addr *, char *);
__END_DECLS
#endif
#endif /* _STANDALONE */
#endif /* !_NET_IF_ETHER_H_ */
/* $NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_inet_csum.h"
#include "opt_mbuftrace.h"
#endif
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/once.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/md5.h>
#include <sys/cprng.h>
#include <net/route.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_vtw.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_syncache.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/key.h>
#endif
struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
u_int32_t tcp_now; /* slow ticks, for RFC 1323 timestamps */
percpu_t *tcpstat_percpu;
/* patchable/settable parameters for tcp */
int tcp_mssdflt = TCP_MSS;
int tcp_minmss = TCP_MINMSS;
int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */
int tcp_do_sack = 1; /* selective acknowledgement */
int tcp_do_win_scale = 1; /* RFC1323 window scaling */
int tcp_do_timestamps = 1; /* RFC1323 timestamps */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
int tcp_do_ecn = 0; /* Explicit Congestion Notification */
#ifndef TCP_INIT_WIN
#define TCP_INIT_WIN 4 /* initial slow start window */
#endif
#ifndef TCP_INIT_WIN_LOCAL
#define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */
#endif
/*
* Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460.
* This is to simulate current behavior for iw == 4
*/
int tcp_init_win_max[] = {
1 * 1460,
1 * 1460,
2 * 1460,
2 * 1460,
3 * 1460,
5 * 1460,
6 * 1460,
7 * 1460,
8 * 1460,
9 * 1460,
10 * 1460
};
int tcp_init_win = TCP_INIT_WIN;
int tcp_init_win_local = TCP_INIT_WIN_LOCAL;
int tcp_mss_ifmtu = 0;
int tcp_rst_ppslim = 100; /* 100pps */
int tcp_ackdrop_ppslim = 100; /* 100pps */
int tcp_do_loopback_cksum = 0;
int tcp_do_abc = 1; /* RFC3465 Appropriate byte counting. */
int tcp_abc_aggressive = 1; /* 1: L=2*SMSS 0: L=1*SMSS */
int tcp_sack_tp_maxholes = 32;
int tcp_sack_globalmaxholes = 1024;
int tcp_sack_globalholes = 0;
int tcp_ecn_maxretries = 1;
int tcp_msl_enable = 1; /* enable TIME_WAIT truncation */
int tcp_msl_loop = PR_SLOWHZ; /* MSL for loopback */
int tcp_msl_local = 5 * PR_SLOWHZ; /* MSL for 'local' */
int tcp_msl_remote = TCPTV_MSL; /* MSL otherwise */
int tcp_msl_remote_threshold = TCPTV_SRTTDFLT; /* RTT threshold */
int tcp_rttlocal = 0; /* Use RTT to decide who's 'local' */
int tcp4_vtw_enable = 0; /* 1 to enable */
int tcp6_vtw_enable = 0; /* 1 to enable */
int tcp_vtw_was_enabled = 0;
int tcp_vtw_entries = 1 << 4; /* 16 vestigial TIME_WAIT entries */
/* tcb hash */
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
#endif
int tcbhashsize = TCBHASHSIZE;
int tcp_freeq(struct tcpcb *);
static int tcp_iss_secret_init(void);
static void tcp_mtudisc_callback(struct in_addr);
#ifdef INET6
static void tcp6_mtudisc(struct inpcb *, int);
#endif
static struct pool tcpcb_pool;
static int tcp_drainwanted;
#ifdef TCP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum bad");
struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum ok");
struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum data");
struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "swcsum");
EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp_swcsum);
#if defined(INET6)
struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum bad");
struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum ok");
struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum data");
struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "swcsum");
EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp6_swcsum);
#endif /* defined(INET6) */
#endif /* TCP_CSUM_COUNTERS */
#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>
struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output big header");
struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict hit");
struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict miss");
struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy small");
struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy big");
struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output reference big");
EVCNT_ATTACH_STATIC(tcp_output_bigheader);
EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
EVCNT_ATTACH_STATIC(tcp_output_copysmall);
EVCNT_ATTACH_STATIC(tcp_output_copybig);
EVCNT_ATTACH_STATIC(tcp_output_refbig);
#endif /* TCP_OUTPUT_COUNTERS */
#ifdef TCP_REASS_COUNTERS
#include <sys/device.h>
struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp_reass", "calls");
struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert into empty queue");
struct evcnt tcp_reass_iteration[8] = {
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
};
struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend to first");
struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend");
struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert");
struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert at tail");
struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append");
struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append to tail fragment");
struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at end");
struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at start");
struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate segment");
struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate fragment");
EVCNT_ATTACH_STATIC(tcp_reass_);
EVCNT_ATTACH_STATIC(tcp_reass_empty);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
EVCNT_ATTACH_STATIC(tcp_reass_prepend);
EVCNT_ATTACH_STATIC(tcp_reass_insert);
EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
EVCNT_ATTACH_STATIC(tcp_reass_append);
EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
EVCNT_ATTACH_STATIC(tcp_reass_segdup);
EVCNT_ATTACH_STATIC(tcp_reass_fragdup);
#endif /* TCP_REASS_COUNTERS */
#ifdef MBUFTRACE
struct mowner tcp_mowner = MOWNER_INIT("tcp", "");
struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx");
struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx");
struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock");
struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx");
struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx");
#endif
static int
do_tcpinit(void)
{
inpcb_init(&tcbtable, tcbhashsize, tcbhashsize);
pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
NULL, IPL_SOFTNET);
tcp_usrreq_init();
/* Initialize timer state. */
tcp_timer_init();
/* Initialize the compressed state engine. */
syn_cache_init();
/* Initialize the congestion control algorithms. */
tcp_congctl_init();
/* Initialize the TCPCB template. */
tcp_tcpcb_template();
/* Initialize reassembly queue */
tcpipqent_init();
/* SACK */
tcp_sack_init();
MOWNER_ATTACH(&tcp_tx_mowner);
MOWNER_ATTACH(&tcp_rx_mowner);
MOWNER_ATTACH(&tcp_reass_mowner);
MOWNER_ATTACH(&tcp_sock_mowner);
MOWNER_ATTACH(&tcp_sock_tx_mowner);
MOWNER_ATTACH(&tcp_sock_rx_mowner);
MOWNER_ATTACH(&tcp_mowner);
tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS);
vtw_earlyinit();
tcp_slowtimo_init();
return 0;
}
void
tcp_init_common(unsigned basehlen)
{
static ONCE_DECL(dotcpinit);
unsigned hlen = basehlen + sizeof(struct tcphdr);
unsigned oldhlen;
if (max_linkhdr + hlen > MHLEN)
panic("tcp_init");
while ((oldhlen = max_protohdr) < hlen)
atomic_cas_uint(&max_protohdr, oldhlen, hlen);
RUN_ONCE(&dotcpinit, do_tcpinit);
}
/*
* Tcp initialization
*/
void
tcp_init(void)
{
icmp_mtudisc_callback_register(tcp_mtudisc_callback);
tcp_init_common(sizeof(struct ip));
}
/*
* Create template to be used to send tcp packets on a connection.
* Call after host entry created, allocates an mbuf and fills
* in a skeletal tcp/ip header, minimizing the amount of work
* necessary when the connection is used.
*/
struct mbuf *
tcp_template(struct tcpcb *tp)
{
struct inpcb *inp = tp->t_inpcb;
struct tcphdr *n;
struct mbuf *m;
int hlen;
switch (tp->t_family) {
case AF_INET:
hlen = sizeof(struct ip);
if (inp->inp_af == AF_INET)
break;
#ifdef INET6
if (inp->inp_af == AF_INET6) {
/* mapped addr case */
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)))
break;
}
#endif
return NULL; /*EINVAL*/
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
if (inp != NULL) {
/* more sainty check? */
break;
}
return NULL; /*EINVAL*/
#endif
default:
return NULL; /*EAFNOSUPPORT*/
}
KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES);
m = tp->t_template;
if (m && m->m_len == hlen + sizeof(struct tcphdr)) {
;
} else {
if (m)
m_freem(m);
m = tp->t_template = NULL;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return NULL;
MCLAIM(m, &tcp_mowner);
m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
}
memset(mtod(m, void *), 0, m->m_len);
n = (struct tcphdr *)(mtod(m, char *) + hlen);
switch (tp->t_family) {
case AF_INET:
{
struct ipovly *ipov;
mtod(m, struct ip *)->ip_v = 4;
mtod(m, struct ip *)->ip_hl = hlen >> 2;
ipov = mtod(m, struct ipovly *);
ipov->ih_pr = IPPROTO_TCP;
ipov->ih_len = htons(sizeof(struct tcphdr));
if (inp->inp_af == AF_INET) { ipov->ih_src = in4p_laddr(inp);
ipov->ih_dst = in4p_faddr(inp);
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
/* mapped addr case */
bcopy(&in6p_laddr(inp).s6_addr32[3], &ipov->ih_src,
sizeof(ipov->ih_src));
bcopy(&in6p_faddr(inp).s6_addr32[3], &ipov->ih_dst,
sizeof(ipov->ih_dst));
}
#endif
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
ipov->ih_dst.s_addr,
htons(sizeof(struct tcphdr) + IPPROTO_TCP));
break;
}
#ifdef INET6
case AF_INET6:
{
struct ip6_hdr *ip6;
mtod(m, struct ip *)->ip_v = 6;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_src = in6p_laddr(inp);
ip6->ip6_dst = in6p_faddr(inp);
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) { ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in6_cksum_phdr(&in6p_laddr(inp),
&in6p_faddr(inp), htonl(sizeof(struct tcphdr)),
htonl(IPPROTO_TCP));
break;
}
#endif
}
n->th_sport = inp->inp_lport;
n->th_dport = inp->inp_fport;
n->th_seq = 0;
n->th_ack = 0;
n->th_x2 = 0;
n->th_off = 5;
n->th_flags = 0;
n->th_win = 0;
n->th_urp = 0;
return m;
}
/*
* Send a single message to the TCP at address specified by
* the given TCP/IP header. If m == 0, then we make a copy
* of the tcpiphdr at ti and send directly to the addressed host.
* This is used to force keep alive messages out using the TCP
* template for a connection tp->t_template. If flags are given
* then we send a message back to the TCP which originated the
* segment ti, and discard the mbuf containing it and any other
* attached mbufs.
*
* In any case the ack and sequence number of the transmitted
* segment are as specified by the parameters.
*/
int
tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m,
struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
{
struct route *ro;
int error, tlen, win = 0;
int hlen;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
int family; /* family on packet, not inpcb! */
struct tcphdr *th;
if (tp != NULL && (flags & TH_RST) == 0) {
KASSERT(tp->t_inpcb != NULL);
win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
}
th = NULL; /* Quell uninitialized warning */
ip = NULL;
#ifdef INET6
ip6 = NULL;
#endif
if (m == NULL) {
if (!mtemplate)
return EINVAL;
/* get family information from template */
switch (mtod(mtemplate, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
return EAFNOSUPPORT;
}
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m) {
MCLAIM(m, &tcp_tx_mowner);
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return ENOBUFS;
tlen = 0;
m->m_data += max_linkhdr;
bcopy(mtod(mtemplate, void *), mtod(m, void *),
mtemplate->m_len);
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
break;
#endif
}
flags = TH_ACK;
} else {
if ((m->m_flags & M_PKTHDR) == 0) {
m_freem(m);
return EINVAL;
}
KASSERT(th0 != NULL);
/* get family information from m */
switch (mtod(m, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
ip = mtod(m, struct ip *);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
ip6 = mtod(m, struct ip6_hdr *);
break;
#endif
default:
m_freem(m);
return EAFNOSUPPORT;
}
/* clear h/w csum flags inherited from rx packet */
m->m_pkthdr.csum_flags = 0;
if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
tlen = sizeof(*th0);
else
tlen = th0->th_off << 2;
if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
mtod(m, char *) + hlen == (char *)th0) {
m->m_len = hlen + tlen;
m_freem(m->m_next);
m->m_next = NULL;
} else {
struct mbuf *n;
KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES);
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n && max_linkhdr + hlen + tlen > MHLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n) {
m_freem(m);
return ENOBUFS;
}
MCLAIM(n, &tcp_tx_mowner);
n->m_data += max_linkhdr;
n->m_len = hlen + tlen;
m_copyback(n, 0, hlen, mtod(m, void *));
m_copyback(n, hlen, tlen, (void *)th0);
m_freem(m);
m = n;
n = NULL;
}
#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
ip->ip_p = IPPROTO_TCP;
xchg(ip->ip_dst, ip->ip_src, struct in_addr);
ip->ip_p = IPPROTO_TCP;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
ip6->ip6_nxt = IPPROTO_TCP;
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
ip6->ip6_nxt = IPPROTO_TCP;
break;
#endif
}
xchg(th->th_dport, th->th_sport, u_int16_t);
#undef xchg
tlen = 0; /*be friendly with the following code*/
}
th->th_seq = htonl(seq);
th->th_ack = htonl(ack);
th->th_x2 = 0;
if ((flags & TH_SYN) == 0) {
if (tp)
win >>= tp->rcv_scale;
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
th->th_win = htons((u_int16_t)win);
th->th_off = sizeof (struct tcphdr) >> 2;
tlen += sizeof(*th);
} else {
tlen += th->th_off << 2;
}
m->m_len = hlen + tlen;
m->m_pkthdr.len = hlen + tlen;
m_reset_rcvif(m);
th->th_flags = flags;
th->th_urp = 0;
switch (family) {
case AF_INET:
{
struct ipovly *ipov = (struct ipovly *)ip;
memset(ipov->ih_x1, 0, sizeof ipov->ih_x1);
ipov->ih_len = htons((u_int16_t)tlen);
th->th_sum = 0;
th->th_sum = in_cksum(m, hlen + tlen);
ip->ip_len = htons(hlen + tlen);
ip->ip_ttl = ip_defttl;
break;
}
#ifdef INET6
case AF_INET6:
{
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
tlen);
ip6->ip6_plen = htons(tlen);
if (tp && tp->t_inpcb->inp_af == AF_INET6)
ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
else
ip6->ip6_hlim = ip6_defhlim;
ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) {
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
break;
}
#endif
}
if (tp != NULL && tp->t_inpcb->inp_af == AF_INET) {
ro = &tp->t_inpcb->inp_route;
KASSERT(family == AF_INET);
KASSERT(in_hosteq(ip->ip_dst, in4p_faddr(tp->t_inpcb)));
}
#ifdef INET6
else if (tp != NULL && tp->t_inpcb->inp_af == AF_INET6) {
ro = (struct route *)&tp->t_inpcb->inp_route;
#ifdef DIAGNOSTIC
if (family == AF_INET) {
if (!IN6_IS_ADDR_V4MAPPED(&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: not mapped addr");
if (memcmp(&ip->ip_dst,
&in6p_faddr(tp->t_inpcb).s6_addr32[3],
sizeof(ip->ip_dst)) != 0) {
panic("tcp_respond: ip_dst != in6p_faddr");
}
} else if (family == AF_INET6) {
if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: ip6_dst != in6p_faddr");
} else
panic("tcp_respond: address family mismatch");
#endif
}
#endif
else
ro = NULL;
switch (family) {
case AF_INET:
error = ip_output(m, NULL, ro,
(tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL,
tp ? tp->t_inpcb : NULL);
break;
#ifdef INET6
case AF_INET6:
error = ip6_output(m, NULL, ro, 0, NULL,
tp ? tp->t_inpcb : NULL, NULL);
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
return error;
}
/*
* Template TCPCB. Rather than zeroing a new TCPCB and initializing
* a bunch of members individually, we maintain this template for the
* static and mostly-static components of the TCPCB, and copy it into
* the new TCPCB instead.
*/
static struct tcpcb tcpcb_template = {
.t_srtt = TCPTV_SRTTBASE,
.t_rttmin = TCPTV_MIN,
.snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_numholes = 0,
.snd_cubic_wmax = 0,
.snd_cubic_wmax_last = 0,
.snd_cubic_ctime = 0,
.t_partialacks = -1,
.t_bytes_acked = 0,
.t_sndrexmitpack = 0,
.t_rcvoopack = 0,
.t_sndzerowin = 0,
};
/*
* Updates the TCPCB template whenever a parameter that would affect
* the template is changed.
*/
void
tcp_tcpcb_template(void)
{
struct tcpcb *tp = &tcpcb_template;
int flags;
tp->t_peermss = tcp_mssdflt;
tp->t_ourmss = tcp_mssdflt;
tp->t_segsz = tcp_mssdflt;
flags = 0;
if (tcp_do_rfc1323 && tcp_do_win_scale)
flags |= TF_REQ_SCALE;
if (tcp_do_rfc1323 && tcp_do_timestamps)
flags |= TF_REQ_TSTMP;
tp->t_flags = flags;
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
TCPTV_MIN, TCPTV_REXMTMAX);
/* Keep Alive */
tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS);
tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS);
tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS);
tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS));
tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
TCP_TIMER_MAXTICKS/tp->t_keepcnt);
/* MSL */
tp->t_msl = TCPTV_MSL;
}
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(int family, struct inpcb *inp)
{
struct tcpcb *tp;
int i;
/* XXX Consider using a pool_cache for speed. */
tp = pool_get(&tcpcb_pool, PR_NOWAIT); /* splsoftnet via tcp_usrreq */
if (tp == NULL)
return NULL;
memcpy(tp, &tcpcb_template, sizeof(*tp));
TAILQ_INIT(&tp->segq);
TAILQ_INIT(&tp->timeq);
tp->t_family = family; /* may be overridden later on */
TAILQ_INIT(&tp->snd_holes);
LIST_INIT(&tp->t_sc); /* XXX can template this */
/* Don't sweat this loop; hopefully the compiler will unroll it. */
for (i = 0; i < TCPT_NTIMERS; i++) {
callout_init(&tp->t_timer[i], CALLOUT_MPSAFE);
TCP_TIMER_INIT(tp, i);
}
callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE);
switch (family) {
case AF_INET:
in4p_ip(inp).ip_ttl = ip_defttl;
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
tp->t_mtudisc = ip_mtudisc;
break;
#ifdef INET6
case AF_INET6:
in6p_ip6(inp).ip6_hlim = in6pcb_selecthlim_rt(inp);
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
/* for IPv6, always try to run path MTU discovery */
tp->t_mtudisc = 1;
break;
#endif /* INET6 */
default:
for (i = 0; i < TCPT_NTIMERS; i++)
callout_destroy(&tp->t_timer[i]);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp); /* splsoftnet via tcp_usrreq */
return NULL;
}
/*
* Initialize our timebase. When we send timestamps, we take
* the delta from tcp_now -- this means each connection always
* gets a timebase of 1, which makes it, among other things,
* more difficult to determine how long a system has been up,
* and thus how many TCP sequence increments have occurred.
*
* We start with 1, because 0 doesn't work with linux, which
* considers timestamp 0 in a SYN packet as a bug and disables
* timestamps.
*/
tp->ts_timebase = tcp_now - 1;
tcp_congctl_select(tp, tcp_congctl_global_name);
return tp;
}
/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
*/
struct tcpcb *
tcp_drop(struct tcpcb *tp, int errno)
{
struct socket *so;
KASSERT(tp->t_inpcb != NULL);
so = tp->t_inpcb->inp_socket;
if (so == NULL)
return NULL;
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_state = TCPS_CLOSED;
(void) tcp_output(tp);
TCP_STATINC(TCP_STAT_DROPS);
} else
TCP_STATINC(TCP_STAT_CONNDROPS);
if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
return (tcp_close(tp));
}
/*
* Close a TCP control block:
* discard all space held by the tcp
* discard internet protocol block
* wake up any sleepers
*/
struct tcpcb *
tcp_close(struct tcpcb *tp)
{
struct inpcb *inp;
struct socket *so;
#ifdef RTV_RTT
struct rtentry *rt = NULL;
#endif
struct route *ro;
int j;
inp = tp->t_inpcb;
so = inp->inp_socket;
ro = &inp->inp_route;
#ifdef RTV_RTT
/*
* If we sent enough data to get some meaningful characteristics,
* save them in the routing entry. 'Enough' is arbitrarily
* defined as the sendpipesize (default 4K) * 16. This would
* give us 16 rtt samples assuming we only get one sample per
* window (the usual case on a long haul net). 16 samples is
* enough for the srtt filter to converge to within 5% of the correct
* value; fewer samples and we could save a very bogus rtt.
*
* Don't update the default route's characteristics and don't
* update anything that the user "locked".
*/
if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) && ro && (rt = rtcache_validate(ro)) != NULL &&
!in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) {
u_long i = 0;
if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { i = tp->t_srtt *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
if (rt->rt_rmx.rmx_rtt && i)
/*
* filter this update to half the old & half
* the new values, converting scale.
* See route.h and tcp_var.h for a
* description of the scaling constants.
*/
rt->rt_rmx.rmx_rtt =
(rt->rt_rmx.rmx_rtt + i) / 2;
else
rt->rt_rmx.rmx_rtt = i;
}
if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { i = tp->t_rttvar *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
if (rt->rt_rmx.rmx_rttvar && i)
rt->rt_rmx.rmx_rttvar =
(rt->rt_rmx.rmx_rttvar + i) / 2;
else
rt->rt_rmx.rmx_rttvar = i;
}
/*
* update the pipelimit (ssthresh) if it has been updated
* already or if a pipesize was specified & the threshold
* got below half the pipesize. I.e., wait for bad news
* before we start updating, then update on both good
* and bad news.
*/
if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && (i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
i < (rt->rt_rmx.rmx_sendpipe / 2)) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
i = (i + tp->t_segsz / 2) / tp->t_segsz;
if (i < 2)
i = 2;
i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
if (rt->rt_rmx.rmx_ssthresh)
rt->rt_rmx.rmx_ssthresh =
(rt->rt_rmx.rmx_ssthresh + i) / 2;
else
rt->rt_rmx.rmx_ssthresh = i;
}
}
rtcache_unref(rt, ro);
#endif /* RTV_RTT */
/* free the reassembly queue, if any */
TCP_REASS_LOCK(tp);
(void) tcp_freeq(tp);
TCP_REASS_UNLOCK(tp);
/* free the SACK holes list. */
tcp_free_sackholes(tp);
tcp_congctl_release(tp);
syn_cache_cleanup(tp);
if (tp->t_template) { m_free(tp->t_template);
tp->t_template = NULL;
}
/*
* Detaching the pcb will unlock the socket/tcpcb, and stopping
* the timers can also drop the lock. We need to prevent access
* to the tcpcb as it's half torn down. Flag the pcb as dead
* (prevents access by timers) and only then detach it.
*/
tp->t_flags |= TF_DEAD;
inp->inp_ppcb = NULL;
soisdisconnected(so);
inpcb_destroy(inp);
/*
* pcb is no longer visble elsewhere, so we can safely release
* the lock in callout_halt() if needed.
*/
TCP_STATINC(TCP_STAT_CLOSED);
for (j = 0; j < TCPT_NTIMERS; j++) {
callout_halt(&tp->t_timer[j], softnet_lock);
callout_destroy(&tp->t_timer[j]);
}
callout_halt(&tp->t_delack_ch, softnet_lock);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp);
return NULL;
}
int
tcp_freeq(struct tcpcb *tp)
{
struct ipqent *qe;
int rv = 0;
TCP_REASS_LOCK_CHECK(tp); while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) { TAILQ_REMOVE(&tp->segq, qe, ipqe_q); TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
m_freem(qe->ipqe_m);
tcpipqent_free(qe);
rv = 1;
}
tp->t_segqlen = 0;
KASSERT(TAILQ_EMPTY(&tp->timeq));
return (rv);
}
void
tcp_fasttimo(void)
{
if (tcp_drainwanted) {
tcp_drain();
tcp_drainwanted = 0;
}
}
void
tcp_drainstub(void)
{
tcp_drainwanted = 1;
}
/*
* Protocol drain routine. Called when memory is in short supply.
* Called from pr_fasttimo thus a callout context.
*/
void
tcp_drain(void)
{
struct inpcb *inp;
struct tcpcb *tp;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
/*
* Free the sequence queue of all TCP connections.
*/
TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) {
tp = intotcpcb(inp);
if (tp != NULL) {
/*
* If the tcpcb is already busy,
* just bail out now.
*/
if (tcp_reass_lock_try(tp) == 0)
continue;
if (tcp_freeq(tp))
TCP_STATINC(TCP_STAT_CONNSDRAINED);
TCP_REASS_UNLOCK(tp);
}
}
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Notify a tcp user of an asynchronous error;
* store error as soft error, but wake up user
* (for now, won't do anything until can select for soft error).
*/
void
tcp_notify(struct inpcb *inp, int error)
{
struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
struct socket *so = inp->inp_socket;
/*
* Ignore some errors if we are hooked up.
* If connection hasn't completed, has retransmitted several times,
* and receives a second error, give up now. This is better
* than waiting a long time to establish a connection that
* can never complete.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
return;
} else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
tp->t_rxtshift > 3 && tp->t_softerror)
so->so_error = error;
else
tp->t_softerror = error;
cv_broadcast(&so->so_cv);
sorwakeup(so);
sowwakeup(so);
}
#ifdef INET6
void *
tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct tcphdr th;
void (*notify)(struct inpcb *, int) = tcp_notify;
int nmatch;
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6_src = NULL;
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
struct mbuf *m;
int off;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
else if (cmd == PRC_QUENCH) {
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
} else if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
sa6_src = &sa6_any;
off = 0;
}
if (ip6) {
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(th)) {
if (cmd == PRC_MSGSIZE) icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
return NULL;
}
memset(&th, 0, sizeof(th));
m_copydata(m, off, sizeof(th), (void *)&th);
if (cmd == PRC_MSGSIZE) {
int valid = 0;
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMPv6 message
* payload.
*/
if (in6pcb_lookup(&tcbtable, &sa6->sin6_addr,
th.th_dport,
(const struct in6_addr *)&sa6_src->sin6_addr,
th.th_sport, 0, 0))
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalcurate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* no need to call in6pcb_notify, it should have been
* called via callback if necessary
*/
return NULL;
}
nmatch = in6pcb_notify(&tcbtable, sa, th.th_dport,
(const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
if (nmatch == 0 && syn_cache_count && (inet6ctlerrmap[cmd] == EHOSTUNREACH ||
inet6ctlerrmap[cmd] == ENETUNREACH ||
inet6ctlerrmap[cmd] == EHOSTDOWN))
syn_cache_unreach((const struct sockaddr *)sa6_src,
sa, &th);
} else {
(void) in6pcb_notify(&tcbtable, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
}
return NULL;
}
#endif
/* assumes that ip header and tcp header are contiguous on mbuf */
void *
tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
{
struct ip *ip = v;
struct tcphdr *th;
struct icmp *icp;
extern const int inetctlerrmap[];
void (*notify)(struct inpcb *, int) = tcp_notify;
int errno;
int nmatch;
struct tcpcb *tp;
u_int mtu;
tcp_seq seq;
struct inpcb *inp;
#ifdef INET6
struct in6_addr src6, dst6;
#endif
if (sa->sa_family != AF_INET ||
sa->sa_len != sizeof(struct sockaddr_in))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
errno = inetctlerrmap[cmd];
if (cmd == PRC_QUENCH)
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
else if (PRC_IS_REDIRECT(cmd))
notify = inpcb_rtchange, ip = 0;
else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMP message
* payload.
*
* Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
*/
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
#ifdef INET6
in6_in_2_v4mapin6(&ip->ip_src, &src6);
in6_in_2_v4mapin6(&ip->ip_dst, &dst6);
#endif
if ((inp = inpcb_lookup(&tcbtable, ip->ip_dst,
th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL)
;
#ifdef INET6
else if ((inp = in6pcb_lookup(&tcbtable, &dst6,
th->th_dport, &src6, th->th_sport, 0, 0)) != NULL)
;
#endif
else
return NULL;
/*
* Now that we've validated that we are actually communicating
* with the host indicated in the ICMP message, locate the
* ICMP header, recalculate the new MTU, and create the
* corresponding routing entry.
*/
icp = (struct icmp *)((char *)ip -
offsetof(struct icmp, icmp_ip));
tp = intotcpcb(inp);
if (tp == NULL)
return NULL;
seq = ntohl(th->th_seq);
if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max))
return NULL;
/*
* If the ICMP message advertises a Next-Hop MTU
* equal or larger than the maximum packet size we have
* ever sent, drop the message.
*/
mtu = (u_int)ntohs(icp->icmp_nextmtu);
if (mtu >= tp->t_pmtud_mtu_sent)
return NULL;
if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
/*
* Calculate new MTU, and create corresponding
* route (traditional PMTUD).
*/
tp->t_flags &= ~TF_PMTUD_PEND;
icmp_mtudisc(icp, ip->ip_dst);
} else {
/*
* Record the information got in the ICMP
* message; act on it later.
* If we had already recorded an ICMP message,
* replace the old one only if the new message
* refers to an older TCP segment
*/
if (tp->t_flags & TF_PMTUD_PEND) {
if (SEQ_LT(tp->t_pmtud_th_seq, seq))
return NULL;
} else
tp->t_flags |= TF_PMTUD_PEND;
tp->t_pmtud_th_seq = seq;
tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
}
return NULL;
} else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if (errno == 0)
return NULL;
if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
nmatch = inpcb_notify(&tcbtable, satocsin(sa)->sin_addr,
th->th_dport, ip->ip_src, th->th_sport, errno, notify);
if (nmatch == 0 && syn_cache_count && (inetctlerrmap[cmd] == EHOSTUNREACH ||
inetctlerrmap[cmd] == ENETUNREACH ||
inetctlerrmap[cmd] == EHOSTDOWN)) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_port = th->th_sport;
sin.sin_addr = ip->ip_src;
syn_cache_unreach((struct sockaddr *)&sin, sa, th);
}
/* XXX mapped address case */
} else
inpcb_notifyall(&tcbtable, satocsin(sa)->sin_addr, errno,
notify);
return NULL;
}
/*
* When a source quench is received, we are being notified of congestion.
* Close the congestion window down to the Loss Window (one segment).
* We will gradually open it again as we proceed.
*/
void
tcp_quench(struct inpcb *inp)
{
struct tcpcb *tp = intotcpcb(inp);
if (tp) {
tp->snd_cwnd = tp->t_segsz;
tp->t_bytes_acked = 0;
}
}
/*
* Path MTU Discovery handlers.
*/
void
tcp_mtudisc_callback(struct in_addr faddr)
{
#ifdef INET6
struct in6_addr in6;
#endif
inpcb_notifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
#ifdef INET6
in6_in_2_v4mapin6(&faddr, &in6);
tcp6_mtudisc_callback(&in6);
#endif
}
/*
* On receipt of path MTU corrections, flush old route and replace it
* with the new one. Retransmit all unacknowledged packets, to ensure
* that all packets will be received.
*/
void
tcp_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
rt = inpcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
inpcb_rtentry_unref(rt, inp);
inpcb_rtchange(inp, errno);
if ((rt = inpcb_rtentry(inp)) == NULL)
return;
}
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0)
tp->snd_cwnd =
TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
inpcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#ifdef INET6
/*
* Path MTU Discovery handlers.
*/
void
tcp6_mtudisc_callback(struct in6_addr *faddr)
{
struct sockaddr_in6 sin6;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_addr = *faddr;
(void) in6pcb_notify(&tcbtable, (struct sockaddr *)&sin6, 0,
(const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
}
void
tcp6_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
rt = in6pcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
in6pcb_rtentry_unref(rt, inp);
in6pcb_rtchange(inp, errno);
rt = in6pcb_rtentry(inp);
if (rt == NULL)
return;
}
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0) {
tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
}
in6pcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#endif /* INET6 */
/*
* Compute the MSS to advertise to the peer. Called only during
* the 3-way handshake. If we are the server (peer initiated
* connection), we are called with a pointer to the interface
* on which the SYN packet arrived. If we are the client (we
* initiated connection), we are called with a pointer to the
* interface out which this connection should go.
*
* NOTE: Do not subtract IP option/extension header size nor IPsec
* header size from MSS advertisement. MSS option must hold the maximum
* segment size we can accept, so it must always be:
* max(if mtu) - ip header - tcp header
*/
u_long
tcp_mss_to_advertise(const struct ifnet *ifp, int af)
{
extern u_long in_maxmtu;
u_long mss = 0;
u_long hdrsiz;
/*
* In order to avoid defeating path MTU discovery on the peer,
* we advertise the max MTU of all attached networks as our MSS,
* per RFC 1191, section 3.1.
*
* We provide the option to advertise just the MTU of
* the interface on which we hope this connection will
* be receiving. If we are responding to a SYN, we
* will have a pretty good idea about this, but when
* initiating a connection there is a bit more doubt.
*
* We also need to ensure that loopback has a large enough
* MSS, as the loopback MTU is never included in in_maxmtu.
*/
if (ifp != NULL) switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
mss = ifp->if_mtu;
break;
}
if (tcp_mss_ifmtu == 0) switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
mss = uimax(in_maxmtu, mss);
break;
}
switch (af) {
case AF_INET:
hdrsiz = sizeof(struct ip);
break;
#ifdef INET6
case AF_INET6:
hdrsiz = sizeof(struct ip6_hdr);
break;
#endif
default:
hdrsiz = 0;
break;
}
hdrsiz += sizeof(struct tcphdr);
if (mss > hdrsiz)
mss -= hdrsiz;
mss = uimax(tcp_mssdflt, mss);
return (mss);
}
/*
* Set connection variables based on the peer's advertised MSS.
* We are passed the TCPCB for the actual connection. If we
* are the server, we are called by the compressed state engine
* when the 3-way handshake is complete. If we are the client,
* we are called when we receive the SYN,ACK from the server.
*
* NOTE: Our advertised MSS value must be initialized in the TCPCB
* before this routine is called!
*/
void
tcp_mss_from_peer(struct tcpcb *tp, int offer)
{
struct socket *so;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
struct rtentry *rt;
#endif
u_long bufsize;
int mss;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
so = tp->t_inpcb->inp_socket;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
/*
* As per RFC1122, use the default MSS value, unless they
* sent us an offer. Do not accept offers less than 256 bytes.
*/
mss = tcp_mssdflt;
if (offer)
mss = offer;
mss = uimax(mss, 256); /* sanity */
tp->t_peermss = mss;
mss -= tcp_optlen(tp);
if (tp->t_inpcb->inp_af == AF_INET)
mss -= ip_optlen(tp->t_inpcb);
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6)
mss -= ip6_optlen(tp->t_inpcb);
#endif
/*
* XXX XXX What if mss goes negative or zero? This can happen if a
* socket has large IPv6 options. We crash below.
*/
/*
* If there's a pipesize, change the socket buffer to that size.
* Make the socket buffer an integral number of MSS units. If
* the MSS is larger than the socket buffer, artificially decrease
* the MSS.
*/
#ifdef RTV_SPIPE
if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
bufsize = rt->rt_rmx.rmx_sendpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_snd.sb_hiwat;
}
if (bufsize < mss)
mss = bufsize;
else {
bufsize = roundup(bufsize, mss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_snd, bufsize, so);
}
tp->t_segsz = mss;
#ifdef RTV_SSTHRESH
if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
/*
* There's some sort of gateway or interface buffer
* limit on the path. Use this to set the slow
* start threshold, but set the threshold to no less
* than 2 * MSS.
*/
tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh);
}
#endif
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Processing necessary when a TCP connection is established.
*/
void
tcp_established(struct tcpcb *tp)
{
struct socket *so;
#ifdef RTV_RPIPE
struct rtentry *rt;
#endif
u_long bufsize;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
/* This is a while() to reduce the dreadful stairstepping below */
while (tp->t_inpcb->inp_af == AF_INET) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
if (__predict_true(tcp_msl_enable)) {
if (in4p_laddr(tp->t_inpcb).s_addr == INADDR_LOOPBACK) {
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in_localaddr(in4p_faddr(tp->t_inpcb))) {
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#ifdef INET6
while (tp->t_inpcb->inp_af == AF_INET6) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
rt = in6pcb_rtentry(tp->t_inpcb);
#endif
if (__predict_true(tcp_msl_enable)) {
extern const struct in6_addr in6addr_loopback;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(tp->t_inpcb),
&in6addr_loopback)) {
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in6_localaddr(&in6p_faddr(tp->t_inpcb))) {
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#endif
tp->t_state = TCPS_ESTABLISHED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
#ifdef RTV_RPIPE
if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
bufsize = rt->rt_rmx.rmx_recvpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_rcv.sb_hiwat;
}
if (bufsize > tp->t_ourmss) {
bufsize = roundup(bufsize, tp->t_ourmss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_rcv, bufsize, so);
}
#ifdef RTV_RPIPE
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Check if there's an initial rtt or rttvar. Convert from the
* route-table units to scaled multiples of the slow timeout timer.
* Called only during the 3-way handshake.
*/
void
tcp_rmx_rtt(struct tcpcb *tp)
{
#ifdef RTV_RTT
struct rtentry *rt = NULL;
int rtt;
KASSERT(tp->t_inpcb != NULL);
rt = inpcb_rtentry(tp->t_inpcb);
if (rt == NULL)
return;
if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
/*
* XXX The lock bit for MTU indicates that the value
* is also a minimum value; this is subject to time.
*/
if (rt->rt_rmx.rmx_locks & RTV_RTT)
TCPT_RANGESET(tp->t_rttmin,
rtt / (RTM_RTTUNIT / PR_SLOWHZ),
TCPTV_MIN, TCPTV_REXMTMAX);
tp->t_srtt = rtt /
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
((RTM_RTTUNIT / PR_SLOWHZ) >>
(TCP_RTTVAR_SHIFT + 2));
} else {
/* Default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
tp->t_rttmin, TCPTV_REXMTMAX);
}
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
/*
* Get a new sequence value given a tcp control block
*/
tcp_seq
tcp_new_iss(struct tcpcb *tp)
{
if (tp->t_inpcb->inp_af == AF_INET) {
return tcp_new_iss1(&in4p_laddr(tp->t_inpcb),
&in4p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in4p_laddr(tp->t_inpcb)));
}
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6) {
return tcp_new_iss1(&in6p_laddr(tp->t_inpcb),
&in6p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in6p_laddr(tp->t_inpcb)));
}
#endif
panic("tcp_new_iss: unreachable");
}
static u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */
/*
* Initialize RFC 1948 ISS Secret
*/
static int
tcp_iss_secret_init(void)
{
cprng_strong(kern_cprng,
tcp_iss_secret, sizeof(tcp_iss_secret), 0);
return 0;
}
/*
* This routine actually generates a new TCP initial sequence number.
*/
tcp_seq
tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
size_t addrsz)
{
tcp_seq tcp_iss;
if (tcp_do_rfc1948) {
MD5_CTX ctx;
u_int8_t hash[16]; /* XXX MD5 knowledge */
static ONCE_DECL(tcp_iss_secret_control);
/*
* If we haven't been here before, initialize our cryptographic
* hash secret.
*/
RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init);
/*
* Compute the base value of the ISS. It is a hash
* of (saddr, sport, daddr, dport, secret).
*/
MD5Init(&ctx);
MD5Update(&ctx, (u_char *) laddr, addrsz);
MD5Update(&ctx, (u_char *) &lport, sizeof(lport));
MD5Update(&ctx, (u_char *) faddr, addrsz);
MD5Update(&ctx, (u_char *) &fport, sizeof(fport));
MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));
MD5Final(hash, &ctx);
memcpy(&tcp_iss, hash, sizeof(tcp_iss));
#ifdef TCPISS_DEBUG
printf("ISS hash 0x%08x, ", tcp_iss);
#endif
} else {
/*
* Randomize.
*/
tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK;
#ifdef TCPISS_DEBUG
printf("ISS random 0x%08x, ", tcp_iss);
#endif
}
/*
* Add the offset in to the computed value.
*/
tcp_iss += tcp_iss_seq;
#ifdef TCPISS_DEBUG
printf("ISS %08x\n", tcp_iss);
#endif
return tcp_iss;
}
#if defined(IPSEC)
/* compute ESP/AH header size for TCP, including outer IP header. */
size_t
ipsec4_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
/* XXX mapped addr case (tp->t_inpcb) */
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#ifdef INET6
size_t
ipsec6_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET6:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
case AF_INET:
/* mapped address case - tricky */
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#endif
#endif /*IPSEC*/
/*
* Determine the length of the TCP options for this connection.
*
* XXX: What do we do for SACK, when we add that? Just reserve
* all of the space? Otherwise we can't exactly be incrementing
* cwnd by an amount that varies depending on the amount we last
* had to SACK!
*/
u_int
tcp_optlen(struct tcpcb *tp)
{
u_int optlen;
optlen = 0;
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP | TF_RCVD_TSTMP))
optlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
optlen += TCPOLEN_SIGLEN;
#endif
return optlen;
}
u_int
tcp_hdrsz(struct tcpcb *tp)
{
u_int hlen;
switch (tp->t_family) {
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
case AF_INET:
hlen = sizeof(struct ip);
break;
default:
hlen = 0;
break;
}
hlen += sizeof(struct tcphdr);
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
hlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
hlen += TCPOLEN_SIGLEN;
#endif
return hlen;
}
void
tcp_statinc(u_int stat)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATINC(stat);
}
void
tcp_statadd(u_int stat, uint64_t val)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATADD(stat, val);
}
/* $NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $ */
/*-
* Copyright (c) 2014 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Inter-processor interrupt (IPI) interface: asynchronous IPIs to
* invoke functions with a constant argument and synchronous IPIs
* with the cross-call support.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_ipi.c,v 1.11 2023/02/24 11:02:27 riastradh Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/ipi.h>
#include <sys/intr.h>
#include <sys/kcpuset.h>
#include <sys/kmem.h>
#include <sys/lock.h>
#include <sys/mutex.h>
/*
* An array of the IPI handlers used for asynchronous invocation.
* The lock protects the slot allocation.
*/
typedef struct {
ipi_func_t func;
void * arg;
} ipi_intr_t;
static kmutex_t ipi_mngmt_lock;
static ipi_intr_t ipi_intrs[IPI_MAXREG] __cacheline_aligned;
/*
* Per-CPU mailbox for IPI messages: it is a single cache line storing
* up to IPI_MSG_MAX messages. This interface is built on top of the
* synchronous IPIs.
*/
#define IPI_MSG_SLOTS (CACHE_LINE_SIZE / sizeof(ipi_msg_t *))
#define IPI_MSG_MAX IPI_MSG_SLOTS
typedef struct {
ipi_msg_t * msg[IPI_MSG_SLOTS];
} ipi_mbox_t;
/* Mailboxes for the synchronous IPIs. */
static ipi_mbox_t * ipi_mboxes __read_mostly;
static struct evcnt ipi_mboxfull_ev __cacheline_aligned;
static void ipi_msg_cpu_handler(void *);
/* Handler for the synchronous IPIs - it must be zero. */
#define IPI_SYNCH_ID 0
#ifndef MULTIPROCESSOR
#define cpu_ipi(ci) KASSERT(ci == NULL)
#endif
void
ipi_sysinit(void)
{
mutex_init(&ipi_mngmt_lock, MUTEX_DEFAULT, IPL_NONE);
memset(ipi_intrs, 0, sizeof(ipi_intrs));
/*
* Register the handler for synchronous IPIs. This mechanism
* is built on top of the asynchronous interface. Slot zero is
* reserved permanently; it is also handy to use zero as a failure
* for other registers (as it is potentially less error-prone).
*/
ipi_intrs[IPI_SYNCH_ID].func = ipi_msg_cpu_handler;
evcnt_attach_dynamic(&ipi_mboxfull_ev, EVCNT_TYPE_MISC, NULL,
"ipi", "full");
}
void
ipi_percpu_init(void)
{
const size_t len = ncpu * sizeof(ipi_mbox_t);
/* Initialise the per-CPU bit fields. */
for (u_int i = 0; i < ncpu; i++) {
struct cpu_info *ci = cpu_lookup(i);
memset(&ci->ci_ipipend, 0, sizeof(ci->ci_ipipend));
}
/* Allocate per-CPU IPI mailboxes. */
ipi_mboxes = kmem_zalloc(len, KM_SLEEP);
KASSERT(ipi_mboxes != NULL);
}
/*
* ipi_register: register an asynchronous IPI handler.
*
* => Returns IPI ID which is greater than zero; on failure - zero.
*/
u_int
ipi_register(ipi_func_t func, void *arg)
{
mutex_enter(&ipi_mngmt_lock);
for (u_int i = 0; i < IPI_MAXREG; i++) {
if (ipi_intrs[i].func == NULL) {
/* Register the function. */
ipi_intrs[i].func = func;
ipi_intrs[i].arg = arg;
mutex_exit(&ipi_mngmt_lock);
KASSERT(i != IPI_SYNCH_ID);
return i;
}
}
mutex_exit(&ipi_mngmt_lock);
printf("WARNING: ipi_register: table full, increase IPI_MAXREG\n");
return 0;
}
/*
* ipi_unregister: release the IPI handler given the ID.
*/
void
ipi_unregister(u_int ipi_id)
{
ipi_msg_t ipimsg = { .func = __FPTRCAST(ipi_func_t, nullop) };
KASSERT(ipi_id != IPI_SYNCH_ID);
KASSERT(ipi_id < IPI_MAXREG);
/* Release the slot. */
mutex_enter(&ipi_mngmt_lock);
KASSERT(ipi_intrs[ipi_id].func != NULL);
ipi_intrs[ipi_id].func = NULL;
/* Ensure that there are no IPIs in flight. */
kpreempt_disable();
ipi_broadcast(&ipimsg, false);
ipi_wait(&ipimsg);
kpreempt_enable();
mutex_exit(&ipi_mngmt_lock);
}
/*
* ipi_mark_pending: internal routine to mark an IPI pending on the
* specified CPU (which might be curcpu()).
*/
static bool
ipi_mark_pending(u_int ipi_id, struct cpu_info *ci)
{
const u_int i = ipi_id >> IPI_BITW_SHIFT;
const uint32_t bitm = 1U << (ipi_id & IPI_BITW_MASK);
KASSERT(ipi_id < IPI_MAXREG); KASSERT(kpreempt_disabled());
/* Mark as pending and return true if not previously marked. */
if ((atomic_load_acquire(&ci->ci_ipipend[i]) & bitm) == 0) { membar_release();
atomic_or_32(&ci->ci_ipipend[i], bitm);
return true;
}
return false;
}
/*
* ipi_trigger: asynchronously send an IPI to the specified CPU.
*/
void
ipi_trigger(u_int ipi_id, struct cpu_info *ci)
{ KASSERT(curcpu() != ci); if (ipi_mark_pending(ipi_id, ci)) { cpu_ipi(ci);
}
}
/*
* ipi_trigger_multi_internal: the guts of ipi_trigger_multi() and
* ipi_trigger_broadcast().
*/
static void
ipi_trigger_multi_internal(u_int ipi_id, const kcpuset_t *target,
bool skip_self)
{
const cpuid_t selfid = cpu_index(curcpu());
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(kpreempt_disabled());
KASSERT(target != NULL);
for (CPU_INFO_FOREACH(cii, ci)) {
const cpuid_t cpuid = cpu_index(ci);
if (!kcpuset_isset(target, cpuid) || cpuid == selfid) {
continue;
}
ipi_trigger(ipi_id, ci);
}
if (!skip_self && kcpuset_isset(target, selfid)) {
ipi_mark_pending(ipi_id, curcpu());
int s = splhigh();
ipi_cpu_handler();
splx(s);
}
}
/*
* ipi_trigger_multi: same as ipi_trigger() but sends to the multiple
* CPUs given the target CPU set.
*/
void
ipi_trigger_multi(u_int ipi_id, const kcpuset_t *target)
{
ipi_trigger_multi_internal(ipi_id, target, false);
}
/*
* ipi_trigger_broadcast: same as ipi_trigger_multi() to kcpuset_attached,
* optionally skipping the sending CPU.
*/
void
ipi_trigger_broadcast(u_int ipi_id, bool skip_self)
{
ipi_trigger_multi_internal(ipi_id, kcpuset_attached, skip_self);
}
/*
* put_msg: insert message into the mailbox.
*
* Caller is responsible for issuing membar_release first.
*/
static inline void
put_msg(ipi_mbox_t *mbox, ipi_msg_t *msg)
{
int count = SPINLOCK_BACKOFF_MIN;
again:
for (u_int i = 0; i < IPI_MSG_MAX; i++) {
if (atomic_cas_ptr(&mbox->msg[i], NULL, msg) == NULL) {
return;
}
}
/* All slots are full: we have to spin-wait. */
ipi_mboxfull_ev.ev_count++;
SPINLOCK_BACKOFF(count);
goto again;
}
/*
* ipi_cpu_handler: the IPI handler.
*/
void
ipi_cpu_handler(void)
{
struct cpu_info * const ci = curcpu();
/*
* Handle asynchronous IPIs: inspect per-CPU bit field, extract
* IPI ID numbers and execute functions in those slots.
*/
for (u_int i = 0; i < IPI_BITWORDS; i++) {
uint32_t pending, bit;
if (atomic_load_relaxed(&ci->ci_ipipend[i]) == 0) {
continue;
}
pending = atomic_swap_32(&ci->ci_ipipend[i], 0);
membar_acquire();
while ((bit = ffs(pending)) != 0) {
const u_int ipi_id = (i << IPI_BITW_SHIFT) | --bit;
ipi_intr_t *ipi_hdl = &ipi_intrs[ipi_id];
pending &= ~(1U << bit);
KASSERT(ipi_hdl->func != NULL);
ipi_hdl->func(ipi_hdl->arg);
}
}
}
/*
* ipi_msg_cpu_handler: handle synchronous IPIs - iterate mailbox,
* execute the passed functions and acknowledge the messages.
*/
static void
ipi_msg_cpu_handler(void *arg __unused)
{
const struct cpu_info * const ci = curcpu();
ipi_mbox_t *mbox = &ipi_mboxes[cpu_index(ci)];
for (u_int i = 0; i < IPI_MSG_MAX; i++) {
ipi_msg_t *msg;
/* Get the message. */
if ((msg = atomic_load_acquire(&mbox->msg[i])) == NULL) {
continue;
}
atomic_store_relaxed(&mbox->msg[i], NULL);
/* Execute the handler. */
KASSERT(msg->func);
msg->func(msg->arg);
/* Ack the request. */
membar_release();
atomic_dec_uint(&msg->_pending);
}
}
/*
* ipi_unicast: send an IPI to a single CPU.
*
* => The CPU must be remote; must not be local.
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_unicast(ipi_msg_t *msg, struct cpu_info *ci)
{
const cpuid_t id = cpu_index(ci);
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
KASSERT(curcpu() != ci);
msg->_pending = 1;
membar_release();
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
/*
* ipi_multicast: send an IPI to each CPU in the specified set.
*
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_multicast(ipi_msg_t *msg, const kcpuset_t *target)
{
const struct cpu_info * const self = curcpu();
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
u_int local;
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
local = !!kcpuset_isset(target, cpu_index(self));
msg->_pending = kcpuset_countset(target) - local;
membar_release();
for (CPU_INFO_FOREACH(cii, ci)) {
cpuid_t id;
if (__predict_false(ci == self)) {
continue;
}
id = cpu_index(ci);
if (!kcpuset_isset(target, id)) {
continue;
}
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
if (local) {
msg->func(msg->arg);
}
}
/*
* ipi_broadcast: send an IPI to all CPUs.
*
* => The caller must ipi_wait() on the message for completion.
*/
void
ipi_broadcast(ipi_msg_t *msg, bool skip_self)
{
const struct cpu_info * const self = curcpu();
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
KASSERT(msg->func != NULL);
KASSERT(kpreempt_disabled());
msg->_pending = ncpu - 1;
membar_release();
/* Broadcast IPIs for remote CPUs. */
for (CPU_INFO_FOREACH(cii, ci)) {
cpuid_t id;
if (__predict_false(ci == self)) {
continue;
}
id = cpu_index(ci);
put_msg(&ipi_mboxes[id], msg);
ipi_trigger(IPI_SYNCH_ID, ci);
}
if (!skip_self) {
/* Finally, execute locally. */
msg->func(msg->arg);
}
}
/*
* ipi_wait: spin-wait until the message is processed.
*/
void
ipi_wait(ipi_msg_t *msg)
{
int count = SPINLOCK_BACKOFF_MIN;
while (atomic_load_acquire(&msg->_pending)) {
KASSERT(atomic_load_relaxed(&msg->_pending) < ncpu);
SPINLOCK_BACKOFF(count);
}
}
/* $NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997
* Jonathan Stone and Jason R. Thorpe. All rights reserved.
*
* This software is derived from information provided by Matt Thomas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Jonathan Stone
* and Jason R. Thorpe for the NetBSD Project.
* 4. The names of the authors may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_media_80.c,v 1.5 2022/08/03 01:38:51 riastradh Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/syscallargs.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/compat_stub.h>
#include <net/if.h>
#include <net/if_media.h>
#include <compat/sys/sockio.h>
#include <compat/common/compat_mod.h>
static void
ifmword_n2o(int *oldwd, int *newwd)
{
if (IFM_SUBTYPE(*newwd) > IFM_OTHER)
*oldwd = (*newwd & ~(_IFM_ETH_XTMASK | IFM_TMASK)) | IFM_OTHER;
else
*oldwd = *newwd;
}
/*ARGSUSED*/
static int
compat_ifmediareq_pre(struct ifreq *ifr, u_long *cmd, bool *do_post)
{
struct ifmediareq *ifmr = (struct ifmediareq *)ifr;
switch (*cmd) {
case SIOCSIFMEDIA_80:
*cmd = SIOCSIFMEDIA; /* Convert to new one */
if ((IFM_TYPE(ifr->ifr_media) == IFM_ETHER) &&
IFM_SUBTYPE(ifr->ifr_media) > IFM_OTHER) {
/* Clear unused bits to not to change to wrong media */
ifr->ifr_media &= ~_IFM_ETH_XTMASK;
}
return 0;
case SIOCGIFMEDIA_80:
*cmd = SIOCGIFMEDIA; /* Convert to new one */
if (ifmr->ifm_count != 0) {
/*
* Tell the upper layer to try to convert each ifmedia
* entry in the post process.
*/
*do_post = true;
}
return 0;
default:
return 0;
}
}
/*ARGSUSED*/
static int
compat_ifmediareq_post(struct ifreq *ifr, u_long cmd)
{
struct ifmediareq *ifmr = (struct ifmediareq *)ifr;
size_t minwords;
size_t count;
int error, *kptr;
switch (cmd) {
case SIOCSIFMEDIA:
return 0;
case SIOCGIFMEDIA:
if (ifmr->ifm_count < 0)
return EINVAL;
/*
* ifmr->ifm_count was already ajusted in ifmedia_ioctl(), so
* there is no problem to trust ifm_count.
*/
minwords = ifmr->ifm_count;
kptr = malloc(minwords * sizeof(*kptr), M_TEMP,
M_WAITOK|M_ZERO);
if (kptr == NULL)
return ENOMEM;
/*
* Convert ifm_current and ifm_active.
* It's not required to convert ifm_mask.
*/
ifmword_n2o(&ifmr->ifm_current, &ifmr->ifm_current);
ifmword_n2o(&ifmr->ifm_active, &ifmr->ifm_active);
/* Convert ifm_ulist array */
for (count = 0; count < minwords; count++) {
int oldmwd;
error = ufetch_int(&ifmr->ifm_ulist[count], &oldmwd);
if (error != 0)
goto out;
ifmword_n2o(&kptr[count], &oldmwd);
}
/* Copy to userland in old format */
error = copyout(kptr, ifmr->ifm_ulist,
minwords * sizeof(*kptr));
out:
free(kptr, M_TEMP);
return error;
default:
return 0;
}
}
void
ifmedia_80_init(void)
{
MODULE_HOOK_SET(ifmedia_80_pre_hook, compat_ifmediareq_pre);
MODULE_HOOK_SET(ifmedia_80_post_hook, compat_ifmediareq_post);
}
void
ifmedia_80_fini(void)
{
MODULE_HOOK_UNSET(ifmedia_80_post_hook);
MODULE_HOOK_UNSET(ifmedia_80_pre_hook);
}
/* $NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $ */
/*
* Copyright (c) 1980, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_usrreq.c 8.1 (Berkeley) 6/10/93
*/
/*
* Raw protocol interface.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: raw_usrreq.c,v 1.65 2022/09/02 23:48:11 thorpej Exp $");
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
static inline int
equal(const struct sockaddr *a1, const struct sockaddr *a2)
{
return memcmp(a1, a2, a1->sa_len) == 0;
}
/*
* raw_input: find the socket associated with the packet and move it over.
* If nothing exists for this packet, drop it.
*/
void
raw_input(struct mbuf *m0, struct sockproto *proto, struct sockaddr *src,
struct sockaddr *dst, struct rawcbhead *rawcbhead)
{
struct rawcb *rp;
struct mbuf *m = m0;
struct socket *last;
last = NULL;
LIST_FOREACH(rp, rawcbhead, rcb_list) { if (rp->rcb_proto.sp_family != proto->sp_family)
continue;
if (rp->rcb_proto.sp_protocol &&
rp->rcb_proto.sp_protocol != proto->sp_protocol)
continue;
/*
* We assume the lower level routines have
* placed the address in a canonical format
* suitable for a structure comparison.
*
* Note that if the lengths are not the same
* the comparison will fail at the first byte.
*/
if (rp->rcb_laddr && !equal(rp->rcb_laddr, dst))
continue;
if (rp->rcb_faddr && !equal(rp->rcb_faddr, src))
continue;
/* Run any filtering that may have been installed. */
if (rp->rcb_filter != NULL && rp->rcb_filter(m, proto, rp) != 0)
continue;
if (last != NULL) {
struct mbuf *n;
if ((n = m_copypacket(m, M_DONTWAIT)) == NULL ||
sbappendaddr(&last->so_rcv, src, n, NULL) == 0)
{
if (n != NULL)
m_freem(n);
soroverflow(last);
} else
sorwakeup(last);
}
last = rp->rcb_socket;
}
if (last != NULL) {
if (sbappendaddr(&last->so_rcv, src, m, NULL) == 0) {
m_freem(m);
soroverflow(last);
} else
sorwakeup(last);
} else {
m_freem(m);
}
}
void *
raw_ctlinput(int cmd, const struct sockaddr *arg, void *d)
{
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
return NULL;
/* INCOMPLETE */
}
void
raw_setsockaddr(struct rawcb *rp, struct sockaddr *nam)
{
memcpy(nam, rp->rcb_laddr, rp->rcb_laddr->sa_len);
}
void
raw_setpeeraddr(struct rawcb *rp, struct sockaddr *nam)
{
memcpy(nam, rp->rcb_faddr, rp->rcb_faddr->sa_len);
}
int
raw_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l,
int (*output)(struct mbuf *, struct socket *))
{
struct rawcb *rp = sotorawcb(so);
int error = 0;
KASSERT(rp != NULL);
/*
* Ship a packet out. The appropriate raw output
* routine handles any massaging necessary.
*/
if (control && control->m_len) { m_freem(control);
m_freem(m);
return EINVAL;
}
if (nam) {
if ((so->so_state & SS_ISCONNECTED) != 0) {
error = EISCONN;
goto die;
}
error = (*so->so_proto->pr_usrreqs->pr_connect)(so, nam, l);
if (error) {
die:
m_freem(m);
return error;
}
} else {
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
goto die;
}
}
error = (*output)(m, so);
if (nam)
raw_disconnect(rp);
return error;
}
int
raw_usrreq(struct socket *so, int req, struct mbuf *m, struct mbuf *nam,
struct mbuf *control, struct lwp *l)
{
KASSERT(req != PRU_ATTACH);
KASSERT(req != PRU_DETACH);
KASSERT(req != PRU_ACCEPT);
KASSERT(req != PRU_BIND);
KASSERT(req != PRU_LISTEN);
KASSERT(req != PRU_CONNECT);
KASSERT(req != PRU_CONNECT2);
KASSERT(req != PRU_DISCONNECT);
KASSERT(req != PRU_SHUTDOWN);
KASSERT(req != PRU_ABORT);
KASSERT(req != PRU_CONTROL);
KASSERT(req != PRU_SENSE);
KASSERT(req != PRU_PEERADDR);
KASSERT(req != PRU_SOCKADDR);
KASSERT(req != PRU_RCVD);
KASSERT(req != PRU_RCVOOB);
KASSERT(req != PRU_SEND);
KASSERT(req != PRU_SENDOOB);
KASSERT(req != PRU_PURGEIF);
if (sotorawcb(so) == NULL)
return EINVAL;
panic("raw_usrreq");
return 0;
}
/* $NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_page.c 8.3 (Berkeley) 3/21/94
* from: Id: uvm_page.c,v 1.1.2.18 1998/02/06 05:24:42 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_page.c: page ops.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_page.c,v 1.256 2024/03/05 14:33:50 thorpej Exp $");
#include "opt_ddb.h"
#include "opt_uvm.h"
#include "opt_uvmhist.h"
#include "opt_readahead.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sched.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/radixtree.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <ddb/db_active.h>
#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_pdpolicy.h>
#include <uvm/uvm_pgflcache.h>
/*
* number of pages per-CPU to reserve for the kernel.
*/
#ifndef UVM_RESERVED_PAGES_PER_CPU
#define UVM_RESERVED_PAGES_PER_CPU 5
#endif
int vm_page_reserve_kernel = UVM_RESERVED_PAGES_PER_CPU;
/*
* physical memory size;
*/
psize_t physmem;
/*
* local variables
*/
/*
* these variables record the values returned by vm_page_bootstrap,
* for debugging purposes. The implementation of uvm_pageboot_alloc
* and pmap_startup here also uses them internally.
*/
static vaddr_t virtual_space_start;
static vaddr_t virtual_space_end;
/*
* we allocate an initial number of page colors in uvm_page_init(),
* and remember them. We may re-color pages as cache sizes are
* discovered during the autoconfiguration phase. But we can never
* free the initial set of buckets, since they are allocated using
* uvm_pageboot_alloc().
*/
static size_t recolored_pages_memsize /* = 0 */;
static char *recolored_pages_mem;
/*
* freelist locks - one per bucket.
*/
union uvm_freelist_lock uvm_freelist_locks[PGFL_MAX_BUCKETS]
__cacheline_aligned;
/*
* basic NUMA information.
*/
static struct uvm_page_numa_region {
struct uvm_page_numa_region *next;
paddr_t start;
paddr_t size;
u_int numa_id;
} *uvm_page_numa_region;
#ifdef DEBUG
kmutex_t uvm_zerochecklock __cacheline_aligned;
vaddr_t uvm_zerocheckkva;
#endif /* DEBUG */
/*
* These functions are reserved for uvm(9) internal use and are not
* exported in the header file uvm_physseg.h
*
* Thus they are redefined here.
*/
void uvm_physseg_init_seg(uvm_physseg_t, struct vm_page *);
void uvm_physseg_seg_chomp_slab(uvm_physseg_t, struct vm_page *, size_t);
/* returns a pgs array */
struct vm_page *uvm_physseg_seg_alloc_from_slab(uvm_physseg_t, size_t);
/*
* inline functions
*/
/*
* uvm_pageinsert: insert a page in the object.
*
* => caller must lock object
* => call should have already set pg's object and offset pointers
* and bumped the version counter
*/
static inline void
uvm_pageinsert_object(struct uvm_object *uobj, struct vm_page *pg)
{ KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT((pg->flags & PG_TABLED) == 0); if ((pg->flags & PG_STAT) != 0) {
/* Cannot use uvm_pagegetdirty(): not yet in radix tree. */
const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
if ((pg->flags & PG_FILE) != 0) {
if (uobj->uo_npages == 0) {
struct vnode *vp = (struct vnode *)uobj;
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_PAGES) == 0);
vp->v_iflag |= VI_PAGES;
vholdl(vp);
mutex_exit(vp->v_interlock);
}
if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, 1);
}
cpu_count(CPU_COUNT_FILEUNKNOWN + status, 1);
} else {
cpu_count(CPU_COUNT_ANONUNKNOWN + status, 1);
}
}
pg->flags |= PG_TABLED;
uobj->uo_npages++;
}
static inline int
uvm_pageinsert_tree(struct uvm_object *uobj, struct vm_page *pg)
{
const uint64_t idx = pg->offset >> PAGE_SHIFT;
int error;
KASSERT(rw_write_held(uobj->vmobjlock));
error = radix_tree_insert_node(&uobj->uo_pages, idx, pg);
if (error != 0) {
return error;
}
if ((pg->flags & PG_CLEAN) == 0) { uvm_obj_page_set_dirty(pg);
}
KASSERT(((pg->flags & PG_CLEAN) == 0) ==
uvm_obj_page_dirty_p(pg));
return 0;
}
/*
* uvm_page_remove: remove page from object.
*
* => caller must lock object
*/
static inline void
uvm_pageremove_object(struct uvm_object *uobj, struct vm_page *pg)
{ KASSERT(uobj == pg->uobject); KASSERT(rw_write_held(uobj->vmobjlock)); KASSERT(pg->flags & PG_TABLED); if ((pg->flags & PG_STAT) != 0) {
/* Cannot use uvm_pagegetdirty(): no longer in radix tree. */
const unsigned int status = pg->flags & (PG_CLEAN | PG_DIRTY);
if ((pg->flags & PG_FILE) != 0) {
if (uobj->uo_npages == 1) {
struct vnode *vp = (struct vnode *)uobj;
mutex_enter(vp->v_interlock);
KASSERT((vp->v_iflag & VI_PAGES) != 0);
vp->v_iflag &= ~VI_PAGES;
holdrelel(vp);
mutex_exit(vp->v_interlock);
}
if (UVM_OBJ_IS_VTEXT(uobj)) { cpu_count(CPU_COUNT_EXECPAGES, -1);
}
cpu_count(CPU_COUNT_FILEUNKNOWN + status, -1);
} else {
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
}
uobj->uo_npages--;
pg->flags &= ~PG_TABLED;
pg->uobject = NULL;
}
static inline void
uvm_pageremove_tree(struct uvm_object *uobj, struct vm_page *pg)
{
struct vm_page *opg __unused;
KASSERT(rw_write_held(uobj->vmobjlock));
opg = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
KASSERT(pg == opg);
}
static void
uvm_page_init_bucket(struct pgfreelist *pgfl, struct pgflbucket *pgb, int num)
{
int i;
pgb->pgb_nfree = 0;
for (i = 0; i < uvmexp.ncolors; i++) {
LIST_INIT(&pgb->pgb_colors[i]);
}
pgfl->pgfl_buckets[num] = pgb;
}
/*
* uvm_page_init: init the page system. called from uvm_init().
*
* => we return the range of kernel virtual memory in kvm_startp/kvm_endp
*/
void
uvm_page_init(vaddr_t *kvm_startp, vaddr_t *kvm_endp)
{
static struct uvm_cpu uvm_boot_cpu __cacheline_aligned;
psize_t freepages, pagecount, bucketsize, n;
struct pgflbucket *pgb;
struct vm_page *pagearray;
char *bucketarray;
uvm_physseg_t bank;
int fl, b;
KASSERT(ncpu <= 1);
/*
* init the page queues and free page queue locks, except the
* free list; we allocate that later (with the initial vm_page
* structures).
*/
curcpu()->ci_data.cpu_uvm = &uvm_boot_cpu;
uvmpdpol_init();
for (b = 0; b < __arraycount(uvm_freelist_locks); b++) {
mutex_init(&uvm_freelist_locks[b].lock, MUTEX_DEFAULT, IPL_VM);
}
/*
* allocate vm_page structures.
*/
/*
* sanity check:
* before calling this function the MD code is expected to register
* some free RAM with the uvm_page_physload() function. our job
* now is to allocate vm_page structures for this memory.
*/
if (uvm_physseg_get_last() == UVM_PHYSSEG_TYPE_INVALID)
panic("uvm_page_bootstrap: no memory pre-allocated");
/*
* first calculate the number of free pages...
*
* note that we use start/end rather than avail_start/avail_end.
* this allows us to allocate extra vm_page structures in case we
* want to return some memory to the pool after booting.
*/
freepages = 0;
for (bank = uvm_physseg_get_first();
uvm_physseg_valid_p(bank) ;
bank = uvm_physseg_get_next(bank)) {
freepages += (uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank));
}
/*
* Let MD code initialize the number of colors, or default
* to 1 color if MD code doesn't care.
*/
if (uvmexp.ncolors == 0)
uvmexp.ncolors = 1;
uvmexp.colormask = uvmexp.ncolors - 1;
KASSERT((uvmexp.colormask & uvmexp.ncolors) == 0);
/* We always start with only 1 bucket. */
uvm.bucketcount = 1;
/*
* we now know we have (PAGE_SIZE * freepages) bytes of memory we can
* use. for each page of memory we use we need a vm_page structure.
* thus, the total number of pages we can use is the total size of
* the memory divided by the PAGE_SIZE plus the size of the vm_page
* structure. we add one to freepages as a fudge factor to avoid
* truncation errors (since we can only allocate in terms of whole
* pages).
*/
pagecount = ((freepages + 1) << PAGE_SHIFT) /
(PAGE_SIZE + sizeof(struct vm_page));
bucketsize = offsetof(struct pgflbucket, pgb_colors[uvmexp.ncolors]);
bucketsize = roundup2(bucketsize, coherency_unit);
bucketarray = (void *)uvm_pageboot_alloc(
bucketsize * VM_NFREELIST +
pagecount * sizeof(struct vm_page));
pagearray = (struct vm_page *)
(bucketarray + bucketsize * VM_NFREELIST);
for (fl = 0; fl < VM_NFREELIST; fl++) {
pgb = (struct pgflbucket *)(bucketarray + bucketsize * fl);
uvm_page_init_bucket(&uvm.page_free[fl], pgb, 0);
}
memset(pagearray, 0, pagecount * sizeof(struct vm_page));
/*
* init the freelist cache in the disabled state.
*/
uvm_pgflcache_init();
/*
* init the vm_page structures and put them in the correct place.
*/
/* First init the extent */
for (bank = uvm_physseg_get_first(),
uvm_physseg_seg_chomp_slab(bank, pagearray, pagecount);
uvm_physseg_valid_p(bank);
bank = uvm_physseg_get_next(bank)) {
n = uvm_physseg_get_end(bank) - uvm_physseg_get_start(bank);
uvm_physseg_seg_alloc_from_slab(bank, n);
uvm_physseg_init_seg(bank, pagearray);
/* set up page array pointers */
pagearray += n;
pagecount -= n;
}
/*
* pass up the values of virtual_space_start and
* virtual_space_end (obtained by uvm_pageboot_alloc) to the upper
* layers of the VM.
*/
*kvm_startp = round_page(virtual_space_start);
*kvm_endp = trunc_page(virtual_space_end);
/*
* init various thresholds.
*/
uvmexp.reserve_pagedaemon = 1;
uvmexp.reserve_kernel = vm_page_reserve_kernel;
/*
* done!
*/
uvm.page_init_done = true;
}
/*
* uvm_pgfl_lock: lock all freelist buckets
*/
void
uvm_pgfl_lock(void)
{
int i;
for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
mutex_spin_enter(&uvm_freelist_locks[i].lock);
}
}
/*
* uvm_pgfl_unlock: unlock all freelist buckets
*/
void
uvm_pgfl_unlock(void)
{
int i;
for (i = 0; i < __arraycount(uvm_freelist_locks); i++) {
mutex_spin_exit(&uvm_freelist_locks[i].lock);
}
}
/*
* uvm_setpagesize: set the page size
*
* => sets page_shift and page_mask from uvmexp.pagesize.
*/
void
uvm_setpagesize(void)
{
/*
* If uvmexp.pagesize is 0 at this point, we expect PAGE_SIZE
* to be a constant (indicated by being a non-zero value).
*/
if (uvmexp.pagesize == 0) {
if (PAGE_SIZE == 0)
panic("uvm_setpagesize: uvmexp.pagesize not set");
uvmexp.pagesize = PAGE_SIZE;
}
uvmexp.pagemask = uvmexp.pagesize - 1;
if ((uvmexp.pagemask & uvmexp.pagesize) != 0)
panic("uvm_setpagesize: page size %u (%#x) not a power of two",
uvmexp.pagesize, uvmexp.pagesize);
for (uvmexp.pageshift = 0; ; uvmexp.pageshift++)
if ((1 << uvmexp.pageshift) == uvmexp.pagesize)
break;
}
/*
* uvm_pageboot_alloc: steal memory from physmem for bootstrapping
*/
vaddr_t
uvm_pageboot_alloc(vsize_t size)
{
static bool initialized = false;
vaddr_t addr;
#if !defined(PMAP_STEAL_MEMORY)
vaddr_t vaddr;
paddr_t paddr;
#endif
/*
* on first call to this function, initialize ourselves.
*/
if (initialized == false) {
pmap_virtual_space(&virtual_space_start, &virtual_space_end);
/* round it the way we like it */
virtual_space_start = round_page(virtual_space_start);
virtual_space_end = trunc_page(virtual_space_end);
initialized = true;
}
/* round to page size */
size = round_page(size);
uvmexp.bootpages += atop(size);
#if defined(PMAP_STEAL_MEMORY)
/*
* defer bootstrap allocation to MD code (it may want to allocate
* from a direct-mapped segment). pmap_steal_memory should adjust
* virtual_space_start/virtual_space_end if necessary.
*/
addr = pmap_steal_memory(size, &virtual_space_start,
&virtual_space_end);
return addr;
#else /* !PMAP_STEAL_MEMORY */
/*
* allocate virtual memory for this request
*/
if (virtual_space_start == virtual_space_end ||
(virtual_space_end - virtual_space_start) < size)
panic("uvm_pageboot_alloc: out of virtual space");
addr = virtual_space_start;
#ifdef PMAP_GROWKERNEL
/*
* If the kernel pmap can't map the requested space,
* then allocate more resources for it.
*/
if (uvm_maxkaddr < (addr + size)) {
uvm_maxkaddr = pmap_growkernel(addr + size);
if (uvm_maxkaddr < (addr + size))
panic("uvm_pageboot_alloc: pmap_growkernel() failed");
}
#endif
virtual_space_start += size;
/*
* allocate and mapin physical pages to back new virtual pages
*/
for (vaddr = round_page(addr) ; vaddr < addr + size ;
vaddr += PAGE_SIZE) {
if (!uvm_page_physget(&paddr))
panic("uvm_pageboot_alloc: out of memory");
/*
* Note this memory is no longer managed, so using
* pmap_kenter is safe.
*/
pmap_kenter_pa(vaddr, paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
return addr;
#endif /* PMAP_STEAL_MEMORY */
}
#if !defined(PMAP_STEAL_MEMORY)
/*
* uvm_page_physget: "steal" one page from the vm_physmem structure.
*
* => attempt to allocate it off the end of a segment in which the "avail"
* values match the start/end values. if we can't do that, then we
* will advance both values (making them equal, and removing some
* vm_page structures from the non-avail area).
* => return false if out of memory.
*/
/* subroutine: try to allocate from memory chunks on the specified freelist */
static bool uvm_page_physget_freelist(paddr_t *, int);
static bool
uvm_page_physget_freelist(paddr_t *paddrp, int freelist)
{
uvm_physseg_t lcv;
/* pass 1: try allocating from a matching end */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
{
if (uvm.page_init_done == true)
panic("uvm_page_physget: called _after_ bootstrap");
/* Try to match at front or back on unused segment */
if (uvm_page_physunload(lcv, freelist, paddrp))
return true;
}
/* pass2: forget about matching ends, just allocate something */
#if (VM_PHYSSEG_STRAT == VM_PSTRAT_BIGFIRST)
for (lcv = uvm_physseg_get_last(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_prev(lcv))
#else
for (lcv = uvm_physseg_get_first(); uvm_physseg_valid_p(lcv); lcv = uvm_physseg_get_next(lcv))
#endif
{
/* Try the front regardless. */
if (uvm_page_physunload_force(lcv, freelist, paddrp))
return true;
}
return false;
}
bool
uvm_page_physget(paddr_t *paddrp)
{
int i;
/* try in the order of freelist preference */
for (i = 0; i < VM_NFREELIST; i++)
if (uvm_page_physget_freelist(paddrp, i) == true)
return (true);
return (false);
}
#endif /* PMAP_STEAL_MEMORY */
paddr_t
uvm_vm_page_to_phys(const struct vm_page *pg)
{
return pg->phys_addr & ~(PAGE_SIZE - 1);
}
/*
* uvm_page_numa_load: load NUMA range description.
*/
void
uvm_page_numa_load(paddr_t start, paddr_t size, u_int numa_id)
{
struct uvm_page_numa_region *d;
KASSERT(numa_id < PGFL_MAX_BUCKETS);
d = kmem_alloc(sizeof(*d), KM_SLEEP);
d->start = start;
d->size = size;
d->numa_id = numa_id;
d->next = uvm_page_numa_region;
uvm_page_numa_region = d;
}
/*
* uvm_page_numa_lookup: lookup NUMA node for the given page.
*/
static u_int
uvm_page_numa_lookup(struct vm_page *pg)
{
struct uvm_page_numa_region *d;
static bool warned;
paddr_t pa;
KASSERT(uvm_page_numa_region != NULL);
pa = VM_PAGE_TO_PHYS(pg);
for (d = uvm_page_numa_region; d != NULL; d = d->next) {
if (pa >= d->start && pa < d->start + d->size) {
return d->numa_id;
}
}
if (!warned) {
printf("uvm_page_numa_lookup: failed, first pg=%p pa=%#"
PRIxPADDR "\n", pg, VM_PAGE_TO_PHYS(pg));
warned = true;
}
return 0;
}
/*
* uvm_page_redim: adjust freelist dimensions if they have changed.
*/
static void
uvm_page_redim(int newncolors, int newnbuckets)
{
struct pgfreelist npgfl;
struct pgflbucket *opgb, *npgb;
struct pgflist *ohead, *nhead;
struct vm_page *pg;
size_t bucketsize, bucketmemsize, oldbucketmemsize;
int fl, ob, oc, nb, nc, obuckets, ocolors;
char *bucketarray, *oldbucketmem, *bucketmem;
KASSERT(((newncolors - 1) & newncolors) == 0);
/* Anything to do? */
if (newncolors <= uvmexp.ncolors &&
newnbuckets == uvm.bucketcount) {
return;
}
if (uvm.page_init_done == false) {
uvmexp.ncolors = newncolors;
return;
}
bucketsize = offsetof(struct pgflbucket, pgb_colors[newncolors]);
bucketsize = roundup2(bucketsize, coherency_unit);
bucketmemsize = bucketsize * newnbuckets * VM_NFREELIST +
coherency_unit - 1;
bucketmem = kmem_zalloc(bucketmemsize, KM_SLEEP);
bucketarray = (char *)roundup2((uintptr_t)bucketmem, coherency_unit);
ocolors = uvmexp.ncolors;
obuckets = uvm.bucketcount;
/* Freelist cache mustn't be enabled. */
uvm_pgflcache_pause();
/* Make sure we should still do this. */
uvm_pgfl_lock();
if (newncolors <= uvmexp.ncolors &&
newnbuckets == uvm.bucketcount) {
uvm_pgfl_unlock();
uvm_pgflcache_resume();
kmem_free(bucketmem, bucketmemsize);
return;
}
uvmexp.ncolors = newncolors;
uvmexp.colormask = uvmexp.ncolors - 1;
uvm.bucketcount = newnbuckets;
for (fl = 0; fl < VM_NFREELIST; fl++) {
/* Init new buckets in new freelist. */
memset(&npgfl, 0, sizeof(npgfl));
for (nb = 0; nb < newnbuckets; nb++) {
npgb = (struct pgflbucket *)bucketarray;
uvm_page_init_bucket(&npgfl, npgb, nb);
bucketarray += bucketsize;
}
/* Now transfer pages from the old freelist. */
for (nb = ob = 0; ob < obuckets; ob++) {
opgb = uvm.page_free[fl].pgfl_buckets[ob];
for (oc = 0; oc < ocolors; oc++) {
ohead = &opgb->pgb_colors[oc];
while ((pg = LIST_FIRST(ohead)) != NULL) {
LIST_REMOVE(pg, pageq.list);
/*
* Here we decide on the NEW color &
* bucket for the page. For NUMA
* we'll use the info that the
* hardware gave us. For non-NUMA
* assign take physical page frame
* number and cache color into
* account. We do this to try and
* avoid defeating any memory
* interleaving in the hardware.
*/
KASSERT(
uvm_page_get_bucket(pg) == ob);
KASSERT(fl ==
uvm_page_get_freelist(pg));
if (uvm_page_numa_region != NULL) {
nb = uvm_page_numa_lookup(pg);
} else {
nb = atop(VM_PAGE_TO_PHYS(pg))
/ uvmexp.ncolors / 8
% newnbuckets;
}
uvm_page_set_bucket(pg, nb);
npgb = npgfl.pgfl_buckets[nb];
npgb->pgb_nfree++;
nc = VM_PGCOLOR(pg);
nhead = &npgb->pgb_colors[nc];
LIST_INSERT_HEAD(nhead, pg, pageq.list);
}
}
}
/* Install the new freelist. */
memcpy(&uvm.page_free[fl], &npgfl, sizeof(npgfl));
}
/* Unlock and free the old memory. */
oldbucketmemsize = recolored_pages_memsize;
oldbucketmem = recolored_pages_mem;
recolored_pages_memsize = bucketmemsize;
recolored_pages_mem = bucketmem;
uvm_pgfl_unlock();
uvm_pgflcache_resume();
if (oldbucketmemsize) {
kmem_free(oldbucketmem, oldbucketmemsize);
}
/*
* this calls uvm_km_alloc() which may want to hold
* uvm_freelist_lock.
*/
uvm_pager_realloc_emerg();
}
/*
* uvm_page_recolor: Recolor the pages if the new color count is
* larger than the old one.
*/
void
uvm_page_recolor(int newncolors)
{
uvm_page_redim(newncolors, uvm.bucketcount);
}
/*
* uvm_page_rebucket: Determine a bucket structure and redim the free
* lists to match.
*/
void
uvm_page_rebucket(void)
{
u_int min_numa, max_numa, npackage, shift;
struct cpu_info *ci, *ci2, *ci3;
CPU_INFO_ITERATOR cii;
/*
* If we have more than one NUMA node, and the maximum NUMA node ID
* is less than PGFL_MAX_BUCKETS, then we'll use NUMA distribution
* for free pages.
*/
min_numa = (u_int)-1;
max_numa = 0;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_numa_id < min_numa) {
min_numa = ci->ci_numa_id;
}
if (ci->ci_numa_id > max_numa) {
max_numa = ci->ci_numa_id;
}
}
if (min_numa != max_numa && max_numa < PGFL_MAX_BUCKETS) {
aprint_debug("UVM: using NUMA allocation scheme\n");
for (CPU_INFO_FOREACH(cii, ci)) {
ci->ci_data.cpu_uvm->pgflbucket = ci->ci_numa_id;
}
uvm_page_redim(uvmexp.ncolors, max_numa + 1);
return;
}
/*
* Otherwise we'll go with a scheme to maximise L2/L3 cache locality
* and minimise lock contention. Count the total number of CPU
* packages, and then try to distribute the buckets among CPU
* packages evenly.
*/
npackage = curcpu()->ci_nsibling[CPUREL_PACKAGE1ST];
/*
* Figure out how to arrange the packages & buckets, and the total
* number of buckets we need. XXX 2 may not be the best factor.
*/
for (shift = 0; npackage > PGFL_MAX_BUCKETS; shift++) {
npackage >>= 1;
}
uvm_page_redim(uvmexp.ncolors, npackage);
/*
* Now tell each CPU which bucket to use. In the outer loop, scroll
* through all CPU packages.
*/
npackage = 0;
ci = curcpu();
ci2 = ci->ci_sibling[CPUREL_PACKAGE1ST];
do {
/*
* In the inner loop, scroll through all CPUs in the package
* and assign the same bucket ID.
*/
ci3 = ci2;
do {
ci3->ci_data.cpu_uvm->pgflbucket = npackage >> shift;
ci3 = ci3->ci_sibling[CPUREL_PACKAGE];
} while (ci3 != ci2);
npackage++;
ci2 = ci2->ci_sibling[CPUREL_PACKAGE1ST];
} while (ci2 != ci->ci_sibling[CPUREL_PACKAGE1ST]);
aprint_debug("UVM: using package allocation scheme, "
"%d package(s) per bucket\n", 1 << shift);
}
/*
* uvm_cpu_attach: initialize per-CPU data structures.
*/
void
uvm_cpu_attach(struct cpu_info *ci)
{
struct uvm_cpu *ucpu;
/* Already done in uvm_page_init(). */
if (!CPU_IS_PRIMARY(ci)) {
/* Add more reserve pages for this CPU. */
uvmexp.reserve_kernel += vm_page_reserve_kernel;
/* Allocate per-CPU data structures. */
ucpu = kmem_zalloc(sizeof(struct uvm_cpu) + coherency_unit - 1,
KM_SLEEP);
ucpu = (struct uvm_cpu *)roundup2((uintptr_t)ucpu,
coherency_unit);
ci->ci_data.cpu_uvm = ucpu;
} else {
ucpu = ci->ci_data.cpu_uvm;
}
uvmpdpol_init_cpu(ucpu);
}
/*
* uvm_availmem: fetch the total amount of free memory in pages. this can
* have a detrimental effect on performance due to false sharing; don't call
* unless needed.
*
* some users can request the amount of free memory so often that it begins
* to impact upon performance. if calling frequently and an inexact value
* is okay, call with cached = true.
*/
int
uvm_availmem(bool cached)
{
int64_t fp;
cpu_count_sync(cached);
if ((fp = cpu_count_get(CPU_COUNT_FREEPAGES)) < 0) {
/*
* XXXAD could briefly go negative because it's impossible
* to get a clean snapshot. address this for other counters
* used as running totals before NetBSD 10 although less
* important for those.
*/
fp = 0;
}
return (int)fp;
}
/*
* uvm_pagealloc_pgb: helper routine that tries to allocate any color from a
* specific freelist and specific bucket only.
*
* => must be at IPL_VM or higher to protect per-CPU data structures.
*/
static struct vm_page *
uvm_pagealloc_pgb(struct uvm_cpu *ucpu, int f, int b, int *trycolorp, int flags)
{
int c, trycolor, colormask;
struct pgflbucket *pgb;
struct vm_page *pg;
kmutex_t *lock;
bool fill;
/*
* Skip the bucket if empty, no lock needed. There could be many
* empty freelists/buckets.
*/
pgb = uvm.page_free[f].pgfl_buckets[b];
if (pgb->pgb_nfree == 0) {
return NULL;
}
/* Skip bucket if low on memory. */
lock = &uvm_freelist_locks[b].lock;
mutex_spin_enter(lock);
if (__predict_false(pgb->pgb_nfree <= uvmexp.reserve_kernel)) { if ((flags & UVM_PGA_USERESERVE) == 0 || (pgb->pgb_nfree <= uvmexp.reserve_pagedaemon &&
curlwp != uvm.pagedaemon_lwp)) {
mutex_spin_exit(lock);
return NULL;
}
fill = false;
} else {
fill = true;
}
/* Try all page colors as needed. */
c = trycolor = *trycolorp;
colormask = uvmexp.colormask;
do {
pg = LIST_FIRST(&pgb->pgb_colors[c]);
if (__predict_true(pg != NULL)) {
/*
* Got a free page! PG_FREE must be cleared under
* lock because of uvm_pglistalloc().
*/
LIST_REMOVE(pg, pageq.list); KASSERT(pg->flags == PG_FREE);
pg->flags = PG_BUSY | PG_CLEAN | PG_FAKE;
pgb->pgb_nfree--;
CPU_COUNT(CPU_COUNT_FREEPAGES, -1);
/*
* While we have the bucket locked and our data
* structures fresh in L1 cache, we have an ideal
* opportunity to grab some pages for the freelist
* cache without causing extra contention. Only do
* so if we found pages in this CPU's preferred
* bucket.
*/
if (__predict_true(b == ucpu->pgflbucket && fill)) { uvm_pgflcache_fill(ucpu, f, b, c);
}
mutex_spin_exit(lock);
KASSERT(uvm_page_get_bucket(pg) == b); CPU_COUNT(c == trycolor ?
CPU_COUNT_COLORHIT : CPU_COUNT_COLORMISS, 1);
CPU_COUNT(CPU_COUNT_CPUMISS, 1);
*trycolorp = c;
return pg;
}
c = (c + 1) & colormask;
} while (c != trycolor); mutex_spin_exit(lock);
return NULL;
}
/*
* uvm_pagealloc_pgfl: helper routine for uvm_pagealloc_strat that allocates
* any color from any bucket, in a specific freelist.
*
* => must be at IPL_VM or higher to protect per-CPU data structures.
*/
static struct vm_page *
uvm_pagealloc_pgfl(struct uvm_cpu *ucpu, int f, int *trycolorp, int flags)
{
int b, trybucket, bucketcount;
struct vm_page *pg;
/* Try for the exact thing in the per-CPU cache. */
if ((pg = uvm_pgflcache_alloc(ucpu, f, *trycolorp)) != NULL) {
CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1);
return pg;
}
/* Walk through all buckets, trying our preferred bucket first. */
trybucket = ucpu->pgflbucket;
b = trybucket;
bucketcount = uvm.bucketcount;
do {
pg = uvm_pagealloc_pgb(ucpu, f, b, trycolorp, flags);
if (pg != NULL) {
return pg;
}
b = (b + 1 == bucketcount ? 0 : b + 1);
} while (b != trybucket);
return NULL;
}
/*
* uvm_pagealloc_strat: allocate vm_page from a particular free list.
*
* => return null if no pages free
* => wake up pagedaemon if number of free pages drops below low water mark
* => if obj != NULL, obj must be locked (to put in obj's tree)
* => if anon != NULL, anon must be locked (to put in anon)
* => only one of obj or anon can be non-null
* => caller must activate/deactivate page if it is not wired.
* => free_list is ignored if strat == UVM_PGA_STRAT_NORMAL.
* => policy decision: it is more important to pull a page off of the
* appropriate priority free list than it is to get a page from the
* correct bucket or color bin. This is because we live with the
* consequences of a bad free list decision for the entire
* lifetime of the page, e.g. if the page comes from memory that
* is slower to access.
*/
struct vm_page *
uvm_pagealloc_strat(struct uvm_object *obj, voff_t off, struct vm_anon *anon,
int flags, int strat, int free_list)
{
int color, lcv, error, s;
struct uvm_cpu *ucpu;
struct vm_page *pg;
lwp_t *l;
KASSERT(obj == NULL || anon == NULL); KASSERT(anon == NULL || (flags & UVM_FLAG_COLORMATCH) || off == 0); KASSERT(off == trunc_page(off)); KASSERT(obj == NULL || rw_write_held(obj->vmobjlock)); KASSERT(anon == NULL || anon->an_lock == NULL ||
rw_write_held(anon->an_lock));
/*
* This implements a global round-robin page coloring
* algorithm.
*/
s = splvm();
ucpu = curcpu()->ci_data.cpu_uvm;
if (flags & UVM_FLAG_COLORMATCH) {
color = atop(off) & uvmexp.colormask;
} else {
color = ucpu->pgflcolor;
}
/*
* fail if any of these conditions is true:
* [1] there really are no free pages, or
* [2] only kernel "reserved" pages remain and
* reserved pages have not been requested.
* [3] only pagedaemon "reserved" pages remain and
* the requestor isn't the pagedaemon.
* we make kernel reserve pages available if called by a
* kernel thread.
*/
l = curlwp;
if (__predict_true(l != NULL) && (l->l_flag & LW_SYSTEM) != 0) {
flags |= UVM_PGA_USERESERVE;
}
again:
switch (strat) {
case UVM_PGA_STRAT_NORMAL:
/* Check freelists: descending priority (ascending id) order. */
for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
pg = uvm_pagealloc_pgfl(ucpu, lcv, &color, flags);
if (pg != NULL) {
goto gotit;
}
}
/* No pages free! Have pagedaemon free some memory. */
splx(s);
uvm_kick_pdaemon();
return NULL;
case UVM_PGA_STRAT_ONLY:
case UVM_PGA_STRAT_FALLBACK:
/* Attempt to allocate from the specified free list. */
KASSERT(free_list >= 0); KASSERT(free_list < VM_NFREELIST);
pg = uvm_pagealloc_pgfl(ucpu, free_list, &color, flags);
if (pg != NULL) {
goto gotit;
}
/* Fall back, if possible. */
if (strat == UVM_PGA_STRAT_FALLBACK) {
strat = UVM_PGA_STRAT_NORMAL;
goto again;
}
/* No pages free! Have pagedaemon free some memory. */
splx(s);
uvm_kick_pdaemon();
return NULL;
case UVM_PGA_STRAT_NUMA:
/*
* NUMA strategy (experimental): allocating from the correct
* bucket is more important than observing freelist
* priority. Look only to the current NUMA node; if that
* fails, we need to look to other NUMA nodes, so retry with
* the normal strategy.
*/
for (lcv = 0; lcv < VM_NFREELIST; lcv++) {
pg = uvm_pgflcache_alloc(ucpu, lcv, color);
if (pg != NULL) { CPU_COUNT(CPU_COUNT_CPUHIT, 1); CPU_COUNT(CPU_COUNT_COLORHIT, 1);
goto gotit;
}
pg = uvm_pagealloc_pgb(ucpu, lcv,
ucpu->pgflbucket, &color, flags);
if (pg != NULL) {
goto gotit;
}
}
strat = UVM_PGA_STRAT_NORMAL;
goto again;
default:
panic("uvm_pagealloc_strat: bad strat %d", strat);
/* NOTREACHED */
}
gotit:
/*
* We now know which color we actually allocated from; set
* the next color accordingly.
*/
ucpu->pgflcolor = (color + 1) & uvmexp.colormask;
/*
* while still at IPL_VM, update allocation statistics.
*/
if (anon) { CPU_COUNT(CPU_COUNT_ANONCLEAN, 1);
}
splx(s);
KASSERT(pg->flags == (PG_BUSY|PG_CLEAN|PG_FAKE));
/*
* assign the page to the object. as the page was free, we know
* that pg->uobject and pg->uanon are NULL. we only need to take
* the page's interlock if we are changing the values.
*/
if (anon != NULL || obj != NULL) { mutex_enter(&pg->interlock);
}
pg->offset = off;
pg->uobject = obj;
pg->uanon = anon;
KASSERT(uvm_page_owner_locked_p(pg, true));
if (anon) {
anon->an_page = pg;
pg->flags |= PG_ANON;
mutex_exit(&pg->interlock);
} else if (obj) {
/*
* set PG_FILE|PG_AOBJ before the first uvm_pageinsert.
*/
if (UVM_OBJ_IS_VNODE(obj)) {
pg->flags |= PG_FILE;
} else if (UVM_OBJ_IS_AOBJ(obj)) {
pg->flags |= PG_AOBJ;
}
uvm_pageinsert_object(obj, pg);
mutex_exit(&pg->interlock);
error = uvm_pageinsert_tree(obj, pg);
if (error != 0) {
mutex_enter(&pg->interlock);
uvm_pageremove_object(obj, pg);
mutex_exit(&pg->interlock);
uvm_pagefree(pg);
return NULL;
}
}
#if defined(UVM_PAGE_TRKOWN)
pg->owner_tag = NULL;
#endif
UVM_PAGE_OWN(pg, "new alloc");
if (flags & UVM_PGA_ZERO) {
/* A zero'd page is not clean. */
if (obj != NULL || anon != NULL) { uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
}
pmap_zero_page(VM_PAGE_TO_PHYS(pg));
}
return(pg);
}
/*
* uvm_pagereplace: replace a page with another
*
* => object must be locked
* => page interlocks must be held
*/
void
uvm_pagereplace(struct vm_page *oldpg, struct vm_page *newpg)
{
struct uvm_object *uobj = oldpg->uobject;
struct vm_page *pg __diagused;
uint64_t idx;
KASSERT((oldpg->flags & PG_TABLED) != 0);
KASSERT(uobj != NULL);
KASSERT((newpg->flags & PG_TABLED) == 0);
KASSERT(newpg->uobject == NULL);
KASSERT(rw_write_held(uobj->vmobjlock));
KASSERT(mutex_owned(&oldpg->interlock));
KASSERT(mutex_owned(&newpg->interlock));
newpg->uobject = uobj;
newpg->offset = oldpg->offset;
idx = newpg->offset >> PAGE_SHIFT;
pg = radix_tree_replace_node(&uobj->uo_pages, idx, newpg);
KASSERT(pg == oldpg);
if (((oldpg->flags ^ newpg->flags) & PG_CLEAN) != 0) {
if ((newpg->flags & PG_CLEAN) != 0) {
uvm_obj_page_clear_dirty(newpg);
} else {
uvm_obj_page_set_dirty(newpg);
}
}
/*
* oldpg's PG_STAT is stable. newpg is not reachable by others yet.
*/
newpg->flags |=
(newpg->flags & ~PG_STAT) | (oldpg->flags & PG_STAT);
uvm_pageinsert_object(uobj, newpg);
uvm_pageremove_object(uobj, oldpg);
}
/*
* uvm_pagerealloc: reallocate a page from one object to another
*
* => both objects must be locked
*/
int
uvm_pagerealloc(struct vm_page *pg, struct uvm_object *newobj, voff_t newoff)
{
int error = 0;
/*
* remove it from the old object
*/
if (pg->uobject) { uvm_pageremove_tree(pg->uobject, pg);
uvm_pageremove_object(pg->uobject, pg);
}
/*
* put it in the new object
*/
if (newobj) {
mutex_enter(&pg->interlock);
pg->uobject = newobj;
pg->offset = newoff;
if (UVM_OBJ_IS_VNODE(newobj)) {
pg->flags |= PG_FILE;
} else if (UVM_OBJ_IS_AOBJ(newobj)) {
pg->flags |= PG_AOBJ;
}
uvm_pageinsert_object(newobj, pg);
mutex_exit(&pg->interlock);
error = uvm_pageinsert_tree(newobj, pg);
if (error != 0) {
mutex_enter(&pg->interlock);
uvm_pageremove_object(newobj, pg);
mutex_exit(&pg->interlock);
}
}
return error;
}
/*
* uvm_pagefree: free page
*
* => erase page's identity (i.e. remove from object)
* => put page on free list
* => caller must lock owning object (either anon or uvm_object)
* => assumes all valid mappings of pg are gone
*/
void
uvm_pagefree(struct vm_page *pg)
{
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
struct uvm_cpu *ucpu;
kmutex_t *lock;
int bucket, s;
bool locked;
#ifdef DEBUG
if (pg->uobject == (void *)0xdeadbeef &&
pg->uanon == (void *)0xdeadbeef) {
panic("uvm_pagefree: freeing free page %p", pg);
}
#endif /* DEBUG */
KASSERT((pg->flags & PG_PAGEOUT) == 0); KASSERT(!(pg->flags & PG_FREE)); KASSERT(pg->uobject == NULL || rw_write_held(pg->uobject->vmobjlock)); KASSERT(pg->uobject != NULL || pg->uanon == NULL ||
rw_write_held(pg->uanon->an_lock));
/*
* remove the page from the object's tree before acquiring any page
* interlocks: this can acquire locks to free radixtree nodes.
*/
if (pg->uobject != NULL) { uvm_pageremove_tree(pg->uobject, pg);
}
/*
* if the page is loaned, resolve the loan instead of freeing.
*/
if (pg->loan_count) {
KASSERT(pg->wire_count == 0);
/*
* if the page is owned by an anon then we just want to
* drop anon ownership. the kernel will free the page when
* it is done with it. if the page is owned by an object,
* remove it from the object and mark it dirty for the benefit
* of possible anon owners.
*
* regardless of previous ownership, wakeup any waiters,
* unbusy the page, and we're done.
*/
uvm_pagelock(pg);
locked = true;
if (pg->uobject != NULL) {
uvm_pageremove_object(pg->uobject, pg);
pg->flags &= ~(PG_FILE|PG_AOBJ);
} else if (pg->uanon != NULL) {
if ((pg->flags & PG_ANON) == 0) {
pg->loan_count--;
} else {
const unsigned status = uvm_pagegetdirty(pg);
pg->flags &= ~PG_ANON;
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
pg->uanon->an_page = NULL;
pg->uanon = NULL;
}
if (pg->pqflags & PQ_WANTED) { wakeup(pg);
}
pg->pqflags &= ~PQ_WANTED;
pg->flags &= ~(PG_BUSY|PG_RELEASED|PG_PAGER1);
#ifdef UVM_PAGE_TRKOWN
pg->owner_tag = NULL;
#endif
KASSERT((pg->flags & PG_STAT) == 0); if (pg->loan_count) { KASSERT(pg->uobject == NULL); if (pg->uanon == NULL) { uvm_pagedequeue(pg);
}
uvm_pageunlock(pg);
return;
}
} else if (pg->uobject != NULL || pg->uanon != NULL ||
pg->wire_count != 0) {
uvm_pagelock(pg);
locked = true;
} else {
locked = false;
}
/*
* remove page from its object or anon.
*/
if (pg->uobject != NULL) {
uvm_pageremove_object(pg->uobject, pg); } else if (pg->uanon != NULL) { const unsigned int status = uvm_pagegetdirty(pg);
pg->uanon->an_page = NULL;
pg->uanon = NULL;
cpu_count(CPU_COUNT_ANONUNKNOWN + status, -1);
}
/*
* if the page was wired, unwire it now.
*/
if (pg->wire_count) { pg->wire_count = 0;
atomic_dec_uint(&uvmexp.wired);
}
if (locked) {
/*
* wake anyone waiting on the page.
*/
if ((pg->pqflags & PQ_WANTED) != 0) { pg->pqflags &= ~PQ_WANTED;
wakeup(pg);
}
/*
* now remove the page from the queues.
*/
uvm_pagedequeue(pg);
uvm_pageunlock(pg);
} else {
KASSERT(!uvmpdpol_pageisqueued_p(pg));
}
/*
* and put on free queue
*/
#ifdef DEBUG
pg->uobject = (void *)0xdeadbeef;
pg->uanon = (void *)0xdeadbeef;
#endif /* DEBUG */
/* Try to send the page to the per-CPU cache. */
s = splvm();
ucpu = curcpu()->ci_data.cpu_uvm;
bucket = uvm_page_get_bucket(pg);
if (bucket == ucpu->pgflbucket && uvm_pgflcache_free(ucpu, pg)) { splx(s);
return;
}
/* Didn't work. Never mind, send it to a global bucket. */
pgfl = &uvm.page_free[uvm_page_get_freelist(pg)];
pgb = pgfl->pgfl_buckets[bucket];
lock = &uvm_freelist_locks[bucket].lock;
mutex_spin_enter(lock);
/* PG_FREE must be set under lock because of uvm_pglistalloc(). */
pg->flags = PG_FREE;
LIST_INSERT_HEAD(&pgb->pgb_colors[VM_PGCOLOR(pg)], pg, pageq.list);
pgb->pgb_nfree++;
CPU_COUNT(CPU_COUNT_FREEPAGES, 1);
mutex_spin_exit(lock);
splx(s);
}
/*
* uvm_page_unbusy: unbusy an array of pages.
*
* => pages must either all belong to the same object, or all belong to anons.
* => if pages are object-owned, object must be locked.
* => if pages are anon-owned, anons must be locked.
* => caller must make sure that anon-owned pages are not PG_RELEASED.
*/
void
uvm_page_unbusy(struct vm_page **pgs, int npgs)
{
struct vm_page *pg;
int i, pageout_done;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
pageout_done = 0;
for (i = 0; i < npgs; i++) {
pg = pgs[i];
if (pg == NULL || pg == PGO_DONTCARE) {
continue;
}
KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->flags & PG_BUSY); if (pg->flags & PG_PAGEOUT) { pg->flags &= ~PG_PAGEOUT;
pg->flags |= PG_RELEASED;
pageout_done++;
atomic_inc_uint(&uvmexp.pdfreed);
}
if (pg->flags & PG_RELEASED) {
UVMHIST_LOG(ubchist, "releasing pg %#jx",
(uintptr_t)pg, 0, 0, 0);
KASSERT(pg->uobject != NULL ||
(pg->uanon != NULL && pg->uanon->an_ref > 0));
pg->flags &= ~PG_RELEASED;
uvm_pagefree(pg);
} else {
UVMHIST_LOG(ubchist, "unbusying pg %#jx",
(uintptr_t)pg, 0, 0, 0);
KASSERT((pg->flags & PG_FAKE) == 0);
pg->flags &= ~PG_BUSY;
uvm_pagelock(pg);
uvm_pagewakeup(pg); uvm_pageunlock(pg);
UVM_PAGE_OWN(pg, NULL);
}
}
if (pageout_done != 0) { uvm_pageout_done(pageout_done);
}
}
/*
* uvm_pagewait: wait for a busy page
*
* => page must be known PG_BUSY
* => object must be read or write locked
* => object will be unlocked on return
*/
void
uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
{ KASSERT(rw_lock_held(lock)); KASSERT((pg->flags & PG_BUSY) != 0); KASSERT(uvm_page_owner_locked_p(pg, false));
mutex_enter(&pg->interlock);
pg->pqflags |= PQ_WANTED;
rw_exit(lock);
UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
}
/*
* uvm_pagewakeup: wake anyone waiting on a page
*
* => page interlock must be held
*/
void
uvm_pagewakeup(struct vm_page *pg)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(ubchist);
KASSERT(mutex_owned(&pg->interlock));
UVMHIST_LOG(ubchist, "waking pg %#jx", (uintptr_t)pg, 0, 0, 0);
if ((pg->pqflags & PQ_WANTED) != 0) { wakeup(pg);
pg->pqflags &= ~PQ_WANTED;
}
}
/*
* uvm_pagewanted_p: return true if someone is waiting on the page
*
* => object must be write locked (lock out all concurrent access)
*/
bool
uvm_pagewanted_p(struct vm_page *pg)
{
KASSERT(uvm_page_owner_locked_p(pg, true));
return (atomic_load_relaxed(&pg->pqflags) & PQ_WANTED) != 0;
}
#if defined(UVM_PAGE_TRKOWN)
/*
* uvm_page_own: set or release page ownership
*
* => this is a debugging function that keeps track of who sets PG_BUSY
* and where they do it. it can be used to track down problems
* such a process setting "PG_BUSY" and never releasing it.
* => page's object [if any] must be locked
* => if "tag" is NULL then we are releasing page ownership
*/
void
uvm_page_own(struct vm_page *pg, const char *tag)
{
KASSERT((pg->flags & (PG_PAGEOUT|PG_RELEASED)) == 0);
KASSERT(uvm_page_owner_locked_p(pg, true));
/* gain ownership? */
if (tag) {
KASSERT((pg->flags & PG_BUSY) != 0);
if (pg->owner_tag) {
printf("uvm_page_own: page %p already owned "
"by proc %d.%d [%s]\n", pg,
pg->owner, pg->lowner, pg->owner_tag);
panic("uvm_page_own");
}
pg->owner = curproc->p_pid;
pg->lowner = curlwp->l_lid;
pg->owner_tag = tag;
return;
}
/* drop ownership */
KASSERT((pg->flags & PG_BUSY) == 0);
if (pg->owner_tag == NULL) {
printf("uvm_page_own: dropping ownership of an non-owned "
"page (%p)\n", pg);
panic("uvm_page_own");
}
pg->owner_tag = NULL;
}
#endif
/*
* uvm_pagelookup: look up a page
*
* => caller should lock object to keep someone from pulling the page
* out from under it
*/
struct vm_page *
uvm_pagelookup(struct uvm_object *obj, voff_t off)
{
struct vm_page *pg;
KASSERT(db_active || rw_lock_held(obj->vmobjlock));
pg = radix_tree_lookup_node(&obj->uo_pages, off >> PAGE_SHIFT);
KASSERT(pg == NULL || obj->uo_npages != 0); KASSERT(pg == NULL || (pg->flags & (PG_RELEASED|PG_PAGEOUT)) == 0 ||
(pg->flags & PG_BUSY) != 0);
return pg;
}
/*
* uvm_pagewire: wire the page, thus removing it from the daemon's grasp
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pagewire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
if ((pg->flags & PG_READAHEAD) != 0) {
uvm_ra_hit.ev_count++;
pg->flags &= ~PG_READAHEAD;
}
#endif /* defined(READAHEAD_STATS) */
if (pg->wire_count == 0) { uvm_pagedequeue(pg);
atomic_inc_uint(&uvmexp.wired);
}
pg->wire_count++;
KASSERT(pg->wire_count > 0); /* detect wraparound */
}
/*
* uvm_pageunwire: unwire the page.
*
* => activate if wire count goes to zero.
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageunwire(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(pg->wire_count != 0); KASSERT(!uvmpdpol_pageisqueued_p(pg)); KASSERT(mutex_owned(&pg->interlock));
pg->wire_count--;
if (pg->wire_count == 0) {
uvm_pageactivate(pg);
KASSERT(uvmexp.wired != 0);
atomic_dec_uint(&uvmexp.wired);
}
}
/*
* uvm_pagedeactivate: deactivate page
*
* => caller must lock objects
* => caller must check to make sure page is not wired
* => object that page belongs to must be locked (so we can adjust pg->flags)
* => caller must clear the reference on the page before calling
* => caller must hold pg->interlock
*/
void
uvm_pagedeactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); if (pg->wire_count == 0) { KASSERT(uvmpdpol_pageisqueued_p(pg));
uvmpdpol_pagedeactivate(pg);
}
}
/*
* uvm_pageactivate: activate page
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageactivate(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock));
#if defined(READAHEAD_STATS)
if ((pg->flags & PG_READAHEAD) != 0) {
uvm_ra_hit.ev_count++;
pg->flags &= ~PG_READAHEAD;
}
#endif /* defined(READAHEAD_STATS) */
if (pg->wire_count == 0) { uvmpdpol_pageactivate(pg);
}
}
/*
* uvm_pagedequeue: remove a page from any paging queue
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pagedequeue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, true)); KASSERT(mutex_owned(&pg->interlock)); if (uvmpdpol_pageisqueued_p(pg)) { uvmpdpol_pagedequeue(pg);
}
}
/*
* uvm_pageenqueue: add a page to a paging queue without activating.
* used where a page is not really demanded (yet). eg. read-ahead
*
* => caller must lock objects
* => caller must hold pg->interlock
*/
void
uvm_pageenqueue(struct vm_page *pg)
{ KASSERT(uvm_page_owner_locked_p(pg, false)); KASSERT(mutex_owned(&pg->interlock)); if (pg->wire_count == 0 && !uvmpdpol_pageisqueued_p(pg)) { uvmpdpol_pageenqueue(pg);
}
}
/*
* uvm_pagelock: acquire page interlock
*/
void
uvm_pagelock(struct vm_page *pg)
{
mutex_enter(&pg->interlock);
}
/*
* uvm_pagelock2: acquire two page interlocks
*/
void
uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
{
if (pg1 < pg2) {
mutex_enter(&pg1->interlock);
mutex_enter(&pg2->interlock);
} else {
mutex_enter(&pg2->interlock);
mutex_enter(&pg1->interlock);
}
}
/*
* uvm_pageunlock: release page interlock, and if a page replacement intent
* is set on the page, pass it to uvmpdpol to make real.
*
* => caller must hold pg->interlock
*/
void
uvm_pageunlock(struct vm_page *pg)
{
if ((pg->pqflags & PQ_INTENT_SET) == 0 ||
(pg->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg->interlock);
return;
}
pg->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg->interlock);
uvmpdpol_pagerealize(pg);
}
/*
* uvm_pageunlock2: release two page interlocks, and for both pages if a
* page replacement intent is set on the page, pass it to uvmpdpol to make
* real.
*
* => caller must hold pg->interlock
*/
void
uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
{
if ((pg1->pqflags & PQ_INTENT_SET) == 0 ||
(pg1->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg1->interlock);
pg1 = NULL;
} else {
pg1->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg1->interlock);
}
if ((pg2->pqflags & PQ_INTENT_SET) == 0 ||
(pg2->pqflags & PQ_INTENT_QUEUED) != 0) {
mutex_exit(&pg2->interlock);
pg2 = NULL;
} else {
pg2->pqflags |= PQ_INTENT_QUEUED;
mutex_exit(&pg2->interlock);
}
if (pg1 != NULL) {
uvmpdpol_pagerealize(pg1);
}
if (pg2 != NULL) {
uvmpdpol_pagerealize(pg2);
}
}
/*
* uvm_pagezero: zero fill a page
*
* => if page is part of an object then the object should be locked
* to protect pg->flags.
*/
void
uvm_pagezero(struct vm_page *pg)
{
uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
pmap_zero_page(VM_PAGE_TO_PHYS(pg));
}
/*
* uvm_pagecopy: copy a page
*
* => if page is part of an object then the object should be locked
* to protect pg->flags.
*/
void
uvm_pagecopy(struct vm_page *src, struct vm_page *dst)
{
uvm_pagemarkdirty(dst, UVM_PAGE_STATUS_DIRTY);
pmap_copy_page(VM_PAGE_TO_PHYS(src), VM_PAGE_TO_PHYS(dst));
}
/*
* uvm_pageismanaged: test it see that a page (specified by PA) is managed.
*/
bool
uvm_pageismanaged(paddr_t pa)
{
return (uvm_physseg_find(atop(pa), NULL) != UVM_PHYSSEG_TYPE_INVALID);
}
/*
* uvm_page_lookup_freelist: look up the free list for the specified page
*/
int
uvm_page_lookup_freelist(struct vm_page *pg)
{
uvm_physseg_t upm;
upm = uvm_physseg_find(atop(VM_PAGE_TO_PHYS(pg)), NULL);
KASSERT(upm != UVM_PHYSSEG_TYPE_INVALID);
return uvm_physseg_get_free_list(upm);
}
/*
* uvm_page_owner_locked_p: return true if object associated with page is
* locked. this is a weak check for runtime assertions only.
*/
bool
uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
{
if (pg->uobject != NULL) {
return exclusive
? rw_write_held(pg->uobject->vmobjlock) : rw_lock_held(pg->uobject->vmobjlock);
}
if (pg->uanon != NULL) {
return exclusive
? rw_write_held(pg->uanon->an_lock) : rw_lock_held(pg->uanon->an_lock);
}
return true;
}
/*
* uvm_pagereadonly_p: return if the page should be mapped read-only
*/
bool
uvm_pagereadonly_p(struct vm_page *pg)
{
struct uvm_object * const uobj = pg->uobject;
KASSERT(uobj == NULL || rw_lock_held(uobj->vmobjlock)); KASSERT(uobj != NULL || rw_lock_held(pg->uanon->an_lock)); if ((pg->flags & PG_RDONLY) != 0) {
return true;
}
if (uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
return true;
}
if (uobj == NULL) {
return false;
}
return UVM_OBJ_NEEDS_WRITEFAULT(uobj);
}
#ifdef PMAP_DIRECT
/*
* Call pmap to translate physical address into a virtual and to run a callback
* for it. Used to avoid actually mapping the pages, pmap most likely uses direct map
* or equivalent.
*/
int
uvm_direct_process(struct vm_page **pgs, u_int npages, voff_t off, vsize_t len,
int (*process)(void *, size_t, void *), void *arg)
{
int error = 0;
paddr_t pa;
size_t todo;
voff_t pgoff = (off & PAGE_MASK);
struct vm_page *pg;
KASSERT(npages > 0);
KASSERT(len > 0);
for (int i = 0; i < npages; i++) {
pg = pgs[i];
KASSERT(len > 0);
/*
* Caller is responsible for ensuring all the pages are
* available.
*/
KASSERT(pg != NULL);
KASSERT(pg != PGO_DONTCARE);
pa = VM_PAGE_TO_PHYS(pg);
todo = MIN(len, PAGE_SIZE - pgoff);
error = pmap_direct_process(pa, pgoff, todo, process, arg);
if (error)
break;
pgoff = 0;
len -= todo;
}
KASSERTMSG(error != 0 || len == 0, "len %lu != 0 for non-error", len);
return error;
}
#endif /* PMAP_DIRECT */
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_page_printit: actually print the page
*/
static const char page_flagbits[] = UVM_PGFLAGBITS;
static const char page_pqflagbits[] = UVM_PQFLAGBITS;
void
uvm_page_printit(struct vm_page *pg, bool full,
void (*pr)(const char *, ...))
{
struct vm_page *tpg;
struct uvm_object *uobj;
struct pgflbucket *pgb;
struct pgflist *pgl;
char pgbuf[128];
(*pr)("PAGE %p:\n", pg);
snprintb(pgbuf, sizeof(pgbuf), page_flagbits, pg->flags);
(*pr)(" flags=%s\n", pgbuf);
snprintb(pgbuf, sizeof(pgbuf), page_pqflagbits, pg->pqflags);
(*pr)(" pqflags=%s\n", pgbuf);
(*pr)(" uobject=%p, uanon=%p, offset=0x%llx\n",
pg->uobject, pg->uanon, (long long)pg->offset);
(*pr)(" loan_count=%d wire_count=%d bucket=%d freelist=%d\n",
pg->loan_count, pg->wire_count, uvm_page_get_bucket(pg),
uvm_page_get_freelist(pg));
(*pr)(" pa=0x%lx\n", (long)VM_PAGE_TO_PHYS(pg));
#if defined(UVM_PAGE_TRKOWN)
if (pg->flags & PG_BUSY)
(*pr)(" owning process = %d.%d, tag=%s\n",
pg->owner, pg->lowner, pg->owner_tag);
else
(*pr)(" page not busy, no owner\n");
#else
(*pr)(" [page ownership tracking disabled]\n");
#endif
if (!full)
return;
/* cross-verify object/anon */
if ((pg->flags & PG_FREE) == 0) {
if (pg->flags & PG_ANON) {
if (pg->uanon == NULL || pg->uanon->an_page != pg)
(*pr)(" >>> ANON DOES NOT POINT HERE <<< (%p)\n",
(pg->uanon) ? pg->uanon->an_page : NULL);
else
(*pr)(" anon backpointer is OK\n");
} else {
uobj = pg->uobject;
if (uobj) {
(*pr)(" checking object list\n");
tpg = uvm_pagelookup(uobj, pg->offset);
if (tpg)
(*pr)(" page found on object list\n");
else
(*pr)(" >>> PAGE NOT FOUND ON OBJECT LIST! <<<\n");
}
}
}
/* cross-verify page queue */
if (pg->flags & PG_FREE) {
int fl = uvm_page_get_freelist(pg);
int b = uvm_page_get_bucket(pg);
pgb = uvm.page_free[fl].pgfl_buckets[b];
pgl = &pgb->pgb_colors[VM_PGCOLOR(pg)];
(*pr)(" checking pageq list\n");
LIST_FOREACH(tpg, pgl, pageq.list) {
if (tpg == pg) {
break;
}
}
if (tpg)
(*pr)(" page found on pageq list\n");
else
(*pr)(" >>> PAGE NOT FOUND ON PAGEQ LIST! <<<\n");
}
}
/*
* uvm_page_printall - print a summary of all managed pages
*/
void
uvm_page_printall(void (*pr)(const char *, ...))
{
uvm_physseg_t i;
paddr_t pfn;
struct vm_page *pg;
(*pr)("%18s %4s %4s %18s %18s"
#ifdef UVM_PAGE_TRKOWN
" OWNER"
#endif
"\n", "PAGE", "FLAG", "PQ", "UOBJECT", "UANON");
for (i = uvm_physseg_get_first();
uvm_physseg_valid_p(i);
i = uvm_physseg_get_next(i)) {
for (pfn = uvm_physseg_get_start(i);
pfn < uvm_physseg_get_end(i);
pfn++) {
pg = PHYS_TO_VM_PAGE(ptoa(pfn));
(*pr)("%18p %04x %08x %18p %18p",
pg, pg->flags, pg->pqflags, pg->uobject,
pg->uanon);
#ifdef UVM_PAGE_TRKOWN
if (pg->flags & PG_BUSY)
(*pr)(" %d [%s]", pg->owner, pg->owner_tag);
#endif
(*pr)("\n");
}
}
}
/*
* uvm_page_print_freelists - print a summary freelists
*/
void
uvm_page_print_freelists(void (*pr)(const char *, ...))
{
struct pgfreelist *pgfl;
struct pgflbucket *pgb;
int fl, b, c;
(*pr)("There are %d freelists with %d buckets of %d colors.\n\n",
VM_NFREELIST, uvm.bucketcount, uvmexp.ncolors);
for (fl = 0; fl < VM_NFREELIST; fl++) {
pgfl = &uvm.page_free[fl];
(*pr)("freelist(%d) @ %p\n", fl, pgfl);
for (b = 0; b < uvm.bucketcount; b++) {
pgb = uvm.page_free[fl].pgfl_buckets[b];
(*pr)(" bucket(%d) @ %p, nfree = %d, lock @ %p:\n",
b, pgb, pgb->pgb_nfree,
&uvm_freelist_locks[b].lock);
for (c = 0; c < uvmexp.ncolors; c++) {
(*pr)(" color(%d) @ %p, ", c,
&pgb->pgb_colors[c]);
(*pr)("first page = %p\n",
LIST_FIRST(&pgb->pgb_colors[c]));
}
}
}
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $ */
/*
* Copyright (c) 2008, 2010, 2016, 2017, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran, and by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2007 Manuel Bouyer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Copyright 2001 (c) Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.426 2023/10/04 20:28:06 ad Exp $");
#include "opt_user_ldt.h"
#include "opt_lockdebug.h"
#include "opt_multiprocessor.h"
#include "opt_xen.h"
#include "opt_svs.h"
#include "opt_kaslr.h"
#include "opt_efi.h"
#define __MUTEX_PRIVATE /* for assertions */
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/xcall.h>
#include <sys/kcore.h>
#include <sys/kmem.h>
#include <sys/asan.h>
#include <sys/msan.h>
#include <sys/entropy.h>
#include <uvm/uvm.h>
#include <uvm/pmap/pmap_pvt.h>
#include <dev/isa/isareg.h>
#include <machine/specialreg.h>
#include <machine/gdt.h>
#include <machine/isa_machdep.h>
#include <machine/cpuvar.h>
#include <machine/cputypes.h>
#include <machine/pmap_private.h>
#include <x86/bootspace.h>
#include <x86/pat.h>
#include <x86/pmap_pv.h>
#include <x86/i82489reg.h>
#include <x86/i82489var.h>
#ifdef XEN
#include <xen/include/public/xen.h>
#include <xen/hypervisor.h>
#include <xen/xenpmap.h>
#endif
#ifdef __HAVE_DIRECT_MAP
#include <crypto/nist_hash_drbg/nist_hash_drbg.h>
#endif
/*
* general info:
*
* - for an explanation of how the x86 MMU hardware works see
* the comments in <machine/pte.h>.
*
* - for an explanation of the general memory structure used by
* this pmap (including the recursive mapping), see the comments
* in <machine/pmap.h>.
*
* this file contains the code for the "pmap module." the module's
* job is to manage the hardware's virtual to physical address mappings.
* note that there are two levels of mapping in the VM system:
*
* [1] the upper layer of the VM system uses vm_map's and vm_map_entry's
* to map ranges of virtual address space to objects/files. for
* example, the vm_map may say: "map VA 0x1000 to 0x22000 read-only
* to the file /bin/ls starting at offset zero." note that
* the upper layer mapping is not concerned with how individual
* vm_pages are mapped.
*
* [2] the lower layer of the VM system (the pmap) maintains the mappings
* from virtual addresses. it is concerned with which vm_page is
* mapped where. for example, when you run /bin/ls and start
* at page 0x1000 the fault routine may lookup the correct page
* of the /bin/ls file and then ask the pmap layer to establish
* a mapping for it.
*
* note that information in the lower layer of the VM system can be
* thrown away since it can easily be reconstructed from the info
* in the upper layer.
*
* data structures we use include:
*
* - struct pmap: describes the address space of one thread
* - struct pmap_page: describes one pv-tracked page, without
* necessarily a corresponding vm_page
* - struct pv_entry: describes one <PMAP,VA> mapping of a PA
* - pmap_page::pp_pvlist: there is one list per pv-tracked page of
* physical memory. the pp_pvlist points to a list of pv_entry
* structures which describe all the <PMAP,VA> pairs that this
* page is mapped in. this is critical for page based operations
* such as pmap_page_protect() [change protection on _all_ mappings
* of a page]
*/
/*
* Locking
*
* We have the following locks that we must deal with, listed in the order
* that they are acquired:
*
* pg->uobject->vmobjlock, pg->uanon->an_lock
*
* For managed pages, these per-object locks are taken by the VM system
* before calling into the pmap module - either a read or write hold.
* The lock hold prevent pages from changing identity while the pmap is
* operating on them. For example, the same lock is held across a call
* to pmap_remove() and the following call to pmap_update(), so that a
* page does not gain a new identity while its TLB visibility is stale.
*
* pmap->pm_lock
*
* This lock protects the fields in the pmap structure including the
* non-kernel PDEs in the PDP, the PTEs, and PTPs and connected data
* structures. For modifying unmanaged kernel PTEs it is not needed as
* kernel PDEs are never freed, and the kernel is expected to be self
* consistent (and the lock can't be taken for unmanaged kernel PTEs,
* because they can be modified from interrupt context).
*
* pmaps_lock
*
* This lock protects the list of active pmaps (headed by "pmaps").
* It's acquired when adding or removing pmaps or adjusting kernel PDEs.
*
* pp_lock
*
* This per-page lock protects PV entry lists and the embedded PV entry
* in each vm_page, allowing for concurrent operation on pages by
* different pmaps. This is a spin mutex at IPL_VM, because at the
* points it is taken context switching is usually not tolerable, and
* spin mutexes must block out interrupts that could take kernel_lock.
*/
/* uvm_object is abused here to index pmap_pages; make assertions happy. */
#ifdef DIAGNOSTIC
#define PMAP_DUMMY_LOCK(pm) rw_enter(&(pm)->pm_dummy_lock, RW_WRITER)
#define PMAP_DUMMY_UNLOCK(pm) rw_exit(&(pm)->pm_dummy_lock)
#else
#define PMAP_DUMMY_LOCK(pm)
#define PMAP_DUMMY_UNLOCK(pm)
#endif
static const struct uvm_pagerops pmap_pager = {
/* nothing */
};
/*
* pl_i(va, X) == plX_i(va) <= pl_i_roundup(va, X)
*/
#define pl_i(va, lvl) \
(((VA_SIGN_POS(va)) & ptp_frames[(lvl)-1]) >> ptp_shifts[(lvl)-1])
#define pl_i_roundup(va, lvl) pl_i((va)+ ~ptp_frames[(lvl)-1], (lvl))
/*
* PTP macros:
* a PTP's index is the PD index of the PDE that points to it
* a PTP's offset is the byte-offset in the PTE space that this PTP is at
* a PTP's VA is the first VA mapped by that PTP
*/
#define ptp_va2o(va, lvl) (pl_i(va, (lvl)+1) * PAGE_SIZE)
const vaddr_t ptp_masks[] = PTP_MASK_INITIALIZER;
const vaddr_t ptp_frames[] = PTP_FRAME_INITIALIZER;
const int ptp_shifts[] = PTP_SHIFT_INITIALIZER;
const long nkptpmax[] = NKPTPMAX_INITIALIZER;
const long nbpd[] = NBPD_INITIALIZER;
#ifdef i386
pd_entry_t * const normal_pdes[] = PDES_INITIALIZER;
#else
pd_entry_t *normal_pdes[3];
#endif
long nkptp[] = NKPTP_INITIALIZER;
struct pmap_head pmaps;
kmutex_t pmaps_lock __cacheline_aligned;
struct pcpu_area *pcpuarea __read_mostly;
static vaddr_t pmap_maxkvaddr;
/*
* Misc. event counters.
*/
struct evcnt pmap_iobmp_evcnt;
struct evcnt pmap_ldt_evcnt;
/*
* PAT
*/
static bool cpu_pat_enabled __read_mostly = false;
/*
* Global data structures
*/
static struct pmap kernel_pmap_store __cacheline_aligned; /* kernel's pmap */
struct pmap *const kernel_pmap_ptr = &kernel_pmap_store;
static rb_tree_t pmap_kernel_rb __cacheline_aligned;
struct bootspace bootspace __read_mostly;
struct slotspace slotspace __read_mostly;
/* Set to PTE_NX if supported. */
pd_entry_t pmap_pg_nx __read_mostly = 0;
/* Set to PTE_G if supported. */
pd_entry_t pmap_pg_g __read_mostly = 0;
/* Set to true if large pages are supported. */
int pmap_largepages __read_mostly = 0;
paddr_t lowmem_rsvd __read_mostly;
paddr_t avail_start __read_mostly; /* PA of first available physical page */
paddr_t avail_end __read_mostly; /* PA of last available physical page */
#ifdef XENPV
paddr_t pmap_pa_start; /* PA of first physical page for this domain */
paddr_t pmap_pa_end; /* PA of last physical page for this domain */
#endif
#define VM_PAGE_TO_PP(pg) (&(pg)->mdpage.mp_pp)
#define PMAP_CHECK_PP(pp) \
KASSERTMSG((pp)->pp_lock.mtx_ipl._ipl == IPL_VM, "bad pmap_page %p", pp)
#define PAGE_ALIGNED(pp) \
__builtin_assume_aligned((void *)(pp), PAGE_SIZE)
/*
* Other data structures
*/
static pt_entry_t protection_codes[8] __read_mostly;
static bool pmap_initialized __read_mostly = false; /* pmap_init done yet? */
/*
* The following two vaddr_t's are used during system startup to keep track of
* how much of the kernel's VM space we have used. Once the system is started,
* the management of the remaining kernel VM space is turned over to the
* kernel_map vm_map.
*/
static vaddr_t virtual_avail __read_mostly; /* VA of first free KVA */
static vaddr_t virtual_end __read_mostly; /* VA of last free KVA */
#ifndef XENPV
/*
* LAPIC virtual address, and fake physical address.
*/
volatile vaddr_t local_apic_va __read_mostly;
paddr_t local_apic_pa __read_mostly;
#endif
/*
* pool that pmap structures are allocated from
*/
struct pool_cache pmap_cache;
static int pmap_ctor(void *, void *, int);
static void pmap_dtor(void *, void *);
/*
* pv_page cache
*/
static struct pool_cache pmap_pvp_cache;
#ifdef __HAVE_DIRECT_MAP
vaddr_t pmap_direct_base __read_mostly;
vaddr_t pmap_direct_end __read_mostly;
#endif
#ifndef __HAVE_DIRECT_MAP
/*
* Special VAs and the PTEs that map them
*/
static pt_entry_t *early_zero_pte;
static void pmap_vpage_cpualloc(struct cpu_info *);
#ifdef XENPV
char *early_zerop; /* also referenced from xen_locore() */
#else
static char *early_zerop;
#endif
#endif
int pmap_enter_default(pmap_t, vaddr_t, paddr_t, vm_prot_t, u_int);
/* PDP pool and its callbacks */
static struct pool pmap_pdp_pool;
static void pmap_pdp_init(pd_entry_t *);
static void pmap_pdp_fini(pd_entry_t *);
#ifdef PAE
/* need to allocate items of 4 pages */
static void *pmap_pdp_alloc(struct pool *, int);
static void pmap_pdp_free(struct pool *, void *);
static struct pool_allocator pmap_pdp_allocator = {
.pa_alloc = pmap_pdp_alloc,
.pa_free = pmap_pdp_free,
.pa_pagesz = PAGE_SIZE * PDP_SIZE,
};
#endif
extern vaddr_t idt_vaddr;
extern paddr_t idt_paddr;
extern vaddr_t gdt_vaddr;
extern paddr_t gdt_paddr;
extern vaddr_t ldt_vaddr;
extern paddr_t ldt_paddr;
#ifdef i386
/* stuff to fix the pentium f00f bug */
extern vaddr_t pentium_idt_vaddr;
#endif
/* Array of freshly allocated PTPs, for pmap_get_ptp(). */
struct pmap_ptparray {
struct vm_page *pg[PTP_LEVELS + 1];
bool alloced[PTP_LEVELS + 1];
};
/*
* PV entries are allocated in page-sized chunks and cached per-pmap to
* avoid intense pressure on memory allocators.
*/
struct pv_page {
LIST_HEAD(, pv_entry) pvp_pves;
LIST_ENTRY(pv_page) pvp_list;
long pvp_nfree;
struct pmap *pvp_pmap;
};
#define PVE_PER_PVP ((PAGE_SIZE / sizeof(struct pv_entry)) - 1)
/*
* PV tree prototypes
*/
static int pmap_compare_key(void *, const void *, const void *);
static int pmap_compare_nodes(void *, const void *, const void *);
/* Read-black tree */
static const rb_tree_ops_t pmap_rbtree_ops = {
.rbto_compare_nodes = pmap_compare_nodes,
.rbto_compare_key = pmap_compare_key,
.rbto_node_offset = offsetof(struct pv_entry, pve_rb),
.rbto_context = NULL
};
/*
* Local prototypes
*/
#ifdef __HAVE_PCPU_AREA
static void pmap_init_pcpu(void);
#endif
#ifdef __HAVE_DIRECT_MAP
static void pmap_init_directmap(struct pmap *);
#endif
#if !defined(XENPV)
static void pmap_remap_global(void);
#endif
#ifndef XENPV
static void pmap_init_lapic(void);
static void pmap_remap_largepages(void);
#endif
static int pmap_get_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t, int,
struct vm_page **);
static void pmap_unget_ptp(struct pmap *, struct pmap_ptparray *);
static void pmap_install_ptp(struct pmap *, struct pmap_ptparray *, vaddr_t,
pd_entry_t * const *);
static struct vm_page *pmap_find_ptp(struct pmap *, vaddr_t, int);
static void pmap_freepage(struct pmap *, struct vm_page *, int);
static void pmap_free_ptp(struct pmap *, struct vm_page *, vaddr_t,
pt_entry_t *, pd_entry_t * const *);
static bool pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
vaddr_t);
static void pmap_remove_ptes(struct pmap *, struct vm_page *, vaddr_t, vaddr_t,
vaddr_t);
static int pmap_pvp_ctor(void *, void *, int);
static void pmap_pvp_dtor(void *, void *);
static struct pv_entry *pmap_alloc_pv(struct pmap *);
static void pmap_free_pv(struct pmap *, struct pv_entry *);
static void pmap_drain_pv(struct pmap *);
static void pmap_alloc_level(struct pmap *, vaddr_t, long *);
static void pmap_load1(struct lwp *, struct pmap *, struct pmap *);
static void pmap_reactivate(struct pmap *);
long
pmap_resident_count(struct pmap *pmap)
{
return pmap->pm_stats.resident_count;
}
long
pmap_wired_count(struct pmap *pmap)
{
return pmap->pm_stats.wired_count;
}
/*
* p m a p h e l p e r f u n c t i o n s
*/
static inline void
pmap_stats_update(struct pmap *pmap, int resid_diff, int wired_diff)
{
KASSERT(cold || mutex_owned(&pmap->pm_lock));
pmap->pm_stats.resident_count += resid_diff;
pmap->pm_stats.wired_count += wired_diff;
}
static inline void
pmap_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
int resid_diff = ((npte & PTE_P) ? 1 : 0) - ((opte & PTE_P) ? 1 : 0);
int wired_diff = ((npte & PTE_WIRED) ? 1 : 0) - ((opte & PTE_WIRED) ? 1 : 0);
KASSERT((npte & (PTE_P | PTE_WIRED)) != PTE_WIRED); KASSERT((opte & (PTE_P | PTE_WIRED)) != PTE_WIRED); pmap_stats_update(pmap, resid_diff, wired_diff);
}
/*
* ptp_to_pmap: lookup pmap by ptp
*/
static inline struct pmap *
ptp_to_pmap(struct vm_page *ptp)
{
struct pmap *pmap;
if (ptp == NULL) {
return pmap_kernel();
}
pmap = (struct pmap *)ptp->uobject;
KASSERT(pmap != NULL); KASSERT(&pmap->pm_obj[0] == ptp->uobject);
return pmap;
}
static inline struct pv_pte *
pve_to_pvpte(struct pv_entry *pve)
{
if (pve == NULL)
return NULL;
KASSERT((void *)&pve->pve_pte == (void *)pve);
return &pve->pve_pte;
}
static inline struct pv_entry *
pvpte_to_pve(struct pv_pte *pvpte)
{
struct pv_entry *pve = (void *)pvpte;
KASSERT(pve_to_pvpte(pve) == pvpte);
return pve;
}
/*
* Return true if the pmap page has an embedded PV entry.
*/
static inline bool
pv_pte_embedded(struct pmap_page *pp)
{
KASSERT(mutex_owned(&pp->pp_lock));
return (bool)((vaddr_t)pp->pp_pte.pte_ptp | pp->pp_pte.pte_va);
}
/*
* pv_pte_first, pv_pte_next: PV list iterator.
*/
static inline struct pv_pte *
pv_pte_first(struct pmap_page *pp)
{
KASSERT(mutex_owned(&pp->pp_lock)); if (pv_pte_embedded(pp)) { return &pp->pp_pte;
}
return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
}
static inline struct pv_pte *
pv_pte_next(struct pmap_page *pp, struct pv_pte *pvpte)
{
KASSERT(mutex_owned(&pp->pp_lock));
KASSERT(pvpte != NULL);
if (pvpte == &pp->pp_pte) {
return pve_to_pvpte(LIST_FIRST(&pp->pp_pvlist));
}
return pve_to_pvpte(LIST_NEXT(pvpte_to_pve(pvpte), pve_list));
}
static inline uint8_t
pmap_pte_to_pp_attrs(pt_entry_t pte)
{
uint8_t ret = 0;
if (pte & PTE_D)
ret |= PP_ATTRS_D;
if (pte & PTE_A)
ret |= PP_ATTRS_A;
if (pte & PTE_W)
ret |= PP_ATTRS_W;
return ret;
}
static inline pt_entry_t
pmap_pp_attrs_to_pte(uint8_t attrs)
{
pt_entry_t pte = 0;
if (attrs & PP_ATTRS_D)
pte |= PTE_D;
if (attrs & PP_ATTRS_A)
pte |= PTE_A;
if (attrs & PP_ATTRS_W)
pte |= PTE_W;
return pte;
}
/*
* pmap_is_curpmap: is this pmap the one currently loaded [in %cr3]?
* of course the kernel is always loaded
*/
bool
pmap_is_curpmap(struct pmap *pmap)
{
return ((pmap == pmap_kernel()) || (pmap == curcpu()->ci_pmap));
}
inline void
pmap_reference(struct pmap *pmap)
{
atomic_inc_uint(&pmap->pm_obj[0].uo_refs);
}
/*
* rbtree: compare two nodes.
*/
static int
pmap_compare_nodes(void *context, const void *n1, const void *n2)
{
const struct pv_entry *pve1 = n1;
const struct pv_entry *pve2 = n2;
KASSERT(pve1->pve_pte.pte_ptp == pve2->pve_pte.pte_ptp);
if (pve1->pve_pte.pte_va < pve2->pve_pte.pte_va) {
return -1;
}
if (pve1->pve_pte.pte_va > pve2->pve_pte.pte_va) {
return 1;
}
return 0;
}
/*
* rbtree: compare a node and a key.
*/
static int
pmap_compare_key(void *context, const void *n, const void *k)
{
const struct pv_entry *pve = n;
const vaddr_t key = (vaddr_t)k;
if (pve->pve_pte.pte_va < key) {
return -1;
}
if (pve->pve_pte.pte_va > key) {
return 1;
}
return 0;
}
/*
* pmap_ptp_range_set: abuse ptp->uanon to record minimum VA of PTE
*/
static inline void
pmap_ptp_range_set(struct vm_page *ptp, vaddr_t va)
{
vaddr_t *min = (vaddr_t *)&ptp->uanon;
if (va < *min) { *min = va;
}
}
/*
* pmap_ptp_range_clip: abuse ptp->uanon to clip range of PTEs to remove
*/
static inline void
pmap_ptp_range_clip(struct vm_page *ptp, vaddr_t *startva, pt_entry_t **pte)
{
vaddr_t sclip;
if (ptp == NULL) {
return;
}
sclip = (vaddr_t)ptp->uanon;
sclip = (*startva < sclip ? sclip : *startva);
*pte += (sclip - *startva) / PAGE_SIZE;
*startva = sclip;
}
/*
* pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
*
* there are several pmaps involved. some or all of them might be same.
*
* - the pmap given by the first argument
* our caller wants to access this pmap's PTEs.
*
* - pmap_kernel()
* the kernel pmap. note that it only contains the kernel part
* of the address space which is shared by any pmap. ie. any
* pmap can be used instead of pmap_kernel() for our purpose.
*
* - ci->ci_pmap
* pmap currently loaded on the cpu.
*
* - vm_map_pmap(&curproc->p_vmspace->vm_map)
* current process' pmap.
*
* => caller must lock pmap first (if not the kernel pmap)
* => must be undone with pmap_unmap_ptes before returning
* => disables kernel preemption
*/
void
pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2, pd_entry_t **ptepp,
pd_entry_t * const **pdeppp)
{
struct pmap *curpmap;
struct cpu_info *ci;
lwp_t *l;
kpreempt_disable();
/* The kernel's pmap is always accessible. */
if (pmap == pmap_kernel()) {
*pmap2 = NULL;
*ptepp = PTE_BASE;
*pdeppp = normal_pdes;
return;
}
KASSERT(mutex_owned(&pmap->pm_lock));
l = curlwp;
ci = l->l_cpu;
curpmap = ci->ci_pmap;
if (pmap == curpmap) {
/*
* Already on the CPU: make it valid. This is very
* often the case during exit(), when we have switched
* to the kernel pmap in order to destroy a user pmap.
*/
if (__predict_false(ci->ci_tlbstate != TLBSTATE_VALID)) { pmap_reactivate(pmap);
}
*pmap2 = NULL;
} else {
/*
* Toss current pmap from CPU and install new pmap, but keep
* a reference to the old one. Dropping the reference can
* can block as it needs to take locks, so defer that to
* pmap_unmap_ptes().
*/
pmap_reference(pmap);
pmap_load1(l, pmap, curpmap);
*pmap2 = curpmap;
}
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
#ifdef DIAGNOSTIC
pmap->pm_pctr = lwp_pctr();
#endif
*ptepp = PTE_BASE;
#if defined(XENPV) && defined(__x86_64__)
KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] == L4_BASE);
ci->ci_normal_pdes[PTP_LEVELS - 2] = pmap->pm_pdir;
*pdeppp = ci->ci_normal_pdes;
#else
*pdeppp = normal_pdes;
#endif
}
/*
* pmap_unmap_ptes: unlock the PTE mapping of "pmap"
*
* => we cannot tolerate context switches while mapped in: assert this.
* => reenables kernel preemption.
* => does not unlock pmap.
*/
void
pmap_unmap_ptes(struct pmap *pmap, struct pmap * pmap2)
{
struct cpu_info *ci;
struct pmap *mypmap;
struct lwp *l;
KASSERT(kpreempt_disabled());
/* The kernel's pmap is always accessible. */
if (pmap == pmap_kernel()) {
kpreempt_enable();
return;
}
l = curlwp;
ci = l->l_cpu;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pmap->pm_pctr == lwp_pctr());
#if defined(XENPV) && defined(__x86_64__)
KASSERT(ci->ci_normal_pdes[PTP_LEVELS - 2] != L4_BASE);
ci->ci_normal_pdes[PTP_LEVELS - 2] = L4_BASE;
#endif
/* If not our own pmap, mark whatever's on the CPU now as lazy. */
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
mypmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
if (ci->ci_pmap == vm_map_pmap(&l->l_proc->p_vmspace->vm_map)) {
ci->ci_want_pmapload = 0;
} else {
ci->ci_want_pmapload = (mypmap != pmap_kernel());
ci->ci_tlbstate = TLBSTATE_LAZY;
}
/* Now safe to re-enable preemption. */
kpreempt_enable();
/* Toss reference to other pmap taken earlier. */
if (pmap2 != NULL) { pmap_destroy(pmap2);
}
}
inline static void
pmap_exec_account(struct pmap *pm, vaddr_t va, pt_entry_t opte, pt_entry_t npte)
{
#if !defined(__x86_64__)
if (curproc == NULL || curproc->p_vmspace == NULL ||
pm != vm_map_pmap(&curproc->p_vmspace->vm_map))
return;
if ((opte ^ npte) & PTE_X)
pmap_update_pg(va);
/*
* Executability was removed on the last executable change.
* Reset the code segment to something conservative and
* let the trap handler deal with setting the right limit.
* We can't do that because of locking constraints on the vm map.
*/
if ((opte & PTE_X) && (npte & PTE_X) == 0 && va == pm->pm_hiexec) {
struct trapframe *tf = curlwp->l_md.md_regs;
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
pm->pm_hiexec = I386_MAX_EXE_ADDR;
}
#endif /* !defined(__x86_64__) */
}
#if !defined(__x86_64__)
/*
* Fixup the code segment to cover all potential executable mappings.
* returns 0 if no changes to the code segment were made.
*/
int
pmap_exec_fixup(struct vm_map *map, struct trapframe *tf, struct pcb *pcb)
{
struct vm_map_entry *ent;
struct pmap *pm = vm_map_pmap(map);
vaddr_t va = 0;
vm_map_lock_read(map);
for (ent = (&map->header)->next; ent != &map->header; ent = ent->next) {
/*
* This entry has greater va than the entries before.
* We need to make it point to the last page, not past it.
*/
if (ent->protection & VM_PROT_EXECUTE)
va = trunc_page(ent->end) - PAGE_SIZE;
}
vm_map_unlock_read(map);
if (va == pm->pm_hiexec && tf->tf_cs == GSEL(GUCODEBIG_SEL, SEL_UPL))
return 0;
pm->pm_hiexec = va;
if (pm->pm_hiexec > I386_MAX_EXE_ADDR) {
tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
} else {
tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
return 0;
}
return 1;
}
#endif /* !defined(__x86_64__) */
void
pat_init(struct cpu_info *ci)
{
#ifndef XENPV
uint64_t pat;
if (!(ci->ci_feat_val[0] & CPUID_PAT))
return;
/* We change WT to WC. Leave all other entries the default values. */
pat = PATENTRY(0, PAT_WB) | PATENTRY(1, PAT_WC) |
PATENTRY(2, PAT_UCMINUS) | PATENTRY(3, PAT_UC) |
PATENTRY(4, PAT_WB) | PATENTRY(5, PAT_WC) |
PATENTRY(6, PAT_UCMINUS) | PATENTRY(7, PAT_UC);
wrmsr(MSR_CR_PAT, pat);
cpu_pat_enabled = true;
#endif
}
static pt_entry_t
pmap_pat_flags(u_int flags)
{
u_int cacheflags = (flags & PMAP_CACHE_MASK);
if (!cpu_pat_enabled) {
switch (cacheflags) {
case PMAP_NOCACHE:
case PMAP_NOCACHE_OVR:
/* results in PGC_UCMINUS on cpus which have
* the cpuid PAT but PAT "disabled"
*/
return PTE_PCD;
default:
return 0;
}
}
switch (cacheflags) {
case PMAP_NOCACHE:
return PGC_UC;
case PMAP_WRITE_COMBINE:
return PGC_WC;
case PMAP_WRITE_BACK:
return PGC_WB;
case PMAP_NOCACHE_OVR:
return PGC_UCMINUS;
}
return 0;
}
/*
* p m a p k e n t e r f u n c t i o n s
*
* functions to quickly enter/remove pages from the kernel address
* space. pmap_kremove is exported to MI kernel. we make use of
* the recursive PTE mappings.
*/
/*
* pmap_kenter_pa: enter a kernel mapping without R/M (pv_entry) tracking
*
* => no need to lock anything, assume va is already allocated
* => should be faster than normal pmap enter function
*/
void
pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
{
pt_entry_t *pte, opte, npte;
KASSERT(!(prot & ~VM_PROT_ALL));
if (va < VM_MIN_KERNEL_ADDRESS)
pte = vtopte(va);
else
pte = kvtopte(va);
#if defined(XENPV) && defined(DOM0OPS)
if (pa < pmap_pa_start || pa >= pmap_pa_end) {
#ifdef DEBUG
printf_nolog("%s: pa %#" PRIxPADDR " for va %#" PRIxVADDR
" outside range\n", __func__, pa, va);
#endif /* DEBUG */
npte = pa;
} else
#endif /* XENPV && DOM0OPS */
npte = pmap_pa2pte(pa);
npte |= protection_codes[prot] | PTE_P | pmap_pg_g;
npte |= pmap_pat_flags(flags);
opte = pmap_pte_testset(pte, npte); /* zap! */
/*
* XXX: make sure we are not dealing with a large page, since the only
* large pages created are for the kernel image, and they should never
* be kentered.
*/
KASSERTMSG(!(opte & PTE_PS), "PTE_PS va=%#"PRIxVADDR, va); if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A)) {
/* This should not happen. */
printf_nolog("%s: mapping already present\n", __func__);
kpreempt_disable();
pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
kpreempt_enable();
}
}
__strict_weak_alias(pmap_kenter_ma, pmap_kenter_pa);
#if defined(__x86_64__)
/*
* Change protection for a virtual address. Local for a CPU only, don't
* care about TLB shootdowns.
*
* => must be called with preemption disabled
*/
void
pmap_changeprot_local(vaddr_t va, vm_prot_t prot)
{
pt_entry_t *pte, opte, npte;
KASSERT(kpreempt_disabled());
if (va < VM_MIN_KERNEL_ADDRESS)
pte = vtopte(va);
else
pte = kvtopte(va);
npte = opte = *pte;
if ((prot & VM_PROT_WRITE) != 0)
npte |= PTE_W;
else
npte &= ~(PTE_W|PTE_D);
if (opte != npte) {
pmap_pte_set(pte, npte);
pmap_pte_flush();
invlpg(va);
}
}
#endif /* defined(__x86_64__) */
/*
* pmap_kremove: remove a kernel mapping(s) without R/M (pv_entry) tracking
*
* => no need to lock anything
* => caller must dispose of any vm_page mapped in the va range
* => note: not an inline function
* => we assume the va is page aligned and the len is a multiple of PAGE_SIZE
* => we assume kernel only unmaps valid addresses and thus don't bother
* checking the valid bit before doing TLB flushing
* => must be followed by call to pmap_update() before reuse of page
*/
static void
pmap_kremove1(vaddr_t sva, vsize_t len, bool localonly)
{
pt_entry_t *pte, opte;
vaddr_t va, eva;
eva = sva + len;
kpreempt_disable();
for (va = sva; va < eva; va += PAGE_SIZE) { pte = kvtopte(va);
opte = pmap_pte_testset(pte, 0); /* zap! */
if ((opte & (PTE_P | PTE_A)) == (PTE_P | PTE_A) && !localonly) { pmap_tlb_shootdown(pmap_kernel(), va, opte,
TLBSHOOT_KREMOVE);
}
KASSERTMSG((opte & PTE_PS) == 0,
"va %#" PRIxVADDR " is a large page", va);
KASSERTMSG((opte & PTE_PVLIST) == 0,
"va %#" PRIxVADDR " is a pv tracked page", va);
}
if (localonly) { tlbflushg();
}
kpreempt_enable();
}
void
pmap_kremove(vaddr_t sva, vsize_t len)
{
pmap_kremove1(sva, len, false);
}
/*
* pmap_kremove_local: like pmap_kremove(), but only worry about
* TLB invalidations on the current CPU. this is only intended
* for use while writing kernel crash dumps, either after panic
* or via reboot -d.
*/
void
pmap_kremove_local(vaddr_t sva, vsize_t len)
{
pmap_kremove1(sva, len, true);
}
/*
* p m a p i n i t f u n c t i o n s
*
* pmap_bootstrap and pmap_init are called during system startup
* to init the pmap module. pmap_bootstrap() does a low level
* init just to get things rolling. pmap_init() finishes the job.
*/
/*
* pmap_bootstrap_valloc: allocate a virtual address in the bootstrap area.
* This function is to be used before any VM system has been set up.
*
* The va is taken from virtual_avail.
*/
static vaddr_t
pmap_bootstrap_valloc(size_t npages)
{
vaddr_t va = virtual_avail;
virtual_avail += npages * PAGE_SIZE;
return va;
}
/*
* pmap_bootstrap_palloc: allocate a physical address in the bootstrap area.
* This function is to be used before any VM system has been set up.
*
* The pa is taken from avail_start.
*/
static paddr_t
pmap_bootstrap_palloc(size_t npages)
{
paddr_t pa = avail_start;
avail_start += npages * PAGE_SIZE;
return pa;
}
/*
* pmap_bootstrap: get the system in a state where it can run with VM properly
* enabled (called before main()). The VM system is fully init'd later.
*
* => on i386, locore.S has already enabled the MMU by allocating a PDP for the
* kernel, and nkpde PTP's for the kernel.
* => kva_start is the first free virtual address in kernel space.
*/
void
pmap_bootstrap(vaddr_t kva_start)
{
struct pmap *kpm;
int i;
vaddr_t kva;
pmap_pg_nx = (cpu_feature[2] & CPUID_NOX ? PTE_NX : 0);
/*
* Set up our local static global vars that keep track of the usage of
* KVM before kernel_map is set up.
*/
virtual_avail = kva_start; /* first free KVA */
virtual_end = VM_MAX_KERNEL_ADDRESS; /* last KVA */
/*
* Set up protection_codes: we need to be able to convert from a MI
* protection code (some combo of VM_PROT...) to something we can jam
* into a x86 PTE.
*/
protection_codes[VM_PROT_NONE] = pmap_pg_nx;
protection_codes[VM_PROT_EXECUTE] = PTE_X;
protection_codes[VM_PROT_READ] = pmap_pg_nx;
protection_codes[VM_PROT_READ|VM_PROT_EXECUTE] = PTE_X;
protection_codes[VM_PROT_WRITE] = PTE_W | pmap_pg_nx;
protection_codes[VM_PROT_WRITE|VM_PROT_EXECUTE] = PTE_W | PTE_X;
protection_codes[VM_PROT_WRITE|VM_PROT_READ] = PTE_W | pmap_pg_nx;
protection_codes[VM_PROT_ALL] = PTE_W | PTE_X;
/*
* Now we init the kernel's pmap.
*
* The kernel pmap's pm_obj is not used for much. However, in user pmaps
* the pm_obj contains the list of active PTPs.
*/
kpm = pmap_kernel();
mutex_init(&kpm->pm_lock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&kpm->pm_dummy_lock);
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&kpm->pm_obj[i], &pmap_pager, false, 1);
uvm_obj_setlock(&kpm->pm_obj[i], &kpm->pm_dummy_lock);
kpm->pm_ptphint[i] = NULL;
}
memset(&kpm->pm_list, 0, sizeof(kpm->pm_list)); /* pm_list not used */
kpm->pm_pdir = (pd_entry_t *)bootspace.pdir;
for (i = 0; i < PDP_SIZE; i++)
kpm->pm_pdirpa[i] = PDPpaddr + PAGE_SIZE * i;
kpm->pm_stats.wired_count = kpm->pm_stats.resident_count =
x86_btop(kva_start - VM_MIN_KERNEL_ADDRESS);
kcpuset_create(&kpm->pm_cpus, true);
kcpuset_create(&kpm->pm_kernel_cpus, true);
kpm->pm_ldt = NULL;
kpm->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
/*
* the above is just a rough estimate and not critical to the proper
* operation of the system.
*/
#if !defined(XENPV)
/*
* Begin to enable global TLB entries if they are supported: add PTE_G
* attribute to already mapped kernel pages. Do that only if SVS is
* disabled.
*
* The G bit has no effect until the CR4_PGE bit is set in CR4, which
* happens later in cpu_init().
*/
#ifdef SVS
if (!svs_enabled && (cpu_feature[0] & CPUID_PGE)) {
#else
if (cpu_feature[0] & CPUID_PGE) {
#endif
pmap_pg_g = PTE_G;
pmap_remap_global();
}
#endif
#ifndef XENPV
/*
* Enable large pages if they are supported.
*/
if (cpu_feature[0] & CPUID_PSE) {
lcr4(rcr4() | CR4_PSE); /* enable hardware (via %cr4) */
pmap_largepages = 1; /* enable software */
/*
* The TLB must be flushed after enabling large pages on Pentium
* CPUs, according to section 3.6.2.2 of "Intel Architecture
* Software Developer's Manual, Volume 3: System Programming".
*/
tlbflushg();
/* Remap the kernel. */
pmap_remap_largepages();
}
pmap_init_lapic();
#endif /* !XENPV */
#ifdef __HAVE_PCPU_AREA
pmap_init_pcpu();
#endif
#ifdef __HAVE_DIRECT_MAP
pmap_init_directmap(kpm);
#else
pmap_vpage_cpualloc(&cpu_info_primary);
if (VM_MIN_KERNEL_ADDRESS == KERNBASE) { /* i386 */
early_zerop = (void *)cpu_info_primary.vpage[VPAGE_ZER];
early_zero_pte = cpu_info_primary.vpage_pte[VPAGE_ZER];
} else { /* amd64 */
/*
* zero_pte is stuck at the end of mapped space for the kernel
* image (disjunct from kva space). This is done so that it
* can safely be used in pmap_growkernel (pmap_get_physpage),
* when it's called for the first time.
* XXXfvdl fix this for MULTIPROCESSOR later.
*/
#ifdef XENPV
/* early_zerop initialized in xen_locore() */
#else
early_zerop = (void *)bootspace.spareva;
#endif
early_zero_pte = PTE_BASE + pl1_i((vaddr_t)early_zerop);
}
#endif
#if defined(XENPV) && defined(__x86_64__)
extern vaddr_t xen_dummy_page;
paddr_t xen_dummy_user_pgd;
/*
* We want a dummy page directory for Xen: when deactivating a pmap,
* Xen will still consider it active. So we set user PGD to this one
* to lift all protection on the now inactive page tables set.
*/
xen_dummy_user_pgd = xen_dummy_page - KERNBASE;
/* Zero fill it, the less checks in Xen it requires the better */
memset(PAGE_ALIGNED(xen_dummy_user_pgd + KERNBASE), 0, PAGE_SIZE);
/* Mark read-only */
HYPERVISOR_update_va_mapping(xen_dummy_user_pgd + KERNBASE,
pmap_pa2pte(xen_dummy_user_pgd) | PTE_P | pmap_pg_nx,
UVMF_INVLPG);
/* Pin as L4 */
xpq_queue_pin_l4_table(xpmap_ptom_masked(xen_dummy_user_pgd));
#endif
/*
* Allocate space for the Interrupt Descriptor Table (IDT),
* Global Descriptor Table (GDT), and Local Descriptor Table
* (LDT).
*
* Currently there is an initial temporary GDT allocated on the
* stack by the caller of init386/init_x86_64, which is (among
* other things) needed on i386 for %fs-relative addressing for
* CPU-local data (CPUVAR(...), curcpu(), curlwp). This
* initial temporary GDT will be popped off the stack before we
* can enter main, so we need to make sure there is space for a
* second temporary GDT to continue existing when we enter main
* before we allocate space for the permanent GDT with
* uvm_km(9) in gdt_init via cpu_startup and switch to that.
*/
idt_vaddr = pmap_bootstrap_valloc(1);
idt_paddr = pmap_bootstrap_palloc(1);
gdt_vaddr = pmap_bootstrap_valloc(1);
gdt_paddr = pmap_bootstrap_palloc(1);
#ifdef __HAVE_PCPU_AREA
ldt_vaddr = (vaddr_t)&pcpuarea->ldt;
#else
ldt_vaddr = pmap_bootstrap_valloc(1);
#endif
ldt_paddr = pmap_bootstrap_palloc(1);
#if !defined(__x86_64__)
/* pentium f00f bug stuff */
pentium_idt_vaddr = pmap_bootstrap_valloc(1);
#endif
#if defined(XENPVHVM)
/* XXX: move to hypervisor.c with appropriate API adjustments */
extern paddr_t HYPERVISOR_shared_info_pa;
extern volatile struct xencons_interface *xencons_interface; /* XXX */
extern struct xenstore_domain_interface *xenstore_interface; /* XXX */
if (vm_guest != VM_GUEST_XENPVH) {
HYPERVISOR_shared_info = (void *) pmap_bootstrap_valloc(1);
HYPERVISOR_shared_info_pa = pmap_bootstrap_palloc(1);
}
xencons_interface = (void *) pmap_bootstrap_valloc(1);
xenstore_interface = (void *) pmap_bootstrap_valloc(1);
#endif
/*
* Now we reserve some VM for mapping pages when doing a crash dump.
*/
virtual_avail = reserve_dumppages(virtual_avail);
/*
* Init the global lock and global list.
*/
mutex_init(&pmaps_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&pmaps);
/*
* Ensure the TLB is sync'd with reality by flushing it...
*/
tlbflushg();
/*
* Calculate pmap_maxkvaddr from nkptp[].
*/
kva = VM_MIN_KERNEL_ADDRESS;
for (i = PTP_LEVELS - 1; i >= 1; i--) {
kva += nkptp[i] * nbpd[i];
}
pmap_maxkvaddr = kva;
}
#ifndef XENPV
static void
pmap_init_lapic(void)
{
/*
* On CPUs that have no LAPIC, local_apic_va is never kentered. But our
* x86 implementation relies a lot on this address to be valid; so just
* allocate a fake physical page that will be kentered into
* local_apic_va by machdep.
*
* If the LAPIC is present, the va will be remapped somewhere else
* later in lapic_map.
*/
local_apic_va = pmap_bootstrap_valloc(1);
local_apic_pa = pmap_bootstrap_palloc(1);
}
#endif
#ifdef __x86_64__
static size_t
pmap_pagetree_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
{
size_t npages;
npages = (roundup(endva, pgsz) / pgsz) -
(rounddown(startva, pgsz) / pgsz);
return npages;
}
#endif
#if defined(__HAVE_DIRECT_MAP) || defined(KASAN) || defined(KMSAN)
static inline void
slotspace_copy(int type, pd_entry_t *dst, pd_entry_t *src)
{
size_t sslot = slotspace.area[type].sslot;
size_t nslot = slotspace.area[type].nslot;
memcpy(&dst[sslot], &src[sslot], nslot * sizeof(pd_entry_t));
}
#endif
#ifdef __x86_64__
/*
* Randomize the location of an area. We count the holes in the VM space. We
* randomly select one hole, and then randomly select an area within that hole.
* Finally we update the associated entry in the slotspace structure.
*/
vaddr_t
slotspace_rand(int type, size_t sz, size_t align, size_t randhole,
vaddr_t randva)
{
struct {
int start;
int end;
} holes[SLSPACE_NAREAS+1];
size_t i, nholes, hole;
size_t startsl, endsl, nslots, winsize;
vaddr_t startva, va;
sz = roundup(sz, align);
/*
* Take one more slot with +NBPD_L4, because we may end up choosing
* an area that crosses slots:
* +------+------+------+
* | Slot | Slot | Slot |
* +------+------+------+
* [Chosen Area]
* And in that case we must take into account the additional slot
* consumed.
*/
nslots = roundup(sz+NBPD_L4, NBPD_L4) / NBPD_L4;
/* Get the holes. */
nholes = 0;
size_t curslot = 0 + 256; /* end of SLAREA_USER */
while (1) {
/*
* Find the first occupied slot after the current one.
* The area between the two is a hole.
*/
size_t minsslot = 512;
size_t minnslot = 0;
for (i = 0; i < SLSPACE_NAREAS; i++) {
if (!slotspace.area[i].active)
continue;
if (slotspace.area[i].sslot >= curslot &&
slotspace.area[i].sslot < minsslot) {
minsslot = slotspace.area[i].sslot;
minnslot = slotspace.area[i].nslot;
}
}
/* No hole anymore, stop here. */
if (minsslot == 512) {
break;
}
/* Register the hole. */
if (minsslot - curslot >= nslots) {
holes[nholes].start = curslot;
holes[nholes].end = minsslot;
nholes++;
}
/* Skip that hole, and iterate again. */
curslot = minsslot + minnslot;
}
if (nholes == 0) {
panic("%s: impossible", __func__);
}
/* Select a hole. */
hole = randhole;
#ifdef NO_X86_ASLR
hole = 0;
#endif
hole %= nholes;
startsl = holes[hole].start;
endsl = holes[hole].end;
startva = VA_SIGN_NEG(startsl * NBPD_L4);
/* Select an area within the hole. */
va = randva;
#ifdef NO_X86_ASLR
va = 0;
#endif
winsize = ((endsl - startsl) * NBPD_L4) - sz;
va %= winsize;
va = rounddown(va, align);
va += startva;
/* Update the entry. */
slotspace.area[type].sslot = pl4_i(va);
slotspace.area[type].nslot =
pmap_pagetree_nentries_range(va, va+sz, NBPD_L4);
slotspace.area[type].active = true;
return va;
}
#endif
#ifdef __HAVE_PCPU_AREA
static void
pmap_init_pcpu(void)
{
const vaddr_t startva = PMAP_PCPU_BASE;
size_t nL4e, nL3e, nL2e, nL1e;
size_t L4e_idx, L3e_idx, L2e_idx, L1e_idx __diagused;
paddr_t pa;
vaddr_t endva;
vaddr_t tmpva;
pt_entry_t *pte;
size_t size;
int i;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
size = sizeof(struct pcpu_area);
endva = startva + size;
/* We will use this temporary va. */
tmpva = bootspace.spareva;
pte = PTE_BASE + pl1_i(tmpva);
/* Build L4 */
L4e_idx = pl4_i(startva);
nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
KASSERT(nL4e == 1);
for (i = 0; i < nL4e; i++) {
KASSERT(L4_BASE[L4e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L3 */
L3e_idx = pl3_i(startva);
nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
for (i = 0; i < nL3e; i++) {
KASSERT(L3_BASE[L3e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L2 */
L2e_idx = pl2_i(startva);
nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
for (i = 0; i < nL2e; i++) {
KASSERT(L2_BASE[L2e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L1 */
L1e_idx = pl1_i(startva);
nL1e = pmap_pagetree_nentries_range(startva, endva, NBPD_L1);
for (i = 0; i < nL1e; i++) {
/*
* Nothing to do, the PTEs will be entered via
* pmap_kenter_pa.
*/
KASSERT(L1_BASE[L1e_idx+i] == 0);
}
*pte = 0;
pmap_update_pg(tmpva);
pcpuarea = (struct pcpu_area *)startva;
tlbflush();
}
#endif
#ifdef __HAVE_DIRECT_MAP
static void
randomize_hole(size_t *randholep, vaddr_t *randvap)
{
struct nist_hash_drbg drbg;
uint8_t seed[NIST_HASH_DRBG_SEEDLEN_BYTES];
const char p[] = "x86/directmap";
int error;
entropy_extract(seed, sizeof(seed), 0);
error = nist_hash_drbg_instantiate(&drbg, seed, sizeof(seed),
/*nonce*/NULL, 0,
/*personalization*/p, strlen(p));
KASSERTMSG(error == 0, "error=%d", error);
error = nist_hash_drbg_generate(&drbg, randholep, sizeof(*randholep),
/*additional*/NULL, 0);
KASSERTMSG(error == 0, "error=%d", error);
error = nist_hash_drbg_generate(&drbg, randvap, sizeof(*randvap),
/*additional*/NULL, 0);
KASSERTMSG(error == 0, "error=%d", error);
explicit_memset(seed, 0, sizeof(seed));
explicit_memset(&drbg, 0, sizeof(drbg));
}
/*
* Create the amd64 direct map. Called only once at boot time. We map all of
* the physical memory contiguously using 2MB large pages, with RW permissions.
* However there is a hole: the kernel is mapped with RO permissions.
*/
static void
pmap_init_directmap(struct pmap *kpm)
{
extern phys_ram_seg_t mem_clusters[];
extern int mem_cluster_cnt;
vaddr_t startva;
size_t nL4e, nL3e, nL2e;
size_t L4e_idx, L3e_idx, L2e_idx;
size_t spahole, epahole;
paddr_t lastpa, pa;
vaddr_t endva;
vaddr_t tmpva;
pt_entry_t *pte;
phys_ram_seg_t *mc;
int i;
size_t randhole;
vaddr_t randva;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx;
const pd_entry_t holepteflags = PTE_P | pmap_pg_nx;
CTASSERT(NL4_SLOT_DIRECT * NBPD_L4 == MAXPHYSMEM);
spahole = roundup(bootspace.head.pa, NBPD_L2);
epahole = rounddown(bootspace.boot.pa, NBPD_L2);
/* Get the last physical address available */
lastpa = 0;
for (i = 0; i < mem_cluster_cnt; i++) {
mc = &mem_clusters[i];
lastpa = MAX(lastpa, mc->start + mc->size);
}
/*
* x86_add_cluster should have truncated the memory to MAXPHYSMEM.
*/
if (lastpa > MAXPHYSMEM) {
panic("pmap_init_directmap: lastpa incorrect");
}
randomize_hole(&randhole, &randva);
startva = slotspace_rand(SLAREA_DMAP, lastpa, NBPD_L2,
randhole, randva);
endva = startva + lastpa;
/* We will use this temporary va. */
tmpva = bootspace.spareva;
pte = PTE_BASE + pl1_i(tmpva);
/* Build L4 */
L4e_idx = pl4_i(startva);
nL4e = pmap_pagetree_nentries_range(startva, endva, NBPD_L4);
KASSERT(nL4e <= NL4_SLOT_DIRECT);
for (i = 0; i < nL4e; i++) {
KASSERT(L4_BASE[L4e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L4_BASE[L4e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L3 */
L3e_idx = pl3_i(startva);
nL3e = pmap_pagetree_nentries_range(startva, endva, NBPD_L3);
for (i = 0; i < nL3e; i++) {
KASSERT(L3_BASE[L3e_idx+i] == 0);
pa = pmap_bootstrap_palloc(1);
*pte = (pa & PTE_FRAME) | pteflags;
pmap_update_pg(tmpva);
memset(PAGE_ALIGNED(tmpva), 0, PAGE_SIZE);
L3_BASE[L3e_idx+i] = pa | pteflags | PTE_A;
}
/* Build L2 */
L2e_idx = pl2_i(startva);
nL2e = pmap_pagetree_nentries_range(startva, endva, NBPD_L2);
for (i = 0; i < nL2e; i++) {
KASSERT(L2_BASE[L2e_idx+i] == 0);
pa = (paddr_t)(i * NBPD_L2);
if (spahole <= pa && pa < epahole) {
L2_BASE[L2e_idx+i] = pa | holepteflags | PTE_A |
PTE_PS | pmap_pg_g;
} else {
L2_BASE[L2e_idx+i] = pa | pteflags | PTE_A |
PTE_PS | pmap_pg_g;
}
}
*pte = 0;
pmap_update_pg(tmpva);
pmap_direct_base = startva;
pmap_direct_end = endva;
tlbflush();
}
#endif /* __HAVE_DIRECT_MAP */
#if !defined(XENPV)
/*
* Remap all of the virtual pages created so far with the PTE_G bit.
*/
static void
pmap_remap_global(void)
{
vaddr_t kva, kva_end;
unsigned long p1i;
size_t i;
/* head */
kva = bootspace.head.va;
kva_end = kva + bootspace.head.sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
/* kernel segments */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type == BTSEG_NONE) {
continue;
}
kva = bootspace.segs[i].va;
kva_end = kva + bootspace.segs[i].sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
}
/* boot space */
kva = bootspace.boot.va;
kva_end = kva + bootspace.boot.sz;
for ( ; kva < kva_end; kva += PAGE_SIZE) {
p1i = pl1_i(kva);
if (pmap_valid_entry(PTE_BASE[p1i]))
PTE_BASE[p1i] |= pmap_pg_g;
}
}
#endif
#ifndef XENPV
/*
* Remap several kernel segments with large pages. We cover as many pages as we
* can. Called only once at boot time, if the CPU supports large pages.
*/
static void
pmap_remap_largepages(void)
{
pd_entry_t *pde;
vaddr_t kva, kva_end;
paddr_t pa;
size_t i;
/* Remap the kernel text using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_TEXT) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | PTE_P;
tlbflushg();
}
}
/* Remap the kernel rodata using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_RODATA) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_P;
tlbflushg();
}
}
/* Remap the kernel data+bss using large pages. */
for (i = 0; i < BTSPACE_NSEGS; i++) {
if (bootspace.segs[i].type != BTSEG_DATA) {
continue;
}
kva = roundup(bootspace.segs[i].va, NBPD_L2);
if (kva < bootspace.segs[i].va) {
continue;
}
kva_end = rounddown(bootspace.segs[i].va +
bootspace.segs[i].sz, NBPD_L2);
pa = roundup(bootspace.segs[i].pa, NBPD_L2);
for (/* */; kva < kva_end; kva += NBPD_L2, pa += NBPD_L2) {
pde = &L2_BASE[pl2_i(kva)];
*pde = pa | pmap_pg_g | PTE_PS | pmap_pg_nx | PTE_W | PTE_P;
tlbflushg();
}
}
}
#endif /* !XENPV */
/*
* pmap_init: called from uvm_init, our job is to get the pmap system ready
* to manage mappings.
*/
void
pmap_init(void)
{
int flags;
/*
* initialize caches.
*/
pool_cache_bootstrap(&pmap_cache, sizeof(struct pmap), COHERENCY_UNIT,
0, 0, "pmappl", NULL, IPL_NONE, pmap_ctor, pmap_dtor, NULL);
#ifdef XENPV
/*
* pool_cache(9) should not touch cached objects, since they
* are pinned on xen and R/O for the domU
*/
flags = PR_NOTOUCH;
#else
flags = 0;
#endif
#ifdef PAE
pool_init(&pmap_pdp_pool, PAGE_SIZE * PDP_SIZE, 0, 0, flags,
"pdppl", &pmap_pdp_allocator, IPL_NONE);
#else
pool_init(&pmap_pdp_pool, PAGE_SIZE, 0, 0, flags,
"pdppl", NULL, IPL_NONE);
#endif
pool_cache_bootstrap(&pmap_pvp_cache, PAGE_SIZE, PAGE_SIZE,
0, 0, "pvpage", &pool_allocator_kmem,
IPL_NONE, pmap_pvp_ctor, pmap_pvp_dtor, NULL);
pmap_tlb_init();
/* XXX: Since cpu_hatch() is only for secondary CPUs. */
pmap_tlb_cpu_init(curcpu());
evcnt_attach_dynamic(&pmap_iobmp_evcnt, EVCNT_TYPE_MISC,
NULL, "x86", "io bitmap copy");
evcnt_attach_dynamic(&pmap_ldt_evcnt, EVCNT_TYPE_MISC,
NULL, "x86", "ldt sync");
/*
* The kernel doesn't keep track of PTPs, so there's nowhere handy
* to hang a tree of pv_entry records. Dynamically allocated
* pv_entry lists are not heavily used in the kernel's pmap (the
* usual case is embedded), so cop out and use a single RB tree
* to cover them.
*/
rb_tree_init(&pmap_kernel_rb, &pmap_rbtree_ops);
/*
* done: pmap module is up (and ready for business)
*/
pmap_initialized = true;
}
#ifndef XENPV
/*
* pmap_cpu_init_late: perform late per-CPU initialization.
*/
void
pmap_cpu_init_late(struct cpu_info *ci)
{
/*
* The BP has already its own PD page allocated during early
* MD startup.
*/
if (ci == &cpu_info_primary)
return;
#ifdef PAE
cpu_alloc_l3_page(ci);
#endif
}
#endif
#ifndef __HAVE_DIRECT_MAP
CTASSERT(CACHE_LINE_SIZE > sizeof(pt_entry_t));
CTASSERT(CACHE_LINE_SIZE % sizeof(pt_entry_t) == 0);
static void
pmap_vpage_cpualloc(struct cpu_info *ci)
{
bool primary = (ci == &cpu_info_primary);
size_t i, npages;
vaddr_t vabase;
vsize_t vrange;
npages = (CACHE_LINE_SIZE / sizeof(pt_entry_t));
KASSERT(npages >= VPAGE_MAX);
vrange = npages * PAGE_SIZE;
if (primary) {
while ((vabase = pmap_bootstrap_valloc(1)) % vrange != 0) {
/* Waste some pages to align properly */
}
/* The base is aligned, allocate the rest (contiguous) */
pmap_bootstrap_valloc(npages - 1);
} else {
vabase = uvm_km_alloc(kernel_map, vrange, vrange,
UVM_KMF_VAONLY);
if (vabase == 0) {
panic("%s: failed to allocate tmp VA for CPU %d\n",
__func__, cpu_index(ci));
}
}
KASSERT((vaddr_t)&PTE_BASE[pl1_i(vabase)] % CACHE_LINE_SIZE == 0);
for (i = 0; i < VPAGE_MAX; i++) {
ci->vpage[i] = vabase + i * PAGE_SIZE;
ci->vpage_pte[i] = PTE_BASE + pl1_i(ci->vpage[i]);
}
}
void
pmap_vpage_cpu_init(struct cpu_info *ci)
{
if (ci == &cpu_info_primary) {
/* cpu0 already taken care of in pmap_bootstrap */
return;
}
pmap_vpage_cpualloc(ci);
}
#endif
/*
* p v _ e n t r y f u n c t i o n s
*/
/*
* pmap_pvp_dtor: pool_cache constructor for PV pages.
*/
static int
pmap_pvp_ctor(void *arg, void *obj, int flags)
{
struct pv_page *pvp = (struct pv_page *)obj;
struct pv_entry *pve = (struct pv_entry *)obj + 1;
struct pv_entry *maxpve = pve + PVE_PER_PVP;
KASSERT(sizeof(struct pv_page) <= sizeof(struct pv_entry));
KASSERT(trunc_page((vaddr_t)obj) == (vaddr_t)obj);
LIST_INIT(&pvp->pvp_pves);
pvp->pvp_nfree = PVE_PER_PVP;
pvp->pvp_pmap = NULL;
for (; pve < maxpve; pve++) {
LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
}
return 0;
}
/*
* pmap_pvp_dtor: pool_cache destructor for PV pages.
*/
static void
pmap_pvp_dtor(void *arg, void *obj)
{
struct pv_page *pvp __diagused = obj;
KASSERT(pvp->pvp_pmap == NULL); KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
}
/*
* pmap_alloc_pv: allocate a PV entry (likely cached with pmap).
*/
static struct pv_entry *
pmap_alloc_pv(struct pmap *pmap)
{
struct pv_entry *pve;
struct pv_page *pvp;
KASSERT(mutex_owned(&pmap->pm_lock));
if (__predict_false((pvp = LIST_FIRST(&pmap->pm_pvp_part)) == NULL)) {
if ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) {
LIST_REMOVE(pvp, pvp_list);
} else {
pvp = pool_cache_get(&pmap_pvp_cache, PR_NOWAIT);
}
if (__predict_false(pvp == NULL)) {
return NULL;
}
/* full -> part */
LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list);
pvp->pvp_pmap = pmap;
}
KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree > 0);
pve = LIST_FIRST(&pvp->pvp_pves);
LIST_REMOVE(pve, pve_list);
pvp->pvp_nfree--;
if (__predict_false(pvp->pvp_nfree == 0)) {
/* part -> empty */
KASSERT(LIST_EMPTY(&pvp->pvp_pves)); LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_empty, pvp, pvp_list);
} else {
KASSERT(!LIST_EMPTY(&pvp->pvp_pves));
}
return pve;
}
/*
* pmap_free_pv: delayed free of a PV entry.
*/
static void
pmap_free_pv(struct pmap *pmap, struct pv_entry *pve)
{
struct pv_page *pvp = (struct pv_page *)trunc_page((vaddr_t)pve);
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree >= 0); LIST_INSERT_HEAD(&pvp->pvp_pves, pve, pve_list);
pvp->pvp_nfree++;
if (__predict_false(pvp->pvp_nfree == 1)) {
/* empty -> part */
LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_part, pvp, pvp_list); } else if (__predict_false(pvp->pvp_nfree == PVE_PER_PVP)) {
/* part -> full */
LIST_REMOVE(pvp, pvp_list); LIST_INSERT_HEAD(&pmap->pm_pvp_full, pvp, pvp_list);
}
}
/*
* pmap_drain_pv: free full PV pages.
*/
static void
pmap_drain_pv(struct pmap *pmap)
{
struct pv_page *pvp;
KASSERT(mutex_owned(&pmap->pm_lock)); while ((pvp = LIST_FIRST(&pmap->pm_pvp_full)) != NULL) { LIST_REMOVE(pvp, pvp_list); KASSERT(pvp->pvp_pmap == pmap); KASSERT(pvp->pvp_nfree == PVE_PER_PVP);
pvp->pvp_pmap = NULL;
pool_cache_put(&pmap_pvp_cache, pvp);
}
}
/*
* pmap_check_pv: verify {VA, PTP} pair is either tracked/untracked by page
*/
static void
pmap_check_pv(struct pmap *pmap, struct vm_page *ptp, struct pmap_page *pp,
vaddr_t va, bool tracked)
{
#ifdef DEBUG
struct pv_pte *pvpte;
PMAP_CHECK_PP(pp);
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) { if (pvpte->pte_ptp == ptp && pvpte->pte_va == va) {
break;
}
}
mutex_spin_exit(&pp->pp_lock);
if (pvpte && !tracked) { panic("pmap_check_pv: %p/%lx found on pp %p", ptp, va, pp); } else if (!pvpte && tracked) { panic("pmap_check_pv: %p/%lx missing on pp %p", ptp, va, pp);
}
#endif
}
/*
* pmap_treelookup_pv: search the PV tree for a dynamic entry
*
* => pmap must be locked
*/
static struct pv_entry *
pmap_treelookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
const rb_tree_t *tree, const vaddr_t va)
{
struct pv_entry *pve;
rb_node_t *node;
/*
* Inlined lookup tailored for exactly what's needed here that is
* quite a bit faster than using rb_tree_find_node().
*/
for (node = tree->rbt_root;;) {
if (__predict_false(RB_SENTINEL_P(node))) {
return NULL;
}
pve = (struct pv_entry *)
((uintptr_t)node - offsetof(struct pv_entry, pve_rb));
if (pve->pve_pte.pte_va == va) {
KASSERT(pve->pve_pte.pte_ptp == ptp);
return pve;
}
node = node->rb_nodes[pve->pve_pte.pte_va < va];
}
}
/*
* pmap_lookup_pv: look up a non-embedded pv entry for the given pmap
*
* => a PV entry must be known present (doesn't check for existence)
* => pmap must be locked
*/
static struct pv_entry *
pmap_lookup_pv(const struct pmap *pmap, const struct vm_page *ptp,
const struct pmap_page * const old_pp, const vaddr_t va)
{
struct pv_entry *pve;
const rb_tree_t *tree;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp != NULL || pmap == pmap_kernel());
/*
* [This mostly deals with the case of process-private pages, i.e.
* anonymous memory allocations or COW.]
*
* If the page is tracked with an embedded entry then the tree
* lookup can be avoided. It's safe to check for this specific
* set of values without pp_lock because both will only ever be
* set together for this pmap.
*
*/
if (atomic_load_relaxed(&old_pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&old_pp->pp_pte.pte_va) == va) {
return NULL;
}
/*
* [This mostly deals with shared mappings, for example shared libs
* and executables.]
*
* Optimise for pmap_remove_ptes() which works by ascending scan:
* look at the lowest numbered node in the tree first. The tree is
* known non-empty because of the check above. For short lived
* processes where pmap_remove() isn't used much this gets close to
* a 100% hit rate.
*/
tree = (ptp != NULL ? &VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
KASSERT(!RB_SENTINEL_P(tree->rbt_root));
pve = (struct pv_entry *)
((uintptr_t)tree->rbt_minmax[RB_DIR_LEFT] -
offsetof(struct pv_entry, pve_rb));
if (__predict_true(pve->pve_pte.pte_va == va)) {
KASSERT(pve->pve_pte.pte_ptp == ptp);
return pve;
}
/* Search the RB tree for the key (uncommon). */
return pmap_treelookup_pv(pmap, ptp, tree, va);
}
/*
* pmap_enter_pv: enter a mapping onto a pmap_page lst
*
* => pmap must be locked
* => does NOT insert dynamic entries to tree (pmap_enter() does later)
*/
static int
pmap_enter_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
vaddr_t va, struct pv_entry **new_pve, struct pv_entry **old_pve,
bool *samepage, bool *new_embedded, rb_tree_t *tree)
{
struct pv_entry *pve;
int error;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); PMAP_CHECK_PP(pp);
/*
* If entering the same page and it's already tracked with an
* embedded entry, we can avoid the expense below. It's safe
* to check for this very specific set of values without a lock
* because both will only ever be set together for this pmap.
*/
if (atomic_load_relaxed(&pp->pp_pte.pte_ptp) == ptp && atomic_load_relaxed(&pp->pp_pte.pte_va) == va) {
*samepage = true;
pmap_check_pv(pmap, ptp, pp, va, true);
return 0;
}
/*
* Check for an existing dynamic mapping at this address. If it's
* for the same page, then it will be reused and nothing needs to be
* changed.
*/
*old_pve = pmap_treelookup_pv(pmap, ptp, tree, va); if (*old_pve != NULL && (*old_pve)->pve_pp == pp) {
*samepage = true;
pmap_check_pv(pmap, ptp, pp, va, true);
return 0;
}
/*
* Need to put a new mapping in place. Grab a spare pv_entry in
* case it's needed; won't know for sure until the lock is taken.
*/
if (pmap->pm_pve == NULL) { pmap->pm_pve = pmap_alloc_pv(pmap);
}
error = 0;
pmap_check_pv(pmap, ptp, pp, va, false);
mutex_spin_enter(&pp->pp_lock);
if (!pv_pte_embedded(pp)) {
/*
* Embedded PV tracking available - easy.
*/
pp->pp_pte.pte_ptp = ptp;
pp->pp_pte.pte_va = va;
*new_embedded = true;
} else if (__predict_false(pmap->pm_pve == NULL)) {
/*
* No memory.
*/
error = ENOMEM;
} else {
/*
* Install new pv_entry on the page.
*/
pve = pmap->pm_pve;
pmap->pm_pve = NULL;
*new_pve = pve;
pve->pve_pte.pte_ptp = ptp;
pve->pve_pte.pte_va = va;
pve->pve_pp = pp;
LIST_INSERT_HEAD(&pp->pp_pvlist, pve, pve_list);
}
mutex_spin_exit(&pp->pp_lock);
if (error == 0) {
pmap_check_pv(pmap, ptp, pp, va, true);
}
return error;
}
/*
* pmap_remove_pv: try to remove a mapping from a pv_list
*
* => pmap must be locked
* => removes dynamic entries from tree and frees them
* => caller should adjust ptp's wire_count and free PTP if needed
*/
static void
pmap_remove_pv(struct pmap *pmap, struct pmap_page *pp, struct vm_page *ptp,
vaddr_t va, struct pv_entry *pve, uint8_t oattrs)
{
rb_tree_t *tree = (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(ptp_to_pmap(ptp) == pmap); KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); KASSERT(ptp != NULL || pmap == pmap_kernel());
pmap_check_pv(pmap, ptp, pp, va, true);
if (pve == NULL) {
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp == ptp); KASSERT(pp->pp_pte.pte_va == va);
pp->pp_attrs |= oattrs;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
mutex_spin_exit(&pp->pp_lock);
} else {
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp != ptp ||
pp->pp_pte.pte_va != va);
KASSERT(pve->pve_pte.pte_ptp == ptp); KASSERT(pve->pve_pte.pte_va == va); KASSERT(pve->pve_pp == pp);
pp->pp_attrs |= oattrs;
LIST_REMOVE(pve, pve_list);
mutex_spin_exit(&pp->pp_lock);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == pve);
rb_tree_remove_node(tree, pve);
#ifdef DIAGNOSTIC
memset(pve, 0, sizeof(*pve));
#endif
pmap_free_pv(pmap, pve);
}
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
pmap_check_pv(pmap, ptp, pp, va, false);
}
/*
* p t p f u n c t i o n s
*/
static struct vm_page *
pmap_find_ptp(struct pmap *pmap, vaddr_t va, int level)
{
int lidx = level - 1;
off_t off = ptp_va2o(va, level);
struct vm_page *pg;
KASSERT(mutex_owned(&pmap->pm_lock)); if (pmap->pm_ptphint[lidx] && off == pmap->pm_ptphint[lidx]->offset) { KASSERT(pmap->pm_ptphint[lidx]->wire_count > 0);
pg = pmap->pm_ptphint[lidx];
PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
return pg;
}
PMAP_DUMMY_LOCK(pmap);
pg = uvm_pagelookup(&pmap->pm_obj[lidx], off);
PMAP_DUMMY_UNLOCK(pmap);
if (pg != NULL && __predict_false(pg->wire_count == 0)) {
/* This page is queued to be freed - ignore. */
pg = NULL;
}
if (pg != NULL) {
PMAP_CHECK_PP(VM_PAGE_TO_PP(pg));
}
pmap->pm_ptphint[lidx] = pg;
return pg;
}
static inline void
pmap_freepage(struct pmap *pmap, struct vm_page *ptp, int level)
{
int lidx;
KASSERT(ptp->wire_count <= 1); PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
lidx = level - 1;
pmap_stats_update(pmap, -ptp->wire_count, 0); if (pmap->pm_ptphint[lidx] == ptp) pmap->pm_ptphint[lidx] = NULL;
ptp->wire_count = 0;
ptp->uanon = NULL;
KASSERT(RB_TREE_MIN(&VM_PAGE_TO_PP(ptp)->pp_rb) == NULL);
/*
* Enqueue the PTP to be freed by pmap_update(). We can't remove
* the page from the uvm_object, as that can take further locks
* (intolerable right now because the PTEs are likely mapped in).
* Instead mark the PTP as free and if we bump into it again, we'll
* either ignore or reuse (depending on what's useful at the time).
*/
LIST_INSERT_HEAD(&pmap->pm_gc_ptp, ptp, mdpage.mp_pp.pp_link);
}
static void
pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va,
pt_entry_t *ptes, pd_entry_t * const *pdes)
{
unsigned long index;
int level;
vaddr_t invaladdr;
pd_entry_t opde;
KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
level = 1;
do {
index = pl_i(va, level + 1);
opde = pmap_pte_testset(&pdes[level - 1][index], 0);
/*
* On Xen-amd64 or SVS, we need to sync the top level page
* directory on each CPU.
*/
#if defined(XENPV) && defined(__x86_64__)
if (level == PTP_LEVELS - 1) {
xen_kpm_sync(pmap, index);
}
#elif defined(SVS)
if (svs_enabled && level == PTP_LEVELS - 1 &&
pmap_is_user(pmap)) {
svs_pmap_sync(pmap, index);
}
#endif
invaladdr = level == 1 ? (vaddr_t)ptes :
(vaddr_t)pdes[level - 2];
pmap_tlb_shootdown(pmap, invaladdr + index * PAGE_SIZE,
opde, TLBSHOOT_FREE_PTP);
#if defined(XENPV)
pmap_tlb_shootnow();
#endif
pmap_freepage(pmap, ptp, level);
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, level + 1);
ptp->wire_count--;
if (ptp->wire_count > 1)
break;
}
} while (++level < PTP_LEVELS);
pmap_pte_flush();
}
/*
* pmap_get_ptp: get a PTP (if there isn't one, allocate a new one)
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
* => we are not touching any PTEs yet, so they need not be mapped in
*/
static int
pmap_get_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
int flags, struct vm_page **resultp)
{
struct vm_page *ptp;
int i, aflags;
struct uvm_object *obj;
voff_t off;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
/*
* Loop through all page table levels allocating a page
* for any level where we don't already have one.
*/
memset(pt, 0, sizeof(*pt));
aflags = ((flags & PMAP_CANFAIL) ? 0 : UVM_PGA_USERESERVE) |
UVM_PGA_ZERO;
for (i = PTP_LEVELS; i > 1; i--) {
obj = &pmap->pm_obj[i - 2];
off = ptp_va2o(va, i - 1);
PMAP_DUMMY_LOCK(pmap);
pt->pg[i] = uvm_pagelookup(obj, off);
if (pt->pg[i] == NULL) {
pt->pg[i] = uvm_pagealloc(obj, off, NULL, aflags);
pt->alloced[i] = (pt->pg[i] != NULL);
} else if (pt->pg[i]->wire_count == 0) {
/* This page was queued to be freed; dequeue it. */
LIST_REMOVE(pt->pg[i], mdpage.mp_pp.pp_link);
pt->alloced[i] = true;
}
PMAP_DUMMY_UNLOCK(pmap);
if (pt->pg[i] == NULL) {
pmap_unget_ptp(pmap, pt);
return ENOMEM;
} else if (pt->alloced[i]) {
pt->pg[i]->uanon = (struct vm_anon *)(vaddr_t)~0L;
rb_tree_init(&VM_PAGE_TO_PP(pt->pg[i])->pp_rb,
&pmap_rbtree_ops);
PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
}
}
ptp = pt->pg[2];
KASSERT(ptp != NULL);
*resultp = ptp;
pmap->pm_ptphint[0] = ptp;
return 0;
}
/*
* pmap_install_ptp: install any freshly allocated PTPs
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
* => PTEs must be mapped
* => preemption must be disabled
*/
static void
pmap_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va,
pd_entry_t * const *pdes)
{
struct vm_page *ptp;
unsigned long index;
pd_entry_t *pva;
paddr_t pa;
int i;
KASSERT(pmap != pmap_kernel()); KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
/*
* Now that we have all the pages looked up or allocated,
* loop through again installing any new ones into the tree.
*/
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_i(va, i);
pva = pdes[i - 2];
if (pmap_valid_entry(pva[index])) {
KASSERT(!pt->alloced[i]);
continue;
}
ptp = pt->pg[i];
ptp->flags &= ~PG_BUSY; /* never busy */
ptp->wire_count = 1;
pmap->pm_ptphint[i - 2] = ptp;
pa = VM_PAGE_TO_PHYS(ptp);
pmap_pte_set(&pva[index], (pd_entry_t)
(pmap_pa2pte(pa) | PTE_U | PTE_W | PTE_P));
/*
* On Xen-amd64 or SVS, we need to sync the top level page
* directory on each CPU.
*/
#if defined(XENPV) && defined(__x86_64__)
if (i == PTP_LEVELS) {
xen_kpm_sync(pmap, index);
}
#elif defined(SVS)
if (svs_enabled && i == PTP_LEVELS &&
pmap_is_user(pmap)) {
svs_pmap_sync(pmap, index);
}
#endif
pmap_pte_flush();
pmap_stats_update(pmap, 1, 0);
/*
* If we're not in the top level, increase the
* wire count of the parent page.
*/
if (i < PTP_LEVELS) { pt->pg[i + 1]->wire_count++;
}
}
}
/*
* pmap_unget_ptp: free unusued PTPs
*
* => pmap should NOT be pmap_kernel()
* => pmap should be locked
*/
static void
pmap_unget_ptp(struct pmap *pmap, struct pmap_ptparray *pt)
{
int i;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
for (i = PTP_LEVELS; i > 1; i--) {
if (!pt->alloced[i]) {
continue;
}
KASSERT(pt->pg[i]->wire_count == 0);
PMAP_CHECK_PP(VM_PAGE_TO_PP(pt->pg[i]));
pmap_freepage(pmap, pt->pg[i], i - 1);
}
}
/*
* p m a p l i f e c y c l e f u n c t i o n s
*/
/*
* pmap_pdp_init: constructor a new PDP.
*/
static void
pmap_pdp_init(pd_entry_t *pdir)
{
paddr_t pdirpa = 0;
vaddr_t object;
int i;
#if !defined(XENPV) || !defined(__x86_64__)
int npde;
#endif
#ifdef XENPV
int s;
#endif
memset(PAGE_ALIGNED(pdir), 0, PDP_SIZE * PAGE_SIZE);
/*
* NOTE: This is all done unlocked, but we will check afterwards
* if we have raced with pmap_growkernel().
*/
#if defined(XENPV) && defined(__x86_64__)
/* Fetch the physical address of the page directory */
(void)pmap_extract(pmap_kernel(), (vaddr_t)pdir, &pdirpa);
/*
* This pdir will NEVER be active in kernel mode, so mark
* recursive entry invalid.
*/
pdir[PDIR_SLOT_PTE] = pmap_pa2pte(pdirpa);
/*
* PDP constructed this way won't be for the kernel, hence we
* don't put kernel mappings on Xen.
*
* But we need to make pmap_create() happy, so put a dummy
* (without PTE_P) value at the right place.
*/
pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1] =
(pd_entry_t)-1 & PTE_FRAME;
#else /* XENPV && __x86_64__*/
object = (vaddr_t)pdir;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* Fetch the physical address of the page directory */
(void)pmap_extract(pmap_kernel(), object, &pdirpa);
/* Put in recursive PDE to map the PTEs */
pdir[PDIR_SLOT_PTE + i] = pmap_pa2pte(pdirpa) | PTE_P |
pmap_pg_nx;
#ifndef XENPV
pdir[PDIR_SLOT_PTE + i] |= PTE_W;
#endif
}
/* Copy the kernel's top level PDE */
npde = nkptp[PTP_LEVELS - 1];
memcpy(&pdir[PDIR_SLOT_KERN], &PDP_BASE[PDIR_SLOT_KERN],
npde * sizeof(pd_entry_t));
if (VM_MIN_KERNEL_ADDRESS != KERNBASE) {
int idx = pl_i(KERNBASE, PTP_LEVELS);
pdir[idx] = PDP_BASE[idx];
}
#ifdef __HAVE_PCPU_AREA
pdir[PDIR_SLOT_PCPU] = PDP_BASE[PDIR_SLOT_PCPU];
#endif
#ifdef __HAVE_DIRECT_MAP
slotspace_copy(SLAREA_DMAP, pdir, PDP_BASE);
#endif
#ifdef KASAN
slotspace_copy(SLAREA_ASAN, pdir, PDP_BASE);
#endif
#ifdef KMSAN
slotspace_copy(SLAREA_MSAN, pdir, PDP_BASE);
#endif
#endif /* XENPV && __x86_64__*/
#ifdef XENPV
s = splvm();
object = (vaddr_t)pdir;
pmap_protect(pmap_kernel(), object, object + (PAGE_SIZE * PDP_SIZE),
VM_PROT_READ);
pmap_update(pmap_kernel());
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/*
* pin as L2/L4 page, we have to do the page with the
* PDIR_SLOT_PTE entries last
*/
#ifdef PAE
if (i == l2tol3(PDIR_SLOT_PTE))
continue;
#endif
(void) pmap_extract(pmap_kernel(), object, &pdirpa);
#ifdef __x86_64__
xpq_queue_pin_l4_table(xpmap_ptom_masked(pdirpa));
#else
xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
}
#ifdef PAE
object = ((vaddr_t)pdir) + PAGE_SIZE * l2tol3(PDIR_SLOT_PTE);
(void)pmap_extract(pmap_kernel(), object, &pdirpa);
xpq_queue_pin_l2_table(xpmap_ptom_masked(pdirpa));
#endif
splx(s);
#endif /* XENPV */
}
/*
* pmap_pdp_fini: destructor for the PDPs.
*/
static void
pmap_pdp_fini(pd_entry_t *pdir)
{
#ifdef XENPV
paddr_t pdirpa = 0; /* XXX: GCC */
vaddr_t object = (vaddr_t)pdir;
int i;
int s = splvm();
pt_entry_t *pte;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* fetch the physical address of the page directory. */
(void) pmap_extract(pmap_kernel(), object, &pdirpa);
/* unpin page table */
xpq_queue_unpin_table(xpmap_ptom_masked(pdirpa));
}
object = (vaddr_t)pdir;
for (i = 0; i < PDP_SIZE; i++, object += PAGE_SIZE) {
/* Set page RW again */
pte = kvtopte(object);
pmap_pte_set(pte, *pte | PTE_W);
xen_bcast_invlpg((vaddr_t)object);
}
splx(s);
#endif /* XENPV */
}
#ifdef PAE
static void *
pmap_pdp_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(kernel_map,
PAGE_SIZE * PDP_SIZE, PAGE_SIZE * PDP_SIZE,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) |
UVM_KMF_WIRED);
}
static void
pmap_pdp_free(struct pool *pp, void *v)
{
uvm_km_free(kernel_map, (vaddr_t)v, PAGE_SIZE * PDP_SIZE,
UVM_KMF_WIRED);
}
#endif /* PAE */
/*
* pmap_ctor: constructor for the pmap cache.
*/
static int
pmap_ctor(void *arg, void *obj, int flags)
{
struct pmap *pmap = obj;
pt_entry_t p;
int i;
KASSERT((flags & PR_WAITOK) != 0);
mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&pmap->pm_dummy_lock);
kcpuset_create(&pmap->pm_cpus, true);
kcpuset_create(&pmap->pm_kernel_cpus, true);
#ifdef XENPV
kcpuset_create(&pmap->pm_xen_ptp_cpus, true);
#endif
LIST_INIT(&pmap->pm_gc_ptp);
pmap->pm_pve = NULL;
LIST_INIT(&pmap->pm_pvp_full);
LIST_INIT(&pmap->pm_pvp_part);
LIST_INIT(&pmap->pm_pvp_empty);
/* allocate and init PDP */
pmap->pm_pdir = pool_get(&pmap_pdp_pool, PR_WAITOK);
for (;;) {
pmap_pdp_init(pmap->pm_pdir);
mutex_enter(&pmaps_lock);
p = pmap->pm_pdir[PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1] - 1];
if (__predict_true(p != 0)) {
break;
}
mutex_exit(&pmaps_lock);
}
for (i = 0; i < PDP_SIZE; i++)
pmap->pm_pdirpa[i] =
pmap_pte2pa(pmap->pm_pdir[PDIR_SLOT_PTE + i]);
LIST_INSERT_HEAD(&pmaps, pmap, pm_list);
mutex_exit(&pmaps_lock);
return 0;
}
/*
* pmap_ctor: destructor for the pmap cache.
*/
static void
pmap_dtor(void *arg, void *obj)
{
struct pmap *pmap = obj;
mutex_enter(&pmaps_lock);
LIST_REMOVE(pmap, pm_list);
mutex_exit(&pmaps_lock);
pmap_pdp_fini(pmap->pm_pdir);
pool_put(&pmap_pdp_pool, pmap->pm_pdir);
mutex_destroy(&pmap->pm_lock);
rw_destroy(&pmap->pm_dummy_lock);
kcpuset_destroy(pmap->pm_cpus);
kcpuset_destroy(pmap->pm_kernel_cpus);
#ifdef XENPV
kcpuset_destroy(pmap->pm_xen_ptp_cpus);
#endif
}
/*
* pmap_create: create a pmap object.
*/
struct pmap *
pmap_create(void)
{
struct pmap *pmap;
int i;
pmap = pool_cache_get(&pmap_cache, PR_WAITOK);
/* init uvm_object */
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_init(&pmap->pm_obj[i], &pmap_pager, false, 1);
uvm_obj_setlock(&pmap->pm_obj[i], &pmap->pm_dummy_lock);
pmap->pm_ptphint[i] = NULL;
}
pmap->pm_stats.wired_count = 0;
/* count the PDP allocd below */
pmap->pm_stats.resident_count = PDP_SIZE;
#if !defined(__x86_64__)
pmap->pm_hiexec = 0;
#endif
/* Used by NVMM and Xen */
pmap->pm_enter = NULL;
pmap->pm_extract = NULL;
pmap->pm_remove = NULL;
pmap->pm_sync_pv = NULL;
pmap->pm_pp_remove_ent = NULL;
pmap->pm_write_protect = NULL;
pmap->pm_unwire = NULL;
pmap->pm_tlb_flush = NULL;
pmap->pm_data = NULL;
/* init the LDT */
pmap->pm_ldt = NULL;
pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
return pmap;
}
/*
* pmap_check_ptps: verify that none of the pmap's page table objects
* have any pages allocated to them.
*/
static void
pmap_check_ptps(struct pmap *pmap)
{
int i;
for (i = 0; i < PTP_LEVELS - 1; i++) {
KASSERTMSG(pmap->pm_obj[i].uo_npages == 0,
"pmap %p level %d still has %d pages",
pmap, i, (int)pmap->pm_obj[i].uo_npages);
}
}
static void
pmap_check_inuse(struct pmap *pmap)
{
#ifdef DEBUG
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
for (CPU_INFO_FOREACH(cii, ci)) {
if (ci->ci_pmap == pmap)
panic("destroying pmap being used");
#if defined(XENPV) && defined(__x86_64__)
for (int i = 0; i < PDIR_SLOT_USERLIM; i++) {
if (pmap->pm_pdir[i] != 0 &&
ci->ci_kpm_pdir[i] == pmap->pm_pdir[i]) {
printf("pmap_destroy(%p) pmap_kernel %p "
"curcpu %d cpu %d ci_pmap %p "
"ci->ci_kpm_pdir[%d]=%" PRIx64
" pmap->pm_pdir[%d]=%" PRIx64 "\n",
pmap, pmap_kernel(), curcpu()->ci_index,
ci->ci_index, ci->ci_pmap,
i, ci->ci_kpm_pdir[i],
i, pmap->pm_pdir[i]);
panic("%s: used pmap", __func__);
}
}
#endif
}
#endif /* DEBUG */
}
/*
* pmap_destroy: drop reference count on pmap. free pmap if reference
* count goes to zero.
*
* => we can be called from pmap_unmap_ptes() with a different, unrelated
* pmap's lock held. be careful!
*/
void
pmap_destroy(struct pmap *pmap)
{
int i;
/*
* drop reference count and verify not in use.
*/
if (atomic_dec_uint_nv(&pmap->pm_obj[0].uo_refs) > 0) {
return;
}
pmap_check_inuse(pmap);
/*
* handle any deferred frees.
*/
mutex_enter(&pmap->pm_lock);
if (pmap->pm_pve != NULL) { pmap_free_pv(pmap, pmap->pm_pve);
pmap->pm_pve = NULL;
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
pmap_update(pmap);
/*
* Reference count is zero, free pmap resources and then free pmap.
*/
pmap_check_ptps(pmap); KASSERT(LIST_EMPTY(&pmap->pm_gc_ptp));
#ifdef USER_LDT
if (pmap->pm_ldt != NULL) {
/*
* No need to switch the LDT; this address space is gone,
* nothing is using it.
*
* No need to lock the pmap for ldt_free (or anything else),
* we're the last one to use it.
*/
/* XXXAD can't take cpu_lock here - fix soon. */
mutex_enter(&cpu_lock);
ldt_free(pmap->pm_ldt_sel);
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)pmap->pm_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
}
#endif
for (i = 0; i < PTP_LEVELS - 1; i++) {
uvm_obj_destroy(&pmap->pm_obj[i], false);
}
kcpuset_zero(pmap->pm_cpus);
kcpuset_zero(pmap->pm_kernel_cpus);
#ifdef XENPV
kcpuset_zero(pmap->pm_xen_ptp_cpus);
#endif
KASSERT(LIST_EMPTY(&pmap->pm_pvp_full)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_part)); KASSERT(LIST_EMPTY(&pmap->pm_pvp_empty)); pmap_check_ptps(pmap);
if (__predict_false(pmap->pm_enter != NULL)) {
/* XXX make this a different cache */
pool_cache_destruct_object(&pmap_cache, pmap);
} else {
pool_cache_put(&pmap_cache, pmap);
}
}
/*
* pmap_zap_ptp: clear out an entire PTP without modifying PTEs
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => must be called with kernel preemption disabled
* => does as little work as possible
*/
static void
pmap_zap_ptp(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t startva, vaddr_t blkendva)
{
#ifndef XENPV
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
rb_tree_t *tree;
vaddr_t va;
int wired;
uint8_t oattrs;
u_int cnt;
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(ptp->wire_count > 1);
KASSERT(ptp->wire_count - 1 <= PAGE_SIZE / sizeof(pt_entry_t));
/*
* Start at the lowest entered VA, and scan until there are no more
* PTEs in the PTPs.
*/
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
pve = RB_TREE_MIN(tree);
wired = 0;
va = (vaddr_t)ptp->uanon;
pte += ((va - startva) >> PAGE_SHIFT);
for (cnt = ptp->wire_count; cnt > 1; pte++, va += PAGE_SIZE) {
/*
* No need for an atomic to clear the PTE. Nothing else can
* see the address space any more and speculative access (if
* possible) won't modify. Therefore there's no need to
* track the accessed/dirty bits.
*/
opte = *pte;
if (!pmap_valid_entry(opte)) {
continue;
}
/*
* Count the PTE. If it's not for a managed mapping
* there's noting more to do.
*/
cnt--;
wired -= (opte & PTE_WIRED);
if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without PTE_PVLIST for %#"
PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without PTE_PVLIST for %#"
PRIxVADDR, va);
#endif
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb),
va) == NULL);
continue;
}
/*
* "pve" now points to the lowest (by VA) dynamic PV entry
* in the PTP. If it's for this VA, take advantage of it to
* avoid calling PHYS_TO_VM_PAGE(). Avoid modifying the RB
* tree by skipping to the next VA in the tree whenever
* there is a match here. The tree will be cleared out in
* one pass before return to pmap_remove_all().
*/
oattrs = pmap_pte_to_pp_attrs(opte);
if (pve != NULL && pve->pve_pte.pte_va == va) {
pp = pve->pve_pp;
KASSERT(pve->pve_pte.pte_ptp == ptp);
KASSERT(pp->pp_pte.pte_ptp != ptp ||
pp->pp_pte.pte_va != va);
mutex_spin_enter(&pp->pp_lock);
pp->pp_attrs |= oattrs;
LIST_REMOVE(pve, pve_list);
mutex_spin_exit(&pp->pp_lock);
/*
* pve won't be touched again until pmap_drain_pv(),
* so it's still safe to traverse the tree.
*/
pmap_free_pv(pmap, pve);
pve = RB_TREE_NEXT(tree, pve);
continue;
}
/*
* No entry in the tree so it must be embedded. Look up the
* page and cancel the embedded entry.
*/
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg);
} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR
"(%#"PRIxPADDR")", __func__, va, pa, atop(pa));
}
mutex_spin_enter(&pp->pp_lock);
KASSERT(pp->pp_pte.pte_ptp == ptp);
KASSERT(pp->pp_pte.pte_va == va);
pp->pp_attrs |= oattrs;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
mutex_spin_exit(&pp->pp_lock);
}
/* PTP now empty - adjust the tree & stats to match. */
pmap_stats_update(pmap, -(ptp->wire_count - 1), wired / PTE_WIRED);
ptp->wire_count = 1;
#ifdef DIAGNOSTIC
rb_tree_init(tree, &pmap_rbtree_ops);
#endif
#else /* !XENPV */
/*
* XXXAD For XEN, it's not clear to me that we can do this, because
* I guess the hypervisor keeps track of PTEs too.
*/
pmap_remove_ptes(pmap, ptp, (vaddr_t)pte, startva, blkendva);
#endif /* !XENPV */
}
/*
* pmap_remove_all: remove all mappings from pmap in bulk.
*
* Ordinarily when removing mappings it's important to hold the UVM object's
* lock, so that pages do not gain a new identity while retaining stale TLB
* entries (the same lock hold covers both pmap_remove() and pmap_update()).
* Here it's known that the address space is no longer visible to any user
* process, so we don't need to worry about that.
*/
bool
pmap_remove_all(struct pmap *pmap)
{
struct vm_page *ptps[32];
vaddr_t va, blkendva;
struct pmap *pmap2;
pt_entry_t *ptes;
pd_entry_t pde __diagused;
pd_entry_t * const *pdes;
int lvl __diagused, i, n;
/* XXX Can't handle EPT just yet. */
if (pmap->pm_remove != NULL) {
return false;
}
for (;;) {
/* Fetch a block of PTPs from tree. */
mutex_enter(&pmap->pm_lock);
n = radix_tree_gang_lookup_node(&pmap->pm_obj[0].uo_pages, 0,
(void **)ptps, __arraycount(ptps), false);
if (n == 0) {
mutex_exit(&pmap->pm_lock);
break;
}
/* Remove all mappings in the set of PTPs. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
for (i = 0; i < n; i++) {
if (ptps[i]->wire_count == 0) {
/* It's dead: pmap_update() will expunge. */
continue;
}
/* Determine range of block. */
va = ptps[i]->offset * PAGE_SIZE / sizeof(pt_entry_t);
blkendva = x86_round_pdr(va + 1);
/* Make sure everything squares up... */
KASSERT(pmap_pdes_valid(va, pdes, &pde, &lvl));
KASSERT(lvl == 1);
KASSERT(pmap_find_ptp(pmap, va, 1) == ptps[i]);
/* Zap! */
pmap_zap_ptp(pmap, ptps[i], &ptes[pl1_i(va)], va,
blkendva);
/* PTP should now be unused - free it. */
KASSERT(ptps[i]->wire_count == 1);
pmap_free_ptp(pmap, ptps[i], va, ptes, pdes);
}
pmap_unmap_ptes(pmap, pmap2);
pmap_drain_pv(pmap);
pmap_tlb_shootdown(pmap, -1L, 0, TLBSHOOT_REMOVE_ALL);
mutex_exit(&pmap->pm_lock);
/* Process deferred frees. */
pmap_update(pmap);
/* A breathing point. */
preempt_point();
}
/* Verify that the pmap is now completely empty. */
pmap_check_ptps(pmap);
KASSERTMSG(pmap->pm_stats.resident_count == PDP_SIZE,
"pmap %p not empty", pmap);
return true;
}
#if defined(PMAP_FORK)
/*
* pmap_fork: perform any necessary data structure manipulation when
* a VM space is forked.
*/
void
pmap_fork(struct pmap *pmap1, struct pmap *pmap2)
{
#ifdef USER_LDT
union descriptor *new_ldt;
int sel;
if (__predict_true(pmap1->pm_ldt == NULL)) {
return;
}
/*
* Copy the LDT into the new process.
*
* Read pmap1's ldt pointer unlocked; if it changes behind our back
* we'll retry. This will starve if there's a stream of LDT changes
* in another thread but that should not happen.
*/
retry:
if (pmap1->pm_ldt != NULL) {
/* Allocate space for the new process's LDT */
new_ldt = (union descriptor *)uvm_km_alloc(kernel_map,
MAX_USERLDT_SIZE, 0, UVM_KMF_WIRED);
if (new_ldt == NULL) {
printf("WARNING: %s: unable to allocate LDT space\n",
__func__);
return;
}
mutex_enter(&cpu_lock);
/* Get a GDT slot for it */
sel = ldt_alloc(new_ldt, MAX_USERLDT_SIZE);
if (sel == -1) {
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)new_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
printf("WARNING: %s: unable to allocate LDT selector\n",
__func__);
return;
}
} else {
/* Wasn't anything there after all. */
new_ldt = NULL;
sel = -1;
mutex_enter(&cpu_lock);
}
/*
* Now that we have cpu_lock, ensure the LDT status is the same.
*/
if (pmap1->pm_ldt != NULL) {
if (new_ldt == NULL) {
/* A wild LDT just appeared. */
mutex_exit(&cpu_lock);
goto retry;
}
/* Copy the LDT data and install it in pmap2 */
memcpy(new_ldt, pmap1->pm_ldt, MAX_USERLDT_SIZE);
pmap2->pm_ldt = new_ldt;
pmap2->pm_ldt_sel = sel;
mutex_exit(&cpu_lock);
} else {
if (new_ldt != NULL) {
/* The LDT disappeared, drop what we did. */
ldt_free(sel);
mutex_exit(&cpu_lock);
uvm_km_free(kernel_map, (vaddr_t)new_ldt,
MAX_USERLDT_SIZE, UVM_KMF_WIRED);
return;
}
/* We're good, just leave. */
mutex_exit(&cpu_lock);
}
#endif /* USER_LDT */
}
#endif /* PMAP_FORK */
#ifdef USER_LDT
/*
* pmap_ldt_xcall: cross call used by pmap_ldt_sync. if the named pmap
* is active, reload LDTR.
*/
static void
pmap_ldt_xcall(void *arg1, void *arg2)
{
struct pmap *pm;
kpreempt_disable();
pm = arg1;
if (curcpu()->ci_pmap == pm) {
#if defined(SVS)
if (svs_enabled) {
svs_ldt_sync(pm);
} else
#endif
lldt(pm->pm_ldt_sel);
}
kpreempt_enable();
}
/*
* pmap_ldt_sync: LDT selector for the named pmap is changing. swap
* in the new selector on all CPUs.
*/
void
pmap_ldt_sync(struct pmap *pm)
{
uint64_t where;
KASSERT(mutex_owned(&cpu_lock));
pmap_ldt_evcnt.ev_count++;
where = xc_broadcast(0, pmap_ldt_xcall, pm, NULL);
xc_wait(where);
}
/*
* pmap_ldt_cleanup: if the pmap has a local LDT, deallocate it, and
* restore the default.
*/
void
pmap_ldt_cleanup(struct lwp *l)
{
pmap_t pmap = l->l_proc->p_vmspace->vm_map.pmap;
union descriptor *ldt;
int sel;
if (__predict_true(pmap->pm_ldt == NULL)) {
return;
}
mutex_enter(&cpu_lock);
if (pmap->pm_ldt != NULL) {
sel = pmap->pm_ldt_sel;
ldt = pmap->pm_ldt;
pmap->pm_ldt_sel = GSYSSEL(GLDT_SEL, SEL_KPL);
pmap->pm_ldt = NULL;
pmap_ldt_sync(pmap);
ldt_free(sel);
uvm_km_free(kernel_map, (vaddr_t)ldt, MAX_USERLDT_SIZE,
UVM_KMF_WIRED);
}
mutex_exit(&cpu_lock);
}
#endif /* USER_LDT */
/*
* pmap_activate: activate a process' pmap
*
* => must be called with kernel preemption disabled
* => if lwp is the curlwp, then set ci_want_pmapload so that
* actual MMU context switch will be done by pmap_load() later
*/
void
pmap_activate(struct lwp *l)
{
struct cpu_info *ci;
struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
KASSERT(kpreempt_disabled());
ci = curcpu();
if (l != ci->ci_curlwp)
return;
KASSERT(ci->ci_want_pmapload == 0); KASSERT(ci->ci_tlbstate != TLBSTATE_VALID);
/*
* no need to switch to kernel vmspace because
* it's a subset of any vmspace.
*/
if (pmap == pmap_kernel()) {
ci->ci_want_pmapload = 0;
return;
}
ci->ci_want_pmapload = 1;
}
#if defined(XENPV) && defined(__x86_64__)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == ci->ci_xen_current_user_pgd || \
pmap == pmap_kernel())
#elif defined(PAE)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(ci->ci_pae_l3_pdir[0]))
#elif !defined(XENPV)
#define KASSERT_PDIRPA(pmap) \
KASSERT(pmap_pdirpa(pmap, 0) == pmap_pte2pa(rcr3()))
#else
#define KASSERT_PDIRPA(pmap) KASSERT(true) /* nothing to do */
#endif
/*
* pmap_reactivate: try to regain reference to the pmap.
*
* => Must be called with kernel preemption disabled.
*/
static void
pmap_reactivate(struct pmap *pmap)
{
struct cpu_info * const ci = curcpu();
const cpuid_t cid = cpu_index(ci);
KASSERT(kpreempt_disabled()); KASSERT_PDIRPA(pmap);
/*
* If we still have a lazy reference to this pmap, we can assume
* that there was no TLB shootdown for this pmap in the meantime.
*
* The order of events here is important as we must synchronize
* with TLB shootdown interrupts. Declare interest in invalidations
* (TLBSTATE_VALID) and then check the CPU set, which the IPIs can
* change only when the state is TLBSTATE_LAZY.
*/
ci->ci_tlbstate = TLBSTATE_VALID;
KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid)); if (__predict_true(kcpuset_isset(pmap->pm_cpus, cid))) {
/* We have the reference, state is valid. */
} else {
/*
* Must reload the TLB, pmap has been changed during
* deactivated.
*/
kcpuset_atomic_set(pmap->pm_cpus, cid);
tlbflush();
}
}
/*
* pmap_load: perform the actual pmap switch, i.e. fill in %cr3 register
* and relevant LDT info.
*
* Ensures that the current process' pmap is loaded on the current CPU's
* MMU and that there are no stale TLB entries.
*
* => The caller should disable kernel preemption or do check-and-retry
* to prevent a preemption from undoing our efforts.
* => This function may block.
*/
void
pmap_load(void)
{
struct cpu_info *ci;
struct pmap *pmap, *oldpmap;
struct lwp *l;
uint64_t pctr;
int ilevel __diagused;
u_long psl __diagused;
kpreempt_disable();
retry:
ci = curcpu();
if (!ci->ci_want_pmapload) {
kpreempt_enable();
return;
}
l = ci->ci_curlwp;
pctr = lwp_pctr();
__insn_barrier();
/* should be able to take ipis. */
KASSERTMSG((ilevel = ci->ci_ilevel) < IPL_HIGH, "ilevel=%d", ilevel);
#ifdef XENPV
/* Check to see if interrupts are enabled (ie; no events are masked) */
KASSERTMSG((psl = x86_read_psl()) == 0, "psl=0x%lx", psl);
#else
KASSERTMSG(((psl = x86_read_psl()) & PSL_I) != 0, "psl=0x%lx", psl);
#endif
KASSERT(l != NULL);
pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
KASSERT(pmap != pmap_kernel());
oldpmap = ci->ci_pmap;
if (pmap == oldpmap) {
pmap_reactivate(pmap);
ci->ci_want_pmapload = 0;
kpreempt_enable();
return;
}
/*
* Acquire a reference to the new pmap and perform the switch.
*/
pmap_reference(pmap);
pmap_load1(l, pmap, oldpmap);
ci->ci_want_pmapload = 0;
/*
* we're now running with the new pmap. drop the reference
* to the old pmap. if we block, we need to go around again.
*/
pmap_destroy(oldpmap);
__insn_barrier();
if (lwp_pctr() != pctr) {
goto retry;
}
kpreempt_enable();
}
/*
* pmap_load1: the guts of pmap load, shared by pmap_map_ptes() and
* pmap_load(). It's critically important that this function does not
* block.
*/
static void
pmap_load1(struct lwp *l, struct pmap *pmap, struct pmap *oldpmap)
{
struct cpu_info *ci;
struct pcb *pcb;
cpuid_t cid;
KASSERT(kpreempt_disabled());
pcb = lwp_getpcb(l);
ci = l->l_cpu;
cid = cpu_index(ci);
kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
KASSERT_PDIRPA(oldpmap); KASSERT(!kcpuset_isset(pmap->pm_cpus, cid)); KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
/*
* Mark the pmap in use by this CPU. Again, we must synchronize
* with TLB shootdown interrupts, so set the state VALID first,
* then register us for shootdown events on this pmap.
*/
ci->ci_tlbstate = TLBSTATE_VALID;
kcpuset_atomic_set(pmap->pm_cpus, cid);
kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
ci->ci_pmap = pmap;
/*
* update tss. now that we have registered for invalidations
* from other CPUs, we're good to load the page tables.
*/
#ifdef PAE
pcb->pcb_cr3 = ci->ci_pae_l3_pdirpa;
#else
pcb->pcb_cr3 = pmap_pdirpa(pmap, 0);
#endif
#ifdef i386
#ifndef XENPV
ci->ci_tss->tss.tss_ldt = pmap->pm_ldt_sel;
ci->ci_tss->tss.tss_cr3 = pcb->pcb_cr3;
#endif
#endif
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(pmap);
} else
#endif
lldt(pmap->pm_ldt_sel);
cpu_load_pmap(pmap, oldpmap);
}
/*
* pmap_deactivate: deactivate a process' pmap.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
*/
void
pmap_deactivate(struct lwp *l)
{
struct pmap *pmap;
struct cpu_info *ci;
KASSERT(kpreempt_disabled()); if (l != curlwp) {
return;
}
/*
* Wait for pending TLB shootdowns to complete. Necessary because
* TLB shootdown state is per-CPU, and the LWP may be coming off
* the CPU before it has a chance to call pmap_update(), e.g. due
* to kernel preemption or blocking routine in between.
*/
pmap_tlb_shootnow();
ci = curcpu();
if (ci->ci_want_pmapload) {
/*
* ci_want_pmapload means that our pmap is not loaded on
* the CPU or TLB might be stale. note that pmap_kernel()
* is always considered loaded.
*/
KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
!= pmap_kernel());
KASSERT(vm_map_pmap(&l->l_proc->p_vmspace->vm_map)
!= ci->ci_pmap || ci->ci_tlbstate != TLBSTATE_VALID);
/*
* userspace has not been touched.
* nothing to do here.
*/
ci->ci_want_pmapload = 0;
return;
}
pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
if (pmap == pmap_kernel()) {
return;
}
KASSERT_PDIRPA(pmap); KASSERT(ci->ci_pmap == pmap);
/*
* we aren't interested in TLB invalidations for this pmap,
* at least for the time being.
*/
KASSERT(ci->ci_tlbstate == TLBSTATE_VALID);
ci->ci_tlbstate = TLBSTATE_LAZY;
}
#ifdef EFI_RUNTIME
extern struct pmap *efi_runtime_pmap;
/*
* pmap_is_user: true if pmap, which must not be the kernel pmap, is
* for an unprivileged user process
*/
bool
pmap_is_user(struct pmap *pmap)
{
KASSERT(pmap != pmap_kernel());
return (pmap != efi_runtime_pmap);
}
/*
* pmap_activate_sync: synchronously activate specified pmap.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
* => Must not sleep before pmap_deactivate_sync.
*/
void *
pmap_activate_sync(struct pmap *pmap)
{
struct cpu_info *ci = curcpu();
struct pmap *oldpmap = ci->ci_pmap;
unsigned cid = cpu_index(ci);
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(!kcpuset_isset(pmap->pm_cpus, cid));
KASSERT(!kcpuset_isset(pmap->pm_kernel_cpus, cid));
if (oldpmap) {
KASSERT_PDIRPA(oldpmap);
kcpuset_atomic_clear(oldpmap->pm_cpus, cid);
kcpuset_atomic_clear(oldpmap->pm_kernel_cpus, cid);
}
ci->ci_tlbstate = TLBSTATE_VALID;
kcpuset_atomic_set(pmap->pm_cpus, cid);
kcpuset_atomic_set(pmap->pm_kernel_cpus, cid);
ci->ci_pmap = pmap;
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(pmap);
} else
#endif
lldt(pmap->pm_ldt_sel);
cpu_load_pmap(pmap, oldpmap);
return oldpmap;
}
/*
* pmap_deactivate_sync: synchronously deactivate specified pmap and
* restore whatever was active before pmap_activate_sync.
*
* => Must be called with kernel preemption disabled (high IPL is enough).
* => Must not have slept since pmap_activate_sync.
*/
void
pmap_deactivate_sync(struct pmap *pmap, void *cookie)
{
struct cpu_info *ci = curcpu();
struct pmap *oldpmap = cookie;
unsigned cid = cpu_index(ci);
KASSERT(kpreempt_disabled());
KASSERT(pmap != pmap_kernel());
KASSERT(ci->ci_pmap == pmap);
KASSERT_PDIRPA(pmap);
KASSERT(kcpuset_isset(pmap->pm_cpus, cid));
KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
pmap_tlb_shootnow();
kcpuset_atomic_clear(pmap->pm_cpus, cid);
kcpuset_atomic_clear(pmap->pm_kernel_cpus, cid);
ci->ci_tlbstate = TLBSTATE_VALID;
ci->ci_pmap = oldpmap;
if (oldpmap) {
kcpuset_atomic_set(oldpmap->pm_cpus, cid);
kcpuset_atomic_set(oldpmap->pm_kernel_cpus, cid);
#if defined(SVS) && defined(USER_LDT)
if (svs_enabled) {
svs_ldt_sync(oldpmap);
} else
#endif
lldt(oldpmap->pm_ldt_sel);
cpu_load_pmap(oldpmap, pmap);
} else {
lcr3(pmap_pdirpa(pmap_kernel(), 0));
}
}
#endif /* EFI_RUNTIME */
/*
* some misc. functions
*/
bool
pmap_pdes_valid(vaddr_t va, pd_entry_t * const *pdes, pd_entry_t *lastpde,
int *lastlvl)
{
unsigned long index;
pd_entry_t pde;
int i;
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_i(va, i);
pde = pdes[i - 2][index];
if ((pde & PTE_P) == 0) {
*lastlvl = i;
return false;
}
if (pde & PTE_PS)
break;
}
if (lastpde != NULL)
*lastpde = pde;
*lastlvl = i;
return true;
}
/*
* pmap_extract: extract a PA for the given VA
*/
bool
pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
pt_entry_t *ptes, pte;
pd_entry_t pde;
pd_entry_t * const *pdes;
struct pmap *pmap2;
paddr_t pa;
bool rv;
int lvl;
if (__predict_false(pmap->pm_extract != NULL)) {
return (*pmap->pm_extract)(pmap, va, pap);
}
#ifdef __HAVE_DIRECT_MAP
if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
if (pap != NULL) {
*pap = PMAP_DIRECT_UNMAP(va);
}
return true;
}
#endif
rv = false;
pa = 0;
if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock);
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
if (lvl == 2) {
pa = (pde & PTE_LGFRAME) | (va & (NBPD_L2 - 1));
rv = true;
} else {
KASSERT(lvl == 1);
pte = ptes[pl1_i(va)];
if (__predict_true((pte & PTE_P) != 0)) { pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
rv = true;
}
}
}
pmap_unmap_ptes(pmap, pmap2);
if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock);
}
if (pap != NULL) { *pap = pa;
}
return rv;
}
/*
* vtophys: virtual address to physical address. For use by
* machine-dependent code only.
*/
paddr_t
vtophys(vaddr_t va)
{
paddr_t pa;
if (pmap_extract(pmap_kernel(), va, &pa) == true)
return pa;
return 0;
}
__strict_weak_alias(pmap_extract_ma, pmap_extract);
#ifdef XENPV
/*
* vtomach: virtual address to machine address. For use by
* machine-dependent code only.
*/
paddr_t
vtomach(vaddr_t va)
{
paddr_t pa;
if (pmap_extract_ma(pmap_kernel(), va, &pa) == true)
return pa;
return 0;
}
#endif
/*
* pmap_virtual_space: used during bootup [pmap_steal_memory] to
* determine the bounds of the kernel virtual address space.
*/
void
pmap_virtual_space(vaddr_t *startp, vaddr_t *endp)
{
*startp = virtual_avail;
*endp = virtual_end;
}
void
pmap_zero_page(paddr_t pa)
{
#if defined(__HAVE_DIRECT_MAP)
memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_pagezero(pa);
return;
}
#endif
struct cpu_info *ci;
pt_entry_t *zpte;
vaddr_t zerova;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_D | PTE_A;
kpreempt_disable();
ci = curcpu();
zerova = ci->vpage[VPAGE_ZER];
zpte = ci->vpage_pte[VPAGE_ZER];
KASSERTMSG(!*zpte, "pmap_zero_page: lock botch");
pmap_pte_set(zpte, pmap_pa2pte(pa) | pteflags);
pmap_pte_flush();
pmap_update_pg(zerova); /* flush TLB */
memset(PAGE_ALIGNED(zerova), 0, PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(zpte, 0); /* zap ! */
pmap_pte_flush();
#endif
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}
void
pmap_copy_page(paddr_t srcpa, paddr_t dstpa)
{
#if defined(__HAVE_DIRECT_MAP)
vaddr_t srcva = PMAP_DIRECT_MAP(srcpa);
vaddr_t dstva = PMAP_DIRECT_MAP(dstpa);
memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_copy_page(srcpa, dstpa);
return;
}
#endif
struct cpu_info *ci;
pt_entry_t *srcpte, *dstpte;
vaddr_t srcva, dstva;
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A;
kpreempt_disable();
ci = curcpu();
srcva = ci->vpage[VPAGE_SRC];
dstva = ci->vpage[VPAGE_DST];
srcpte = ci->vpage_pte[VPAGE_SRC];
dstpte = ci->vpage_pte[VPAGE_DST];
KASSERT(*srcpte == 0 && *dstpte == 0);
pmap_pte_set(srcpte, pmap_pa2pte(srcpa) | pteflags);
pmap_pte_set(dstpte, pmap_pa2pte(dstpa) | pteflags | PTE_D);
pmap_pte_flush();
pmap_update_pg(srcva);
pmap_update_pg(dstva);
memcpy(PAGE_ALIGNED(dstva), PAGE_ALIGNED(srcva), PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(srcpte, 0);
pmap_pte_set(dstpte, 0);
pmap_pte_flush();
#endif
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
}
static pt_entry_t *
pmap_map_ptp(struct vm_page *ptp)
{
#ifdef __HAVE_DIRECT_MAP
return (void *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
#else
struct cpu_info *ci;
pt_entry_t *ptppte;
vaddr_t ptpva;
KASSERT(kpreempt_disabled());
#ifndef XENPV
const pd_entry_t pteflags = PTE_P | PTE_W | pmap_pg_nx | PTE_A | PTE_D;
#else
const pd_entry_t pteflags = PTE_P | pmap_pg_nx | PTE_A | PTE_D;
#endif
ci = curcpu();
ptpva = ci->vpage[VPAGE_PTP];
ptppte = ci->vpage_pte[VPAGE_PTP];
pmap_pte_set(ptppte, pmap_pa2pte(VM_PAGE_TO_PHYS(ptp)) | pteflags);
pmap_pte_flush();
pmap_update_pg(ptpva);
return (pt_entry_t *)ptpva;
#endif
}
static void
pmap_unmap_ptp(void)
{
#ifndef __HAVE_DIRECT_MAP
#if defined(DIAGNOSTIC) || defined(XENPV)
struct cpu_info *ci;
pt_entry_t *pte;
KASSERT(kpreempt_disabled());
ci = curcpu();
pte = ci->vpage_pte[VPAGE_PTP];
if (*pte != 0) { pmap_pte_set(pte, 0);
pmap_pte_flush();
}
#endif
#endif
}
static pt_entry_t *
pmap_map_pte(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{
KASSERT(kpreempt_disabled()); if (pmap_is_curpmap(pmap)) {
return &PTE_BASE[pl1_i(va)]; /* (k)vtopte */
}
KASSERT(ptp != NULL); return pmap_map_ptp(ptp) + pl1_pi(va);
}
static void
pmap_unmap_pte(void)
{
KASSERT(kpreempt_disabled()); pmap_unmap_ptp();
}
/*
* p m a p r e m o v e f u n c t i o n s
*
* functions that remove mappings
*/
/*
* pmap_remove_ptes: remove PTEs from a PTP
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
* => must be called with kernel preemption disabled
* => returns composite pte if at least one page should be shot down
*/
static void
pmap_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
vaddr_t startva, vaddr_t endva)
{
pt_entry_t *pte = (pt_entry_t *)ptpva;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled());
/*
* mappings are very often sparse, so clip the given range to the
* range of PTEs that are known present in the PTP.
*/
pmap_ptp_range_clip(ptp, &startva, &pte);
/*
* note that ptpva points to the PTE that maps startva. this may
* or may not be the first PTE in the PTP.
*
* we loop through the PTP while there are still PTEs to look at
* and the wire_count is greater than 1 (because we use the wire_count
* to keep track of the number of real PTEs in the PTP).
*/
while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
(void)pmap_remove_pte(pmap, ptp, pte, startva);
startva += PAGE_SIZE;
pte++;
}
}
/*
* pmap_remove_pte: remove a single PTE from a PTP.
*
* => caller must hold pmap's lock
* => PTP must be mapped into KVA
* => PTP should be null if pmap == pmap_kernel()
* => returns true if we removed a mapping
* => must be called with kernel preemption disabled
*/
static bool
pmap_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t va)
{
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
KASSERT(mutex_owned(&pmap->pm_lock)); KASSERT(kpreempt_disabled()); if (!pmap_valid_entry(*pte)) {
/* VA not mapped. */
return false;
}
/* Atomically save the old PTE and zap it. */
opte = pmap_pte_testset(pte, 0);
if (!pmap_valid_entry(opte)) {
return false;
}
pmap_exec_account(pmap, va, opte, 0);
pmap_stats_update_bypte(pmap, 0, opte); if (ptp) {
/*
* Dropping a PTE. Make sure that the PDE is flushed.
*/
ptp->wire_count--;
if (ptp->wire_count <= 1) {
opte |= PTE_A;
}
}
if ((opte & PTE_A) != 0) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_REMOVE_PTE);
}
/*
* If we are not on a pv list - we are done.
*/
if ((opte & PTE_PVLIST) == 0) {
#ifndef DOM0OPS
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without PTE_PVLIST for %#"PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without PTE_PVLIST for %#"PRIxVADDR, va);
#endif
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
return true;
}
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg); } else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
__func__, va, pa, atop(pa));
}
/* Sync R/M bits. */
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_pte_to_pp_attrs(opte));
return true;
}
static void
pmap_remove_locked(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes;
pd_entry_t pde;
pd_entry_t * const *pdes;
bool result;
vaddr_t blkendva, va = sva;
struct vm_page *ptp;
struct pmap *pmap2;
int lvl;
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/*
* removing one page? take shortcut function.
*/
if (va + PAGE_SIZE == eva) {
if (pmap_pdes_valid(va, pdes, &pde, &lvl)) {
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL,
"%s: unmanaged PTP detected", __func__);
} else {
/* Never free kernel PTPs. */
ptp = NULL;
}
result = pmap_remove_pte(pmap, ptp,
&ptes[pl1_i(va)], va);
/*
* if mapping removed and the PTP is no longer
* being used, free it!
*/
if (result && ptp && ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
} else for (/* null */ ; va < eva ; va = blkendva) {
/* determine range of block */
blkendva = x86_round_pdr(va+1);
if (blkendva > eva)
blkendva = eva;
if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
/* Skip a range corresponding to an invalid pde. */
blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
continue;
}
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
__func__);
} else {
/* Never free kernel PTPs. */
ptp = NULL;
}
pmap_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_i(va)], va,
blkendva);
/* If PTP is no longer being used, free it. */
if (ptp && ptp->wire_count <= 1) {
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
}
pmap_unmap_ptes(pmap, pmap2);
pmap_drain_pv(pmap);
}
/*
* pmap_remove: mapping removal function.
*
* => caller should not be holding any pmap locks
*/
void
pmap_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
if (__predict_false(pmap->pm_remove != NULL)) {
(*pmap->pm_remove)(pmap, sva, eva);
return;
}
mutex_enter(&pmap->pm_lock);
pmap_remove_locked(pmap, sva, eva);
mutex_exit(&pmap->pm_lock);
}
/*
* pmap_sync_pv: clear pte bits and return the old value of the pp_attrs.
*
* => The 'clearbits' parameter is either ~0 or PP_ATTRS_...
* => Caller should disable kernel preemption.
* => issues tlb shootdowns if necessary.
*/
static int
pmap_sync_pv(struct pv_pte *pvpte, paddr_t pa, int clearbits, uint8_t *oattrs,
pt_entry_t *optep)
{
struct pmap *pmap;
struct vm_page *ptp;
vaddr_t va;
pt_entry_t *ptep;
pt_entry_t opte;
pt_entry_t npte;
pt_entry_t expect;
bool need_shootdown;
ptp = pvpte->pte_ptp;
va = pvpte->pte_va;
KASSERT(ptp == NULL || ptp->uobject != NULL); KASSERT(ptp == NULL || ptp_va2o(va, 1) == ptp->offset); pmap = ptp_to_pmap(ptp); KASSERT(kpreempt_disabled());
if (__predict_false(pmap->pm_sync_pv != NULL)) {
return (*pmap->pm_sync_pv)(ptp, va, pa, clearbits, oattrs,
optep);
}
expect = pmap_pa2pte(pa) | PTE_P;
if (clearbits != ~0) { KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
clearbits = pmap_pp_attrs_to_pte(clearbits);
}
ptep = pmap_map_pte(pmap, ptp, va);
do {
opte = *ptep;
KASSERT((opte & (PTE_D | PTE_A)) != PTE_D); KASSERT((opte & (PTE_A | PTE_P)) != PTE_A); KASSERT(opte == 0 || (opte & PTE_P) != 0);
if ((opte & (PTE_FRAME | PTE_P)) != expect) {
/*
* We lost a race with a V->P operation like
* pmap_remove(). Wait for the competitor
* reflecting pte bits into mp_attrs.
*/
pmap_unmap_pte();
return EAGAIN;
}
/*
* Check if there's anything to do on this PTE.
*/
if ((opte & clearbits) == 0) {
need_shootdown = false;
break;
}
/*
* We need a shootdown if the PTE is cached (PTE_A) ...
* ... Unless we are clearing only the PTE_W bit and
* it isn't cached as RW (PTE_D).
*/
need_shootdown = (opte & PTE_A) != 0 && !(clearbits == PTE_W && (opte & PTE_D) == 0);
npte = opte & ~clearbits;
/*
* If we need a shootdown anyway, clear PTE_A and PTE_D.
*/
if (need_shootdown) {
npte &= ~(PTE_A | PTE_D);
}
KASSERT((npte & (PTE_D | PTE_A)) != PTE_D); KASSERT((npte & (PTE_A | PTE_P)) != PTE_A); KASSERT(npte == 0 || (opte & PTE_P) != 0); } while (pmap_pte_cas(ptep, opte, npte) != opte); if (need_shootdown) { pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_SYNC_PV);
}
pmap_unmap_pte();
*oattrs = pmap_pte_to_pp_attrs(opte);
if (optep != NULL) *optep = opte;
return 0;
}
static void
pmap_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
vaddr_t va)
{
struct pmap *pmap2;
pt_entry_t *ptes;
pd_entry_t * const *pdes;
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
pmap_stats_update_bypte(pmap, 0, opte);
ptp->wire_count--;
if (ptp->wire_count <= 1) { pmap_free_ptp(pmap, ptp, va, ptes, pdes);
}
pmap_unmap_ptes(pmap, pmap2);
}
static void
pmap_pp_remove(struct pmap_page *pp, paddr_t pa)
{
struct pv_pte *pvpte;
struct vm_page *ptp;
uintptr_t sum;
uint8_t oattrs;
bool locked;
/*
* Do an unlocked check to see if the page has no mappings, eg when
* pmap_remove_all() was called before amap_wipeout() for a process
* private amap - common. The page being removed must be on the way
* out, so we don't have to worry about concurrent attempts to enter
* it (otherwise the caller either doesn't care or has screwed up).
*/
sum = (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_va); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pte.pte_ptp); sum |= (uintptr_t)atomic_load_relaxed(&pp->pp_pvlist.lh_first); if (sum == 0) {
return;
}
kpreempt_disable();
for (;;) {
struct pmap *pmap;
struct pv_entry *pve;
pt_entry_t opte;
vaddr_t va;
mutex_spin_enter(&pp->pp_lock);
if ((pvpte = pv_pte_first(pp)) == NULL) { mutex_spin_exit(&pp->pp_lock);
break;
}
/*
* Add a reference to the pmap before clearing the pte.
* Otherwise the pmap can disappear behind us.
*/
ptp = pvpte->pte_ptp;
pmap = ptp_to_pmap(ptp); KASSERT(pmap->pm_obj[0].uo_refs > 0); if (ptp != NULL) { pmap_reference(pmap);
}
/*
* Now try to lock it. We need a direct handoff between
* pp_lock and pm_lock to know the pv_entry is kept intact
* and kept associated with this pmap. If that can't be
* had, wait for the pmap's lock to become free and then
* retry.
*/
locked = mutex_tryenter(&pmap->pm_lock);
mutex_spin_exit(&pp->pp_lock);
if (!locked) {
mutex_enter(&pmap->pm_lock);
/* nothing, just wait for it */
mutex_exit(&pmap->pm_lock);
if (ptp != NULL) {
pmap_destroy(pmap);
}
continue;
}
va = pvpte->pte_va;
KASSERTMSG(pmap->pm_stats.resident_count > PDP_SIZE,
"va %lx pmap %p ptp %p is empty", va, pmap, ptp);
KASSERTMSG(ptp == NULL || (ptp->flags & PG_FREE) == 0,
"va %lx pmap %p ptp %p is free", va, pmap, ptp);
KASSERTMSG(ptp == NULL || ptp->wire_count > 1,
"va %lx pmap %p ptp %p is empty", va, pmap, ptp);
#ifdef DEBUG
pmap_check_pv(pmap, ptp, pp, pvpte->pte_va, true);
rb_tree_t *tree = (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb);
pve = pmap_treelookup_pv(pmap, ptp, tree, va);
if (pve == NULL) {
KASSERTMSG(&pp->pp_pte == pvpte,
"va %lx pmap %p ptp %p pvpte %p pve %p oops 1",
va, pmap, ptp, pvpte, pve);
} else {
KASSERTMSG(&pve->pve_pte == pvpte,
"va %lx pmap %p ptp %p pvpte %p pve %p oops 2",
va, pmap, ptp, pvpte, pve);
}
#endif
if (pmap_sync_pv(pvpte, pa, ~0, &oattrs, &opte)) {
panic("pmap_pp_remove: mapping not present");
}
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, oattrs);
/* Update the PTP reference count. Free if last reference. */
if (ptp != NULL) {
KASSERT(pmap != pmap_kernel());
pmap_tlb_shootnow();
if (__predict_false(pmap->pm_pp_remove_ent != NULL)) {
(*pmap->pm_pp_remove_ent)(pmap, ptp, opte, va);
} else {
pmap_pp_remove_ent(pmap, ptp, opte, va);
}
} else {
KASSERT(pmap == pmap_kernel()); pmap_stats_update_bypte(pmap, 0, opte);
}
pmap_tlb_shootnow();
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
if (ptp != NULL) {
pmap_destroy(pmap);
}
}
kpreempt_enable();
}
/*
* pmap_page_remove: remove a managed vm_page from all pmaps that map it
*
* => R/M bits are sync'd back to attrs
*/
void
pmap_page_remove(struct vm_page *pg)
{
struct pmap_page *pp;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
pa = VM_PAGE_TO_PHYS(pg);
pmap_pp_remove(pp, pa);
}
/*
* pmap_pv_remove: remove an unmanaged pv-tracked page from all pmaps
* that map it
*/
void
pmap_pv_remove(paddr_t pa)
{
struct pmap_page *pp;
pp = pmap_pv_tracked(pa);
if (pp == NULL)
panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
pmap_pp_remove(pp, pa);
}
/*
* p m a p a t t r i b u t e f u n c t i o n s
* functions that test/change managed page's attributes
* since a page can be mapped multiple times we must check each PTE that
* maps it by going down the pv lists.
*/
/*
* pmap_test_attrs: test a page's attributes
*/
bool
pmap_test_attrs(struct vm_page *pg, unsigned testbits)
{
struct pmap_page *pp;
struct pv_pte *pvpte;
struct pmap *pmap;
uint8_t oattrs;
u_int result;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
if ((pp->pp_attrs & testbits) != 0) {
return true;
}
pa = VM_PAGE_TO_PHYS(pg);
startover:
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
if ((pp->pp_attrs & testbits) != 0) {
break;
}
if (pmap_sync_pv(pvpte, pa, 0, &oattrs, NULL)) {
/*
* raced with a V->P operation. wait for the other
* side to finish by acquiring pmap's lock. if no
* wait, updates to pp_attrs by the other side may
* go unseen.
*/
pmap = ptp_to_pmap(pvpte->pte_ptp);
pmap_reference(pmap);
mutex_spin_exit(&pp->pp_lock);
mutex_enter(&pmap->pm_lock);
/* nothing. */
mutex_exit(&pmap->pm_lock);
pmap_destroy(pmap);
goto startover;
}
pp->pp_attrs |= oattrs;
}
result = pp->pp_attrs & testbits;
mutex_spin_exit(&pp->pp_lock);
/*
* note that we will exit the for loop with a non-null pve if
* we have found the bits we are testing for.
*/
return result != 0;
}
static bool
pmap_pp_clear_attrs(struct pmap_page *pp, paddr_t pa, unsigned clearbits)
{
struct pv_pte *pvpte;
struct pmap *pmap;
uint8_t oattrs;
u_int result;
startover:
mutex_spin_enter(&pp->pp_lock);
for (pvpte = pv_pte_first(pp); pvpte; pvpte = pv_pte_next(pp, pvpte)) {
if (pmap_sync_pv(pvpte, pa, clearbits, &oattrs, NULL)) {
/*
* raced with a V->P operation. wait for the other
* side to finish by acquiring pmap's lock. it is
* probably unmapping the page, and it will be gone
* when the loop is restarted.
*/
pmap = ptp_to_pmap(pvpte->pte_ptp);
pmap_reference(pmap);
mutex_spin_exit(&pp->pp_lock);
mutex_enter(&pmap->pm_lock);
/* nothing. */
mutex_exit(&pmap->pm_lock);
pmap_destroy(pmap);
goto startover;
}
pp->pp_attrs |= oattrs;
}
result = pp->pp_attrs & clearbits;
pp->pp_attrs &= ~clearbits;
pmap_tlb_shootnow();
mutex_spin_exit(&pp->pp_lock);
return result != 0;
}
/*
* pmap_clear_attrs: clear the specified attribute for a page.
*
* => we return true if we cleared one of the bits we were asked to
*/
bool
pmap_clear_attrs(struct vm_page *pg, unsigned clearbits)
{
struct pmap_page *pp;
paddr_t pa;
pp = VM_PAGE_TO_PP(pg);
pa = VM_PAGE_TO_PHYS(pg);
/*
* If this is a new page, assert it has no mappings and simply zap
* the stored attributes without taking any locks.
*/
if ((pg->flags & PG_FAKE) != 0) {
KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_va) == 0); KASSERT(atomic_load_relaxed(&pp->pp_pte.pte_ptp) == NULL); KASSERT(atomic_load_relaxed(&pp->pp_pvlist.lh_first) == NULL);
atomic_store_relaxed(&pp->pp_attrs, 0);
return false;
} else {
return pmap_pp_clear_attrs(pp, pa, clearbits);
}
}
/*
* pmap_pv_clear_attrs: clear the specified attributes for an unmanaged
* pv-tracked page.
*/
bool
pmap_pv_clear_attrs(paddr_t pa, unsigned clearbits)
{
struct pmap_page *pp;
pp = pmap_pv_tracked(pa);
if (pp == NULL)
panic("%s: page not pv-tracked: %#"PRIxPADDR, __func__, pa);
return pmap_pp_clear_attrs(pp, pa, clearbits);
}
/*
* p m a p p r o t e c t i o n f u n c t i o n s
*/
/*
* pmap_page_protect: change the protection of all recorded mappings
* of a managed page
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_pv_protect: change the protection of all recorded mappings
* of an unmanaged pv-tracked page
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_protect: set the protection in of the pages in a pmap
*
* => NOTE: this is an inline function in pmap.h
*/
/* see pmap.h */
/*
* pmap_write_protect: write-protect pages in a pmap.
*
* Note for Xen-amd64. Xen automatically adds PTE_U to the kernel pages, but we
* don't need to remove this bit when re-entering the PTEs here: Xen tracks the
* kernel pages with a reserved bit (_PAGE_GUEST_KERNEL), so even if PTE_U is
* present the page will still be considered as a kernel page, and the privilege
* separation will be enforced correctly.
*/
void
pmap_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
pt_entry_t bit_rem, bit_put;
pt_entry_t *ptes;
pt_entry_t * const *pdes;
struct pmap *pmap2;
vaddr_t blockend, va;
int lvl, i;
if (__predict_false(pmap->pm_write_protect != NULL)) {
(*pmap->pm_write_protect)(pmap, sva, eva, prot);
return;
}
bit_rem = 0;
if (!(prot & VM_PROT_WRITE))
bit_rem = PTE_W;
bit_put = 0;
if (!(prot & VM_PROT_EXECUTE))
bit_put = pmap_pg_nx;
sva &= ~PAGE_MASK;
eva &= ~PAGE_MASK;
/*
* Acquire pmap. No need to lock the kernel pmap as we won't
* be touching PV entries nor stats and kernel PDEs aren't
* freed.
*/
if (pmap != pmap_kernel()) { mutex_enter(&pmap->pm_lock);
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
for (va = sva ; va < eva; va = blockend) {
pt_entry_t *spte, *epte;
blockend = x86_round_pdr(va + 1);
if (blockend > eva)
blockend = eva;
/* Is it a valid block? */
if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
continue;
}
KASSERT(va < VM_MAXUSER_ADDRESS || va >= VM_MAX_ADDRESS); KASSERT(lvl == 1);
spte = &ptes[pl1_i(va)];
epte = &ptes[pl1_i(blockend)];
for (i = 0; spte < epte; spte++, i++) {
pt_entry_t opte, npte;
do {
opte = *spte;
if (!pmap_valid_entry(opte)) {
goto next;
}
npte = (opte & ~bit_rem) | bit_put;
} while (pmap_pte_cas(spte, opte, npte) != opte); if ((opte & PTE_D) != 0) { vaddr_t tva = va + x86_ptob(i);
pmap_tlb_shootdown(pmap, tva, opte,
TLBSHOOT_WRITE_PROTECT);
}
next:;
}
}
/* Release pmap. */
pmap_unmap_ptes(pmap, pmap2);
if (pmap != pmap_kernel()) { mutex_exit(&pmap->pm_lock);
}
}
/*
* pmap_unwire: clear the wired bit in the PTE.
*
* => Mapping should already be present.
*/
void
pmap_unwire(struct pmap *pmap, vaddr_t va)
{
pt_entry_t *ptes, *ptep, opte;
pd_entry_t * const *pdes;
struct pmap *pmap2;
int lvl;
if (__predict_false(pmap->pm_unwire != NULL)) {
(*pmap->pm_unwire)(pmap, va);
return;
}
/*
* Acquire pmap. Need to lock the kernel pmap only to protect the
* statistics.
*/
mutex_enter(&pmap->pm_lock);
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (!pmap_pdes_valid(va, pdes, NULL, &lvl)) {
panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
}
KASSERT(lvl == 1);
ptep = &ptes[pl1_i(va)];
opte = *ptep;
KASSERT(pmap_valid_entry(opte));
if (opte & PTE_WIRED) {
pt_entry_t npte = opte & ~PTE_WIRED;
opte = pmap_pte_testset(ptep, npte);
pmap_stats_update_bypte(pmap, npte, opte);
} else {
printf("%s: wiring for pmap %p va %#" PRIxVADDR
" did not change!\n", __func__, pmap, va);
}
/* Release pmap. */
pmap_unmap_ptes(pmap, pmap2);
mutex_exit(&pmap->pm_lock);
}
/*
* pmap_copy: copy mappings from one pmap to another
*
* => optional function
* void pmap_copy(dst_pmap, src_pmap, dst_addr, len, src_addr)
*/
/*
* defined as macro in pmap.h
*/
__strict_weak_alias(pmap_enter, pmap_enter_default);
int
pmap_enter_default(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
u_int flags)
{
if (__predict_false(pmap->pm_enter != NULL)) {
return (*pmap->pm_enter)(pmap, va, pa, prot, flags);
}
return pmap_enter_ma(pmap, va, pa, pa, prot, flags, 0);
}
/*
* pmap_enter: enter a mapping into a pmap
*
* => must be done "now" ... no lazy-evaluation
*/
int
pmap_enter_ma(struct pmap *pmap, vaddr_t va, paddr_t ma, paddr_t pa,
vm_prot_t prot, u_int flags, int domid)
{
pt_entry_t *ptes, opte, npte;
pt_entry_t *ptep;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct vm_page *new_pg, *old_pg;
struct pmap_page *new_pp, *old_pp;
struct pv_entry *old_pve, *new_pve;
bool wired = (flags & PMAP_WIRED) != 0;
struct pmap *pmap2;
struct pmap_ptparray pt;
int error;
bool getptp, samepage, new_embedded;
rb_tree_t *tree;
KASSERT(pmap_initialized); KASSERT(va < VM_MAX_KERNEL_ADDRESS); KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
PRIxVADDR " over PDP!", __func__, va);
KASSERTMSG(va < VM_MIN_KERNEL_ADDRESS ||
pmap_valid_entry(pmap->pm_pdir[pl_i(va, PTP_LEVELS)]),
"%s: missing kernel PTP for va=%#" PRIxVADDR, __func__, va);
#ifdef XENPV
KASSERT(domid == DOMID_SELF || pa == 0);
#endif
npte = ma | protection_codes[prot] | PTE_P;
npte |= pmap_pat_flags(flags);
if (wired)
npte |= PTE_WIRED;
if (va < VM_MAXUSER_ADDRESS) { KASSERTMSG(pmap != pmap_kernel(),
"entering user va %#"PRIxVADDR" into kernel pmap",
va);
if (pmap_is_user(pmap))
npte |= PTE_U;
}
if (pmap == pmap_kernel())
npte |= pmap_pg_g;
if (flags & VM_PROT_ALL) {
npte |= PTE_A;
if (flags & VM_PROT_WRITE) { KASSERT((npte & PTE_W) != 0);
npte |= PTE_D;
}
}
#ifdef XENPV
if (domid != DOMID_SELF)
new_pg = NULL;
else
#endif
new_pg = PHYS_TO_VM_PAGE(pa);
if (new_pg != NULL) {
/* This is a managed page */
npte |= PTE_PVLIST;
new_pp = VM_PAGE_TO_PP(new_pg);
PMAP_CHECK_PP(new_pp); } else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
/* This is an unmanaged pv-tracked page */
npte |= PTE_PVLIST;
PMAP_CHECK_PP(new_pp);
} else {
new_pp = NULL;
}
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
if (error != 0) {
if (flags & PMAP_CANFAIL) {
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: get ptp failed, error=%d", __func__,
error);
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
} else {
/* Embedded PV entries rely on this. */
KASSERT(va != 0);
tree = &pmap_kernel_rb;
}
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
new_pve = NULL;
old_pve = NULL;
samepage = false;
new_embedded = false;
if (new_pp != NULL) {
error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
&old_pve, &samepage, &new_embedded, tree);
/*
* If a new pv_entry was needed and none was available, we
* can go no further.
*/
if (error != 0) {
if (flags & PMAP_CANFAIL) {
if (getptp) { pmap_unget_ptp(pmap, &pt);
}
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: alloc pve failed", __func__);
}
} else {
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
}
/* Map PTEs into address space. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/* Install any newly allocated PTPs. */
if (getptp) { pmap_install_ptp(pmap, &pt, va, pdes);
}
/* Check if there is an existing mapping. */
ptep = &ptes[pl1_i(va)];
opte = *ptep;
bool have_oldpa = pmap_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
do {
opte = *ptep;
/*
* if the same page, inherit PTE_A and PTE_D.
*/
if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
npte |= opte & (PTE_A | PTE_D);
}
#if defined(XENPV)
if (domid != DOMID_SELF) {
/* pmap_pte_cas with error handling */
int s = splvm();
if (opte != *ptep) {
splx(s);
continue;
}
error = xpq_update_foreign(
vtomach((vaddr_t)ptep), npte, domid, flags);
splx(s);
if (error) {
/* Undo pv_entry tracking - oof. */
if (new_pp != NULL) {
mutex_spin_enter(&new_pp->pp_lock);
if (new_pve != NULL) {
LIST_REMOVE(new_pve, pve_list);
KASSERT(pmap->pm_pve == NULL);
pmap->pm_pve = new_pve;
} else if (new_embedded) {
new_pp->pp_pte.pte_ptp = NULL;
new_pp->pp_pte.pte_va = 0;
}
mutex_spin_exit(&new_pp->pp_lock);
}
pmap_unmap_ptes(pmap, pmap2);
/* Free new PTP. */
if (ptp != NULL && ptp->wire_count <= 1) {
pmap_free_ptp(pmap, ptp, va, ptes,
pdes);
}
mutex_exit(&pmap->pm_lock);
return error;
}
break;
}
#endif /* defined(XENPV) */
} while (pmap_pte_cas(ptep, opte, npte) != opte);
/*
* Done with the PTEs: they can now be unmapped.
*/
pmap_unmap_ptes(pmap, pmap2);
/*
* Update statistics and PTP's reference count.
*/
pmap_stats_update_bypte(pmap, npte, opte); if (ptp != NULL) { if (!have_oldpa) { ptp->wire_count++;
}
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
KASSERT(ptp == NULL || ptp->wire_count > 1);
/*
* If the same page, we can skip pv_entry handling.
*/
if (((opte ^ npte) & (PTE_FRAME | PTE_P)) == 0) {
KASSERT(((opte ^ npte) & PTE_PVLIST) == 0); if ((npte & PTE_PVLIST) != 0) { KASSERT(samepage);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
goto same_pa;
} else if ((npte & PTE_PVLIST) != 0) { KASSERT(!samepage);
}
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg); } else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR
" pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
__func__, va, oldpa, atop(pa));
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_pte_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL); KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
/*
* If new page is dynamically PV tracked, insert to tree.
*/
if (new_pve != NULL) { KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
old_pve = rb_tree_insert_node(tree, new_pve);
KASSERT(old_pve == new_pve);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
same_pa:
/*
* shootdown tlb if necessary.
*/
if ((~opte & (PTE_P | PTE_A)) == 0 &&
((opte ^ npte) & (PTE_FRAME | PTE_W)) != 0) {
pmap_tlb_shootdown(pmap, va, opte, TLBSHOOT_ENTER);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return 0;
}
#if defined(XEN) && defined(DOM0OPS)
struct pmap_data_gnt {
SLIST_ENTRY(pmap_data_gnt) pd_gnt_list;
vaddr_t pd_gnt_sva;
vaddr_t pd_gnt_eva; /* range covered by this gnt */
int pd_gnt_refs; /* ref counter */
struct gnttab_map_grant_ref pd_gnt_ops[1]; /* variable length */
};
SLIST_HEAD(pmap_data_gnt_head, pmap_data_gnt);
static void pmap_remove_gnt(struct pmap *, vaddr_t, vaddr_t);
static struct pmap_data_gnt *
pmap_find_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
struct pmap_data_gnt_head *headp;
struct pmap_data_gnt *pgnt;
KASSERT(mutex_owned(&pmap->pm_lock));
headp = pmap->pm_data;
KASSERT(headp != NULL);
SLIST_FOREACH(pgnt, headp, pd_gnt_list) {
if (pgnt->pd_gnt_sva <= sva && eva <= pgnt->pd_gnt_eva)
return pgnt;
/* check that we're not overlapping part of a region */
KASSERT(pgnt->pd_gnt_sva >= eva || pgnt->pd_gnt_eva <= sva);
}
return NULL;
}
static void
pmap_alloc_gnt(struct pmap *pmap, vaddr_t sva, int nentries,
const struct gnttab_map_grant_ref *ops)
{
struct pmap_data_gnt_head *headp;
struct pmap_data_gnt *pgnt;
vaddr_t eva = sva + nentries * PAGE_SIZE;
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(nentries >= 1);
if (pmap->pm_remove == NULL) {
pmap->pm_remove = pmap_remove_gnt;
KASSERT(pmap->pm_data == NULL);
headp = kmem_alloc(sizeof(*headp), KM_SLEEP);
SLIST_INIT(headp);
pmap->pm_data = headp;
} else {
KASSERT(pmap->pm_remove == pmap_remove_gnt);
KASSERT(pmap->pm_data != NULL);
headp = pmap->pm_data;
}
pgnt = pmap_find_gnt(pmap, sva, eva);
if (pgnt != NULL) {
KASSERT(pgnt->pd_gnt_sva == sva);
KASSERT(pgnt->pd_gnt_eva == eva);
return;
}
/* new entry */
pgnt = kmem_alloc(sizeof(*pgnt) +
(nentries - 1) * sizeof(struct gnttab_map_grant_ref), KM_SLEEP);
pgnt->pd_gnt_sva = sva;
pgnt->pd_gnt_eva = eva;
pgnt->pd_gnt_refs = 0;
memcpy(pgnt->pd_gnt_ops, ops,
sizeof(struct gnttab_map_grant_ref) * nentries);
SLIST_INSERT_HEAD(headp, pgnt, pd_gnt_list);
}
static void
pmap_free_gnt(struct pmap *pmap, struct pmap_data_gnt *pgnt)
{
struct pmap_data_gnt_head *headp = pmap->pm_data;
int nentries = (pgnt->pd_gnt_eva - pgnt->pd_gnt_sva) / PAGE_SIZE;
KASSERT(nentries >= 1);
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(pgnt->pd_gnt_refs == 0);
SLIST_REMOVE(headp, pgnt, pmap_data_gnt, pd_gnt_list);
kmem_free(pgnt, sizeof(*pgnt) +
(nentries - 1) * sizeof(struct gnttab_map_grant_ref));
if (SLIST_EMPTY(headp)) {
kmem_free(headp, sizeof(*headp));
pmap->pm_data = NULL;
pmap->pm_remove = NULL;
}
}
/*
* pmap_enter_gnt: enter a grant entry into a pmap
*
* => must be done "now" ... no lazy-evaluation
*/
int
pmap_enter_gnt(struct pmap *pmap, vaddr_t va, vaddr_t sva, int nentries,
const struct gnttab_map_grant_ref *oops)
{
struct pmap_data_gnt *pgnt;
pt_entry_t *ptes, opte;
#ifndef XENPV
pt_entry_t npte;
#endif
pt_entry_t *ptep;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct vm_page *old_pg;
struct pmap_page *old_pp;
struct pv_entry *old_pve;
struct pmap *pmap2;
struct pmap_ptparray pt;
int error;
bool getptp;
rb_tree_t *tree;
struct gnttab_map_grant_ref *op;
int ret;
int idx;
KASSERT(pmap_initialized);
KASSERT(va < VM_MAX_KERNEL_ADDRESS);
KASSERTMSG(va != (vaddr_t)PDP_BASE, "%s: trying to map va=%#"
PRIxVADDR " over PDP!", __func__, va);
KASSERT(pmap != pmap_kernel());
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
pmap_alloc_gnt(pmap, sva, nentries, oops);
pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
KASSERT(pgnt != NULL);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, PMAP_CANFAIL, &ptp);
if (error != 0) {
mutex_exit(&pmap->pm_lock);
return error;
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
old_pve = NULL;
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
/* Map PTEs into address space. */
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/* Install any newly allocated PTPs. */
if (getptp) {
pmap_install_ptp(pmap, &pt, va, pdes);
}
/* Check if there is an existing mapping. */
ptep = &ptes[pl1_i(va)];
opte = *ptep;
bool have_oldpa = pmap_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
op = &pgnt->pd_gnt_ops[idx];
#ifdef XENPV
KASSERT(op->flags & GNTMAP_contains_pte);
op->host_addr = xpmap_ptetomach(ptep);
#else
KASSERT((op->flags & GNTMAP_contains_pte) == 0);
KASSERT(op->flags != 0);
KASSERT(op->host_addr != 0);
#endif
op->dev_bus_addr = 0;
op->status = GNTST_general_error;
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
if (__predict_false(ret)) {
printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
__func__, ret);
op->status = GNTST_general_error;
}
for (int d = 0; d < 256 && op->status == GNTST_eagain; d++) {
kpause("gntmap", false, mstohz(1), NULL);
ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, 1);
if (__predict_false(ret)) {
printf("%s: GNTTABOP_map_grant_ref failed: %d\n",
__func__, ret);
op->status = GNTST_general_error;
}
}
if (__predict_false(op->status != GNTST_okay)) {
printf("%s: GNTTABOP_map_grant_ref status: %d\n",
__func__, op->status);
if (have_oldpa) { /* XXX did the pte really change if XENPV ?*/
ptp->wire_count--;
}
} else {
#ifndef XENPV
npte = op->host_addr | pmap_pg_nx | PTE_U | PTE_P;
if ((op->flags & GNTMAP_readonly) == 0)
npte |= PTE_W;
do {
opte = *ptep;
} while (pmap_pte_cas(ptep, opte, npte) != opte);
#endif
pgnt->pd_gnt_refs++;
if (!have_oldpa) {
ptp->wire_count++;
}
KASSERT(ptp->wire_count > 1);
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
if (ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
/*
* Done with the PTEs: they can now be unmapped.
*/
pmap_unmap_ptes(pmap, pmap2);
/*
* Update statistics and PTP's reference count.
*/
pmap_stats_update_bypte(pmap, 0, opte);
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (PTE_P | PTE_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg);
} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: PTE_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR " pa = %#" PRIxPADDR,
__func__, va, oldpa);
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_pte_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return op->status;
}
/*
* pmap_remove_gnt: grant mapping removal function.
*
* => caller should not be holding any pmap locks
*/
static void
pmap_remove_gnt(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
struct pmap_data_gnt *pgnt;
pt_entry_t *ptes;
pd_entry_t pde;
pd_entry_t * const *pdes;
struct vm_page *ptp;
struct pmap *pmap2;
vaddr_t va;
int lvl;
int idx;
struct gnttab_map_grant_ref *op;
struct gnttab_unmap_grant_ref unmap_op;
int ret;
KASSERT(pmap != pmap_kernel());
KASSERT(pmap->pm_remove == pmap_remove_gnt);
mutex_enter(&pmap->pm_lock);
for (va = sva; va < eva; va += PAGE_SIZE) {
pgnt = pmap_find_gnt(pmap, va, va + PAGE_SIZE);
if (pgnt == NULL) {
pmap_remove_locked(pmap, sva, eva);
continue;
}
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
if (!pmap_pdes_valid(va, pdes, &pde, &lvl)) {
panic("pmap_remove_gnt pdes not valid");
}
idx = (va - pgnt->pd_gnt_sva) / PAGE_SIZE;
op = &pgnt->pd_gnt_ops[idx];
KASSERT(lvl == 1);
/* Get PTP if non-kernel mapping. */
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL,
"%s: unmanaged PTP detected", __func__);
if (op->status == GNTST_okay) {
KASSERT(pmap_valid_entry(ptes[pl1_i(va)]));
#ifdef XENPV
unmap_op.host_addr = xpmap_ptetomach(&ptes[pl1_i(va)]);
#else
unmap_op.host_addr = op->host_addr;
pmap_pte_testset(&ptes[pl1_i(va)], 0);
#endif
unmap_op.handle = op->handle;
unmap_op.dev_bus_addr = 0;
ret = HYPERVISOR_grant_table_op(
GNTTABOP_unmap_grant_ref, &unmap_op, 1);
if (ret) {
printf("%s: GNTTABOP_unmap_grant_ref "
"failed: %d\n", __func__, ret);
}
ptp->wire_count--;
pgnt->pd_gnt_refs--;
}
if (pgnt->pd_gnt_refs == 0) {
pmap_free_gnt(pmap, pgnt);
}
/*
* if mapping removed and the PTP is no longer
* being used, free it!
*/
if (ptp->wire_count <= 1)
pmap_free_ptp(pmap, ptp, va, ptes, pdes);
pmap_unmap_ptes(pmap, pmap2);
}
mutex_exit(&pmap->pm_lock);
}
#endif /* XEN && DOM0OPS */
paddr_t
pmap_get_physpage(void)
{
struct vm_page *ptp;
struct pmap *kpm = pmap_kernel();
paddr_t pa;
if (!uvm.page_init_done) {
/*
* We're growing the kernel pmap early (from
* uvm_pageboot_alloc()). This case must be
* handled a little differently.
*/
if (!uvm_page_physget(&pa))
panic("%s: out of memory", __func__);
#if defined(__HAVE_DIRECT_MAP)
memset(PAGE_ALIGNED(PMAP_DIRECT_MAP(pa)), 0, PAGE_SIZE);
#else
#if defined(XENPV)
if (XEN_VERSION_SUPPORTED(3, 4)) {
xen_pagezero(pa);
return pa;
}
#endif
kpreempt_disable();
pmap_pte_set(early_zero_pte, pmap_pa2pte(pa) | PTE_P |
PTE_W | pmap_pg_nx);
pmap_pte_flush();
pmap_update_pg((vaddr_t)early_zerop);
memset(PAGE_ALIGNED(early_zerop), 0, PAGE_SIZE);
#if defined(DIAGNOSTIC) || defined(XENPV)
pmap_pte_set(early_zero_pte, 0);
pmap_pte_flush();
#endif /* defined(DIAGNOSTIC) */
kpreempt_enable();
#endif /* defined(__HAVE_DIRECT_MAP) */
} else {
/* XXX */
ptp = uvm_pagealloc(NULL, 0, NULL,
UVM_PGA_USERESERVE|UVM_PGA_ZERO);
if (ptp == NULL)
panic("%s: out of memory", __func__);
ptp->flags &= ~PG_BUSY;
ptp->wire_count = 1;
pa = VM_PAGE_TO_PHYS(ptp);
}
pmap_stats_update(kpm, 1, 0);
return pa;
}
/*
* Expand the page tree with the specified amount of PTPs, mapping virtual
* addresses starting at kva. We populate all the levels but the last one
* (L1). The nodes of the tree are created as RW, but the pages covered
* will be kentered in L1, with proper permissions.
*
* Used only by pmap_growkernel.
*/
static void
pmap_alloc_level(struct pmap *cpm, vaddr_t kva, long *needed_ptps)
{
unsigned long i;
paddr_t pa;
unsigned long index, endindex;
int level;
pd_entry_t *pdep;
#ifdef XENPV
int s = splvm(); /* protect xpq_* */
#endif
for (level = PTP_LEVELS; level > 1; level--) {
if (level == PTP_LEVELS)
pdep = cpm->pm_pdir;
else
pdep = normal_pdes[level - 2];
index = pl_i_roundup(kva, level);
endindex = index + needed_ptps[level - 1] - 1;
for (i = index; i <= endindex; i++) {
pt_entry_t pte;
KASSERT(!pmap_valid_entry(pdep[i]));
pa = pmap_get_physpage();
pte = pmap_pa2pte(pa) | PTE_P | PTE_W;
#ifdef __x86_64__
pte |= pmap_pg_nx;
#endif
pmap_pte_set(&pdep[i], pte);
#ifdef XENPV
if (level == PTP_LEVELS && i >= PDIR_SLOT_KERN) {
if (__predict_true(
cpu_info_primary.ci_flags & CPUF_PRESENT)) {
/* update per-cpu PMDs on all cpus */
xen_kpm_sync(pmap_kernel(), i);
} else {
/*
* too early; update primary CPU
* PMD only (without locks)
*/
#ifdef __x86_64__
pd_entry_t *cpu_pdep =
&cpu_info_primary.ci_kpm_pdir[i];
#else
pd_entry_t *cpu_pdep =
&cpu_info_primary.ci_kpm_pdir[l2tol2(i)];
#endif
pmap_pte_set(cpu_pdep, pte);
}
}
#endif
KASSERT(level != PTP_LEVELS || nkptp[level - 1] +
pl_i(VM_MIN_KERNEL_ADDRESS, level) == i);
nkptp[level - 1]++;
}
pmap_pte_flush();
}
#ifdef XENPV
splx(s);
#endif
}
/*
* pmap_growkernel: increase usage of KVM space.
*
* => we allocate new PTPs for the kernel and install them in all
* the pmaps on the system.
*/
vaddr_t
pmap_growkernel(vaddr_t maxkvaddr)
{
struct pmap *kpm = pmap_kernel();
struct pmap *cpm;
#if !defined(XENPV) || !defined(__x86_64__)
struct pmap *pm;
long old;
#endif
int s, i;
long needed_kptp[PTP_LEVELS], target_nptp;
bool invalidate = false;
s = splvm(); /* to be safe */
mutex_enter(&kpm->pm_lock);
if (maxkvaddr <= pmap_maxkvaddr) {
mutex_exit(&kpm->pm_lock);
splx(s);
return pmap_maxkvaddr;
}
maxkvaddr = x86_round_pdr(maxkvaddr);
#if !defined(XENPV) || !defined(__x86_64__)
old = nkptp[PTP_LEVELS - 1];
#endif
/* Initialize needed_kptp. */
for (i = PTP_LEVELS - 1; i >= 1; i--) {
target_nptp = pl_i_roundup(maxkvaddr, i + 1) -
pl_i_roundup(VM_MIN_KERNEL_ADDRESS, i + 1);
if (target_nptp > nkptpmax[i])
panic("out of KVA space");
KASSERT(target_nptp >= nkptp[i]);
needed_kptp[i] = target_nptp - nkptp[i];
}
#ifdef XENPV
/* only pmap_kernel(), or the per-cpu map, has kernel entries */
cpm = kpm;
#else
/* Get the current pmap */
if (__predict_true(cpu_info_primary.ci_flags & CPUF_PRESENT)) {
cpm = curcpu()->ci_pmap;
} else {
cpm = kpm;
}
#endif
kasan_shadow_map((void *)pmap_maxkvaddr,
(size_t)(maxkvaddr - pmap_maxkvaddr));
kmsan_shadow_map((void *)pmap_maxkvaddr,
(size_t)(maxkvaddr - pmap_maxkvaddr));
pmap_alloc_level(cpm, pmap_maxkvaddr, needed_kptp);
/*
* If the number of top level entries changed, update all pmaps.
*/
if (needed_kptp[PTP_LEVELS - 1] != 0) {
#ifdef XENPV
#ifdef __x86_64__
/* nothing, kernel entries are never entered in user pmap */
#else
int pdkidx;
mutex_enter(&pmaps_lock);
LIST_FOREACH(pm, &pmaps, pm_list) {
for (pdkidx = PDIR_SLOT_KERN + old;
pdkidx < PDIR_SLOT_KERN + nkptp[PTP_LEVELS - 1];
pdkidx++) {
pmap_pte_set(&pm->pm_pdir[pdkidx],
kpm->pm_pdir[pdkidx]);
}
pmap_pte_flush();
}
mutex_exit(&pmaps_lock);
#endif /* __x86_64__ */
#else /* XENPV */
size_t newpdes;
newpdes = nkptp[PTP_LEVELS - 1] - old;
if (cpm != kpm) {
memcpy(&kpm->pm_pdir[PDIR_SLOT_KERN + old],
&cpm->pm_pdir[PDIR_SLOT_KERN + old],
newpdes * sizeof(pd_entry_t));
}
mutex_enter(&pmaps_lock);
LIST_FOREACH(pm, &pmaps, pm_list) {
if (__predict_false(pm->pm_enter != NULL)) {
/*
* Not a native pmap, the kernel is not mapped,
* so nothing to synchronize.
*/
continue;
}
memcpy(&pm->pm_pdir[PDIR_SLOT_KERN + old],
&kpm->pm_pdir[PDIR_SLOT_KERN + old],
newpdes * sizeof(pd_entry_t));
}
mutex_exit(&pmaps_lock);
#endif
invalidate = true;
}
pmap_maxkvaddr = maxkvaddr;
mutex_exit(&kpm->pm_lock);
splx(s);
if (invalidate && pmap_initialized) {
/* Invalidate the pmap cache. */
pool_cache_invalidate(&pmap_cache);
}
return maxkvaddr;
}
#ifdef DEBUG
void pmap_dump(struct pmap *, vaddr_t, vaddr_t);
/*
* pmap_dump: dump all the mappings from a pmap
*
* => caller should not be holding any pmap locks
*/
void
pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes, *pte;
pd_entry_t * const *pdes;
struct pmap *pmap2;
vaddr_t blkendva;
int lvl;
/*
* if end is out of range truncate.
* if (end == start) update to max.
*/
if (eva > VM_MAXUSER_ADDRESS || eva <= sva)
eva = VM_MAXUSER_ADDRESS;
mutex_enter(&pmap->pm_lock);
pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
/*
* dumping a range of pages: we dump in PTP sized blocks (4MB)
*/
for (/* null */ ; sva < eva ; sva = blkendva) {
/* determine range of block */
blkendva = x86_round_pdr(sva+1);
if (blkendva > eva)
blkendva = eva;
/* valid block? */
if (!pmap_pdes_valid(sva, pdes, NULL, &lvl))
continue;
KASSERT(lvl == 1);
pte = &ptes[pl1_i(sva)];
for (/* null */; sva < blkendva ; sva += PAGE_SIZE, pte++) {
if (!pmap_valid_entry(*pte))
continue;
printf("va %#" PRIxVADDR " -> pa %#" PRIxPADDR
" (pte=%#" PRIxPADDR ")\n",
sva, (paddr_t)pmap_pte2pa(*pte), (paddr_t)*pte);
}
}
pmap_unmap_ptes(pmap, pmap2);
mutex_exit(&pmap->pm_lock);
}
#endif
/*
* pmap_update: process deferred invalidations and frees.
*/
void
pmap_update(struct pmap *pmap)
{
struct pmap_page *pp;
struct vm_page *ptp;
/*
* Initiate any pending TLB shootdowns. Wait for them to
* complete before returning control to the caller.
*/
kpreempt_disable();
pmap_tlb_shootnow();
kpreempt_enable();
/*
* Now that shootdowns are complete, process deferred frees. This
* is an unlocked check, but is safe as we're only interested in
* work done in this LWP - we won't get a false negative.
*/
if (atomic_load_relaxed(&pmap->pm_gc_ptp.lh_first) == NULL) {
return;
}
mutex_enter(&pmap->pm_lock);
while ((ptp = LIST_FIRST(&pmap->pm_gc_ptp)) != NULL) { KASSERT(ptp->wire_count == 0); KASSERT(ptp->uanon == NULL); LIST_REMOVE(ptp, mdpage.mp_pp.pp_link);
pp = VM_PAGE_TO_PP(ptp);
LIST_INIT(&pp->pp_pvlist);
pp->pp_attrs = 0;
pp->pp_pte.pte_ptp = NULL;
pp->pp_pte.pte_va = 0;
PMAP_CHECK_PP(VM_PAGE_TO_PP(ptp));
/*
* XXX Hack to avoid extra locking, and lock
* assertions in uvm_pagefree(). Despite uobject
* being set, this isn't a managed page.
*/
PMAP_DUMMY_LOCK(pmap);
uvm_pagerealloc(ptp, NULL, 0);
PMAP_DUMMY_UNLOCK(pmap);
uvm_pagefree(ptp);
}
mutex_exit(&pmap->pm_lock);
}
#if PTP_LEVELS > 4
#error "Unsupported number of page table mappings"
#endif
paddr_t
pmap_init_tmp_pgtbl(paddr_t pg)
{
static bool maps_loaded;
static const paddr_t x86_tmp_pml_paddr[] = {
4 * PAGE_SIZE, /* L1 */
5 * PAGE_SIZE, /* L2 */
6 * PAGE_SIZE, /* L3 */
7 * PAGE_SIZE /* L4 */
};
static vaddr_t x86_tmp_pml_vaddr[] = { 0, 0, 0, 0 };
pd_entry_t *tmp_pml, *kernel_pml;
int level;
if (!maps_loaded) {
for (level = 0; level < PTP_LEVELS; ++level) {
x86_tmp_pml_vaddr[level] =
uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
if (x86_tmp_pml_vaddr[level] == 0)
panic("mapping of real mode PML failed\n");
pmap_kenter_pa(x86_tmp_pml_vaddr[level],
x86_tmp_pml_paddr[level],
VM_PROT_READ | VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
maps_loaded = true;
}
/* Zero levels 1-3 */
for (level = 0; level < PTP_LEVELS - 1; ++level) {
tmp_pml = (void *)x86_tmp_pml_vaddr[level];
memset(PAGE_ALIGNED(tmp_pml), 0, PAGE_SIZE);
}
/* Copy PML4 */
kernel_pml = pmap_kernel()->pm_pdir;
tmp_pml = (void *)x86_tmp_pml_vaddr[PTP_LEVELS - 1];
memcpy(PAGE_ALIGNED(tmp_pml), PAGE_ALIGNED(kernel_pml), PAGE_SIZE);
#ifdef PAE
/*
* Use the last 4 entries of the L2 page as L3 PD entries. These
* last entries are unlikely to be used for temporary mappings.
* 508: maps 0->1GB (userland)
* 509: unused
* 510: unused
* 511: maps 3->4GB (kernel)
*/
tmp_pml[508] = x86_tmp_pml_paddr[PTP_LEVELS - 1] | PTE_P;
tmp_pml[509] = 0;
tmp_pml[510] = 0;
tmp_pml[511] = pmap_pdirpa(pmap_kernel(), PDIR_SLOT_KERN) | PTE_P;
#endif
for (level = PTP_LEVELS - 1; level > 0; --level) {
tmp_pml = (void *)x86_tmp_pml_vaddr[level];
tmp_pml[pl_i(pg, level + 1)] =
(x86_tmp_pml_paddr[level - 1] & PTE_FRAME) | PTE_W | PTE_P;
}
tmp_pml = (void *)x86_tmp_pml_vaddr[0];
tmp_pml[pl_i(pg, 1)] = (pg & PTE_FRAME) | PTE_W | PTE_P;
#ifdef PAE
/* Return the PA of the L3 page (entry 508 of the L2 page) */
return x86_tmp_pml_paddr[PTP_LEVELS - 1] + 508 * sizeof(pd_entry_t);
#endif
return x86_tmp_pml_paddr[PTP_LEVELS - 1];
}
u_int
x86_mmap_flags(paddr_t mdpgno)
{
u_int nflag = (mdpgno >> X86_MMAP_FLAG_SHIFT) & X86_MMAP_FLAG_MASK;
u_int pflag = 0;
if (nflag & X86_MMAP_FLAG_PREFETCH)
pflag |= PMAP_WRITE_COMBINE;
return pflag;
}
#if defined(__HAVE_DIRECT_MAP) && defined(__x86_64__) && !defined(XENPV)
/*
* -----------------------------------------------------------------------------
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* **************** HERE BEGINS THE EPT CODE, USED BY INTEL-VMX ****************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* *****************************************************************************
* -----------------------------------------------------------------------------
*
* These functions are invoked as callbacks from the code above. Contrary to
* native, EPT does not have a recursive slot; therefore, it is not possible
* to call pmap_map_ptes(). Instead, we use the direct map and walk down the
* tree manually.
*
* Apart from that, the logic is mostly the same as native. Once a pmap has
* been created, NVMM calls pmap_ept_transform() to make it an EPT pmap.
* After that we're good, and the callbacks will handle the translations
* for us.
*
* -----------------------------------------------------------------------------
*/
/* Hardware bits. */
#define EPT_R __BIT(0) /* read */
#define EPT_W __BIT(1) /* write */
#define EPT_X __BIT(2) /* execute */
#define EPT_T __BITS(5,3) /* type */
#define TYPE_UC 0
#define TYPE_WC 1
#define TYPE_WT 4
#define TYPE_WP 5
#define TYPE_WB 6
#define EPT_NOPAT __BIT(6)
#define EPT_L __BIT(7) /* large */
#define EPT_A __BIT(8) /* accessed */
#define EPT_D __BIT(9) /* dirty */
/* Software bits. */
#define EPT_PVLIST __BIT(60)
#define EPT_WIRED __BIT(61)
#define pmap_ept_valid_entry(pte) (pte & EPT_R)
bool pmap_ept_has_ad __read_mostly;
static inline void
pmap_ept_stats_update_bypte(struct pmap *pmap, pt_entry_t npte, pt_entry_t opte)
{
int resid_diff = ((npte & EPT_R) ? 1 : 0) - ((opte & EPT_R) ? 1 : 0);
int wired_diff = ((npte & EPT_WIRED) ? 1 : 0) - ((opte & EPT_WIRED) ? 1 : 0);
KASSERT((npte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
KASSERT((opte & (EPT_R | EPT_WIRED)) != EPT_WIRED);
pmap_stats_update(pmap, resid_diff, wired_diff);
}
static pt_entry_t
pmap_ept_type(u_int flags)
{
u_int cacheflags = (flags & PMAP_CACHE_MASK);
pt_entry_t ret;
switch (cacheflags) {
case PMAP_NOCACHE:
case PMAP_NOCACHE_OVR:
ret = __SHIFTIN(TYPE_UC, EPT_T);
break;
case PMAP_WRITE_COMBINE:
ret = __SHIFTIN(TYPE_WC, EPT_T);
break;
case PMAP_WRITE_BACK:
default:
ret = __SHIFTIN(TYPE_WB, EPT_T);
break;
}
ret |= EPT_NOPAT;
return ret;
}
static inline pt_entry_t
pmap_ept_prot(vm_prot_t prot)
{
pt_entry_t res = 0;
if (prot & VM_PROT_READ)
res |= EPT_R;
if (prot & VM_PROT_WRITE)
res |= EPT_W;
if (prot & VM_PROT_EXECUTE)
res |= EPT_X;
return res;
}
static inline uint8_t
pmap_ept_to_pp_attrs(pt_entry_t ept)
{
uint8_t ret = 0;
if (pmap_ept_has_ad) {
if (ept & EPT_D)
ret |= PP_ATTRS_D;
if (ept & EPT_A)
ret |= PP_ATTRS_A;
} else {
ret |= (PP_ATTRS_D|PP_ATTRS_A);
}
if (ept & EPT_W)
ret |= PP_ATTRS_W;
return ret;
}
static inline pt_entry_t
pmap_pp_attrs_to_ept(uint8_t attrs)
{
pt_entry_t ept = 0;
if (attrs & PP_ATTRS_D)
ept |= EPT_D;
if (attrs & PP_ATTRS_A)
ept |= EPT_A;
if (attrs & PP_ATTRS_W)
ept |= EPT_W;
return ept;
}
/*
* Helper for pmap_ept_free_ptp.
* tree[0] = &L2[L2idx]
* tree[1] = &L3[L3idx]
* tree[2] = &L4[L4idx]
*/
static void
pmap_ept_get_tree(struct pmap *pmap, vaddr_t va, pd_entry_t **tree)
{
pt_entry_t *pteva;
paddr_t ptepa;
int i, index;
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_pi(va, i);
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
KASSERT(pmap_ept_valid_entry(pteva[index]));
tree[i - 2] = &pteva[index];
ptepa = pmap_pte2pa(pteva[index]);
}
}
static void
pmap_ept_free_ptp(struct pmap *pmap, struct vm_page *ptp, vaddr_t va)
{
pd_entry_t *tree[3];
int level;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
pmap_ept_get_tree(pmap, va, tree);
level = 1;
do {
(void)pmap_pte_testset(tree[level - 1], 0);
pmap_freepage(pmap, ptp, level);
if (level < PTP_LEVELS - 1) {
ptp = pmap_find_ptp(pmap, va, level + 1);
ptp->wire_count--;
if (ptp->wire_count > 1)
break;
}
} while (++level < PTP_LEVELS);
pmap_pte_flush();
}
/* Allocate L4->L3->L2. Return L2. */
static void
pmap_ept_install_ptp(struct pmap *pmap, struct pmap_ptparray *pt, vaddr_t va)
{
struct vm_page *ptp;
unsigned long index;
pd_entry_t *pteva;
paddr_t ptepa;
int i;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
/*
* Now that we have all the pages looked up or allocated,
* loop through again installing any new ones into the tree.
*/
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
index = pl_pi(va, i);
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
if (pmap_ept_valid_entry(pteva[index])) {
KASSERT(!pt->alloced[i]);
ptepa = pmap_pte2pa(pteva[index]);
continue;
}
ptp = pt->pg[i];
ptp->flags &= ~PG_BUSY; /* never busy */
ptp->wire_count = 1;
pmap->pm_ptphint[i - 2] = ptp;
ptepa = VM_PAGE_TO_PHYS(ptp);
pmap_pte_set(&pteva[index], ptepa | EPT_R | EPT_W | EPT_X);
pmap_pte_flush();
pmap_stats_update(pmap, 1, 0);
/*
* If we're not in the top level, increase the
* wire count of the parent page.
*/
if (i < PTP_LEVELS) {
pt->pg[i + 1]->wire_count++;
}
}
}
static int
pmap_ept_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot,
u_int flags)
{
pt_entry_t *ptes, opte, npte;
pt_entry_t *ptep;
struct vm_page *ptp;
struct vm_page *new_pg, *old_pg;
struct pmap_page *new_pp, *old_pp;
struct pv_entry *old_pve, *new_pve;
bool wired = (flags & PMAP_WIRED) != 0;
bool accessed;
struct pmap_ptparray pt;
int error;
bool getptp, samepage, new_embedded;
rb_tree_t *tree;
KASSERT(pmap_initialized);
KASSERT(va < VM_MAXUSER_ADDRESS);
npte = pa | pmap_ept_prot(prot) | pmap_ept_type(flags);
if (wired)
npte |= EPT_WIRED;
if (flags & VM_PROT_ALL) {
npte |= EPT_A;
if (flags & VM_PROT_WRITE) {
KASSERT((npte & EPT_W) != 0);
npte |= EPT_D;
}
}
new_pg = PHYS_TO_VM_PAGE(pa);
if (new_pg != NULL) {
/* This is a managed page */
npte |= EPT_PVLIST;
new_pp = VM_PAGE_TO_PP(new_pg);
} else if ((new_pp = pmap_pv_tracked(pa)) != NULL) {
/* This is an unmanaged pv-tracked page */
npte |= EPT_PVLIST;
} else {
new_pp = NULL;
}
/* Begin by locking the pmap. */
mutex_enter(&pmap->pm_lock);
/* Look up the PTP. Allocate if none present. */
ptp = NULL;
getptp = false;
if (pmap != pmap_kernel()) {
ptp = pmap_find_ptp(pmap, va, 1);
if (ptp == NULL) {
getptp = true;
error = pmap_get_ptp(pmap, &pt, va, flags, &ptp);
if (error != 0) {
if (flags & PMAP_CANFAIL) {
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: get ptp failed, error=%d", __func__,
error);
}
}
tree = &VM_PAGE_TO_PP(ptp)->pp_rb;
} else {
/* Embedded PV entries rely on this. */
KASSERT(va != 0);
tree = &pmap_kernel_rb;
}
/*
* Look up the old PV entry at this VA (if any), and insert a new PV
* entry if required for the new mapping. Temporarily track the old
* and new mappings concurrently. Only after the old mapping is
* evicted from the pmap will we remove its PV entry. Otherwise,
* our picture of modified/accessed state for either page could get
* out of sync (we need any P->V operation for either page to stall
* on pmap->pm_lock until done here).
*/
new_pve = NULL;
old_pve = NULL;
samepage = false;
new_embedded = false;
if (new_pp != NULL) {
error = pmap_enter_pv(pmap, new_pp, ptp, va, &new_pve,
&old_pve, &samepage, &new_embedded, tree);
/*
* If a new pv_entry was needed and none was available, we
* can go no further.
*/
if (error != 0) {
if (flags & PMAP_CANFAIL) {
if (getptp) {
pmap_unget_ptp(pmap, &pt);
}
mutex_exit(&pmap->pm_lock);
return error;
}
panic("%s: alloc pve failed", __func__);
}
} else {
old_pve = pmap_treelookup_pv(pmap, ptp, tree, va);
}
/* Map PTEs into address space. */
kpreempt_disable();
/* Install any newly allocated PTPs. */
if (getptp) {
pmap_ept_install_ptp(pmap, &pt, va);
}
/* Check if there is an existing mapping. */
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(VM_PAGE_TO_PHYS(ptp));
ptep = &ptes[pl1_pi(va)];
opte = *ptep;
bool have_oldpa = pmap_ept_valid_entry(opte);
paddr_t oldpa = pmap_pte2pa(opte);
/*
* Update the pte.
*/
do {
opte = *ptep;
/*
* if the same page, inherit PTE_A and PTE_D.
*/
if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
npte |= opte & (EPT_A | EPT_D);
}
} while (pmap_pte_cas(ptep, opte, npte) != opte);
/*
* Done with the PTEs: they can now be unmapped.
*/
kpreempt_enable();
/*
* Update statistics and PTP's reference count.
*/
pmap_ept_stats_update_bypte(pmap, npte, opte);
if (ptp != NULL) {
if (!have_oldpa) {
ptp->wire_count++;
}
/* Remember minimum VA in PTP. */
pmap_ptp_range_set(ptp, va);
}
KASSERT(ptp == NULL || ptp->wire_count > 1);
/*
* If the same page, we can skip pv_entry handling.
*/
if (((opte ^ npte) & (PTE_FRAME | EPT_R)) == 0) {
KASSERT(((opte ^ npte) & EPT_PVLIST) == 0);
if ((npte & EPT_PVLIST) != 0) {
KASSERT(samepage);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
goto same_pa;
} else if ((npte & EPT_PVLIST) != 0) {
KASSERT(!samepage);
}
/*
* If old page is pv-tracked, remove pv_entry from its list.
*/
if ((~opte & (EPT_R | EPT_PVLIST)) == 0) {
if ((old_pg = PHYS_TO_VM_PAGE(oldpa)) != NULL) {
old_pp = VM_PAGE_TO_PP(old_pg);
} else if ((old_pp = pmap_pv_tracked(oldpa)) == NULL) {
panic("%s: EPT_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR
" pa = %#" PRIxPADDR " (%#" PRIxPADDR ")",
__func__, va, oldpa, atop(pa));
}
pmap_remove_pv(pmap, old_pp, ptp, va, old_pve,
pmap_ept_to_pp_attrs(opte));
} else {
KASSERT(old_pve == NULL);
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
}
/*
* If new page is dynamically PV tracked, insert to tree.
*/
if (new_pve != NULL) {
KASSERT(pmap_treelookup_pv(pmap, ptp, tree, va) == NULL);
old_pve = rb_tree_insert_node(tree, new_pve);
KASSERT(old_pve == new_pve);
pmap_check_pv(pmap, ptp, new_pp, va, true);
}
same_pa:
/*
* shootdown tlb if necessary.
*/
if (pmap_ept_has_ad) {
accessed = (~opte & (EPT_R | EPT_A)) == 0;
} else {
accessed = (opte & EPT_R) != 0;
}
if (accessed && ((opte ^ npte) & (PTE_FRAME | EPT_W)) != 0) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_ENTER);
}
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
return 0;
}
/* Pay close attention, this returns L2. */
static int
pmap_ept_pdes_invalid(struct pmap *pmap, vaddr_t va, pd_entry_t *lastpde)
{
pt_entry_t *pteva;
paddr_t ptepa;
int i, index;
KASSERT(mutex_owned(&pmap->pm_lock));
ptepa = pmap->pm_pdirpa[0];
for (i = PTP_LEVELS; i > 1; i--) {
pteva = (pt_entry_t *)PMAP_DIRECT_MAP(ptepa);
index = pl_pi(va, i);
if (!pmap_ept_valid_entry(pteva[index]))
return i;
ptepa = pmap_pte2pa(pteva[index]);
}
if (lastpde != NULL) {
*lastpde = pteva[index];
}
return 0;
}
static bool
pmap_ept_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
{
pt_entry_t *ptes, pte;
pd_entry_t pde;
paddr_t ptppa, pa;
bool rv;
#ifdef __HAVE_DIRECT_MAP
if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
if (pap != NULL) {
*pap = PMAP_DIRECT_UNMAP(va);
}
return true;
}
#endif
rv = false;
pa = 0;
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
if (!pmap_ept_pdes_invalid(pmap, va, &pde)) {
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
pte = ptes[pl1_pi(va)];
if (__predict_true((pte & EPT_R) != 0)) {
pa = pmap_pte2pa(pte) | (va & (NBPD_L1 - 1));
rv = true;
}
}
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
if (pap != NULL) {
*pap = pa;
}
return rv;
}
static bool
pmap_ept_remove_pte(struct pmap *pmap, struct vm_page *ptp, pt_entry_t *pte,
vaddr_t va)
{
struct pv_entry *pve;
struct vm_page *pg;
struct pmap_page *pp;
pt_entry_t opte;
bool accessed;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
if (!pmap_ept_valid_entry(*pte)) {
/* VA not mapped. */
return false;
}
/* Atomically save the old PTE and zap it. */
opte = pmap_pte_testset(pte, 0);
if (!pmap_ept_valid_entry(opte)) {
return false;
}
pmap_ept_stats_update_bypte(pmap, 0, opte);
if (ptp) {
/*
* Dropping a PTE. Make sure that the PDE is flushed.
*/
ptp->wire_count--;
if (ptp->wire_count <= 1) {
opte |= EPT_A;
}
}
if (pmap_ept_has_ad) {
accessed = (opte & EPT_A) != 0;
} else {
accessed = true;
}
if (accessed) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_REMOVE_PTE);
}
/*
* If we are not on a pv list - we are done.
*/
if ((opte & EPT_PVLIST) == 0) {
KASSERTMSG((PHYS_TO_VM_PAGE(pmap_pte2pa(opte)) == NULL),
"managed page without EPT_PVLIST for %#"PRIxVADDR, va);
KASSERTMSG((pmap_pv_tracked(pmap_pte2pa(opte)) == NULL),
"pv-tracked page without EPT_PVLIST for %#"PRIxVADDR, va);
KASSERT(pmap_treelookup_pv(pmap, ptp, (ptp != NULL ?
&VM_PAGE_TO_PP(ptp)->pp_rb : &pmap_kernel_rb), va) == NULL);
return true;
}
if ((pg = PHYS_TO_VM_PAGE(pmap_pte2pa(opte))) != NULL) {
pp = VM_PAGE_TO_PP(pg);
} else if ((pp = pmap_pv_tracked(pmap_pte2pa(opte))) == NULL) {
paddr_t pa = pmap_pte2pa(opte);
panic("%s: EPT_PVLIST with pv-untracked page"
" va = %#"PRIxVADDR"pa = %#"PRIxPADDR" (%#"PRIxPADDR")",
__func__, va, pa, atop(pa));
}
/* Sync R/M bits. */
pve = pmap_lookup_pv(pmap, ptp, pp, va);
pmap_remove_pv(pmap, pp, ptp, va, pve, pmap_ept_to_pp_attrs(opte));
return true;
}
static void
pmap_ept_remove_ptes(struct pmap *pmap, struct vm_page *ptp, vaddr_t ptpva,
vaddr_t startva, vaddr_t endva)
{
pt_entry_t *pte = (pt_entry_t *)ptpva;
KASSERT(pmap != pmap_kernel());
KASSERT(mutex_owned(&pmap->pm_lock));
KASSERT(kpreempt_disabled());
/*
* mappings are very often sparse, so clip the given range to the
* range of PTEs that are known present in the PTP.
*/
pmap_ptp_range_clip(ptp, &startva, &pte);
/*
* note that ptpva points to the PTE that maps startva. this may
* or may not be the first PTE in the PTP.
*
* we loop through the PTP while there are still PTEs to look at
* and the wire_count is greater than 1 (because we use the wire_count
* to keep track of the number of real PTEs in the PTP).
*/
while (startva < endva && (ptp == NULL || ptp->wire_count > 1)) {
(void)pmap_ept_remove_pte(pmap, ptp, pte, startva);
startva += PAGE_SIZE;
pte++;
}
}
static void
pmap_ept_remove(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
{
pt_entry_t *ptes;
pd_entry_t pde;
paddr_t ptppa;
vaddr_t blkendva, va = sva;
struct vm_page *ptp;
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
for (/* null */ ; va < eva ; va = blkendva) {
int lvl;
/* determine range of block */
blkendva = x86_round_pdr(va+1);
if (blkendva > eva)
blkendva = eva;
lvl = pmap_ept_pdes_invalid(pmap, va, &pde);
if (lvl != 0) {
/* Skip a range corresponding to an invalid pde. */
blkendva = (va & ptp_frames[lvl - 1]) + nbpd[lvl - 1];
continue;
}
/* PA of the PTP */
ptppa = pmap_pte2pa(pde);
ptp = pmap_find_ptp(pmap, va, 1);
KASSERTMSG(ptp != NULL, "%s: unmanaged PTP detected",
__func__);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
pmap_ept_remove_ptes(pmap, ptp, (vaddr_t)&ptes[pl1_pi(va)], va,
blkendva);
/* If PTP is no longer being used, free it. */
if (ptp && ptp->wire_count <= 1) {
pmap_ept_free_ptp(pmap, ptp, va);
}
}
kpreempt_enable();
pmap_drain_pv(pmap);
mutex_exit(&pmap->pm_lock);
}
static int
pmap_ept_sync_pv(struct vm_page *ptp, vaddr_t va, paddr_t pa, int clearbits,
uint8_t *oattrs, pt_entry_t *optep)
{
struct pmap *pmap;
pt_entry_t *ptep;
pt_entry_t opte;
pt_entry_t npte;
pt_entry_t expect;
bool need_shootdown;
expect = pmap_pa2pte(pa) | EPT_R;
pmap = ptp_to_pmap(ptp);
if (clearbits != ~0) {
KASSERT((clearbits & ~(PP_ATTRS_D|PP_ATTRS_A|PP_ATTRS_W)) == 0);
clearbits = pmap_pp_attrs_to_ept(clearbits);
}
ptep = pmap_map_pte(pmap, ptp, va);
do {
opte = *ptep;
KASSERT((opte & (EPT_D | EPT_A)) != EPT_D);
KASSERT((opte & (EPT_A | EPT_R)) != EPT_A);
KASSERT(opte == 0 || (opte & EPT_R) != 0);
if ((opte & (PTE_FRAME | EPT_R)) != expect) {
/*
* We lost a race with a V->P operation like
* pmap_remove(). Wait for the competitor
* reflecting pte bits into mp_attrs.
*/
pmap_unmap_pte();
return EAGAIN;
}
/*
* Check if there's anything to do on this PTE.
*/
if ((opte & clearbits) == 0) {
need_shootdown = false;
break;
}
/*
* We need a shootdown if the PTE is cached (EPT_A) ...
* ... Unless we are clearing only the EPT_W bit and
* it isn't cached as RW (EPT_D).
*/
if (pmap_ept_has_ad) {
need_shootdown = (opte & EPT_A) != 0 &&
!(clearbits == EPT_W && (opte & EPT_D) == 0);
} else {
need_shootdown = true;
}
npte = opte & ~clearbits;
/*
* If we need a shootdown anyway, clear EPT_A and EPT_D.
*/
if (need_shootdown) {
npte &= ~(EPT_A | EPT_D);
}
KASSERT((npte & (EPT_D | EPT_A)) != EPT_D);
KASSERT((npte & (EPT_A | EPT_R)) != EPT_A);
KASSERT(npte == 0 || (opte & EPT_R) != 0);
} while (pmap_pte_cas(ptep, opte, npte) != opte);
if (need_shootdown) {
pmap_tlb_shootdown(pmap, va, 0, TLBSHOOT_SYNC_PV);
}
pmap_unmap_pte();
*oattrs = pmap_ept_to_pp_attrs(opte);
if (optep != NULL)
*optep = opte;
return 0;
}
static void
pmap_ept_pp_remove_ent(struct pmap *pmap, struct vm_page *ptp, pt_entry_t opte,
vaddr_t va)
{
KASSERT(mutex_owned(&pmap->pm_lock));
pmap_ept_stats_update_bypte(pmap, 0, opte);
ptp->wire_count--;
if (ptp->wire_count <= 1) {
pmap_ept_free_ptp(pmap, ptp, va);
}
}
static void
pmap_ept_write_protect(struct pmap *pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
{
pt_entry_t bit_rem;
pt_entry_t *ptes, *spte;
pt_entry_t opte, npte;
pd_entry_t pde;
paddr_t ptppa;
vaddr_t va;
bool modified;
bit_rem = 0;
if (!(prot & VM_PROT_WRITE))
bit_rem = EPT_W;
sva &= PTE_FRAME;
eva &= PTE_FRAME;
/* Acquire pmap. */
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
for (va = sva; va < eva; va += PAGE_SIZE) {
if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
continue;
}
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
spte = &ptes[pl1_pi(va)];
do {
opte = *spte;
if (!pmap_ept_valid_entry(opte)) {
goto next;
}
npte = (opte & ~bit_rem);
} while (pmap_pte_cas(spte, opte, npte) != opte);
if (pmap_ept_has_ad) {
modified = (opte & EPT_D) != 0;
} else {
modified = true;
}
if (modified) {
vaddr_t tva = x86_ptob(spte - ptes);
pmap_tlb_shootdown(pmap, tva, 0,
TLBSHOOT_WRITE_PROTECT);
}
next:;
}
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
}
static void
pmap_ept_unwire(struct pmap *pmap, vaddr_t va)
{
pt_entry_t *ptes, *ptep, opte;
pd_entry_t pde;
paddr_t ptppa;
/* Acquire pmap. */
mutex_enter(&pmap->pm_lock);
kpreempt_disable();
if (pmap_ept_pdes_invalid(pmap, va, &pde)) {
panic("%s: invalid PDE va=%#" PRIxVADDR, __func__, va);
}
ptppa = pmap_pte2pa(pde);
ptes = (pt_entry_t *)PMAP_DIRECT_MAP(ptppa);
ptep = &ptes[pl1_pi(va)];
opte = *ptep;
KASSERT(pmap_ept_valid_entry(opte));
if (opte & EPT_WIRED) {
pt_entry_t npte = opte & ~EPT_WIRED;
opte = pmap_pte_testset(ptep, npte);
pmap_ept_stats_update_bypte(pmap, npte, opte);
} else {
printf("%s: wiring for pmap %p va %#" PRIxVADDR
"did not change!\n", __func__, pmap, va);
}
/* Release pmap. */
kpreempt_enable();
mutex_exit(&pmap->pm_lock);
}
/* -------------------------------------------------------------------------- */
void
pmap_ept_transform(struct pmap *pmap)
{
pmap->pm_enter = pmap_ept_enter;
pmap->pm_extract = pmap_ept_extract;
pmap->pm_remove = pmap_ept_remove;
pmap->pm_sync_pv = pmap_ept_sync_pv;
pmap->pm_pp_remove_ent = pmap_ept_pp_remove_ent;
pmap->pm_write_protect = pmap_ept_write_protect;
pmap->pm_unwire = pmap_ept_unwire;
memset(PAGE_ALIGNED(pmap->pm_pdir), 0, PAGE_SIZE);
}
#endif /* __HAVE_DIRECT_MAP && __x86_64__ && !XENPV */
/* $NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* Copyright (c) 1991, 1993, The Regents of the University of California.
*
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* The Mach Operating System project at Carnegie-Mellon University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_kern.c 8.3 (Berkeley) 1/12/94
* from: Id: uvm_km.c,v 1.1.2.14 1998/02/06 05:19:27 chs Exp
*
*
* Copyright (c) 1987, 1990 Carnegie-Mellon University.
* All rights reserved.
*
* Permission to use, copy, modify and distribute this software and
* its documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
* FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie the
* rights to redistribute these changes.
*/
/*
* uvm_km.c: handle kernel memory allocation and management
*/
/*
* overview of kernel memory management:
*
* the kernel virtual address space is mapped by "kernel_map." kernel_map
* starts at VM_MIN_KERNEL_ADDRESS and goes to VM_MAX_KERNEL_ADDRESS.
* note that VM_MIN_KERNEL_ADDRESS is equal to vm_map_min(kernel_map).
*
* the kernel_map has several "submaps." submaps can only appear in
* the kernel_map (user processes can't use them). submaps "take over"
* the management of a sub-range of the kernel's address space. submaps
* are typically allocated at boot time and are never released. kernel
* virtual address space that is mapped by a submap is locked by the
* submap's lock -- not the kernel_map's lock.
*
* thus, the useful feature of submaps is that they allow us to break
* up the locking and protection of the kernel address space into smaller
* chunks.
*
* the vm system has several standard kernel submaps/arenas, including:
* kmem_arena => used for kmem/pool (memoryallocators(9))
* pager_map => used to map "buf" structures into kernel space
* exec_map => used during exec to handle exec args
* etc...
*
* The kmem_arena is a "special submap", as it lives in a fixed map entry
* within the kernel_map and is controlled by vmem(9).
*
* the kernel allocates its private memory out of special uvm_objects whose
* reference count is set to UVM_OBJ_KERN (thus indicating that the objects
* are "special" and never die). all kernel objects should be thought of
* as large, fixed-sized, sparsely populated uvm_objects. each kernel
* object is equal to the size of kernel virtual address space (i.e. the
* value "VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS").
*
* note that just because a kernel object spans the entire kernel virtual
* address space doesn't mean that it has to be mapped into the entire space.
* large chunks of a kernel object's space go unused either because
* that area of kernel VM is unmapped, or there is some other type of
* object mapped into that range (e.g. a vnode). for submap's kernel
* objects, the only part of the object that can ever be populated is the
* offsets that are managed by the submap.
*
* note that the "offset" in a kernel object is always the kernel virtual
* address minus the VM_MIN_KERNEL_ADDRESS (aka vm_map_min(kernel_map)).
* example:
* suppose VM_MIN_KERNEL_ADDRESS is 0xf8000000 and the kernel does a
* uvm_km_alloc(kernel_map, PAGE_SIZE) [allocate 1 wired down page in the
* kernel map]. if uvm_km_alloc returns virtual address 0xf8235000,
* then that means that the page at offset 0x235000 in kernel_object is
* mapped at 0xf8235000.
*
* kernel object have one other special property: when the kernel virtual
* memory mapping them is unmapped, the backing memory in the object is
* freed right away. this is done with the uvm_km_pgremove() function.
* this has to be done because there is no backing store for kernel pages
* and no need to save them after they are no longer referenced.
*
* Generic arenas:
*
* kmem_arena:
* Main arena controlling the kernel KVA used by other arenas.
*
* kmem_va_arena:
* Implements quantum caching in order to speedup allocations and
* reduce fragmentation. The pool(9), unless created with a custom
* meta-data allocator, and kmem(9) subsystems use this arena.
*
* Arenas for meta-data allocations are used by vmem(9) and pool(9).
* These arenas cannot use quantum cache. However, kmem_va_meta_arena
* compensates this by importing larger chunks from kmem_arena.
*
* kmem_va_meta_arena:
* Space for meta-data.
*
* kmem_meta_arena:
* Imports from kmem_va_meta_arena. Allocations from this arena are
* backed with the pages.
*
* Arena stacking:
*
* kmem_arena
* kmem_va_arena
* kmem_va_meta_arena
* kmem_meta_arena
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_km.c,v 1.165 2023/04/09 09:00:56 riastradh Exp $");
#include "opt_uvmhist.h"
#include "opt_kmempages.h"
#ifndef NKMEMPAGES
#define NKMEMPAGES 0
#endif
/*
* Defaults for lower and upper-bounds for the kmem_arena page count.
* Can be overridden by kernel config options.
*/
#ifndef NKMEMPAGES_MIN
#define NKMEMPAGES_MIN NKMEMPAGES_MIN_DEFAULT
#endif
#ifndef NKMEMPAGES_MAX
#define NKMEMPAGES_MAX NKMEMPAGES_MAX_DEFAULT
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/atomic.h>
#include <sys/proc.h>
#include <sys/pool.h>
#include <sys/vmem.h>
#include <sys/vmem_impl.h>
#include <sys/kmem.h>
#include <sys/msan.h>
#include <uvm/uvm.h>
/*
* global data structures
*/
struct vm_map *kernel_map = NULL;
/*
* local data structures
*/
static struct vm_map kernel_map_store;
static struct vm_map_entry kernel_image_mapent_store;
static struct vm_map_entry kernel_kmem_mapent_store;
size_t nkmempages = 0;
vaddr_t kmembase;
vsize_t kmemsize;
static struct vmem kmem_arena_store;
vmem_t *kmem_arena = NULL;
static struct vmem kmem_va_arena_store;
vmem_t *kmem_va_arena;
/*
* kmeminit_nkmempages: calculate the size of kmem_arena.
*/
void
kmeminit_nkmempages(void)
{
size_t npages;
if (nkmempages != 0) {
/*
* It's already been set (by us being here before)
* bail out now;
*/
return;
}
#if defined(NKMEMPAGES_MAX_UNLIMITED) && !defined(KMSAN)
npages = physmem;
#else
#if defined(KMSAN)
npages = (physmem / 4);
#elif defined(PMAP_MAP_POOLPAGE)
npages = (physmem / 4);
#else
npages = (physmem / 3) * 2;
#endif /* defined(PMAP_MAP_POOLPAGE) */
#if !defined(NKMEMPAGES_MAX_UNLIMITED)
if (npages > NKMEMPAGES_MAX)
npages = NKMEMPAGES_MAX;
#endif
#endif
if (npages < NKMEMPAGES_MIN)
npages = NKMEMPAGES_MIN;
nkmempages = npages;
}
/*
* uvm_km_bootstrap: init kernel maps and objects to reflect reality (i.e.
* KVM already allocated for text, data, bss, and static data structures).
*
* => KVM is defined by VM_MIN_KERNEL_ADDRESS/VM_MAX_KERNEL_ADDRESS.
* we assume that [vmin -> start] has already been allocated and that
* "end" is the end.
*/
void
uvm_km_bootstrap(vaddr_t start, vaddr_t end)
{
bool kmem_arena_small;
vaddr_t base = VM_MIN_KERNEL_ADDRESS;
struct uvm_map_args args;
int error;
UVMHIST_FUNC(__func__);
UVMHIST_CALLARGS(maphist, "start=%#jx end=%#jx", start, end, 0,0);
kmeminit_nkmempages();
kmemsize = (vsize_t)nkmempages * PAGE_SIZE;
kmem_arena_small = kmemsize < 64 * 1024 * 1024;
UVMHIST_LOG(maphist, "kmemsize=%#jx", kmemsize, 0,0,0);
/*
* next, init kernel memory objects.
*/
/* kernel_object: for pageable anonymous kernel memory */
uvm_kernel_object = uao_create(VM_MAX_KERNEL_ADDRESS -
VM_MIN_KERNEL_ADDRESS, UAO_FLAG_KERNOBJ);
/*
* init the map and reserve any space that might already
* have been allocated kernel space before installing.
*/
uvm_map_setup(&kernel_map_store, base, end, VM_MAP_PAGEABLE);
kernel_map_store.pmap = pmap_kernel();
if (start != base) {
error = uvm_map_prepare(&kernel_map_store,
base, start - base,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
if (!error) {
kernel_image_mapent_store.flags =
UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
error = uvm_map_enter(&kernel_map_store, &args,
&kernel_image_mapent_store);
}
if (error)
panic(
"uvm_km_bootstrap: could not reserve space for kernel");
kmembase = args.uma_start + args.uma_size;
} else {
kmembase = base;
}
error = uvm_map_prepare(&kernel_map_store,
kmembase, kmemsize,
NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, UVM_FLAG_FIXED), &args);
if (!error) {
kernel_kmem_mapent_store.flags =
UVM_MAP_KERNEL | UVM_MAP_STATIC | UVM_MAP_NOMERGE;
error = uvm_map_enter(&kernel_map_store, &args,
&kernel_kmem_mapent_store);
}
if (error)
panic("uvm_km_bootstrap: could not reserve kernel kmem");
/*
* install!
*/
kernel_map = &kernel_map_store;
pool_subsystem_init();
kmem_arena = vmem_init(&kmem_arena_store, "kmem",
kmembase, kmemsize, PAGE_SIZE, NULL, NULL, NULL,
0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
#ifdef PMAP_GROWKERNEL
/*
* kmem_arena VA allocations happen independently of uvm_map.
* grow kernel to accommodate the kmem_arena.
*/
if (uvm_maxkaddr < kmembase + kmemsize) {
uvm_maxkaddr = pmap_growkernel(kmembase + kmemsize);
KASSERTMSG(uvm_maxkaddr >= kmembase + kmemsize,
"%#"PRIxVADDR" %#"PRIxVADDR" %#"PRIxVSIZE,
uvm_maxkaddr, kmembase, kmemsize);
}
#endif
vmem_subsystem_init(kmem_arena);
UVMHIST_LOG(maphist, "kmem vmem created (base=%#jx, size=%#jx",
kmembase, kmemsize, 0,0);
kmem_va_arena = vmem_init(&kmem_va_arena_store, "kva",
0, 0, PAGE_SIZE, vmem_alloc, vmem_free, kmem_arena,
(kmem_arena_small ? 4 : VMEM_QCACHE_IDX_MAX) * PAGE_SIZE,
VM_NOSLEEP, IPL_VM);
UVMHIST_LOG(maphist, "<- done", 0,0,0,0);
}
/*
* uvm_km_init: init the kernel maps virtual memory caches
* and start the pool/kmem allocator.
*/
void
uvm_km_init(void)
{
kmem_init();
}
/*
* uvm_km_suballoc: allocate a submap in the kernel map. once a submap
* is allocated all references to that area of VM must go through it. this
* allows the locking of VAs in kernel_map to be broken up into regions.
*
* => if `fixed' is true, *vmin specifies where the region described
* pager_map => used to map "buf" structures into kernel space
* by the submap must start
* => if submap is non NULL we use that as the submap, otherwise we
* alloc a new map
*/
struct vm_map *
uvm_km_suballoc(struct vm_map *map, vaddr_t *vmin /* IN/OUT */,
vaddr_t *vmax /* OUT */, vsize_t size, int flags, bool fixed,
struct vm_map *submap)
{
int mapflags = UVM_FLAG_NOMERGE | (fixed ? UVM_FLAG_FIXED : 0);
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(vm_map_pmap(map) == pmap_kernel());
size = round_page(size); /* round up to pagesize */
/*
* first allocate a blank spot in the parent map
*/
if (uvm_map(map, vmin, size, NULL, UVM_UNKNOWN_OFFSET, 0,
UVM_MAPFLAG(UVM_PROT_ALL, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM, mapflags)) != 0) {
panic("%s: unable to allocate space in parent map", __func__);
}
/*
* set VM bounds (vmin is filled in by uvm_map)
*/
*vmax = *vmin + size;
/*
* add references to pmap and create or init the submap
*/
pmap_reference(vm_map_pmap(map));
if (submap == NULL) {
submap = kmem_alloc(sizeof(*submap), KM_SLEEP);
}
uvm_map_setup(submap, *vmin, *vmax, flags);
submap->pmap = vm_map_pmap(map);
/*
* now let uvm_map_submap plug in it...
*/
if (uvm_map_submap(map, *vmin, *vmax, submap) != 0)
panic("uvm_km_suballoc: submap allocation failed");
return(submap);
}
/*
* uvm_km_pgremove: remove pages from a kernel uvm_object and KVA.
*/
void
uvm_km_pgremove(vaddr_t startva, vaddr_t endva)
{
struct uvm_object * const uobj = uvm_kernel_object;
const voff_t start = startva - vm_map_min(kernel_map);
const voff_t end = endva - vm_map_min(kernel_map);
struct vm_page *pg;
voff_t curoff, nextoff;
int swpgonlydelta = 0;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(VM_MIN_KERNEL_ADDRESS <= startva); KASSERT(startva < endva); KASSERT(endva <= VM_MAX_KERNEL_ADDRESS);
rw_enter(uobj->vmobjlock, RW_WRITER);
pmap_remove(pmap_kernel(), startva, endva);
for (curoff = start; curoff < end; curoff = nextoff) {
nextoff = curoff + PAGE_SIZE;
pg = uvm_pagelookup(uobj, curoff);
if (pg != NULL && pg->flags & PG_BUSY) {
uvm_pagewait(pg, uobj->vmobjlock, "km_pgrm");
rw_enter(uobj->vmobjlock, RW_WRITER);
nextoff = curoff;
continue;
}
/*
* free the swap slot, then the page.
*/
if (pg == NULL &&
uao_find_swslot(uobj, curoff >> PAGE_SHIFT) > 0) {
swpgonlydelta++;
}
uao_dropswap(uobj, curoff >> PAGE_SHIFT);
if (pg != NULL) {
uvm_pagefree(pg);
}
}
rw_exit(uobj->vmobjlock); if (swpgonlydelta > 0) { KASSERT(uvmexp.swpgonly >= swpgonlydelta);
atomic_add_int(&uvmexp.swpgonly, -swpgonlydelta);
}
}
/*
* uvm_km_pgremove_intrsafe: like uvm_km_pgremove(), but for non object backed
* regions.
*
* => when you unmap a part of anonymous kernel memory you want to toss
* the pages right away. (this is called from uvm_unmap_...).
* => none of the pages will ever be busy, and none of them will ever
* be on the active or inactive queues (because they have no object).
*/
void
uvm_km_pgremove_intrsafe(struct vm_map *map, vaddr_t start, vaddr_t end)
{
#define __PGRM_BATCH 16
struct vm_page *pg;
paddr_t pa[__PGRM_BATCH];
int npgrm, i;
vaddr_t va, batch_vastart;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(VM_MAP_IS_KERNEL(map)); KASSERTMSG(vm_map_min(map) <= start,
"vm_map_min(map) [%#"PRIxVADDR"] <= start [%#"PRIxVADDR"]"
" (size=%#"PRIxVSIZE")",
vm_map_min(map), start, end - start);
KASSERT(start < end); KASSERT(end <= vm_map_max(map)); for (va = start; va < end;) {
batch_vastart = va;
/* create a batch of at most __PGRM_BATCH pages to free */
for (i = 0;
i < __PGRM_BATCH && va < end;
va += PAGE_SIZE) {
if (!pmap_extract(pmap_kernel(), va, &pa[i])) {
continue;
}
i++;
}
npgrm = i;
/* now remove the mappings */
pmap_kremove(batch_vastart, va - batch_vastart);
/* and free the pages */
for (i = 0; i < npgrm; i++) {
pg = PHYS_TO_VM_PAGE(pa[i]);
KASSERT(pg); KASSERT(pg->uobject == NULL); KASSERT(pg->uanon == NULL); KASSERT((pg->flags & PG_BUSY) == 0);
uvm_pagefree(pg);
}
}
#undef __PGRM_BATCH
}
#if defined(DEBUG)
void
uvm_km_check_empty(struct vm_map *map, vaddr_t start, vaddr_t end)
{
vaddr_t va;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KDASSERT(VM_MAP_IS_KERNEL(map)); KDASSERT(vm_map_min(map) <= start); KDASSERT(start < end); KDASSERT(end <= vm_map_max(map)); for (va = start; va < end; va += PAGE_SIZE) {
paddr_t pa;
if (pmap_extract(pmap_kernel(), va, &pa)) {
panic("uvm_km_check_empty: va %p has pa %#llx",
(void *)va, (long long)pa);
}
/*
* kernel_object should not have pages for the corresponding
* region. check it.
*
* why trylock? because:
* - caller might not want to block.
* - we can recurse when allocating radix_node for
* kernel_object.
*/
if (rw_tryenter(uvm_kernel_object->vmobjlock, RW_READER)) {
struct vm_page *pg;
pg = uvm_pagelookup(uvm_kernel_object,
va - vm_map_min(kernel_map));
rw_exit(uvm_kernel_object->vmobjlock);
if (pg) {
panic("uvm_km_check_empty: "
"has page hashed at %p",
(const void *)va);
}
}
}
}
#endif /* defined(DEBUG) */
/*
* uvm_km_alloc: allocate an area of kernel memory.
*
* => NOTE: we can return 0 even if we can wait if there is not enough
* free VM space in the map... caller should be prepared to handle
* this case.
* => we return KVA of memory allocated
*/
vaddr_t
uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
{
vaddr_t kva, loopva;
vaddr_t offset;
vsize_t loopsize;
struct vm_page *pg;
struct uvm_object *obj;
int pgaflags;
vm_prot_t prot, vaprot;
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT(vm_map_pmap(map) == pmap_kernel()); KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
KASSERT((flags & UVM_KMF_VAONLY) != 0 || (flags & UVM_KMF_COLORMATCH) == 0);
KASSERT((flags & UVM_KMF_COLORMATCH) == 0 || (flags & UVM_KMF_VAONLY) != 0);
/*
* setup for call
*/
kva = vm_map_min(map); /* hint */
size = round_page(size);
obj = (flags & UVM_KMF_PAGEABLE) ? uvm_kernel_object : NULL;
UVMHIST_LOG(maphist," (map=%#jx, obj=%#jx, size=%#jx, flags=%#jx)",
(uintptr_t)map, (uintptr_t)obj, size, flags);
/*
* allocate some virtual space
*/
vaprot = (flags & UVM_KMF_EXEC) ? UVM_PROT_ALL : UVM_PROT_RW;
if (__predict_false(uvm_map(map, &kva, size, obj, UVM_UNKNOWN_OFFSET,
align, UVM_MAPFLAG(vaprot, UVM_PROT_ALL, UVM_INH_NONE,
UVM_ADV_RANDOM,
(flags & (UVM_KMF_TRYLOCK | UVM_KMF_NOWAIT | UVM_KMF_WAITVA
| UVM_KMF_COLORMATCH)))) != 0)) {
UVMHIST_LOG(maphist, "<- done (no VM)",0,0,0,0);
return(0);
}
/*
* if all we wanted was VA, return now
*/
if (flags & (UVM_KMF_VAONLY | UVM_KMF_PAGEABLE)) {
UVMHIST_LOG(maphist,"<- done valloc (kva=%#jx)", kva,0,0,0);
return(kva);
}
/*
* recover object offset from virtual address
*/
offset = kva - vm_map_min(kernel_map);
UVMHIST_LOG(maphist, " kva=%#jx, offset=%#jx", kva, offset,0,0);
/*
* now allocate and map in the memory... note that we are the only ones
* whom should ever get a handle on this area of VM.
*/
loopva = kva;
loopsize = size;
pgaflags = UVM_FLAG_COLORMATCH;
if (flags & UVM_KMF_NOWAIT)
pgaflags |= UVM_PGA_USERESERVE;
if (flags & UVM_KMF_ZERO)
pgaflags |= UVM_PGA_ZERO;
prot = VM_PROT_READ | VM_PROT_WRITE;
if (flags & UVM_KMF_EXEC)
prot |= VM_PROT_EXECUTE;
while (loopsize) { KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, NULL),
"loopva=%#"PRIxVADDR, loopva);
pg = uvm_pagealloc_strat(NULL, offset, NULL, pgaflags,
#ifdef UVM_KM_VMFREELIST
UVM_PGA_STRAT_ONLY, UVM_KM_VMFREELIST
#else
UVM_PGA_STRAT_NORMAL, 0
#endif
);
/*
* out of memory?
*/
if (__predict_false(pg == NULL)) {
if ((flags & UVM_KMF_NOWAIT) || ((flags & UVM_KMF_CANFAIL) && !uvm_reclaimable())) {
/* free everything! */
uvm_km_free(map, kva, size,
flags & UVM_KMF_TYPEMASK);
return (0);
} else {
uvm_wait("km_getwait2"); /* sleep here */
continue;
}
}
pg->flags &= ~PG_BUSY; /* new page */
UVM_PAGE_OWN(pg, NULL);
/*
* map it in
*/
pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
prot, PMAP_KMPAGE);
loopva += PAGE_SIZE;
offset += PAGE_SIZE;
loopsize -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
if ((flags & UVM_KMF_ZERO) == 0) { kmsan_orig((void *)kva, size, KMSAN_TYPE_UVM, __RET_ADDR);
kmsan_mark((void *)kva, size, KMSAN_STATE_UNINIT);
}
UVMHIST_LOG(maphist,"<- done (kva=%#jx)", kva,0,0,0);
return(kva);
}
/*
* uvm_km_protect: change the protection of an allocated area
*/
int
uvm_km_protect(struct vm_map *map, vaddr_t addr, vsize_t size, vm_prot_t prot)
{
return uvm_map_protect(map, addr, addr + round_page(size), prot, false);
}
/*
* uvm_km_free: free an area of kernel memory
*/
void
uvm_km_free(struct vm_map *map, vaddr_t addr, vsize_t size, uvm_flag_t flags)
{
UVMHIST_FUNC(__func__); UVMHIST_CALLED(maphist);
KASSERT((flags & UVM_KMF_TYPEMASK) == UVM_KMF_WIRED ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_PAGEABLE ||
(flags & UVM_KMF_TYPEMASK) == UVM_KMF_VAONLY);
KASSERT((addr & PAGE_MASK) == 0); KASSERT(vm_map_pmap(map) == pmap_kernel());
size = round_page(size);
if (flags & UVM_KMF_PAGEABLE) {
uvm_km_pgremove(addr, addr + size); } else if (flags & UVM_KMF_WIRED) {
/*
* Note: uvm_km_pgremove_intrsafe() extracts mapping, thus
* remove it after. See comment below about KVA visibility.
*/
uvm_km_pgremove_intrsafe(map, addr, addr + size);
}
/*
* Note: uvm_unmap_remove() calls pmap_update() for us, before
* KVA becomes globally available.
*/
uvm_unmap1(map, addr, addr + size, UVM_FLAG_VAONLY);
}
/* Sanity; must specify both or none. */
#if (defined(PMAP_MAP_POOLPAGE) || defined(PMAP_UNMAP_POOLPAGE)) && \
(!defined(PMAP_MAP_POOLPAGE) || !defined(PMAP_UNMAP_POOLPAGE))
#error Must specify MAP and UNMAP together.
#endif
#if defined(PMAP_ALLOC_POOLPAGE) && \
!defined(PMAP_MAP_POOLPAGE) && !defined(PMAP_UNMAP_POOLPAGE)
#error Must specify ALLOC with MAP and UNMAP
#endif
int
uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
vmem_addr_t *addr)
{
struct vm_page *pg;
vmem_addr_t va;
int rc;
vaddr_t loopva;
vsize_t loopsize;
size = round_page(size);
#if defined(PMAP_MAP_POOLPAGE)
if (size == PAGE_SIZE) {
again:
#ifdef PMAP_ALLOC_POOLPAGE
pg = PMAP_ALLOC_POOLPAGE((flags & VM_SLEEP) ?
0 : UVM_PGA_USERESERVE);
#else
pg = uvm_pagealloc(NULL, 0, NULL,
(flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE);
#endif /* PMAP_ALLOC_POOLPAGE */
if (__predict_false(pg == NULL)) {
if (flags & VM_SLEEP) {
uvm_wait("plpg");
goto again;
}
return ENOMEM;
}
va = PMAP_MAP_POOLPAGE(VM_PAGE_TO_PHYS(pg));
KASSERT(va != 0);
*addr = va;
return 0;
}
#endif /* PMAP_MAP_POOLPAGE */
rc = vmem_alloc(vm, size, flags, &va);
if (rc != 0)
return rc;
#ifdef PMAP_GROWKERNEL
/*
* These VA allocations happen independently of uvm_map
* so this allocation must not extend beyond the current limit.
*/
KASSERTMSG(uvm_maxkaddr >= va + size,
"%#"PRIxVADDR" %#"PRIxPTR" %#zx",
uvm_maxkaddr, va, size);
#endif
loopva = va;
loopsize = size;
while (loopsize) {
paddr_t pa __diagused;
KASSERTMSG(!pmap_extract(pmap_kernel(), loopva, &pa),
"loopva=%#"PRIxVADDR" loopsize=%#"PRIxVSIZE
" pa=%#"PRIxPADDR" vmem=%p",
loopva, loopsize, pa, vm);
pg = uvm_pagealloc(NULL, loopva, NULL,
UVM_FLAG_COLORMATCH
| ((flags & VM_SLEEP) ? 0 : UVM_PGA_USERESERVE));
if (__predict_false(pg == NULL)) {
if (flags & VM_SLEEP) {
uvm_wait("plpg");
continue;
} else {
uvm_km_pgremove_intrsafe(kernel_map, va,
va + size);
vmem_free(vm, va, size);
return ENOMEM;
}
}
pg->flags &= ~PG_BUSY; /* new page */
UVM_PAGE_OWN(pg, NULL);
pmap_kenter_pa(loopva, VM_PAGE_TO_PHYS(pg),
VM_PROT_READ|VM_PROT_WRITE, PMAP_KMPAGE);
loopva += PAGE_SIZE;
loopsize -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
*addr = va;
return 0;
}
void
uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, size_t size)
{
size = round_page(size);
#if defined(PMAP_UNMAP_POOLPAGE)
if (size == PAGE_SIZE) {
paddr_t pa;
pa = PMAP_UNMAP_POOLPAGE(addr);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
return;
}
#endif /* PMAP_UNMAP_POOLPAGE */
uvm_km_pgremove_intrsafe(kernel_map, addr, addr + size);
pmap_update(pmap_kernel());
vmem_free(vm, addr, size);
}
bool
uvm_km_va_starved_p(void)
{
vmem_size_t total;
vmem_size_t free;
if (kmem_arena == NULL)
return false;
total = vmem_size(kmem_arena, VMEM_ALLOC|VMEM_FREE);
free = vmem_size(kmem_arena, VMEM_FREE);
return (free < (total / 10));
}
/* $NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $ */
/*-
* Copyright (c) 1997, 1998, 1999, 2002, 2007, 2008, 2019
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratory.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_copy.c,v 1.19 2023/05/22 14:07:24 riastradh Exp $");
#define __UFETCHSTORE_PRIVATE
#define __UCAS_PRIVATE
#include <sys/param.h>
#include <sys/fcntl.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <uvm/uvm_extern.h>
void
uio_setup_sysspace(struct uio *uio)
{
uio->uio_vmspace = vmspace_kernel();
}
int
uiomove(void *buf, size_t n, struct uio *uio)
{
struct vmspace *vm = uio->uio_vmspace;
struct iovec *iov;
size_t cnt;
int error = 0;
char *cp = buf;
ASSERT_SLEEPABLE();
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); while (n > 0 && uio->uio_resid) { KASSERT(uio->uio_iovcnt > 0);
iov = uio->uio_iov;
cnt = iov->iov_len;
if (cnt == 0) {
KASSERT(uio->uio_iovcnt > 1);
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
if (cnt > n)
cnt = n;
if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point();
}
if (uio->uio_rw == UIO_READ) {
error = copyout_vmspace(vm, cp, iov->iov_base,
cnt);
} else {
error = copyin_vmspace(vm, iov->iov_base, cp,
cnt);
}
if (error) {
break;
}
iov->iov_base = (char *)iov->iov_base + cnt;
iov->iov_len -= cnt;
uio->uio_resid -= cnt;
uio->uio_offset += cnt;
cp += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
return (error);
}
/*
* Wrapper for uiomove() that validates the arguments against a known-good
* kernel buffer.
*/
int
uiomove_frombuf(void *buf, size_t buflen, struct uio *uio)
{
size_t offset;
if (uio->uio_offset < 0 || /* uio->uio_resid < 0 || */
(offset = uio->uio_offset) != uio->uio_offset)
return (EINVAL);
if (offset >= buflen)
return (0);
return (uiomove((char *)buf + offset, buflen - offset, uio));
}
int
uiopeek(void *buf, size_t n, struct uio *uio)
{
struct vmspace *vm = uio->uio_vmspace;
struct iovec *iov;
size_t cnt;
int error = 0;
char *cp = buf;
size_t resid = uio->uio_resid;
int iovcnt = uio->uio_iovcnt;
char *base;
size_t len;
KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); if (n == 0 || resid == 0)
return 0;
iov = uio->uio_iov;
base = iov->iov_base;
len = iov->iov_len;
while (n > 0 && resid > 0) { KASSERT(iovcnt > 0);
cnt = len;
if (cnt == 0) {
KASSERT(iovcnt > 1);
iov++;
iovcnt--;
base = iov->iov_base;
len = iov->iov_len;
continue;
}
if (cnt > n)
cnt = n;
if (!VMSPACE_IS_KERNEL_P(vm)) { preempt_point();
}
if (uio->uio_rw == UIO_READ) {
error = copyout_vmspace(vm, cp, base, cnt);
} else {
error = copyin_vmspace(vm, base, cp, cnt);
}
if (error) {
break;
}
base += cnt;
len -= cnt;
resid -= cnt;
cp += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
return error;
}
void
uioskip(size_t n, struct uio *uio)
{
struct iovec *iov;
size_t cnt;
KASSERTMSG(n <= uio->uio_resid, "n=%zu resid=%zu", n, uio->uio_resid); KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE); while (n > 0 && uio->uio_resid) { KASSERT(uio->uio_iovcnt > 0);
iov = uio->uio_iov;
cnt = iov->iov_len;
if (cnt == 0) {
KASSERT(uio->uio_iovcnt > 1);
uio->uio_iov++;
uio->uio_iovcnt--;
continue;
}
if (cnt > n)
cnt = n;
iov->iov_base = (char *)iov->iov_base + cnt;
iov->iov_len -= cnt;
uio->uio_resid -= cnt;
uio->uio_offset += cnt;
KDASSERT(cnt <= n);
n -= cnt;
}
}
/*
* Give next character to user as result of read.
*/
int
ureadc(int c, struct uio *uio)
{
struct iovec *iov;
if (uio->uio_resid <= 0)
panic("ureadc: non-positive resid");
again:
if (uio->uio_iovcnt <= 0)
panic("ureadc: non-positive iovcnt");
iov = uio->uio_iov;
if (iov->iov_len <= 0) {
uio->uio_iovcnt--;
uio->uio_iov++;
goto again;
}
if (!VMSPACE_IS_KERNEL_P(uio->uio_vmspace)) {
int error;
if ((error = ustore_char(iov->iov_base, c)) != 0)
return (error);
} else {
*(char *)iov->iov_base = c;
}
iov->iov_base = (char *)iov->iov_base + 1;
iov->iov_len--;
uio->uio_resid--;
uio->uio_offset++;
return (0);
}
/*
* Like copyin(), but operates on an arbitrary vmspace.
*/
int
copyin_vmspace(struct vmspace *vm, const void *uaddr, void *kaddr, size_t len)
{
struct iovec iov;
struct uio uio;
int error;
if (len == 0)
return (0);
if (VMSPACE_IS_KERNEL_P(vm)) {
return kcopy(uaddr, kaddr, len);
}
if (__predict_true(vm == curproc->p_vmspace)) {
return copyin(uaddr, kaddr, len);
}
iov.iov_base = kaddr;
iov.iov_len = len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(uintptr_t)uaddr;
uio.uio_resid = len;
uio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&uio);
error = uvm_io(&vm->vm_map, &uio, 0);
return (error);
}
/*
* Like copyout(), but operates on an arbitrary vmspace.
*/
int
copyout_vmspace(struct vmspace *vm, const void *kaddr, void *uaddr, size_t len)
{
struct iovec iov;
struct uio uio;
int error;
if (len == 0)
return (0);
if (VMSPACE_IS_KERNEL_P(vm)) {
return kcopy(kaddr, uaddr, len);
}
if (__predict_true(vm == curproc->p_vmspace)) {
return copyout(kaddr, uaddr, len);
}
iov.iov_base = __UNCONST(kaddr); /* XXXUNCONST cast away const */
iov.iov_len = len;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = (off_t)(uintptr_t)uaddr;
uio.uio_resid = len;
uio.uio_rw = UIO_WRITE;
UIO_SETUP_SYSSPACE(&uio);
error = uvm_io(&vm->vm_map, &uio, 0);
return (error);
}
/*
* Like copyin(), but operates on an arbitrary process.
*/
int
copyin_proc(struct proc *p, const void *uaddr, void *kaddr, size_t len)
{
struct vmspace *vm;
int error;
error = proc_vmspace_getref(p, &vm);
if (error) {
return error;
}
error = copyin_vmspace(vm, uaddr, kaddr, len);
uvmspace_free(vm);
return error;
}
/*
* Like copyout(), but operates on an arbitrary process.
*/
int
copyout_proc(struct proc *p, const void *kaddr, void *uaddr, size_t len)
{
struct vmspace *vm;
int error;
error = proc_vmspace_getref(p, &vm);
if (error) {
return error;
}
error = copyout_vmspace(vm, kaddr, uaddr, len);
uvmspace_free(vm);
return error;
}
/*
* Like copyin(), but operates on an arbitrary pid.
*/
int
copyin_pid(pid_t pid, const void *uaddr, void *kaddr, size_t len)
{
struct proc *p;
struct vmspace *vm;
int error;
mutex_enter(&proc_lock);
p = proc_find(pid);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
mutex_enter(p->p_lock);
error = proc_vmspace_getref(p, &vm);
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
if (error == 0) {
error = copyin_vmspace(vm, uaddr, kaddr, len);
uvmspace_free(vm);
}
return error;
}
/*
* Like copyin(), except it operates on kernel addresses when the FKIOCTL
* flag is passed in `ioctlflags' from the ioctl call.
*/
int
ioctl_copyin(int ioctlflags, const void *src, void *dst, size_t len)
{
if (ioctlflags & FKIOCTL)
return kcopy(src, dst, len);
return copyin(src, dst, len);
}
/*
* Like copyout(), except it operates on kernel addresses when the FKIOCTL
* flag is passed in `ioctlflags' from the ioctl call.
*/
int
ioctl_copyout(int ioctlflags, const void *src, void *dst, size_t len)
{
if (ioctlflags & FKIOCTL)
return kcopy(src, dst, len);
return copyout(src, dst, len);
}
/*
* User-space CAS / fetch / store
*/
#ifdef __NO_STRICT_ALIGNMENT
#define CHECK_ALIGNMENT(x) __nothing
#else /* ! __NO_STRICT_ALIGNMENT */
static bool
ufetchstore_aligned(uintptr_t uaddr, size_t size)
{
return (uaddr & (size - 1)) == 0;
}
#define CHECK_ALIGNMENT() \
do { \
if (!ufetchstore_aligned((uintptr_t)uaddr, sizeof(*uaddr))) \
return EFAULT; \
} while (/*CONSTCOND*/0)
#endif /* __NO_STRICT_ALIGNMENT */
/*
* __HAVE_UCAS_FULL platforms provide _ucas_32() and _ucas_64() themselves.
* _RUMPKERNEL also provides it's own _ucas_32() and _ucas_64().
*
* In all other cases, we provide generic implementations that work on
* all platforms.
*/
#if !defined(__HAVE_UCAS_FULL) && !defined(_RUMPKERNEL)
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
#include <sys/atomic.h>
#include <sys/cpu.h>
#include <sys/once.h>
#include <sys/mutex.h>
#include <sys/ipi.h>
static int ucas_critical_splcookie;
static volatile u_int ucas_critical_pausing_cpus;
static u_int ucas_critical_ipi;
static ONCE_DECL(ucas_critical_init_once)
static void
ucas_critical_cpu_gate(void *arg __unused)
{
int count = SPINLOCK_BACKOFF_MIN;
KASSERT(atomic_load_relaxed(&ucas_critical_pausing_cpus) > 0);
/*
* Notify ucas_critical_wait that we have stopped. Using
* store-release ensures all our memory operations up to the
* IPI happen before the ucas -- no buffered stores on our end
* can clobber it later on, for instance.
*
* Matches atomic_load_acquire in ucas_critical_wait -- turns
* the following atomic_dec_uint into a store-release.
*/
membar_release();
atomic_dec_uint(&ucas_critical_pausing_cpus);
/*
* Wait for ucas_critical_exit to reopen the gate and let us
* proceed. Using a load-acquire ensures the ucas happens
* before any of our memory operations when we return from the
* IPI and proceed -- we won't observe any stale cached value
* that the ucas overwrote, for instance.
*
* Matches atomic_store_release in ucas_critical_exit.
*/
while (atomic_load_acquire(&ucas_critical_pausing_cpus) != (u_int)-1) {
SPINLOCK_BACKOFF(count);
}
}
static int
ucas_critical_init(void)
{
ucas_critical_ipi = ipi_register(ucas_critical_cpu_gate, NULL);
return 0;
}
static void
ucas_critical_wait(void)
{
int count = SPINLOCK_BACKOFF_MIN;
/*
* Wait for all CPUs to stop at the gate. Using a load-acquire
* ensures all memory operations before they stop at the gate
* happen before the ucas -- no buffered stores in other CPUs
* can clobber it later on, for instance.
*
* Matches membar_release/atomic_dec_uint (store-release) in
* ucas_critical_cpu_gate.
*/
while (atomic_load_acquire(&ucas_critical_pausing_cpus) > 0) {
SPINLOCK_BACKOFF(count);
}
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
static inline void
ucas_critical_enter(lwp_t * const l)
{
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
if (ncpu > 1) {
RUN_ONCE(&ucas_critical_init_once, ucas_critical_init);
/*
* Acquire the mutex first, then go to splhigh() and
* broadcast the IPI to lock all of the other CPUs
* behind the gate.
*
* N.B. Going to splhigh() implicitly disables preemption,
* so there's no need to do it explicitly.
*/
mutex_enter(&cpu_lock);
ucas_critical_splcookie = splhigh();
ucas_critical_pausing_cpus = ncpu - 1;
ipi_trigger_broadcast(ucas_critical_ipi, true);
ucas_critical_wait();
return;
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
KPREEMPT_DISABLE(l);
}
static inline void
ucas_critical_exit(lwp_t * const l)
{
#if !defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)
if (ncpu > 1) {
/*
* Open the gate and notify all CPUs in
* ucas_critical_cpu_gate that they can now proceed.
* Using a store-release ensures the ucas happens
* before any memory operations they issue after the
* IPI -- they won't observe any stale cache of the
* target word, for instance.
*
* Matches atomic_load_acquire in ucas_critical_cpu_gate.
*/
atomic_store_release(&ucas_critical_pausing_cpus, (u_int)-1);
splx(ucas_critical_splcookie);
mutex_exit(&cpu_lock);
return;
}
#endif /* ! __HAVE_UCAS_MP && MULTIPROCESSOR */
KPREEMPT_ENABLE(l);
}
int
_ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{
lwp_t * const l = curlwp;
uint32_t *uva = ((void *)(uintptr_t)uaddr);
int error;
/*
* Wire the user address down to avoid taking a page fault during
* the critical section.
*/
error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
VM_PROT_READ | VM_PROT_WRITE);
if (error)
return error;
ucas_critical_enter(l);
error = _ufetch_32(uva, ret);
if (error == 0 && *ret == old) {
error = _ustore_32(uva, new);
}
ucas_critical_exit(l);
uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));
return error;
}
#ifdef _LP64
int
_ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{
lwp_t * const l = curlwp;
uint64_t *uva = ((void *)(uintptr_t)uaddr);
int error;
/*
* Wire the user address down to avoid taking a page fault during
* the critical section.
*/
error = uvm_vslock(l->l_proc->p_vmspace, uva, sizeof(*uaddr),
VM_PROT_READ | VM_PROT_WRITE);
if (error)
return error;
ucas_critical_enter(l);
error = _ufetch_64(uva, ret);
if (error == 0 && *ret == old) {
error = _ustore_64(uva, new);
}
ucas_critical_exit(l);
uvm_vsunlock(l->l_proc->p_vmspace, uva, sizeof(*uaddr));
return error;
}
#endif /* _LP64 */
#endif /* ! __HAVE_UCAS_FULL && ! _RUMPKERNEL */
int
ucas_32(volatile uint32_t *uaddr, uint32_t old, uint32_t new, uint32_t *ret)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
!defined(_RUMPKERNEL)
if (ncpu > 1) {
return _ucas_32_mp(uaddr, old, new, ret);
}
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
return _ucas_32(uaddr, old, new, ret);
}
#ifdef _LP64
int
ucas_64(volatile uint64_t *uaddr, uint64_t old, uint64_t new, uint64_t *ret)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
#if (defined(__HAVE_UCAS_MP) && defined(MULTIPROCESSOR)) && \
!defined(_RUMPKERNEL)
if (ncpu > 1) {
return _ucas_64_mp(uaddr, old, new, ret);
}
#endif /* __HAVE_UCAS_MP && MULTIPROCESSOR */
return _ucas_64(uaddr, old, new, ret);
}
#endif /* _LP64 */
__strong_alias(ucas_int,ucas_32);
#ifdef _LP64
__strong_alias(ucas_ptr,ucas_64);
#else
__strong_alias(ucas_ptr,ucas_32);
#endif /* _LP64 */
int
ufetch_8(const uint8_t *uaddr, uint8_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_8(uaddr, valp);
}
int
ufetch_16(const uint16_t *uaddr, uint16_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_16(uaddr, valp);
}
int
ufetch_32(const uint32_t *uaddr, uint32_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_32(uaddr, valp);
}
#ifdef _LP64
int
ufetch_64(const uint64_t *uaddr, uint64_t *valp)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ufetch_64(uaddr, valp);
}
#endif /* _LP64 */
__strong_alias(ufetch_char,ufetch_8);
__strong_alias(ufetch_short,ufetch_16);
__strong_alias(ufetch_int,ufetch_32);
#ifdef _LP64
__strong_alias(ufetch_long,ufetch_64);
__strong_alias(ufetch_ptr,ufetch_64);
#else
__strong_alias(ufetch_long,ufetch_32);
__strong_alias(ufetch_ptr,ufetch_32);
#endif /* _LP64 */
int
ustore_8(uint8_t *uaddr, uint8_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_8(uaddr, val);
}
int
ustore_16(uint16_t *uaddr, uint16_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_16(uaddr, val);
}
int
ustore_32(uint32_t *uaddr, uint32_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_32(uaddr, val);
}
#ifdef _LP64
int
ustore_64(uint64_t *uaddr, uint64_t val)
{
ASSERT_SLEEPABLE();
CHECK_ALIGNMENT();
return _ustore_64(uaddr, val);
}
#endif /* _LP64 */
__strong_alias(ustore_char,ustore_8);
__strong_alias(ustore_short,ustore_16);
__strong_alias(ustore_int,ustore_32);
#ifdef _LP64
__strong_alias(ustore_long,ustore_64);
__strong_alias(ustore_ptr,ustore_64);
#else
__strong_alias(ustore_long,ustore_32);
__strong_alias(ustore_ptr,ustore_32);
#endif /* _LP64 */
/* $NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $ */
/*-
* Copyright (c) 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#define _LWP_API_PRIVATE
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_lwp_specificdata.c,v 1.4 2019/05/17 03:34:26 ozaki-r Exp $");
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/specificdata.h>
static specificdata_domain_t lwp_specificdata_domain;
void
lwpinit_specificdata(void)
{
lwp_specificdata_domain = specificdata_domain_create();
KASSERT(lwp_specificdata_domain != NULL);
}
/*
* lwp_specific_key_create --
* Create a key for subsystem lwp-specific data.
*/
int
lwp_specific_key_create(specificdata_key_t *keyp, specificdata_dtor_t dtor)
{
return (specificdata_key_create(lwp_specificdata_domain, keyp, dtor));
}
/*
* lwp_specific_key_delete --
* Delete a key for subsystem lwp-specific data.
*/
void
lwp_specific_key_delete(specificdata_key_t key)
{
specificdata_key_delete(lwp_specificdata_domain, key);
}
/*
* lwp_initspecific --
* Initialize an LWP's specificdata container.
*/
void
lwp_initspecific(struct lwp *l)
{
int error __diagused;
error = specificdata_init(lwp_specificdata_domain, &l->l_specdataref);
KASSERT(error == 0);
}
/*
* lwp_finispecific --
* Finalize an LWP's specificdata container.
*/
void
lwp_finispecific(struct lwp *l)
{
specificdata_fini(lwp_specificdata_domain, &l->l_specdataref);
}
/*
* lwp_getspecific --
* Return lwp-specific data corresponding to the specified key.
*
* Note: LWP specific data is NOT INTERLOCKED. An LWP should access
* only its OWN SPECIFIC DATA. If it is necessary to access another
* LWP's specifc data, care must be taken to ensure that doing so
* would not cause internal data structure inconsistency (i.e. caller
* can guarantee that the target LWP is not inside an lwp_getspecific()
* or lwp_setspecific() call).
*/
void *
lwp_getspecific(specificdata_key_t key)
{
return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
&curlwp->l_specdataref, key));
}
void *
_lwp_getspecific_by_lwp(struct lwp *l, specificdata_key_t key)
{
return (specificdata_getspecific_unlocked(lwp_specificdata_domain,
&l->l_specdataref, key));
}
/*
* lwp_setspecific --
* Set lwp-specific data corresponding to the specified key.
*/
void
lwp_setspecific(specificdata_key_t key, void *data)
{
specificdata_setspecific(lwp_specificdata_domain,
&curlwp->l_specdataref, key, data);
}
void
lwp_setspecific_by_lwp(struct lwp *l, specificdata_key_t key, void *data)
{
specificdata_setspecific(lwp_specificdata_domain,
&l->l_specdataref, key, data);
}
/* $NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $ */
/*-
* Copyright (c) 1982, 1986 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
*/
/*-
* Copyright (c) 1995 Charles M. Hannum. All rights reserved.
* Copyright (c) 1989, 1990 William Jolitz
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department, and William Jolitz.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vm_machdep.c 7.3 (Berkeley) 5/13/91
*/
/*
* Utah $Hdr: vm_machdep.c 1.16.1.1 89/06/23$
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.46 2023/10/06 11:53:27 skrll Exp $");
#include "opt_mtrr.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#include <sys/core.h>
#include <sys/exec.h>
#include <sys/ptrace.h>
#include <uvm/uvm.h>
#include <machine/cpu.h>
#include <machine/gdt.h>
#include <machine/reg.h>
#include <machine/specialreg.h>
#ifdef MTRR
#include <machine/mtrr.h>
#endif
#include <x86/fpu.h>
#include <x86/dbregs.h>
extern struct pool x86_dbregspl;
void
cpu_proc_fork(struct proc *p1, struct proc *p2)
{
p2->p_md.md_flags = p1->p_md.md_flags;
}
/*
* cpu_lwp_fork: finish a new LWP (l2) operation.
*
* First LWP (l1) is the process being forked. If it is &lwp0, then we
* are creating a kthread, where return path and argument are specified
* with `func' and `arg'.
*
* If an alternate user-level stack is requested (with non-zero values
* in both the stack and stacksize arguments), then set up the user stack
* pointer accordingly.
*/
void
cpu_lwp_fork(struct lwp *l1, struct lwp *l2, void *stack, size_t stacksize,
void (*func)(void *), void *arg)
{
struct pcb *pcb1, *pcb2;
struct trapframe *tf;
struct switchframe *sf;
vaddr_t uv;
KASSERT(l1 == curlwp || l1 == &lwp0);
pcb1 = lwp_getpcb(l1);
pcb2 = lwp_getpcb(l2);
/* Copy the PCB from parent, except the FPU state. */
memcpy(pcb2, pcb1, offsetof(struct pcb, pcb_savefpu));
/* Fork the FPU state. */
fpu_lwp_fork(l1, l2);
/* Never inherit CPU Debug Registers */
pcb2->pcb_dbregs = NULL;
pcb2->pcb_flags &= ~PCB_DBREGS;
#if defined(XENPV)
pcb2->pcb_iopl = IOPL_KPL;
#endif
/*
* Set the kernel stack address (from the address to uarea) and
* trapframe address for child.
*
* Rig kernel stack so that it would start out in lwp_trampoline()
* and call child_return() with l2 as an argument. This causes the
* newly-created child process to go directly to user level with a
* parent return value of 0 from fork(), while the parent process
* returns normally.
*/
uv = uvm_lwp_getuarea(l2);
KASSERT(uv % PAGE_SIZE == 0);
#ifdef __x86_64__
#ifdef SVS
pcb2->pcb_rsp0 = (uv + USPACE - PAGE_SIZE +
sizeof(struct trapframe));
KASSERT((pcb2->pcb_rsp0 & 0xF) == 0);
#else
pcb2->pcb_rsp0 = (uv + USPACE - 16);
#endif
tf = (struct trapframe *)pcb2->pcb_rsp0 - 1;
#else
pcb2->pcb_esp0 = (uv + USPACE - 16);
tf = (struct trapframe *)pcb2->pcb_esp0 - 1;
pcb2->pcb_iomap = NULL;
#endif
l2->l_md.md_regs = tf;
/*
* Copy the trapframe from parent, so that return to userspace
* will be to right address, with correct registers.
*/
memcpy(tf, l1->l_md.md_regs, sizeof(struct trapframe));
/* Child LWP might get aston() before returning to userspace. */
tf->tf_trapno = T_ASTFLT;
/* If specified, set a different user stack for a child. */
if (stack != NULL) {
#ifdef __x86_64__
tf->tf_rsp = (uint64_t)stack + stacksize;
#else
tf->tf_esp = (uint32_t)stack + stacksize;
#endif
}
l2->l_md.md_flags = l1->l_md.md_flags;
KASSERT(l2->l_md.md_astpending == 0);
sf = (struct switchframe *)tf - 1;
#ifdef __x86_64__
sf->sf_r12 = (uint64_t)func;
sf->sf_r13 = (uint64_t)arg;
sf->sf_rip = (uint64_t)lwp_trampoline;
pcb2->pcb_rsp = (uint64_t)sf;
pcb2->pcb_rbp = (uint64_t)l2;
#else
/*
* XXX Is there a reason sf->sf_edi isn't initialized here?
* Could this leak potentially sensitive information to new
* userspace processes?
*/
sf->sf_esi = (int)func;
sf->sf_ebx = (int)arg;
sf->sf_eip = (int)lwp_trampoline;
pcb2->pcb_esp = (int)sf;
pcb2->pcb_ebp = (int)l2;
#endif
}
/*
* cpu_lwp_free is called from exit() to let machine-dependent
* code free machine-dependent resources. Note that this routine
* must not block. NB: this may be called with l != curlwp in
* error paths.
*/
void
cpu_lwp_free(struct lwp *l, int proc)
{ if (l != curlwp)
return;
/* Abandon the FPU state. */
fpu_lwp_abandon(l);
/* Abandon the dbregs state. */
x86_dbregs_abandon(l);
#ifdef MTRR
if (proc && l->l_proc->p_md.md_flags & MDP_USEDMTRR) mtrr_clean(l->l_proc);
#endif
}
/*
* cpu_lwp_free2 is called when an LWP is being reaped.
* This routine may block.
*/
void
cpu_lwp_free2(struct lwp *l)
{
struct pcb *pcb;
pcb = lwp_getpcb(l);
KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0); if (pcb->pcb_dbregs) { pool_put(&x86_dbregspl, pcb->pcb_dbregs);
pcb->pcb_dbregs = NULL;
}
}
/*
* Convert kernel VA to physical address
*/
paddr_t
kvtop(void *addr)
{
paddr_t pa;
bool ret __diagused;
ret = pmap_extract(pmap_kernel(), (vaddr_t)addr, &pa);
KASSERT(ret == true);
return pa;
}
/*
* Map a user I/O request into kernel virtual address space.
* Note: the pages are already locked by uvm_vslock(), so we
* do not need to pass an access_type to pmap_enter().
*/
int
vmapbuf(struct buf *bp, vsize_t len)
{
vaddr_t faddr, taddr, off;
paddr_t fpa;
KASSERT((bp->b_flags & B_PHYS) != 0);
bp->b_saveaddr = bp->b_data;
faddr = trunc_page((vaddr_t)bp->b_data);
off = (vaddr_t)bp->b_data - faddr;
len = round_page(off + len);
taddr = uvm_km_alloc(phys_map, len, 0, UVM_KMF_VAONLY | UVM_KMF_WAITVA);
bp->b_data = (void *)(taddr + off);
/*
* The region is locked, so we expect that pmap_extract() will return
* true.
* XXX: unwise to expect this in a multithreaded environment.
* anything can happen to a pmap between the time we lock a
* region, release the pmap lock, and then relock it for
* the pmap_extract().
*
* no need to flush TLB since we expect nothing to be mapped
* where we just allocated (TLB will be flushed when our
* mapping is removed).
*/
while (len) {
(void) pmap_extract(vm_map_pmap(&bp->b_proc->p_vmspace->vm_map),
faddr, &fpa);
pmap_kenter_pa(taddr, fpa, VM_PROT_READ|VM_PROT_WRITE, 0);
faddr += PAGE_SIZE;
taddr += PAGE_SIZE;
len -= PAGE_SIZE;
}
pmap_update(pmap_kernel());
return 0;
}
/*
* Unmap a previously-mapped user I/O request.
*/
void
vunmapbuf(struct buf *bp, vsize_t len)
{
vaddr_t addr, off;
KASSERT((bp->b_flags & B_PHYS) != 0);
addr = trunc_page((vaddr_t)bp->b_data);
off = (vaddr_t)bp->b_data - addr;
len = round_page(off + len);
pmap_kremove(addr, len);
pmap_update(pmap_kernel());
uvm_km_free(phys_map, addr, len, UVM_KMF_VAONLY);
bp->b_data = bp->b_saveaddr;
bp->b_saveaddr = 0;
}
#ifdef __HAVE_CPU_UAREA_ROUTINES
/*
* Layout of the uarea:
* Page[0] = PCB
* Page[1] = RedZone
* Page[2] = Stack
* Page[...] = Stack
* Page[UPAGES-1] = Stack
* Page[UPAGES] = RedZone
* There is a redzone at the beginning of the stack, and another one at the
* end. The former is to protect against deep recursions that could corrupt
* the PCB, the latter to protect against severe stack overflows.
*/
void *
cpu_uarea_alloc(bool system)
{
vaddr_t base, va;
paddr_t pa;
base = uvm_km_alloc(kernel_map, USPACE + PAGE_SIZE, 0,
UVM_KMF_WIRED|UVM_KMF_WAITVA);
/* Page[1] = RedZone */
va = base + PAGE_SIZE;
if (!pmap_extract(pmap_kernel(), va, &pa)) {
panic("%s: impossible, Page[1] unmapped", __func__);
}
pmap_kremove(va, PAGE_SIZE);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
/* Page[UPAGES] = RedZone */
va = base + USPACE;
if (!pmap_extract(pmap_kernel(), va, &pa)) {
panic("%s: impossible, Page[UPAGES] unmapped", __func__);
}
pmap_kremove(va, PAGE_SIZE);
uvm_pagefree(PHYS_TO_VM_PAGE(pa));
pmap_update(pmap_kernel());
return (void *)base;
}
bool
cpu_uarea_free(void *addr)
{
vaddr_t base = (vaddr_t)addr;
KASSERT(!pmap_extract(pmap_kernel(), base + PAGE_SIZE, NULL));
KASSERT(!pmap_extract(pmap_kernel(), base + USPACE, NULL));
uvm_km_free(kernel_map, base, USPACE + PAGE_SIZE, UVM_KMF_WIRED);
return true;
}
#endif /* __HAVE_CPU_UAREA_ROUTINES */
/* $NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $ */
/*-
* Copyright (c) 2008, 2019, 2020, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_cache.c 8.3 (Berkeley) 8/22/94
*/
/*
* Name caching:
*
* Names found by directory scans are retained in a cache for future
* reference. It is managed LRU, so frequently used names will hang
* around. The cache is indexed by hash value obtained from the name.
*
* The name cache is the brainchild of Robert Elz and was introduced in
* 4.3BSD. See "Using gprof to Tune the 4.2BSD Kernel", Marshall Kirk
* McKusick, May 21 1984.
*
* Data structures:
*
* Most Unix namecaches very sensibly use a global hash table to index
* names. The global hash table works well, but can cause concurrency
* headaches for the kernel hacker. In the NetBSD 10.0 implementation
* we are not sensible, and use a per-directory data structure to index
* names, but the cache otherwise functions the same.
*
* The index is a red-black tree. It should not be difficult to
* experiment with other types of index, however note that a tree
* can trivially be made to support lockless lookup.
*
* Each cached name is stored in a struct namecache, along with a
* pointer to the associated vnode (nc_vp). Names longer than a
* maximum length of NCHNAMLEN are allocated with kmem_alloc(); they
* occur infrequently, and names shorter than this are stored directly
* in struct namecache. If it is a "negative" entry, (i.e. for a name
* that is known NOT to exist) the vnode pointer will be NULL.
*
* In practice this implementation is not any slower than the hash
* table that preceeded it and in some cases it significantly
* outperforms the hash table. Some reasons why this might be:
*
* - natural partitioning provided by the file system structure, which
* the prior implementation discarded (global hash table).
* - worst case tree traversal of O(log n), the hash table could have
* many collisions.
* - minimized cache misses & total L2/L3 CPU cache footprint; struct
* namecache and vnode_impl_t are laid out to keep cache footprint
* minimal in the lookup path; no hash table buckets to cache.
* - minimized number of conditionals & string comparisons.
*
* For a directory with 3 cached names for 3 distinct vnodes, the
* various vnodes and namecache structs would be connected like this
* (the root is at the bottom of the diagram):
*
* ...
* ^
* |- vi_nc_tree
* |
* +----o----+ +---------+ +---------+
* | VDIR | | VCHR | | VREG |
* | vnode o-----+ | vnode o-----+ | vnode o------+
* +---------+ | +---------+ | +---------+ |
* ^ | ^ | ^ |
* |- nc_vp |- vi_nc_list |- nc_vp |- vi_nc_list |- nc_vp |
* | | | | | |
* +----o----+ | +----o----+ | +----o----+ |
* +---onamecache|<----+ +---onamecache|<----+ +---onamecache|<-----+
* | +---------+ | +---------+ | +---------+
* | ^ | ^ | ^
* | | | | | |
* | | +----------------------+ | |
* |-nc_dvp | +-------------------------------------------------+
* | |/- vi_nc_tree | |
* | | |- nc_dvp |- nc_dvp
* | +----o----+ | |
* +-->| VDIR |<----------+ |
* | vnode |<------------------------------------+
* +---------+
*
* START HERE
*
* Replacement:
*
* As the cache becomes full, old and unused entries are purged as new
* entries are added. The synchronization overhead in maintaining a
* strict ordering would be prohibitive, so the VM system's "clock" or
* "second chance" page replacement algorithm is aped here. New
* entries go to the tail of the active list. After they age out and
* reach the head of the list, they are moved to the tail of the
* inactive list. Any use of the deactivated cache entry reactivates
* it, saving it from impending doom; if not reactivated, the entry
* eventually reaches the head of the inactive list and is purged.
*
* Concurrency:
*
* From a performance perspective, cache_lookup(nameiop == LOOKUP) is
* what really matters; insertion of new entries with cache_enter() is
* comparatively infrequent, and overshadowed by the cost of expensive
* file system metadata operations (which may involve disk I/O). We
* therefore want to make everything simplest in the lookup path.
*
* struct namecache is mostly stable except for list and tree related
* entries, changes to which don't affect the cached name or vnode.
* For changes to name+vnode, entries are purged in preference to
* modifying them.
*
* Read access to namecache entries is made via tree, list, or LRU
* list. A lock corresponding to the direction of access should be
* held. See definition of "struct namecache" in src/sys/namei.src,
* and the definition of "struct vnode" for the particulars.
*
* Per-CPU statistics, and LRU list totals are read unlocked, since an
* approximate value is OK. We maintain 32-bit sized per-CPU counters
* and 64-bit global counters since 32-bit sized counters can be
* observed locklessly while the global counters are protected by a
* mutex.
*
* The lock order is:
*
* 1) vi->vi_nc_lock (tree or parent -> child direction,
* used during forward lookup)
*
* 2) vi->vi_nc_listlock (list or child -> parent direction,
* used during reverse lookup)
*
* 3) cache_lru_lock (LRU list direction, used during reclaim)
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_cache.c,v 1.156 2023/10/02 21:50:18 ad Exp $");
#define __NAMECACHE_PRIVATE
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "opt_dtrace.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/callout.h>
#include <sys/cpu.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/hash.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/mutex.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/pool.h>
#include <sys/sdt.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vnode_impl.h>
#include <miscfs/genfs/genfs.h>
/*
* Assert that data structure layout hasn't changed unintentionally.
*/
#ifdef _LP64
CTASSERT(sizeof(struct namecache) == 128);
#else
CTASSERT(sizeof(struct namecache) == 64);
#endif
CTASSERT(NC_NLEN_MASK >= MAXPATHLEN);
static void cache_activate(struct namecache *);
static void cache_update_stats(void *);
static int cache_compare_nodes(void *, const void *, const void *);
static void cache_deactivate(void);
static void cache_reclaim(void);
static int cache_stat_sysctl(SYSCTLFN_ARGS);
/*
* Global pool cache.
*/
static pool_cache_t cache_pool __read_mostly;
/*
* LRU replacement.
*/
enum cache_lru_id {
LRU_ACTIVE,
LRU_INACTIVE,
LRU_COUNT
};
static struct {
TAILQ_HEAD(, namecache) list[LRU_COUNT];
u_int count[LRU_COUNT];
} cache_lru __cacheline_aligned;
static kmutex_t cache_lru_lock __cacheline_aligned;
/*
* Cache effectiveness statistics. nchstats holds system-wide total.
*/
struct nchstats nchstats;
struct nchstats_percpu _NAMEI_CACHE_STATS(uint32_t);
struct nchcpu {
struct nchstats_percpu cur;
struct nchstats_percpu last;
};
static callout_t cache_stat_callout;
static kmutex_t cache_stat_lock __cacheline_aligned;
#define COUNT(f) do { \
lwp_t *l = curlwp; \
KPREEMPT_DISABLE(l); \
struct nchcpu *nchcpu = curcpu()->ci_data.cpu_nch; \
nchcpu->cur.f++; \
KPREEMPT_ENABLE(l); \
} while (/* CONSTCOND */ 0);
#define UPDATE(nchcpu, f) do { \
uint32_t cur = atomic_load_relaxed(&nchcpu->cur.f); \
nchstats.f += (uint32_t)(cur - nchcpu->last.f); \
nchcpu->last.f = cur; \
} while (/* CONSTCOND */ 0)
/*
* Tunables. cache_maxlen replaces the historical doingcache:
* set it zero to disable caching for debugging purposes.
*/
int cache_lru_maxdeact __read_mostly = 2; /* max # to deactivate */
int cache_lru_maxscan __read_mostly = 64; /* max # to scan/reclaim */
int cache_maxlen __read_mostly = NC_NLEN_MASK; /* max name length to cache */
int cache_stat_interval __read_mostly = 300; /* in seconds */
/*
* sysctl stuff.
*/
static struct sysctllog *cache_sysctllog;
/*
* This is a dummy name that cannot usually occur anywhere in the cache nor
* file system. It's used when caching the root vnode of mounted file
* systems. The name is attached to the directory that the file system is
* mounted on.
*/
static const char cache_mp_name[] = "";
static const int cache_mp_nlen = sizeof(cache_mp_name) - 1;
/*
* Red-black tree stuff.
*/
static const rb_tree_ops_t cache_rbtree_ops = {
.rbto_compare_nodes = cache_compare_nodes,
.rbto_compare_key = cache_compare_nodes,
.rbto_node_offset = offsetof(struct namecache, nc_tree),
.rbto_context = NULL
};
/*
* dtrace probes.
*/
SDT_PROBE_DEFINE1(vfs, namecache, invalidate, done, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, parents, "struct vnode *");
SDT_PROBE_DEFINE1(vfs, namecache, purge, children, "struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, purge, name, "char *", "size_t");
SDT_PROBE_DEFINE1(vfs, namecache, purge, vfs, "struct mount *");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, miss, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, lookup, toolong, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, success, "struct vnode *",
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, revlookup, fail, "struct vnode *",
"int");
SDT_PROBE_DEFINE2(vfs, namecache, prune, done, "int", "int");
SDT_PROBE_DEFINE3(vfs, namecache, enter, toolong, "struct vnode *",
"char *", "size_t");
SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *",
"char *", "size_t");
/*
* rbtree: compare two nodes.
*/
static int
cache_compare_nodes(void *context, const void *n1, const void *n2)
{
const struct namecache *nc1 = n1;
const struct namecache *nc2 = n2;
if (nc1->nc_key < nc2->nc_key) {
return -1;
}
if (nc1->nc_key > nc2->nc_key) {
return 1;
}
KASSERT(NC_NLEN(nc1) == NC_NLEN(nc2));
return memcmp(nc1->nc_name, nc2->nc_name, NC_NLEN(nc1));
}
/*
* Compute a key value for the given name. The name length is encoded in
* the key value to try and improve uniqueness, and so that length doesn't
* need to be compared separately for string comparisons.
*/
static uintptr_t
cache_key(const char *name, size_t nlen)
{
uintptr_t key;
KASSERT((nlen & ~NC_NLEN_MASK) == 0); key = hash32_buf(name, nlen, HASH32_STR_INIT);
return (key << NC_NLEN_BITS) | (uintptr_t)nlen;
}
/*
* Remove an entry from the cache. vi_nc_lock must be held, and if dir2node
* is true, then we're locking in the conventional direction and the list
* lock will be acquired when removing the entry from the vnode list.
*/
static void
cache_remove(struct namecache *ncp, const bool dir2node)
{
struct vnode *vp, *dvp = ncp->nc_dvp;
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
size_t namelen = NC_NLEN(ncp);
KASSERT(rw_write_held(&dvi->vi_nc_lock)); KASSERT(cache_key(ncp->nc_name, namelen) == ncp->nc_key); KASSERT(rb_tree_find_node(&dvi->vi_nc_tree, ncp) == ncp); SDT_PROBE(vfs, namecache, invalidate, done, ncp, 0, 0, 0, 0);
/*
* Remove from the vnode's list. This excludes cache_revlookup(),
* and then it's safe to remove from the LRU lists.
*/
if ((vp = ncp->nc_vp) != NULL) {
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
if (__predict_true(dir2node)) {
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
rw_exit(&vi->vi_nc_listlock);
} else {
TAILQ_REMOVE(&vi->vi_nc_list, ncp, nc_list);
}
}
/* Remove from the directory's rbtree. */
rb_tree_remove_node(&dvi->vi_nc_tree, ncp);
/* Remove from the LRU lists. */
mutex_enter(&cache_lru_lock);
TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
cache_lru.count[ncp->nc_lrulist]--;
mutex_exit(&cache_lru_lock);
/* Finally, free it. */
if (namelen > NCHNAMLEN) {
size_t sz = offsetof(struct namecache, nc_name[namelen]);
kmem_free(ncp, sz);
} else {
pool_cache_put(cache_pool, ncp);
}
}
/*
* Find a single cache entry and return it. vi_nc_lock must be held.
*/
static struct namecache * __noinline
cache_lookup_entry(struct vnode *dvp, const char *name, size_t namelen,
uintptr_t key)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct rb_node *node = dvi->vi_nc_tree.rbt_root;
struct namecache *ncp;
enum cache_lru_id lrulist;
int diff;
KASSERT(namelen <= MAXPATHLEN); KASSERT(rw_lock_held(&dvi->vi_nc_lock));
/*
* Search the RB tree for the key. This is an inlined lookup
* tailored for exactly what's needed here that turns out to be
* quite a bit faster than using rb_tree_find_node().
*
* For a matching key memcmp() needs to be called once to confirm
* that the correct name has been found. Very rarely there will be
* a key value collision and the search will continue.
*/
for (;;) {
if (__predict_false(RB_SENTINEL_P(node))) {
return NULL;
}
ncp = (struct namecache *)node;
KASSERT((void *)&ncp->nc_tree == (void *)ncp);
KASSERT(ncp->nc_dvp == dvp);
if (ncp->nc_key == key) {
KASSERT(NC_NLEN(ncp) == namelen);
diff = memcmp(ncp->nc_name, name, namelen);
if (__predict_true(diff == 0)) {
break;
}
node = node->rb_nodes[diff < 0];
} else {
node = node->rb_nodes[ncp->nc_key < key];
}
}
/*
* If the entry is on the wrong LRU list, requeue it. This is an
* unlocked check, but it will rarely be wrong and even then there
* will be no harm caused.
*/
lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
if (__predict_false(lrulist != LRU_ACTIVE)) { cache_activate(ncp);
}
return ncp;
}
/*
* Look for a the name in the cache. We don't do this
* if the segment name is long, simply so the cache can avoid
* holding long names (which would either waste space, or
* add greatly to the complexity).
*
* Lookup is called with DVP pointing to the directory to search,
* and CNP providing the name of the entry being sought: cn_nameptr
* is the name, cn_namelen is its length, and cn_flags is the flags
* word from the namei operation.
*
* DVP must be locked.
*
* There are three possible non-error return states:
* 1. Nothing was found in the cache. Nothing is known about
* the requested name.
* 2. A negative entry was found in the cache, meaning that the
* requested name definitely does not exist.
* 3. A positive entry was found in the cache, meaning that the
* requested name does exist and that we are providing the
* vnode.
* In these cases the results are:
* 1. 0 returned; VN is set to NULL.
* 2. 1 returned; VN is set to NULL.
* 3. 1 returned; VN is set to the vnode found.
*
* The additional result argument ISWHT is set to zero, unless a
* negative entry is found that was entered as a whiteout, in which
* case ISWHT is set to one.
*
* The ISWHT_RET argument pointer may be null. In this case an
* assertion is made that the whiteout flag is not set. File systems
* that do not support whiteouts can/should do this.
*
* Filesystems that do support whiteouts should add ISWHITEOUT to
* cnp->cn_flags if ISWHT comes back nonzero.
*
* When a vnode is returned, it is locked, as per the vnode lookup
* locking protocol.
*
* There is no way for this function to fail, in the sense of
* generating an error that requires aborting the namei operation.
*
* (Prior to October 2012, this function returned an integer status,
* and a vnode, and mucked with the flags word in CNP for whiteouts.
* The integer status was -1 for "nothing found", ENOENT for "a
* negative entry found", 0 for "a positive entry found", and possibly
* other errors, and the value of VN might or might not have been set
* depending on what error occurred.)
*/
bool
cache_lookup(struct vnode *dvp, const char *name, size_t namelen,
uint32_t nameiop, uint32_t cnflags,
int *iswht_ret, struct vnode **vn_ret)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
struct vnode *vp;
uintptr_t key;
int error;
bool hit;
krw_t op;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* Establish default result values */
if (iswht_ret != NULL) { *iswht_ret = 0;
}
*vn_ret = NULL;
if (__predict_false(namelen > cache_maxlen)) {
SDT_PROBE(vfs, namecache, lookup, toolong, dvp,
name, namelen, 0, 0);
COUNT(ncs_long);
return false;
}
/* Compute the key up front - don't need the lock. */
key = cache_key(name, namelen);
/* Could the entry be purged below? */
if ((cnflags & ISLASTCN) != 0 && ((cnflags & MAKEENTRY) == 0 || nameiop == CREATE)) {
op = RW_WRITER;
} else {
op = RW_READER;
}
/* Now look for the name. */
rw_enter(&dvi->vi_nc_lock, op);
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (__predict_false(ncp == NULL)) {
rw_exit(&dvi->vi_nc_lock);
COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp,
name, namelen, 0, 0);
return false;
}
if (__predict_false((cnflags & MAKEENTRY) == 0)) {
/*
* Last component and we are renaming or deleting,
* the cache entry is invalid, or otherwise don't
* want cache entry to exist.
*/
KASSERT((cnflags & ISLASTCN) != 0);
cache_remove(ncp, true);
rw_exit(&dvi->vi_nc_lock);
COUNT(ncs_badhits);
return false;
}
if ((vp = ncp->nc_vp) == NULL) {
if (iswht_ret != NULL) {
/*
* Restore the ISWHITEOUT flag saved earlier.
*/
*iswht_ret = ncp->nc_whiteout;
} else {
KASSERT(!ncp->nc_whiteout);
}
if (nameiop == CREATE && (cnflags & ISLASTCN) != 0) {
/*
* Last component and we are preparing to create
* the named object, so flush the negative cache
* entry.
*/
COUNT(ncs_badhits);
cache_remove(ncp, true);
hit = false;
} else {
COUNT(ncs_neghits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name,
namelen, 0, 0);
/* found neg entry; vn is already null from above */
hit = true;
}
rw_exit(&dvi->vi_nc_lock);
return hit;
}
error = vcache_tryvget(vp);
rw_exit(&dvi->vi_nc_lock);
if (error) { KASSERT(error == EBUSY);
/*
* This vnode is being cleaned out.
* XXX badhits?
*/
COUNT(ncs_falsehits);
return false;
}
COUNT(ncs_goodhits); SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
/* found it */
*vn_ret = vp;
return true;
}
/*
* Version of the above without the nameiop argument, for NFS.
*/
bool
cache_lookup_raw(struct vnode *dvp, const char *name, size_t namelen,
uint32_t cnflags,
int *iswht_ret, struct vnode **vn_ret)
{
return cache_lookup(dvp, name, namelen, LOOKUP, cnflags | MAKEENTRY,
iswht_ret, vn_ret);
}
/*
* Used by namei() to walk down a path, component by component by looking up
* names in the cache. The node locks are chained along the way: a parent's
* lock is not dropped until the child's is acquired.
*/
bool
cache_lookup_linked(struct vnode *dvp, const char *name, size_t namelen,
struct vnode **vn_ret, krwlock_t **plock,
kauth_cred_t cred)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
krwlock_t *oldlock, *newlock;
struct vnode *vp;
uintptr_t key;
int error;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* If disabled, or file system doesn't support this, bail out. */
if (__predict_false((dvp->v_mount->mnt_iflag & IMNT_NCLOOKUP) == 0)) {
return false;
}
if (__predict_false(namelen > cache_maxlen)) {
COUNT(ncs_long);
return false;
}
/* Compute the key up front - don't need the lock. */
key = cache_key(name, namelen);
/*
* Acquire the directory lock. Once we have that, we can drop the
* previous one (if any).
*
* The two lock holds mean that the directory can't go away while
* here: the directory must be purged with cache_purge() before
* being freed, and both parent & child's vi_nc_lock must be taken
* before that point is passed.
*
* However if there's no previous lock, like at the root of the
* chain, then "dvp" must be referenced to prevent dvp going away
* before we get its lock.
*
* Note that the two locks can be the same if looking up a dot, for
* example: /usr/bin/. If looking up the parent (..) we can't wait
* on the lock as child -> parent is the wrong direction.
*/
if (*plock != &dvi->vi_nc_lock) {
oldlock = *plock;
newlock = &dvi->vi_nc_lock;
if (!rw_tryenter(&dvi->vi_nc_lock, RW_READER)) {
return false;
}
} else {
oldlock = NULL;
newlock = NULL;
if (*plock == NULL) { KASSERT(vrefcnt(dvp) > 0);
}
}
/*
* First up check if the user is allowed to look up files in this
* directory.
*/
if (cred != FSCRED) {
if (dvi->vi_nc_mode == VNOVAL) {
if (newlock != NULL) { rw_exit(newlock);
}
return false;
}
KASSERT(dvi->vi_nc_uid != VNOVAL); KASSERT(dvi->vi_nc_gid != VNOVAL);
error = kauth_authorize_vnode(cred,
KAUTH_ACCESS_ACTION(VEXEC,
dvp->v_type, dvi->vi_nc_mode & ALLPERMS), dvp, NULL,
genfs_can_access(dvp, cred, dvi->vi_nc_uid, dvi->vi_nc_gid,
dvi->vi_nc_mode & ALLPERMS, NULL, VEXEC));
if (error != 0) { if (newlock != NULL) { rw_exit(newlock);
}
COUNT(ncs_denied);
return false;
}
}
/*
* Now look for a matching cache entry.
*/
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (__predict_false(ncp == NULL)) {
if (newlock != NULL) { rw_exit(newlock);
}
COUNT(ncs_miss); SDT_PROBE(vfs, namecache, lookup, miss, dvp,
name, namelen, 0, 0);
return false;
}
if ((vp = ncp->nc_vp) == NULL) {
/* found negative entry; vn is already null from above */
KASSERT(namelen != cache_mp_nlen); KASSERT(name != cache_mp_name); COUNT(ncs_neghits);
} else {
COUNT(ncs_goodhits); /* XXX can be "badhits" */
}
SDT_PROBE(vfs, namecache, lookup, hit, dvp, name, namelen, 0, 0);
/*
* Return with the directory lock still held. It will either be
* returned to us with another call to cache_lookup_linked() when
* looking up the next component, or the caller will release it
* manually when finished.
*/
if (oldlock) { rw_exit(oldlock);
}
if (newlock) { *plock = newlock;
}
*vn_ret = vp;
return true;
}
/*
* Scan cache looking for name of directory entry pointing at vp.
* Will not search for "." or "..".
*
* If the lookup succeeds the vnode is referenced and stored in dvpp.
*
* If bufp is non-NULL, also place the name in the buffer which starts
* at bufp, immediately before *bpp, and move bpp backwards to point
* at the start of it. (Yes, this is a little baroque, but it's done
* this way to cater to the whims of getcwd).
*
* Returns 0 on success, -1 on cache miss, positive errno on failure.
*/
int
cache_revlookup(struct vnode *vp, struct vnode **dvpp, char **bpp, char *bufp,
bool checkaccess, accmode_t accmode)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
struct namecache *ncp;
enum cache_lru_id lrulist;
struct vnode *dvp;
int error, nlen;
char *bp;
KASSERT(vp != NULL); if (cache_maxlen == 0)
goto out;
rw_enter(&vi->vi_nc_listlock, RW_READER);
if (checkaccess) {
/*
* Check if the user is allowed to see. NOTE: this is
* checking for access on the "wrong" directory. getcwd()
* wants to see that there is access on every component
* along the way, not that there is access to any individual
* component. Don't use this to check you can look in vp.
*
* I don't like it, I didn't come up with it, don't blame me!
*/
if (vi->vi_nc_mode == VNOVAL) {
rw_exit(&vi->vi_nc_listlock);
return -1;
}
KASSERT(vi->vi_nc_uid != VNOVAL); KASSERT(vi->vi_nc_gid != VNOVAL);
error = kauth_authorize_vnode(kauth_cred_get(),
KAUTH_ACCESS_ACTION(VEXEC, vp->v_type, vi->vi_nc_mode &
ALLPERMS), vp, NULL, genfs_can_access(vp, curlwp->l_cred,
vi->vi_nc_uid, vi->vi_nc_gid, vi->vi_nc_mode & ALLPERMS,
NULL, accmode));
if (error != 0) {
rw_exit(&vi->vi_nc_listlock);
COUNT(ncs_denied);
return EACCES;
}
}
TAILQ_FOREACH(ncp, &vi->vi_nc_list, nc_list) { KASSERT(ncp->nc_vp == vp); KASSERT(ncp->nc_dvp != NULL);
nlen = NC_NLEN(ncp);
/*
* Ignore mountpoint entries.
*/
if (nlen == cache_mp_nlen) {
continue;
}
/*
* The queue is partially sorted. Once we hit dots, nothing
* else remains but dots and dotdots, so bail out.
*/
if (ncp->nc_name[0] == '.') { if (nlen == 1 ||
(nlen == 2 && ncp->nc_name[1] == '.')) {
break;
}
}
/*
* Record a hit on the entry. This is an unlocked read but
* even if wrong it doesn't matter too much.
*/
lrulist = atomic_load_relaxed(&ncp->nc_lrulist);
if (lrulist != LRU_ACTIVE) { cache_activate(ncp);
}
if (bufp) {
bp = *bpp;
bp -= nlen;
if (bp <= bufp) {
*dvpp = NULL;
rw_exit(&vi->vi_nc_listlock);
SDT_PROBE(vfs, namecache, revlookup,
fail, vp, ERANGE, 0, 0, 0);
return (ERANGE);
}
memcpy(bp, ncp->nc_name, nlen);
*bpp = bp;
}
dvp = ncp->nc_dvp;
error = vcache_tryvget(dvp);
rw_exit(&vi->vi_nc_listlock);
if (error) { KASSERT(error == EBUSY); if (bufp) (*bpp) += nlen;
*dvpp = NULL;
SDT_PROBE(vfs, namecache, revlookup, fail, vp,
error, 0, 0, 0);
return -1;
}
*dvpp = dvp;
SDT_PROBE(vfs, namecache, revlookup, success, vp, dvp,
0, 0, 0);
COUNT(ncs_revhits);
return (0);
}
rw_exit(&vi->vi_nc_listlock);
COUNT(ncs_revmiss);
out:
*dvpp = NULL;
return (-1);
}
/*
* Add an entry to the cache.
*/
void
cache_enter(struct vnode *dvp, struct vnode *vp,
const char *name, size_t namelen, uint32_t cnflags)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp, *oncp;
int total;
KASSERT(namelen != cache_mp_nlen || name == cache_mp_name);
/* First, check whether we can/should add a cache entry. */
if ((cnflags & MAKEENTRY) == 0 ||
__predict_false(namelen > cache_maxlen)) {
SDT_PROBE(vfs, namecache, enter, toolong, vp, name, namelen,
0, 0);
return;
}
SDT_PROBE(vfs, namecache, enter, done, vp, name, namelen, 0, 0);
/*
* Reclaim some entries if over budget. This is an unlocked check,
* but it doesn't matter. Just need to catch up with things
* eventually: it doesn't matter if we go over temporarily.
*/
total = atomic_load_relaxed(&cache_lru.count[LRU_ACTIVE]);
total += atomic_load_relaxed(&cache_lru.count[LRU_INACTIVE]);
if (__predict_false(total > desiredvnodes)) { cache_reclaim();
}
/* Now allocate a fresh entry. */
if (__predict_true(namelen <= NCHNAMLEN)) {
ncp = pool_cache_get(cache_pool, PR_WAITOK);
} else {
size_t sz = offsetof(struct namecache, nc_name[namelen]);
ncp = kmem_alloc(sz, KM_SLEEP);
}
/*
* Fill in cache info. For negative hits, save the ISWHITEOUT flag
* so we can restore it later when the cache entry is used again.
*/
ncp->nc_vp = vp;
ncp->nc_dvp = dvp;
ncp->nc_key = cache_key(name, namelen);
ncp->nc_whiteout = ((cnflags & ISWHITEOUT) != 0);
memcpy(ncp->nc_name, name, namelen);
/*
* Insert to the directory. Concurrent lookups may race for a cache
* entry. If there's a entry there already, purge it.
*/
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
if (oncp != ncp) { KASSERT(oncp->nc_key == ncp->nc_key); KASSERT(NC_NLEN(oncp) == NC_NLEN(ncp)); KASSERT(memcmp(oncp->nc_name, name, namelen) == 0);
cache_remove(oncp, true);
oncp = rb_tree_insert_node(&dvi->vi_nc_tree, ncp);
KASSERT(oncp == ncp);
}
/*
* With the directory lock still held, insert to the tail of the
* ACTIVE LRU list (new) and take the opportunity to incrementally
* balance the lists.
*/
mutex_enter(&cache_lru_lock);
ncp->nc_lrulist = LRU_ACTIVE;
cache_lru.count[LRU_ACTIVE]++;
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
cache_deactivate();
mutex_exit(&cache_lru_lock);
/*
* Finally, insert to the vnode and unlock. With everything set up
* it's safe to let cache_revlookup() see the entry. Partially sort
* the per-vnode list: dots go to back so cache_revlookup() doesn't
* have to consider them.
*/
if (vp != NULL) {
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
if ((namelen == 1 && name[0] == '.') || (namelen == 2 && name[0] == '.' && name[1] == '.')) { TAILQ_INSERT_TAIL(&vi->vi_nc_list, ncp, nc_list);
} else {
TAILQ_INSERT_HEAD(&vi->vi_nc_list, ncp, nc_list);
}
rw_exit(&vi->vi_nc_listlock);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Set identity info in cache for a vnode. We only care about directories
* so ignore other updates. The cached info may be marked invalid if the
* inode has an ACL.
*/
void
cache_enter_id(struct vnode *vp, mode_t mode, uid_t uid, gid_t gid, bool valid)
{ vnode_impl_t *vi = VNODE_TO_VIMPL(vp); if (vp->v_type == VDIR) {
/* Grab both locks, for forward & reverse lookup. */
rw_enter(&vi->vi_nc_lock, RW_WRITER);
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
if (valid) {
vi->vi_nc_mode = mode;
vi->vi_nc_uid = uid;
vi->vi_nc_gid = gid;
} else {
vi->vi_nc_mode = VNOVAL;
vi->vi_nc_uid = VNOVAL;
vi->vi_nc_gid = VNOVAL;
}
rw_exit(&vi->vi_nc_listlock);
rw_exit(&vi->vi_nc_lock);
}
}
/*
* Return true if we have identity for the given vnode, and use as an
* opportunity to confirm that everything squares up.
*
* Because of shared code, some file systems could provide partial
* information, missing some updates, so check the mount flag too.
*/
bool
cache_have_id(struct vnode *vp)
{ if (vp->v_type == VDIR && (vp->v_mount->mnt_iflag & IMNT_NCLOOKUP) != 0 && atomic_load_relaxed(&VNODE_TO_VIMPL(vp)->vi_nc_mode) != VNOVAL) {
return true;
} else {
return false;
}
}
/*
* Enter a mount point. cvp is the covered vnode, and rvp is the root of
* the mounted file system.
*/
void
cache_enter_mount(struct vnode *cvp, struct vnode *rvp)
{ KASSERT(vrefcnt(cvp) > 0); KASSERT(vrefcnt(rvp) > 0); KASSERT(cvp->v_type == VDIR); KASSERT((rvp->v_vflag & VV_ROOT) != 0); if (rvp->v_type == VDIR) { cache_enter(cvp, rvp, cache_mp_name, cache_mp_nlen, MAKEENTRY);
}
}
/*
* Look up a cached mount point. Used in the strongly locked path.
*/
bool
cache_lookup_mount(struct vnode *dvp, struct vnode **vn_ret)
{
bool ret;
ret = cache_lookup(dvp, cache_mp_name, cache_mp_nlen, LOOKUP,
MAKEENTRY, NULL, vn_ret);
KASSERT((*vn_ret != NULL) == ret);
return ret;
}
/*
* Try to cross a mount point. For use with cache_lookup_linked().
*/
bool
cache_cross_mount(struct vnode **dvp, krwlock_t **plock)
{
return cache_lookup_linked(*dvp, cache_mp_name, cache_mp_nlen,
dvp, plock, FSCRED);
}
/*
* Name cache initialization, from vfs_init() when the system is booting.
*/
void
nchinit(void)
{
cache_pool = pool_cache_init(sizeof(struct namecache),
coherency_unit, 0, 0, "namecache", NULL, IPL_NONE, NULL,
NULL, NULL);
KASSERT(cache_pool != NULL);
mutex_init(&cache_lru_lock, MUTEX_DEFAULT, IPL_NONE);
TAILQ_INIT(&cache_lru.list[LRU_ACTIVE]);
TAILQ_INIT(&cache_lru.list[LRU_INACTIVE]);
mutex_init(&cache_stat_lock, MUTEX_DEFAULT, IPL_NONE);
callout_init(&cache_stat_callout, CALLOUT_MPSAFE);
callout_setfunc(&cache_stat_callout, cache_update_stats, NULL);
callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
KASSERT(cache_sysctllog == NULL);
sysctl_createv(&cache_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "namecache_stats",
SYSCTL_DESCR("namecache statistics"),
cache_stat_sysctl, 0, NULL, 0,
CTL_VFS, CTL_CREATE, CTL_EOL);
}
/*
* Called once for each CPU in the system as attached.
*/
void
cache_cpu_init(struct cpu_info *ci)
{
size_t sz;
sz = roundup2(sizeof(struct nchcpu), coherency_unit);
ci->ci_data.cpu_nch = kmem_zalloc(sz, KM_SLEEP);
KASSERT(((uintptr_t)ci->ci_data.cpu_nch & (coherency_unit - 1)) == 0);
}
/*
* A vnode is being allocated: set up cache structures.
*/
void
cache_vnode_init(struct vnode *vp)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
rw_init(&vi->vi_nc_lock);
rw_init(&vi->vi_nc_listlock);
rb_tree_init(&vi->vi_nc_tree, &cache_rbtree_ops);
TAILQ_INIT(&vi->vi_nc_list);
vi->vi_nc_mode = VNOVAL;
vi->vi_nc_uid = VNOVAL;
vi->vi_nc_gid = VNOVAL;
}
/*
* A vnode is being freed: finish cache structures.
*/
void
cache_vnode_fini(struct vnode *vp)
{
vnode_impl_t *vi = VNODE_TO_VIMPL(vp);
KASSERT(RB_TREE_MIN(&vi->vi_nc_tree) == NULL); KASSERT(TAILQ_EMPTY(&vi->vi_nc_list));
rw_destroy(&vi->vi_nc_lock);
rw_destroy(&vi->vi_nc_listlock);
}
/*
* Helper for cache_purge1(): purge cache entries for the given vnode from
* all directories that the vnode is cached in.
*/
static void
cache_purge_parents(struct vnode *vp)
{
vnode_impl_t *dvi, *vi = VNODE_TO_VIMPL(vp);
struct vnode *dvp, *blocked;
struct namecache *ncp;
SDT_PROBE(vfs, namecache, purge, parents, vp, 0, 0, 0, 0);
blocked = NULL;
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
while ((ncp = TAILQ_FIRST(&vi->vi_nc_list)) != NULL) {
/*
* Locking in the wrong direction. Try for a hold on the
* directory node's lock, and if we get it then all good,
* nuke the entry and move on to the next.
*/
dvp = ncp->nc_dvp;
dvi = VNODE_TO_VIMPL(dvp);
if (rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
cache_remove(ncp, false);
rw_exit(&dvi->vi_nc_lock);
blocked = NULL;
continue;
}
/*
* We can't wait on the directory node's lock with our list
* lock held or the system could deadlock.
*
* Take a hold on the directory vnode to prevent it from
* being freed (taking the vnode & lock with it). Then
* wait for the lock to become available with no other locks
* held, and retry.
*
* If this happens twice in a row, give the other side a
* breather; we can do nothing until it lets go.
*/
vhold(dvp);
rw_exit(&vi->vi_nc_listlock);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
/* Do nothing. */
rw_exit(&dvi->vi_nc_lock);
holdrele(dvp);
if (blocked == dvp) { kpause("ncpurge", false, 1, NULL);
}
rw_enter(&vi->vi_nc_listlock, RW_WRITER);
blocked = dvp;
}
rw_exit(&vi->vi_nc_listlock);
}
/*
* Helper for cache_purge1(): purge all cache entries hanging off the given
* directory vnode.
*/
static void
cache_purge_children(struct vnode *dvp)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
SDT_PROBE(vfs, namecache, purge, children, dvp, 0, 0, 0, 0);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
while ((ncp = RB_TREE_MIN(&dvi->vi_nc_tree)) != NULL) {
cache_remove(ncp, true);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Helper for cache_purge1(): purge cache entry from the given vnode,
* finding it by name.
*/
static void
cache_purge_name(struct vnode *dvp, const char *name, size_t namelen)
{
vnode_impl_t *dvi = VNODE_TO_VIMPL(dvp);
struct namecache *ncp;
uintptr_t key;
SDT_PROBE(vfs, namecache, purge, name, name, namelen, 0, 0, 0); key = cache_key(name, namelen);
rw_enter(&dvi->vi_nc_lock, RW_WRITER);
ncp = cache_lookup_entry(dvp, name, namelen, key);
if (ncp) { cache_remove(ncp, true);
}
rw_exit(&dvi->vi_nc_lock);
}
/*
* Cache flush, a particular vnode; called when a vnode is renamed to
* hide entries that would now be invalid.
*/
void
cache_purge1(struct vnode *vp, const char *name, size_t namelen, int flags)
{ if (flags & PURGE_PARENTS) { cache_purge_parents(vp);
}
if (flags & PURGE_CHILDREN) { cache_purge_children(vp);
}
if (name != NULL) { cache_purge_name(vp, name, namelen);
}
}
/*
* vnode filter for cache_purgevfs().
*/
static bool
cache_vdir_filter(void *cookie, vnode_t *vp)
{
return vp->v_type == VDIR;
}
/*
* Cache flush, a whole filesystem; called when filesys is umounted to
* remove entries that would now be invalid.
*/
void
cache_purgevfs(struct mount *mp)
{
struct vnode_iterator *iter;
vnode_t *dvp;
vfs_vnode_iterator_init(mp, &iter);
for (;;) {
dvp = vfs_vnode_iterator_next(iter, cache_vdir_filter, NULL);
if (dvp == NULL) {
break;
}
cache_purge_children(dvp);
vrele(dvp);
}
vfs_vnode_iterator_destroy(iter);
}
/*
* Re-queue an entry onto the tail of the active LRU list, after it has
* scored a hit.
*/
static void
cache_activate(struct namecache *ncp)
{
mutex_enter(&cache_lru_lock);
TAILQ_REMOVE(&cache_lru.list[ncp->nc_lrulist], ncp, nc_lru);
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru);
cache_lru.count[ncp->nc_lrulist]--;
cache_lru.count[LRU_ACTIVE]++;
ncp->nc_lrulist = LRU_ACTIVE;
mutex_exit(&cache_lru_lock);
}
/*
* Try to balance the LRU lists. Pick some victim entries, and re-queue
* them from the head of the active list to the tail of the inactive list.
*/
static void
cache_deactivate(void)
{
struct namecache *ncp;
int total, i;
KASSERT(mutex_owned(&cache_lru_lock));
/* If we're nowhere near budget yet, don't bother. */
total = cache_lru.count[LRU_ACTIVE] + cache_lru.count[LRU_INACTIVE];
if (total < (desiredvnodes >> 1)) {
return;
}
/*
* Aim for a 1:1 ratio of active to inactive. This is to allow each
* potential victim a reasonable amount of time to cycle through the
* inactive list in order to score a hit and be reactivated, while
* trying not to cause reactivations too frequently.
*/
if (cache_lru.count[LRU_ACTIVE] < cache_lru.count[LRU_INACTIVE]) {
return;
}
/* Move only a few at a time; will catch up eventually. */
for (i = 0; i < cache_lru_maxdeact; i++) {
ncp = TAILQ_FIRST(&cache_lru.list[LRU_ACTIVE]);
if (ncp == NULL) {
break;
}
KASSERT(ncp->nc_lrulist == LRU_ACTIVE);
ncp->nc_lrulist = LRU_INACTIVE;
TAILQ_REMOVE(&cache_lru.list[LRU_ACTIVE], ncp, nc_lru); TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE], ncp, nc_lru);
cache_lru.count[LRU_ACTIVE]--;
cache_lru.count[LRU_INACTIVE]++;
}
}
/*
* Free some entries from the cache, when we have gone over budget.
*
* We don't want to cause too much work for any individual caller, and it
* doesn't matter if we temporarily go over budget. This is also "just a
* cache" so it's not a big deal if we screw up and throw out something we
* shouldn't. So we take a relaxed attitude to this process to reduce its
* impact.
*/
static void
cache_reclaim(void)
{
struct namecache *ncp;
vnode_impl_t *dvi;
int toscan;
/*
* Scan up to a preset maximum number of entries, but no more than
* 0.8% of the total at once (to allow for very small systems).
*
* On bigger systems, do a larger chunk of work to reduce the number
* of times that cache_lru_lock is held for any length of time.
*/
mutex_enter(&cache_lru_lock);
toscan = MIN(cache_lru_maxscan, desiredvnodes >> 7);
toscan = MAX(toscan, 1);
SDT_PROBE(vfs, namecache, prune, done, cache_lru.count[LRU_ACTIVE] +
cache_lru.count[LRU_INACTIVE], toscan, 0, 0, 0);
while (toscan-- != 0) {
/* First try to balance the lists. */
cache_deactivate();
/* Now look for a victim on head of inactive list (old). */
ncp = TAILQ_FIRST(&cache_lru.list[LRU_INACTIVE]);
if (ncp == NULL) {
break;
}
dvi = VNODE_TO_VIMPL(ncp->nc_dvp);
KASSERT(ncp->nc_lrulist == LRU_INACTIVE); KASSERT(dvi != NULL);
/*
* Locking in the wrong direction. If we can't get the
* lock, the directory is actively busy, and it could also
* cause problems for the next guy in here, so send the
* entry to the back of the list.
*/
if (!rw_tryenter(&dvi->vi_nc_lock, RW_WRITER)) {
TAILQ_REMOVE(&cache_lru.list[LRU_INACTIVE],
ncp, nc_lru);
TAILQ_INSERT_TAIL(&cache_lru.list[LRU_INACTIVE],
ncp, nc_lru);
continue;
}
/*
* Now have the victim entry locked. Drop the LRU list
* lock, purge the entry, and start over. The hold on
* vi_nc_lock will prevent the vnode from vanishing until
* finished (cache_purge() will be called on dvp before it
* disappears, and that will wait on vi_nc_lock).
*/
mutex_exit(&cache_lru_lock);
cache_remove(ncp, true);
rw_exit(&dvi->vi_nc_lock);
mutex_enter(&cache_lru_lock);
}
mutex_exit(&cache_lru_lock);
}
/*
* For file system code: count a lookup that required a full re-scan of
* directory metadata.
*/
void
namecache_count_pass2(void)
{ COUNT(ncs_pass2);
}
/*
* For file system code: count a lookup that scored a hit in the directory
* metadata near the location of the last lookup.
*/
void
namecache_count_2passes(void)
{ COUNT(ncs_2passes);
}
/*
* Sum the stats from all CPUs into nchstats. This needs to run at least
* once within every window where a 32-bit counter could roll over. It's
* called regularly by timer to ensure this.
*/
static void
cache_update_stats(void *cookie)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
mutex_enter(&cache_stat_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
struct nchcpu *nchcpu = ci->ci_data.cpu_nch;
UPDATE(nchcpu, ncs_goodhits);
UPDATE(nchcpu, ncs_neghits);
UPDATE(nchcpu, ncs_badhits);
UPDATE(nchcpu, ncs_falsehits);
UPDATE(nchcpu, ncs_miss);
UPDATE(nchcpu, ncs_long);
UPDATE(nchcpu, ncs_pass2);
UPDATE(nchcpu, ncs_2passes);
UPDATE(nchcpu, ncs_revhits);
UPDATE(nchcpu, ncs_revmiss);
UPDATE(nchcpu, ncs_denied);
}
if (cookie != NULL) {
memcpy(cookie, &nchstats, sizeof(nchstats));
}
/* Reset the timer; arrive back here in N minutes at latest. */
callout_schedule(&cache_stat_callout, cache_stat_interval * hz);
mutex_exit(&cache_stat_lock);
}
/*
* Fetch the current values of the stats for sysctl.
*/
static int
cache_stat_sysctl(SYSCTLFN_ARGS)
{
struct nchstats stats;
if (oldp == NULL) {
*oldlenp = sizeof(nchstats);
return 0;
}
if (*oldlenp <= 0) {
*oldlenp = 0;
return 0;
}
/* Refresh the global stats. */
sysctl_unlock();
cache_update_stats(&stats);
sysctl_relock();
*oldlenp = MIN(sizeof(stats), *oldlenp);
return sysctl_copyout(l, &stats, oldp, *oldlenp);
}
/*
* For the debugger, given the address of a vnode, print all associated
* names in the cache.
*/
#ifdef DDB
void
namecache_print(struct vnode *vp, void (*pr)(const char *, ...))
{
struct vnode *dvp = NULL;
struct namecache *ncp;
enum cache_lru_id id;
for (id = 0; id < LRU_COUNT; id++) {
TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
if (ncp->nc_vp == vp) {
(*pr)("name %.*s\n", NC_NLEN(ncp),
ncp->nc_name);
dvp = ncp->nc_dvp;
}
}
}
if (dvp == NULL) {
(*pr)("name not found\n");
return;
}
for (id = 0; id < LRU_COUNT; id++) {
TAILQ_FOREACH(ncp, &cache_lru.list[id], nc_lru) {
if (ncp->nc_vp == dvp) {
(*pr)("parent %.*s\n", NC_NLEN(ncp),
ncp->nc_name);
}
}
}
}
#endif
/* $NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $ */
/*-
* Copyright (c) 1998, 2000, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: syscall.c,v 1.22 2023/10/05 19:41:06 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/signal.h>
#include <sys/ktrace.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscall_stats.h>
#include <uvm/uvm_extern.h>
#include <machine/cpu.h>
#include <machine/psl.h>
#include <machine/userret.h>
#include "opt_dtrace.h"
#ifndef __x86_64__
int x86_copyargs(void *, void *, size_t);
#endif
void syscall_intern(struct proc *);
static void syscall(struct trapframe *);
void
md_child_return(struct lwp *l)
{
struct trapframe *tf = l->l_md.md_regs;
X86_TF_RAX(tf) = 0;
X86_TF_RFLAGS(tf) &= ~PSL_C;
userret(l);
}
/*
* Process the tail end of a posix_spawn() for the child.
*/
void
cpu_spawn_return(struct lwp *l)
{
userret(l);
}
/*
* syscall(frame):
* System call request from POSIX system call gate interface to kernel.
* Like trap(), argument is call by reference.
*/
#ifdef KDTRACE_HOOKS
void syscall(struct trapframe *);
#else
static
#endif
void
syscall(struct trapframe *frame)
{
const struct sysent *callp;
struct proc *p;
struct lwp *l;
int error;
register_t code, rval[2];
#ifdef __x86_64__
/* Verify that the syscall args will fit in the trapframe space */
CTASSERT(offsetof(struct trapframe, tf_arg9) >=
sizeof(register_t) * (2 + SYS_MAXSYSARGS - 1));
#define args (&frame->tf_rdi)
#else
register_t args[2 + SYS_MAXSYSARGS];
#endif
l = curlwp;
p = l->l_proc;
code = X86_TF_RAX(frame) & (SYS_NSYSENT - 1);
callp = p->p_emul->e_sysent + code;
SYSCALL_COUNT(syscall_counts, code);
SYSCALL_TIME_SYS_ENTRY(l, syscall_times, code);
#ifdef __x86_64__
/*
* The first 6 syscall args are passed in rdi, rsi, rdx, r10, r8 and r9
* (rcx gets copied to r10 in the libc stub because the syscall
* instruction overwrites %cx) and are together in the trap frame
* with space following for 4 more entries.
*/
if (__predict_false(callp->sy_argsize > 6 * 8)) {
error = copyin((register_t *)frame->tf_rsp + 1,
&frame->tf_arg6, callp->sy_argsize - 6 * 8);
if (error != 0)
goto bad;
}
#else
if (callp->sy_argsize) {
error = x86_copyargs((char *)frame->tf_esp + sizeof(int), args,
callp->sy_argsize);
if (__predict_false(error != 0))
goto bad;
}
#endif
error = sy_invoke(callp, l, args, rval, code); if (__predict_true(error == 0)) { X86_TF_RAX(frame) = rval[0];
X86_TF_RDX(frame) = rval[1];
X86_TF_RFLAGS(frame) &= ~PSL_C; /* carry bit */
} else {
switch (error) {
case ERESTART:
/*
* The offset to adjust the PC by depends on whether we
* entered the kernel through the trap or call gate.
* We saved the instruction size in tf_err on entry.
*/
X86_TF_RIP(frame) -= frame->tf_err;
break;
case EJUSTRETURN:
/* nothing to do */
break;
default:
bad:
X86_TF_RAX(frame) = error;
X86_TF_RFLAGS(frame) |= PSL_C; /* carry bit */
break;
}
}
SYSCALL_TIME_SYS_EXIT(l);
userret(l);
}
void
syscall_intern(struct proc *p)
{
p->p_md.md_syscall = syscall;
}
/* $NetBSD: in6_var.h,v 1.104 2020/06/16 17:12:18 maxv Exp $ */
/* $KAME: in6_var.h,v 1.81 2002/06/08 11:16:51 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1985, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET6_IN6_VAR_H_
#define _NETINET6_IN6_VAR_H_
#include <sys/callout.h>
#include <sys/ioccom.h>
/*
* Interface address, Internet version. One of these structures
* is allocated for each interface with an Internet address.
* The ifaddr structure contains the protocol-independent part
* of the structure and is assumed to be first.
*/
/*
* pltime/vltime are just for future reference (required to implements 2
* hour rule for hosts). they should never be modified by nd6_timeout or
* anywhere else.
* userland -> kernel: accept pltime/vltime
* kernel -> userland: throw up everything
* in kernel: modify preferred/expire only
*/
struct in6_addrlifetime {
time_t ia6t_expire; /* valid lifetime expiration time */
time_t ia6t_preferred; /* preferred lifetime expiration time */
u_int32_t ia6t_vltime; /* valid lifetime */
u_int32_t ia6t_pltime; /* prefix lifetime */
};
struct lltable;
struct nd_kifinfo;
struct in6_ifextra {
struct in6_ifstat *in6_ifstat;
struct icmp6_ifstat *icmp6_ifstat;
struct nd_kifinfo *nd_ifinfo;
struct scope6_id *scope6_id;
struct lltable *lltable;
};
LIST_HEAD(in6_multihead, in6_multi);
struct in6_ifaddr {
struct ifaddr ia_ifa; /* protocol-independent info */
#define ia_ifp ia_ifa.ifa_ifp
#define ia_flags ia_ifa.ifa_flags
struct sockaddr_in6 ia_addr; /* interface address */
struct sockaddr_in6 ia_net; /* network number of interface */
struct sockaddr_in6 ia_dstaddr; /* space for destination addr */
struct sockaddr_in6 ia_prefixmask; /* prefix mask */
u_int32_t ia_plen; /* prefix length */
/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
struct in6_ifaddr *ia_next; /* next in6 list of IP6 addresses */
/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
struct in6_multihead _ia6_multiaddrs;
/* list of multicast addresses */
int ia6_flags;
struct in6_addrlifetime ia6_lifetime;
time_t ia6_createtime; /* the creation time of this address, which is
* currently used for temporary addresses only.
*/
time_t ia6_updatetime;
/* multicast addresses joined from the kernel */
LIST_HEAD(, in6_multi_mship) ia6_memberships;
#ifdef _KERNEL
struct pslist_entry ia6_pslist_entry;
#endif
};
#ifdef _KERNEL
static __inline void
ia6_acquire(struct in6_ifaddr *ia, struct psref *psref)
{
KASSERT(ia != NULL);
ifa_acquire(&ia->ia_ifa, psref);
}
static __inline void
ia6_release(struct in6_ifaddr *ia, struct psref *psref)
{
if (ia == NULL)
return;
ifa_release(&ia->ia_ifa, psref);
}
#endif
/* control structure to manage address selection policy */
struct in6_addrpolicy {
struct sockaddr_in6 addr; /* prefix address */
struct sockaddr_in6 addrmask; /* prefix mask */
int preced; /* precedence */
int label; /* matching label */
u_quad_t use; /* statistics */
};
/*
* IPv6 interface statistics, as defined in RFC2465 Ipv6IfStatsEntry (p12).
*/
struct in6_ifstat {
u_quad_t ifs6_in_receive; /* # of total input datagram */
u_quad_t ifs6_in_hdrerr; /* # of datagrams with invalid hdr */
u_quad_t ifs6_in_toobig; /* # of datagrams exceeded MTU */
u_quad_t ifs6_in_noroute; /* # of datagrams with no route */
u_quad_t ifs6_in_addrerr; /* # of datagrams with invalid dst */
u_quad_t ifs6_in_protounknown; /* # of datagrams with unknown proto */
/* NOTE: increment on final dst if */
u_quad_t ifs6_in_truncated; /* # of truncated datagrams */
u_quad_t ifs6_in_discard; /* # of discarded datagrams */
/* NOTE: fragment timeout is not here */
u_quad_t ifs6_in_deliver; /* # of datagrams delivered to ULP */
/* NOTE: increment on final dst if */
u_quad_t ifs6_out_forward; /* # of datagrams forwarded */
/* NOTE: increment on outgoing if */
u_quad_t ifs6_out_request; /* # of outgoing datagrams from ULP */
/* NOTE: does not include forwrads */
u_quad_t ifs6_out_discard; /* # of discarded datagrams */
u_quad_t ifs6_out_fragok; /* # of datagrams fragmented */
u_quad_t ifs6_out_fragfail; /* # of datagrams failed on fragment */
u_quad_t ifs6_out_fragcreat; /* # of fragment datagrams */
/* NOTE: this is # after fragment */
u_quad_t ifs6_reass_reqd; /* # of incoming fragmented packets */
/* NOTE: increment on final dst if */
u_quad_t ifs6_reass_ok; /* # of reassembled packets */
/* NOTE: this is # after reass */
/* NOTE: increment on final dst if */
u_quad_t ifs6_reass_fail; /* # of reass failures */
/* NOTE: may not be packet count */
/* NOTE: increment on final dst if */
u_quad_t ifs6_in_mcast; /* # of inbound multicast datagrams */
u_quad_t ifs6_out_mcast; /* # of outbound multicast datagrams */
};
/*
* ICMPv6 interface statistics, as defined in RFC2466 Ipv6IfIcmpEntry.
* XXX: I'm not sure if this file is the right place for this structure...
*/
struct icmp6_ifstat {
/*
* Input statistics
*/
/* ipv6IfIcmpInMsgs, total # of input messages */
u_quad_t ifs6_in_msg;
/* ipv6IfIcmpInErrors, # of input error messages */
u_quad_t ifs6_in_error;
/* ipv6IfIcmpInDestUnreachs, # of input dest unreach errors */
u_quad_t ifs6_in_dstunreach;
/* ipv6IfIcmpInAdminProhibs, # of input administratively prohibited errs */
u_quad_t ifs6_in_adminprohib;
/* ipv6IfIcmpInTimeExcds, # of input time exceeded errors */
u_quad_t ifs6_in_timeexceed;
/* ipv6IfIcmpInParmProblems, # of input parameter problem errors */
u_quad_t ifs6_in_paramprob;
/* ipv6IfIcmpInPktTooBigs, # of input packet too big errors */
u_quad_t ifs6_in_pkttoobig;
/* ipv6IfIcmpInEchos, # of input echo requests */
u_quad_t ifs6_in_echo;
/* ipv6IfIcmpInEchoReplies, # of input echo replies */
u_quad_t ifs6_in_echoreply;
/* ipv6IfIcmpInRouterSolicits, # of input router solicitations */
u_quad_t ifs6_in_routersolicit;
/* ipv6IfIcmpInRouterAdvertisements, # of input router advertisements */
u_quad_t ifs6_in_routeradvert;
/* ipv6IfIcmpInNeighborSolicits, # of input neighbor solicitations */
u_quad_t ifs6_in_neighborsolicit;
/* ipv6IfIcmpInNeighborAdvertisements, # of input neighbor advertisements */
u_quad_t ifs6_in_neighboradvert;
/* ipv6IfIcmpInRedirects, # of input redirects */
u_quad_t ifs6_in_redirect;
/* ipv6IfIcmpInGroupMembQueries, # of input MLD queries */
u_quad_t ifs6_in_mldquery;
/* ipv6IfIcmpInGroupMembResponses, # of input MLD reports */
u_quad_t ifs6_in_mldreport;
/* ipv6IfIcmpInGroupMembReductions, # of input MLD done */
u_quad_t ifs6_in_mlddone;
/*
* Output statistics. We should solve unresolved routing problem...
*/
/* ipv6IfIcmpOutMsgs, total # of output messages */
u_quad_t ifs6_out_msg;
/* ipv6IfIcmpOutErrors, # of output error messages */
u_quad_t ifs6_out_error;
/* ipv6IfIcmpOutDestUnreachs, # of output dest unreach errors */
u_quad_t ifs6_out_dstunreach;
/* ipv6IfIcmpOutAdminProhibs, # of output administratively prohibited errs */
u_quad_t ifs6_out_adminprohib;
/* ipv6IfIcmpOutTimeExcds, # of output time exceeded errors */
u_quad_t ifs6_out_timeexceed;
/* ipv6IfIcmpOutParmProblems, # of output parameter problem errors */
u_quad_t ifs6_out_paramprob;
/* ipv6IfIcmpOutPktTooBigs, # of output packet too big errors */
u_quad_t ifs6_out_pkttoobig;
/* ipv6IfIcmpOutEchos, # of output echo requests */
u_quad_t ifs6_out_echo;
/* ipv6IfIcmpOutEchoReplies, # of output echo replies */
u_quad_t ifs6_out_echoreply;
/* ipv6IfIcmpOutRouterSolicits, # of output router solicitations */
u_quad_t ifs6_out_routersolicit;
/* ipv6IfIcmpOutRouterAdvertisements, # of output router advertisements */
u_quad_t ifs6_out_routeradvert;
/* ipv6IfIcmpOutNeighborSolicits, # of output neighbor solicitations */
u_quad_t ifs6_out_neighborsolicit;
/* ipv6IfIcmpOutNeighborAdvertisements, # of output neighbor advertisements */
u_quad_t ifs6_out_neighboradvert;
/* ipv6IfIcmpOutRedirects, # of output redirects */
u_quad_t ifs6_out_redirect;
/* ipv6IfIcmpOutGroupMembQueries, # of output MLD queries */
u_quad_t ifs6_out_mldquery;
/* ipv6IfIcmpOutGroupMembResponses, # of output MLD reports */
u_quad_t ifs6_out_mldreport;
/* ipv6IfIcmpOutGroupMembReductions, # of output MLD done */
u_quad_t ifs6_out_mlddone;
};
/*
* If you make changes that change the size of in6_ifreq,
* make sure you fix compat/netinet6/in6_var.h
*/
struct in6_ifreq {
char ifr_name[IFNAMSIZ];
union {
struct sockaddr_in6 ifru_addr;
struct sockaddr_in6 ifru_dstaddr;
short ifru_flags;
int ifru_flags6;
int ifru_metric;
void * ifru_data;
struct in6_addrlifetime ifru_lifetime;
struct in6_ifstat ifru_stat;
struct icmp6_ifstat ifru_icmp6stat;
} ifr_ifru;
};
struct in6_aliasreq {
char ifra_name[IFNAMSIZ];
struct sockaddr_in6 ifra_addr;
struct sockaddr_in6 ifra_dstaddr;
struct sockaddr_in6 ifra_prefixmask;
int ifra_flags;
struct in6_addrlifetime ifra_lifetime;
};
/*
* Given a pointer to an in6_ifaddr (ifaddr),
* return a pointer to the addr as a sockaddr_in6
*/
#define IA6_IN6(ia) (&((ia)->ia_addr.sin6_addr))
#define IA6_DSTIN6(ia) (&((ia)->ia_dstaddr.sin6_addr))
#define IA6_MASKIN6(ia) (&((ia)->ia_prefixmask.sin6_addr))
#define IA6_SIN6(ia) (&((ia)->ia_addr))
#define IA6_DSTSIN6(ia) (&((ia)->ia_dstaddr))
#define IFA_IN6(x) (&((struct sockaddr_in6 *)((x)->ifa_addr))->sin6_addr)
#define IFA_DSTIN6(x) (&((struct sockaddr_in6 *)((x)->ifa_dstaddr))->sin6_addr)
#ifdef _KERNEL
#define IN6_ARE_MASKED_ADDR_EQUAL(d, a, m) ( \
(((d)->s6_addr32[0] ^ (a)->s6_addr32[0]) & (m)->s6_addr32[0]) == 0 && \
(((d)->s6_addr32[1] ^ (a)->s6_addr32[1]) & (m)->s6_addr32[1]) == 0 && \
(((d)->s6_addr32[2] ^ (a)->s6_addr32[2]) & (m)->s6_addr32[2]) == 0 && \
(((d)->s6_addr32[3] ^ (a)->s6_addr32[3]) & (m)->s6_addr32[3]) == 0 )
#endif
#define SIOCSIFADDR_IN6 _IOW('i', 12, struct in6_ifreq)
#define SIOCGIFADDR_IN6 _IOWR('i', 33, struct in6_ifreq)
#ifdef _KERNEL
/*
* SIOCSxxx ioctls should be unused (see comments in in6.c), but
* we do not shift numbers for binary compatibility.
*/
#define SIOCSIFDSTADDR_IN6 _IOW('i', 14, struct in6_ifreq)
#define SIOCSIFNETMASK_IN6 _IOW('i', 22, struct in6_ifreq)
#endif
#define SIOCGIFDSTADDR_IN6 _IOWR('i', 34, struct in6_ifreq)
#define SIOCGIFNETMASK_IN6 _IOWR('i', 37, struct in6_ifreq)
#define SIOCDIFADDR_IN6 _IOW('i', 25, struct in6_ifreq)
/* 26 was OSIOCAIFADDR_IN6 */
/* 70 was OSIOCSIFPHYADDR_IN6 */
#define SIOCGIFPSRCADDR_IN6 _IOWR('i', 71, struct in6_ifreq)
#define SIOCGIFPDSTADDR_IN6 _IOWR('i', 72, struct in6_ifreq)
#define SIOCGIFAFLAG_IN6 _IOWR('i', 73, struct in6_ifreq)
/*
* 74 was SIOCGDRLST_IN6
* 75 was SIOCGPRLST_IN6
* 76 was OSIOCGIFINFO_IN6
* 77 was SIOCSNDFLUSH_IN6
*/
#define SIOCGNBRINFO_IN6 _IOWR('i', 78, struct in6_nbrinfo)
/*
* 79 was SIOCSPFXFLUSH_IN6
* 80 was SIOCSRTRFLUSH_IN6
* 81 was SIOCGIFALIFETIME_IN6
*/
#if 0
/* withdrawn - do not reuse number 82 */
#define SIOCSIFALIFETIME_IN6 _IOWR('i', 82, struct in6_ifreq)
#endif
#define SIOCGIFSTAT_IN6 _IOWR('i', 83, struct in6_ifreq)
#define SIOCGIFSTAT_ICMP6 _IOWR('i', 84, struct in6_ifreq)
/*
* 85 was SIOCSDEFIFACE_IN6
* 86 was SIOCGDEFIFACE_IN6
* 87 was OSIOCSIFINFO_FLAGS
* 100 was SIOCSIFPREFIX_IN6
* 101 was SIOCGIFPREFIX_IN6
* 102 was SIOCDIFPREFIX_IN6
* 103 was SIOCAIFPREFIX_IN6
* 104 was SIOCCIFPREFIX_IN6
* 105 was SIOCSGIFPREFIX_IN6
*/
#define SIOCGIFALIFETIME_IN6 _IOWR('i', 106, struct in6_ifreq)
#define SIOCAIFADDR_IN6 _IOW('i', 107, struct in6_aliasreq)
/* 108 was OSIOCGIFINFO_IN6_90
* 109 was OSIOCSIFINFO_IN6_90 */
#define SIOCSIFPHYADDR_IN6 _IOW('i', 110, struct in6_aliasreq)
/* 110 - 112 are defined in net/if_pppoe.h */
#define SIOCGIFINFO_IN6 _IOWR('i', 113, struct in6_ndireq)
#define SIOCSIFINFO_IN6 _IOWR('i', 114, struct in6_ndireq)
#define SIOCSIFINFO_FLAGS _IOWR('i', 115, struct in6_ndireq)
/* XXX: Someone decided to switch to 'u' here for unknown reasons! */
#define SIOCGETSGCNT_IN6 _IOWR('u', 106, \
struct sioc_sg_req6) /* get s,g pkt cnt */
#define SIOCGETMIFCNT_IN6 _IOWR('u', 107, \
struct sioc_mif_req6) /* get pkt cnt per if */
#define SIOCAADDRCTL_POLICY _IOW('u', 108, struct in6_addrpolicy)
#define SIOCDADDRCTL_POLICY _IOW('u', 109, struct in6_addrpolicy)
#define IN6_IFF_ANYCAST 0x01 /* anycast address */
#define IN6_IFF_TENTATIVE 0x02 /* tentative address */
#define IN6_IFF_DUPLICATED 0x04 /* DAD detected duplicate */
#define IN6_IFF_DETACHED 0x08 /* may be detached from the link */
#define IN6_IFF_DEPRECATED 0x10 /* deprecated address */
#define IN6_IFF_NODAD 0x20 /* don't perform DAD on this address
* (used only at first SIOC* call)
*/
#define IN6_IFF_AUTOCONF 0x40 /* autoconfigurable address. */
#define IN6_IFF_TEMPORARY 0x80 /* temporary (anonymous) address. */
#define IN6_IFFBITS \
"\020\1ANYCAST\2TENTATIVE\3DUPLICATED\4DETACHED\5DEPRECATED\6NODAD" \
"\7AUTOCONF\10TEMPORARY"
/* do not input/output */
#define IN6_IFF_NOTREADY (IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)
#ifdef _KERNEL
#define IN6_ARE_SCOPE_CMP(a,b) ((a)-(b))
#define IN6_ARE_SCOPE_EQUAL(a,b) ((a)==(b))
#endif
#ifdef _KERNEL
#include <sys/mutex.h>
#include <sys/pserialize.h>
#include <net/pktqueue.h>
extern pktqueue_t *ip6_pktq;
MALLOC_DECLARE(M_IP6OPT);
extern struct pslist_head in6_ifaddr_list;
extern kmutex_t in6_ifaddr_lock;
#define IN6_ADDRLIST_ENTRY_INIT(__ia) \
PSLIST_ENTRY_INIT((__ia), ia6_pslist_entry)
#define IN6_ADDRLIST_ENTRY_DESTROY(__ia) \
PSLIST_ENTRY_DESTROY((__ia), ia6_pslist_entry)
#define IN6_ADDRLIST_READER_EMPTY() \
(PSLIST_READER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \
ia6_pslist_entry) == NULL)
#define IN6_ADDRLIST_READER_FIRST() \
PSLIST_READER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \
ia6_pslist_entry)
#define IN6_ADDRLIST_READER_NEXT(__ia) \
PSLIST_READER_NEXT((__ia), struct in6_ifaddr, ia6_pslist_entry)
#define IN6_ADDRLIST_READER_FOREACH(__ia) \
PSLIST_READER_FOREACH((__ia), &in6_ifaddr_list, \
struct in6_ifaddr, ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_INSERT_HEAD(__ia) \
PSLIST_WRITER_INSERT_HEAD(&in6_ifaddr_list, (__ia), ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_REMOVE(__ia) \
PSLIST_WRITER_REMOVE((__ia), ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_FOREACH(__ia) \
PSLIST_WRITER_FOREACH((__ia), &in6_ifaddr_list, struct in6_ifaddr, \
ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_FIRST() \
PSLIST_WRITER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \
ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_NEXT(__ia) \
PSLIST_WRITER_NEXT((__ia), struct in6_ifaddr, ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_INSERT_AFTER(__ia, __new) \
PSLIST_WRITER_INSERT_AFTER((__ia), (__new), ia6_pslist_entry)
#define IN6_ADDRLIST_WRITER_EMPTY() \
(PSLIST_WRITER_FIRST(&in6_ifaddr_list, struct in6_ifaddr, \
ia6_pslist_entry) == NULL)
#define IN6_ADDRLIST_WRITER_INSERT_TAIL(__new) \
do { \
if (IN6_ADDRLIST_WRITER_EMPTY()) { \
IN6_ADDRLIST_WRITER_INSERT_HEAD((__new)); \
} else { \
struct in6_ifaddr *__ia; \
IN6_ADDRLIST_WRITER_FOREACH(__ia) { \
if (IN6_ADDRLIST_WRITER_NEXT(__ia) == NULL) { \
IN6_ADDRLIST_WRITER_INSERT_AFTER(__ia,\
(__new)); \
break; \
} \
} \
} \
} while (0)
#define in6_ifstat_inc(ifp, tag) \
do { \
if (ifp) \
((struct in6_ifextra *)((ifp)->if_afdata[AF_INET6]))->in6_ifstat->tag++; \
} while (/*CONSTCOND*/ 0)
extern const struct in6_addr zeroin6_addr;
extern const u_char inet6ctlerrmap[];
extern bool in6_present;
/*
* Macro for finding the internet address structure (in6_ifaddr) corresponding
* to a given interface (ifnet structure).
*/
static __inline struct in6_ifaddr *
in6_get_ia_from_ifp(struct ifnet *ifp)
{
struct ifaddr *ifa;
IFADDR_READER_FOREACH(ifa, ifp) {
if (ifa->ifa_addr->sa_family == AF_INET6)
break;
}
return (struct in6_ifaddr *)ifa;
}
static __inline struct in6_ifaddr *
in6_get_ia_from_ifp_psref(struct ifnet *ifp, struct psref *psref)
{
struct in6_ifaddr *ia;
int s;
s = pserialize_read_enter();
ia = in6_get_ia_from_ifp(ifp);
if (ia != NULL)
ia6_acquire(ia, psref);
pserialize_read_exit(s);
return ia;
}
#endif /* _KERNEL */
/*
* Multi-cast membership entry. One for each group/ifp that a PCB
* belongs to.
*/
struct in6_multi_mship {
struct in6_multi *i6mm_maddr; /* Multicast address pointer */
LIST_ENTRY(in6_multi_mship) i6mm_chain; /* multicast options chain */
};
struct in6_multi {
LIST_ENTRY(in6_multi) in6m_entry; /* list glue */
struct in6_addr in6m_addr; /* IP6 multicast address */
struct ifnet *in6m_ifp; /* back pointer to ifnet */
/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
struct in6_ifaddr *_in6m_ia; /* back pointer to in6_ifaddr */
u_int in6m_refcount; /* # membership claims by sockets */
u_int in6m_state; /* state of the membership */
int in6m_timer; /* delay to send the 1st report */
struct timeval in6m_timer_expire; /* when the timer expires */
callout_t in6m_timer_ch;
};
#define IN6M_TIMER_UNDEF -1
#ifdef _KERNEL
/* flags to in6_update_ifa */
#define IN6_IFAUPDATE_DADDELAY 0x1 /* first time to configure an address */
#if 0
/*
* Macros for looking up the in6_multi_mship record for a given IP6 multicast
* address on a given interface. If no matching record is found, "imm"
* returns NULL.
*/
static __inline struct in6_multi_mship *
in6_lookup_mship(struct in6_addr *addr, struct ifnet *ifp,
struct ip6_moptions *imop)
{
struct in6_multi_mship *imm;
LIST_FOREACH(imm, &imop->im6o_memberships, i6mm_chain) {
if (imm->i6mm_maddr->in6m_ifp != ifp)
continue;
if (IN6_ARE_ADDR_EQUAL(&imm->i6mm_maddr->in6m_addr,
addr))
break;
}
return imm;
}
#define IN6_LOOKUP_MSHIP(__addr, __ifp, __imop, __imm) \
/* struct in6_addr __addr; */ \
/* struct ifnet *__ifp; */ \
/* struct ip6_moptions *__imop */ \
/* struct in6_multi_mship *__imm; */ \
do { \
(__imm) = in6_lookup_mship(&(__addr), (__ifp), (__imop)); \
} while (/*CONSTCOND*/ 0)
#endif
void in6_init(void);
void in6_multi_lock(int);
void in6_multi_unlock(void);
bool in6_multi_locked(int);
struct in6_multi *
in6_lookup_multi(const struct in6_addr *, const struct ifnet *);
bool in6_multi_group(const struct in6_addr *, const struct ifnet *);
void in6_purge_multi(struct ifnet *);
struct in6_multi *in6_addmulti(struct in6_addr *, struct ifnet *,
int *, int);
void in6_delmulti(struct in6_multi *);
void in6_delmulti_locked(struct in6_multi *);
void in6_lookup_and_delete_multi(const struct in6_addr *,
const struct ifnet *);
struct in6_multi_mship *in6_joingroup(struct ifnet *, struct in6_addr *,
int *, int);
int in6_leavegroup(struct in6_multi_mship *);
int in6_mask2len(struct in6_addr *, u_char *);
int in6_control(struct socket *, u_long, void *, struct ifnet *);
int in6_update_ifa(struct ifnet *, struct in6_aliasreq *, int);
void in6_purgeaddr(struct ifaddr *);
void in6_purgeif(struct ifnet *);
void *in6_domifattach(struct ifnet *);
void in6_domifdetach(struct ifnet *, void *);
void in6_ifremlocal(struct ifaddr *);
void in6_ifaddlocal(struct ifaddr *);
struct in6_ifaddr *
in6ifa_ifpforlinklocal(const struct ifnet *, int);
struct in6_ifaddr *
in6ifa_ifpforlinklocal_psref(const struct ifnet *, int, struct psref *);
struct in6_ifaddr *
in6ifa_ifpwithaddr(const struct ifnet *, const struct in6_addr *);
struct in6_ifaddr *
in6ifa_ifpwithaddr_psref(const struct ifnet *, const struct in6_addr *,
struct psref *);
struct in6_ifaddr *in6ifa_ifwithaddr(const struct in6_addr *, uint32_t);
int in6_matchlen(struct in6_addr *, struct in6_addr *);
void in6_prefixlen2mask(struct in6_addr *, int);
void in6_purge_mcast_references(struct in6_multi *);
int ip6flow_fastforward(struct mbuf **); /* IPv6 fast forward routine */
int in6_src_ioctl(u_long, void *);
int in6_is_addr_deprecated(struct sockaddr_in6 *);
struct in6pcb;
#define LLTABLE6(ifp) (((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->lltable)
void in6_sysctl_multicast_setup(struct sysctllog **);
#endif /* _KERNEL */
#endif /* !_NETINET6_IN6_VAR_H_ */
/* $NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1995 Jean-Marc Zucconi
* All rights reserved.
*
* Ported to NetBSD by Matthieu Herrb <matthieu@laas.fr>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer
* in this position and unchanged.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: joy.c,v 1.21 2017/10/28 04:53:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/device.h>
#include <sys/errno.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/vnode.h>
#include <sys/bus.h>
#include <sys/joystick.h>
#include <dev/ic/joyvar.h>
#include "ioconf.h"
/*
* The game port can manage 4 buttons and 4 variable resistors (usually 2
* joysticks, each with 2 buttons and 2 pots.) via the port at address 0x201.
* Getting the state of the buttons is done by reading the game port;
* buttons 1-4 correspond to bits 4-7 and resistors 1-4 (X1, Y1, X2, Y2)
* to bits 0-3. If button 1 (resp 2, 3, 4) is pressed, the bit 4 (resp 5,
* 6, 7) is set to 0 to get the value of a resistor, write the value 0xff
* at port and wait until the corresponding bit returns to 0.
*/
#define JOYPART(d) (minor(d) & 1)
#define JOYUNIT(d) (minor(d) >> 1)
#ifndef JOY_TIMEOUT
#define JOY_TIMEOUT 2000 /* 2 milliseconds */
#endif
static dev_type_open(joyopen);
static dev_type_close(joyclose);
static dev_type_read(joyread);
static dev_type_ioctl(joyioctl);
const struct cdevsw joy_cdevsw = {
.d_open = joyopen,
.d_close = joyclose,
.d_read = joyread,
.d_write = nowrite,
.d_ioctl = joyioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
void
joyattach(struct joy_softc *sc)
{
if (sc->sc_lock == NULL) {
panic("joyattach: no lock");
}
sc->timeout[0] = 0;
sc->timeout[1] = 0;
mutex_enter(sc->sc_lock);
bus_space_write_1(sc->sc_iot, sc->sc_ioh, 0, 0xff);
DELAY(10000); /* 10 ms delay */
aprint_normal_dev(sc->sc_dev, "joystick %sconnected\n",
(bus_space_read_1(sc->sc_iot, sc->sc_ioh, 0) & 0x0f) == 0x0f ?
"not " : "");
mutex_exit(sc->sc_lock);
}
int
joydetach(struct joy_softc *sc, int flags)
{
int maj, mn;
maj = cdevsw_lookup_major(&joy_cdevsw);
mn = device_unit(sc->sc_dev) << 1;
vdevgone(maj, mn, mn, VCHR);
vdevgone(maj, mn + 1, mn + 1, VCHR);
return 0;
}
static int
joyopen(dev_t dev, int flag, int mode, struct lwp *l)
{
int unit = JOYUNIT(dev);
int i = JOYPART(dev);
struct joy_softc *sc;
sc = device_lookup_private(&joy_cd, unit);
if (sc == NULL)
return ENXIO;
mutex_enter(sc->sc_lock);
if (sc->timeout[i]) {
mutex_exit(sc->sc_lock);
return EBUSY;
}
sc->x_off[i] = sc->y_off[i] = 0;
sc->timeout[i] = JOY_TIMEOUT;
mutex_exit(sc->sc_lock);
return 0;
}
static int
joyclose(dev_t dev, int flag, int mode, struct lwp *l)
{
int unit = JOYUNIT(dev);
int i = JOYPART(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
mutex_enter(sc->sc_lock);
sc->timeout[i] = 0;
mutex_exit(sc->sc_lock);
return 0;
}
static int
joyread(dev_t dev, struct uio *uio, int flag)
{
int unit = JOYUNIT(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
bus_space_tag_t iot = sc->sc_iot;
bus_space_handle_t ioh = sc->sc_ioh;
struct joystick c;
struct timeval start, now, diff;
int state = 0, x = 0, y = 0, i;
mutex_enter(sc->sc_lock);
bus_space_write_1(iot, ioh, 0, 0xff);
microtime(&start);
now = start; /* structure assignment */
i = sc->timeout[JOYPART(dev)];
for (;;) {
timersub(&now, &start, &diff);
if (diff.tv_sec > 0 || diff.tv_usec > i)
break;
state = bus_space_read_1(iot, ioh, 0);
if (JOYPART(dev) == 1)
state >>= 2;
if (!x && !(state & 0x01))
x = diff.tv_usec;
if (!y && !(state & 0x02))
y = diff.tv_usec;
if (x && y)
break;
microtime(&now);
}
mutex_exit(sc->sc_lock);
c.x = x ? sc->x_off[JOYPART(dev)] + x : 0x80000000;
c.y = y ? sc->y_off[JOYPART(dev)] + y : 0x80000000;
state >>= 4;
c.b1 = ~state & 1;
c.b2 = ~(state >> 1) & 1;
return uiomove(&c, sizeof(struct joystick), uio);
}
static int
joyioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
{
int unit = JOYUNIT(dev);
struct joy_softc *sc = device_lookup_private(&joy_cd, unit);
int i = JOYPART(dev), x, error;
mutex_enter(sc->sc_lock);
error = 0;
switch (cmd) {
case JOY_SETTIMEOUT:
x = *(int *)data;
if (x < 1 || x > 10000) { /* 10ms maximum! */
error = EINVAL;
break;
}
sc->timeout[i] = x;
break;
case JOY_GETTIMEOUT:
*(int *)data = sc->timeout[i];
break;
case JOY_SET_X_OFFSET:
sc->x_off[i] = *(int *)data;
break;
case JOY_SET_Y_OFFSET:
sc->y_off[i] = *(int *)data;
break;
case JOY_GET_X_OFFSET:
*(int *)data = sc->x_off[i];
break;
case JOY_GET_Y_OFFSET:
*(int *)data = sc->y_off[i];
break;
default:
error = ENXIO;
break;
}
mutex_exit(sc->sc_lock);
return error;
}
/* $NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $ */
/*
* Copyright (c) 1993, 1994, 1996 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_script.c,v 1.83 2021/05/03 10:25:14 fcambus Exp $");
#ifdef _KERNEL_OPT
#include "opt_script.h"
#endif
#if defined(SETUIDSCRIPTS) && !defined(FDSCRIPTS)
#define FDSCRIPTS /* Need this for safe set-id scripts. */
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/file.h>
#ifdef SETUIDSCRIPTS
#include <sys/stat.h>
#endif
#include <sys/filedesc.h>
#include <sys/exec.h>
#include <sys/resourcevar.h>
#include <sys/module.h>
#include <sys/exec_script.h>
#include <sys/exec_elf.h>
MODULE(MODULE_CLASS_EXEC, exec_script, NULL);
static struct execsw exec_script_execsw = {
.es_hdrsz = SCRIPT_HDR_SIZE,
.es_makecmds = exec_script_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = NULL,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = 0,
.es_copyargs = NULL,
.es_setregs = NULL,
.es_coredump = NULL,
.es_setup_stack = exec_setup_stack,
};
static int
exec_script_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(&exec_script_execsw, 1);
case MODULE_CMD_FINI:
return exec_remove(&exec_script_execsw, 1);
case MODULE_CMD_AUTOUNLOAD:
/*
* We don't want to be autounloaded because our use is
* transient: no executables with p_execsw equal to
* exec_script_execsw will exist, so FINI will never
* return EBUSY. However, the system will run scripts
* often. Return EBUSY here to prevent this module from
* ping-ponging in and out of the kernel.
*/
return EBUSY;
default:
return ENOTTY;
}
}
/*
* exec_script_makecmds(): Check if it's an executable shell script.
*
* Given a proc pointer and an exec package pointer, see if the referent
* of the epp is in shell script. If it is, then set thing up so that
* the script can be run. This involves preparing the address space
* and arguments for the shell which will run the script.
*
* This function is ultimately responsible for creating a set of vmcmds
* which can be used to build the process's vm space and inserting them
* into the exec package.
*/
int
exec_script_makecmds(struct lwp *l, struct exec_package *epp)
{
int error, hdrlinelen, shellnamelen, shellarglen;
char *hdrstr = epp->ep_hdr;
char *cp, *shellname, *shellarg;
size_t shellargp_len;
struct exec_fakearg *shellargp;
struct exec_fakearg *tmpsap;
struct pathbuf *shell_pathbuf;
struct vnode *scriptvp;
#ifdef SETUIDSCRIPTS
/* Gcc needs those initialized for spurious uninitialized warning */
uid_t script_uid = (uid_t) -1;
gid_t script_gid = NOGROUP;
u_short script_sbits;
#endif
/*
* if the magic isn't that of a shell script, or we've already
* done shell script processing for this exec, punt on it.
*/
if ((epp->ep_flags & EXEC_INDIR) != 0 || epp->ep_hdrvalid < EXEC_SCRIPT_MAGICLEN ||
strncmp(hdrstr, EXEC_SCRIPT_MAGIC, EXEC_SCRIPT_MAGICLEN))
return ENOEXEC;
/*
* Check that the shell spec is terminated by a newline, and that
* it isn't too large.
*/
hdrlinelen = uimin(epp->ep_hdrvalid, SCRIPT_HDR_SIZE);
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; cp < hdrstr + hdrlinelen;
cp++) {
if (*cp == '\n') {
*cp = '\0';
break;
}
}
if (cp >= hdrstr + hdrlinelen)
return ENOEXEC;
/* strip spaces before the shell name */
for (cp = hdrstr + EXEC_SCRIPT_MAGICLEN; *cp == ' ' || *cp == '\t';
cp++)
;
if (*cp == '\0')
return ENOEXEC;
shellarg = NULL;
shellarglen = 0;
/* collect the shell name; remember its length for later */
shellname = cp;
shellnamelen = 0;
for ( /* cp = cp */ ; *cp != '\0' && *cp != ' ' && *cp != '\t'; cp++)
shellnamelen++;
if (*cp == '\0')
goto check_shell;
*cp++ = '\0';
/* skip spaces before any argument */
for ( /* cp = cp */ ; *cp == ' ' || *cp == '\t'; cp++)
;
if (*cp == '\0')
goto check_shell;
/*
* collect the shell argument. everything after the shell name
* is passed as ONE argument; that's the correct (historical)
* behaviour.
*/
shellarg = cp;
for ( /* cp = cp */ ; *cp != '\0'; cp++)
shellarglen++;
*cp++ = '\0';
check_shell:
#ifdef SETUIDSCRIPTS
/*
* MNT_NOSUID has already taken care of by check_exec,
* so we don't need to worry about it now or later. We
* will need to check PSL_TRACED later, however.
*/
script_sbits = epp->ep_vap->va_mode & (S_ISUID | S_ISGID);
if (script_sbits != 0) {
script_uid = epp->ep_vap->va_uid;
script_gid = epp->ep_vap->va_gid;
}
#endif
#ifdef FDSCRIPTS
/*
* if the script isn't readable, or it's set-id, then we've
* gotta supply a "/dev/fd/..." for the shell to read.
* Note that stupid shells (csh) do the wrong thing, and
* close all open fd's when they start. That kills this
* method of implementing "safe" set-id and x-only scripts.
*/
vn_lock(epp->ep_vp, LK_SHARED | LK_RETRY);
error = VOP_ACCESS(epp->ep_vp, VREAD, l->l_cred);
VOP_UNLOCK(epp->ep_vp);
if (error == EACCES
#ifdef SETUIDSCRIPTS
|| script_sbits
#endif
) {
struct file *fp;
KASSERT(!(epp->ep_flags & EXEC_HASFD));
if ((error = fd_allocfile(&fp, &epp->ep_fd)) != 0) {
scriptvp = NULL;
shellargp = NULL;
goto fail;
}
epp->ep_flags |= EXEC_HASFD;
fp->f_type = DTYPE_VNODE;
fp->f_ops = &vnops;
fp->f_vnode = epp->ep_vp;
fp->f_flag = FREAD;
fd_affix(curproc, fp, epp->ep_fd);
}
#endif
/* set up the fake args list */
shellargp_len = 4 * sizeof(*shellargp);
shellargp = kmem_alloc(shellargp_len, KM_SLEEP);
tmpsap = shellargp;
tmpsap->fa_len = shellnamelen + 1;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
strlcpy(tmpsap->fa_arg, shellname, tmpsap->fa_len);
tmpsap++;
if (shellarg != NULL) { tmpsap->fa_len = shellarglen + 1;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
strlcpy(tmpsap->fa_arg, shellarg, tmpsap->fa_len);
tmpsap++;
}
tmpsap->fa_len = MAXPATHLEN;
tmpsap->fa_arg = kmem_alloc(tmpsap->fa_len, KM_SLEEP);
#ifdef FDSCRIPTS
if ((epp->ep_flags & EXEC_HASFD) == 0) {
#endif
/* normally can't fail, but check for it if diagnostic */
error = copystr(epp->ep_kname, tmpsap->fa_arg, MAXPATHLEN,
NULL);
KASSERT(error == 0);
tmpsap++;
#ifdef FDSCRIPTS
} else {
snprintf(tmpsap->fa_arg, MAXPATHLEN, "/dev/fd/%d", epp->ep_fd);
tmpsap++;
}
#endif
tmpsap->fa_arg = NULL;
/* Save the old vnode so we can clean it up later. */
scriptvp = epp->ep_vp;
epp->ep_vp = NULL;
/* Note that we're trying recursively. */
epp->ep_flags |= EXEC_INDIR;
/*
* mark the header we have as invalid; check_exec will read
* the header from the new executable
*/
epp->ep_hdrvalid = 0;
/* try loading the interpreter */
if ((error = exec_makepathbuf(l, shellname, UIO_SYSSPACE,
&shell_pathbuf, NULL)) == 0) {
error = check_exec(l, epp, shell_pathbuf, NULL);
pathbuf_destroy(shell_pathbuf);
}
/* note that we've clobbered the header */
epp->ep_flags |= EXEC_DESTR;
if (error == 0) {
/*
* It succeeded. Unlock the script and
* close it if we aren't using it any more.
* Also, set things up so that the fake args
* list will be used.
*/
if ((epp->ep_flags & EXEC_HASFD) == 0) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(scriptvp, FREAD, l->l_cred);
vput(scriptvp);
}
epp->ep_flags |= (EXEC_HASARGL | EXEC_SKIPARG);
epp->ep_fa = shellargp;
epp->ep_fa_len = shellargp_len;
#ifdef SETUIDSCRIPTS
/*
* set thing up so that set-id scripts will be
* handled appropriately. PSL_TRACED will be
* checked later when the shell is actually
* exec'd.
*/
epp->ep_vap->va_mode |= script_sbits;
if (script_sbits & S_ISUID)
epp->ep_vap->va_uid = script_uid;
if (script_sbits & S_ISGID)
epp->ep_vap->va_gid = script_gid;
#endif
return (0);
}
#ifdef FDSCRIPTS
fail:
#endif
/* kill the opened file descriptor, else close the file */
if (epp->ep_flags & EXEC_HASFD) {
epp->ep_flags &= ~EXEC_HASFD;
fd_close(epp->ep_fd);
} else if (scriptvp) { vn_lock(scriptvp, LK_EXCLUSIVE | LK_RETRY);
VOP_CLOSE(scriptvp, FREAD, l->l_cred);
vput(scriptvp);
}
/* free the fake arg list, because we're not returning it */
if ((tmpsap = shellargp) != NULL) {
while (tmpsap->fa_arg != NULL) {
kmem_free(tmpsap->fa_arg, tmpsap->fa_len);
tmpsap++;
}
kmem_free(shellargp, shellargp_len);
}
/*
* free any vmspace-creation commands,
* and release their references
*/
kill_vmcmds(&epp->ep_vmcmds);
return error;
}
/* $NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $ */
/*-
* Copyright (c) 2007-2010, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran and Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Cross call support
*
* Background
*
* Sometimes it is necessary to modify hardware state that is tied
* directly to individual CPUs (such as a CPU's local timer), and
* these updates can not be done remotely by another CPU. The LWP
* requesting the update may be unable to guarantee that it will be
* running on the CPU where the update must occur, when the update
* occurs.
*
* Additionally, it's sometimes necessary to modify per-CPU software
* state from a remote CPU. Where these update operations are so
* rare or the access to the per-CPU data so frequent that the cost
* of using locking or atomic operations to provide coherency is
* prohibitive, another way must be found.
*
* Cross calls help to solve these types of problem by allowing
* any LWP in the system to request that an arbitrary function be
* executed on a specific CPU.
*
* Implementation
*
* A slow mechanism for making low priority cross calls is
* provided. The function to be executed runs on the remote CPU
* within a bound kthread. No queueing is provided, and the
* implementation uses global state. The function being called may
* block briefly on locks, but in doing so must be careful to not
* interfere with other cross calls in the system. The function is
* called with thread context and not from a soft interrupt, so it
* can ensure that it is not interrupting other code running on the
* CPU, and so has exclusive access to the CPU. Since this facility
* is heavyweight, it's expected that it will not be used often.
*
* Cross calls must not allocate memory, as the pagedaemon uses cross
* calls (and memory allocation may need to wait on the pagedaemon).
*
* A low-overhead mechanism for high priority calls (XC_HIGHPRI) is
* also provided. The function to be executed runs in software
* interrupt context at IPL_SOFTSERIAL level, and is expected to
* be very lightweight, e.g. avoid blocking.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_xcall.c,v 1.38 2024/03/01 04:32:38 mrg Exp $");
#include <sys/types.h>
#include <sys/param.h>
#include <sys/xcall.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/evcnt.h>
#include <sys/kthread.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#ifdef _RUMPKERNEL
#include "rump_private.h"
#endif
/* Cross-call state box. */
typedef struct {
kmutex_t xc_lock;
kcondvar_t xc_busy;
xcfunc_t xc_func;
void * xc_arg1;
void * xc_arg2;
uint64_t xc_headp;
uint64_t xc_donep;
unsigned int xc_ipl;
} xc_state_t;
/* Bit indicating high (1) or low (0) priority. */
#define XC_PRI_BIT (1ULL << 63)
/* Low priority xcall structures. */
static xc_state_t xc_low_pri __cacheline_aligned;
/* High priority xcall structures. */
static xc_state_t xc_high_pri __cacheline_aligned;
static void * xc_sihs[4] __cacheline_aligned;
/* Event counters. */
static struct evcnt xc_unicast_ev __cacheline_aligned;
static struct evcnt xc_broadcast_ev __cacheline_aligned;
static void xc_init(void);
static void xc_thread(void *);
static inline uint64_t xc_highpri(xcfunc_t, void *, void *, struct cpu_info *,
unsigned int);
static inline uint64_t xc_lowpri(xcfunc_t, void *, void *, struct cpu_info *);
/* The internal form of IPL */
#define XC_IPL_MASK 0xff00
/*
* Assign 0 to XC_IPL_SOFTSERIAL to treat IPL_SOFTSERIAL as the default value
* (just XC_HIGHPRI).
*/
#define XC_IPL_SOFTSERIAL 0
#define XC_IPL_SOFTNET 1
#define XC_IPL_SOFTBIO 2
#define XC_IPL_SOFTCLOCK 3
#define XC_IPL_MAX XC_IPL_SOFTCLOCK
CTASSERT(XC_IPL_MAX <= __arraycount(xc_sihs));
/*
* xc_init:
*
* Initialize low and high priority cross-call structures.
*/
static void
xc_init(void)
{
xc_state_t *xclo = &xc_low_pri, *xchi = &xc_high_pri;
memset(xclo, 0, sizeof(xc_state_t));
mutex_init(&xclo->xc_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&xclo->xc_busy, "xclow");
memset(xchi, 0, sizeof(xc_state_t));
mutex_init(&xchi->xc_lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
cv_init(&xchi->xc_busy, "xchigh");
/* Set up a softint for each IPL_SOFT*. */
#define SETUP_SOFTINT(xipl, sipl) do { \
xc_sihs[(xipl)] = softint_establish( (sipl) | SOFTINT_MPSAFE,\
xc__highpri_intr, NULL); \
KASSERT(xc_sihs[(xipl)] != NULL); \
} while (0)
SETUP_SOFTINT(XC_IPL_SOFTSERIAL, SOFTINT_SERIAL);
/*
* If a IPL_SOFTXXX have the same value of the previous, we don't use
* the IPL (see xc_encode_ipl). So we don't need to allocate a softint
* for it.
*/
#if IPL_SOFTNET != IPL_SOFTSERIAL
SETUP_SOFTINT(XC_IPL_SOFTNET, SOFTINT_NET);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
SETUP_SOFTINT(XC_IPL_SOFTBIO, SOFTINT_BIO);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
SETUP_SOFTINT(XC_IPL_SOFTCLOCK, SOFTINT_CLOCK);
#endif
#undef SETUP_SOFTINT
evcnt_attach_dynamic(&xc_unicast_ev, EVCNT_TYPE_MISC, NULL,
"crosscall", "unicast");
evcnt_attach_dynamic(&xc_broadcast_ev, EVCNT_TYPE_MISC, NULL,
"crosscall", "broadcast");
}
/*
* Encode an IPL to a form that can be embedded into flags of xc_broadcast
* or xc_unicast.
*/
unsigned int
xc_encode_ipl(int ipl)
{ switch (ipl) {
case IPL_SOFTSERIAL:
return __SHIFTIN(XC_IPL_SOFTSERIAL, XC_IPL_MASK);
/* IPL_SOFT* can be the same value (e.g., on sparc or mips). */
#if IPL_SOFTNET != IPL_SOFTSERIAL
case IPL_SOFTNET:
return __SHIFTIN(XC_IPL_SOFTNET, XC_IPL_MASK);
#endif
#if IPL_SOFTBIO != IPL_SOFTNET
case IPL_SOFTBIO:
return __SHIFTIN(XC_IPL_SOFTBIO, XC_IPL_MASK);
#endif
#if IPL_SOFTCLOCK != IPL_SOFTBIO
case IPL_SOFTCLOCK:
return __SHIFTIN(XC_IPL_SOFTCLOCK, XC_IPL_MASK);
#endif
}
panic("Invalid IPL: %d", ipl);
}
/*
* Extract an XC_IPL from flags of xc_broadcast or xc_unicast.
*/
static inline unsigned int
xc_extract_ipl(unsigned int flags)
{
return __SHIFTOUT(flags, XC_IPL_MASK);
}
/*
* xc_init_cpu:
*
* Initialize the cross-call subsystem. Called once for each CPU
* in the system as they are attached.
*/
void
xc_init_cpu(struct cpu_info *ci)
{
static bool again = false;
int error __diagused;
if (!again) {
/* Autoconfiguration will prevent re-entry. */
xc_init();
again = true;
}
cv_init(&ci->ci_data.cpu_xcall, "xcall");
error = kthread_create(PRI_XCALL, KTHREAD_MPSAFE, ci, xc_thread,
NULL, NULL, "xcall/%u", ci->ci_index);
KASSERT(error == 0);
}
/*
* xc_broadcast:
*
* Trigger a call on all CPUs in the system.
*/
uint64_t
xc_broadcast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2)
{ KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
int s, bound;
if (flags & XC_HIGHPRI)
s = splsoftserial();
else
bound = curlwp_bind();
(*func)(arg1, arg2);
if (flags & XC_HIGHPRI)
splx(s);
else
curlwp_bindx(bound);
return 0;
}
if ((flags & XC_HIGHPRI) != 0) {
int ipl = xc_extract_ipl(flags);
return xc_highpri(func, arg1, arg2, NULL, ipl);
} else {
return xc_lowpri(func, arg1, arg2, NULL);
}
}
static void
xc_nop(void *arg1, void *arg2)
{
return;
}
/*
* xc_barrier:
*
* Broadcast a nop to all CPUs in the system.
*/
void
xc_barrier(unsigned int flags)
{
uint64_t where;
where = xc_broadcast(flags, xc_nop, NULL, NULL);
xc_wait(where);
}
/*
* xc_unicast:
*
* Trigger a call on one CPU.
*/
uint64_t
xc_unicast(unsigned int flags, xcfunc_t func, void *arg1, void *arg2,
struct cpu_info *ci)
{ KASSERT(ci != NULL); KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
int s, bound;
KASSERT(ci == curcpu());
if (flags & XC_HIGHPRI)
s = splsoftserial();
else
bound = curlwp_bind();
(*func)(arg1, arg2);
if (flags & XC_HIGHPRI)
splx(s);
else
curlwp_bindx(bound);
return 0;
}
if ((flags & XC_HIGHPRI) != 0) {
int ipl = xc_extract_ipl(flags);
return xc_highpri(func, arg1, arg2, ci, ipl);
} else {
return xc_lowpri(func, arg1, arg2, ci);
}
}
/*
* xc_wait:
*
* Wait for a cross call to complete.
*/
void
xc_wait(uint64_t where)
{
xc_state_t *xc;
KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p());
ASSERT_SLEEPABLE();
if (__predict_false(!mp_online)) {
return;
}
/* Determine whether it is high or low priority cross-call. */
if ((where & XC_PRI_BIT) != 0) {
xc = &xc_high_pri;
where &= ~XC_PRI_BIT;
} else {
xc = &xc_low_pri;
}
#ifdef __HAVE_ATOMIC64_LOADSTORE
/* Fast path, if already done. */
if (atomic_load_acquire(&xc->xc_donep) >= where) {
return;
}
#endif
/* Slow path: block until awoken. */
mutex_enter(&xc->xc_lock);
while (xc->xc_donep < where) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
mutex_exit(&xc->xc_lock);
}
/*
* xc_lowpri:
*
* Trigger a low priority call on one or more CPUs.
*/
static inline uint64_t
xc_lowpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci)
{
xc_state_t *xc = &xc_low_pri;
CPU_INFO_ITERATOR cii;
uint64_t where;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
xc->xc_arg1 = arg1;
xc->xc_arg2 = arg2;
xc->xc_func = func;
if (ci == NULL) {
xc_broadcast_ev.ev_count++;
for (CPU_INFO_FOREACH(cii, ci)) { if ((ci->ci_schedstate.spc_flags & SPCF_RUNNING) == 0)
continue;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
} else {
xc_unicast_ev.ev_count++;
xc->xc_headp += 1;
ci->ci_data.cpu_xcall_pending = true;
cv_signal(&ci->ci_data.cpu_xcall);
}
KASSERT(xc->xc_donep < xc->xc_headp);
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
/* Return a low priority ticket. */
KASSERT((where & XC_PRI_BIT) == 0);
return where;
}
/*
* xc_thread:
*
* One thread per-CPU to dispatch low priority calls.
*/
static void
xc_thread(void *cookie)
{
struct cpu_info *ci = curcpu();
xc_state_t *xc = &xc_low_pri;
void *arg1, *arg2;
xcfunc_t func;
struct lwp *l = curlwp;
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
l, l->l_nopreempt);
mutex_enter(&xc->xc_lock);
for (;;) {
while (!ci->ci_data.cpu_xcall_pending) {
if (xc->xc_headp == xc->xc_donep) {
cv_broadcast(&xc->xc_busy);
}
cv_wait(&ci->ci_data.cpu_xcall, &xc->xc_lock);
KASSERT(ci == curcpu());
}
ci->ci_data.cpu_xcall_pending = false;
func = xc->xc_func;
arg1 = xc->xc_arg1;
arg2 = xc->xc_arg2;
mutex_exit(&xc->xc_lock);
KASSERT(func != NULL);
(*func)(arg1, arg2);
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d func %p",
l, l->l_nopreempt, func);
mutex_enter(&xc->xc_lock);
#ifdef __HAVE_ATOMIC64_LOADSTORE
atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
xc->xc_donep++;
#endif
}
/* NOTREACHED */
}
/*
* xc_ipi_handler:
*
* Handler of cross-call IPI.
*/
void
xc_ipi_handler(void)
{
xc_state_t *xc = & xc_high_pri;
KASSERT(xc->xc_ipl < __arraycount(xc_sihs)); KASSERT(xc_sihs[xc->xc_ipl] != NULL);
/* Executes xc__highpri_intr() via software interrupt. */
softint_schedule(xc_sihs[xc->xc_ipl]);
}
/*
* xc__highpri_intr:
*
* A software interrupt handler for high priority calls.
*/
void
xc__highpri_intr(void *dummy)
{
xc_state_t *xc = &xc_high_pri;
void *arg1, *arg2;
xcfunc_t func;
KASSERTMSG(!cpu_intr_p(), "high priority xcall for function %p",
xc->xc_func);
/*
* Lock-less fetch of function and its arguments.
* Safe since it cannot change at this point.
*/
func = xc->xc_func;
arg1 = xc->xc_arg1;
arg2 = xc->xc_arg2;
KASSERT(func != NULL);
(*func)(arg1, arg2);
/*
* Note the request as done, and if we have reached the head,
* cross-call has been processed - notify waiters, if any.
*/
mutex_enter(&xc->xc_lock);
KASSERT(xc->xc_donep < xc->xc_headp);
#ifdef __HAVE_ATOMIC64_LOADSTORE
atomic_store_release(&xc->xc_donep, xc->xc_donep + 1);
#else
xc->xc_donep++;
#endif
if (xc->xc_donep == xc->xc_headp) {
cv_broadcast(&xc->xc_busy);
}
mutex_exit(&xc->xc_lock);
}
/*
* xc_highpri:
*
* Trigger a high priority call on one or more CPUs.
*/
static inline uint64_t
xc_highpri(xcfunc_t func, void *arg1, void *arg2, struct cpu_info *ci,
unsigned int ipl)
{
xc_state_t *xc = &xc_high_pri;
uint64_t where;
mutex_enter(&xc->xc_lock);
while (xc->xc_headp != xc->xc_donep) {
cv_wait(&xc->xc_busy, &xc->xc_lock);
}
xc->xc_func = func;
xc->xc_arg1 = arg1;
xc->xc_arg2 = arg2;
xc->xc_headp += (ci ? 1 : ncpu);
xc->xc_ipl = ipl;
where = xc->xc_headp;
mutex_exit(&xc->xc_lock);
/*
* Send the IPI once lock is released.
* Note: it will handle the local CPU case.
*/
#ifdef _RUMPKERNEL
rump_xc_highpri(ci);
#else
#ifdef MULTIPROCESSOR
kpreempt_disable();
if (curcpu() == ci) {
/* Unicast: local CPU. */
xc_ipi_handler();
} else if (ci) {
/* Unicast: remote CPU. */
xc_send_ipi(ci);
} else {
/* Broadcast: all, including local. */
xc_send_ipi(NULL);
xc_ipi_handler();
}
kpreempt_enable();
#else
KASSERT(ci == NULL || curcpu() == ci);
xc_ipi_handler();
#endif
#endif
/* Indicate a high priority ticket. */
return (where | XC_PRI_BIT);
}
/* $NetBSD: lwp.h,v 1.231 2023/11/02 10:31:55 martin Exp $ */
/*
* Copyright (c) 2001, 2006, 2007, 2008, 2009, 2010, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Nathan J. Williams and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_LWP_H_
#define _SYS_LWP_H_
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <sys/param.h>
#include <sys/callout.h>
#include <sys/condvar.h>
#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/queue.h>
#include <sys/resource.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/specificdata.h>
#include <sys/time.h>
#include <sys/wchan.h>
#if defined(_KERNEL)
struct lwp;
/* forward declare this for <machine/cpu.h> so it can get l_cpu. */
static __inline struct cpu_info *lwp_getcpu(struct lwp *);
#include <machine/cpu.h> /* curcpu() and cpu_info */
#include <sys/atomic.h>
#ifdef _KERNEL_OPT
#include "opt_kcov.h"
#include "opt_kmsan.h"
#include "opt_maxlwp.h"
#endif
#endif
#include <machine/proc.h> /* Machine-dependent proc substruct. */
/*
* Lightweight process. Field markings and the corresponding locks:
*
* a: proc_lock
* c: condition variable interlock, passed to cv_wait()
* l: *l_mutex
* p: l_proc->p_lock
* s: spc_mutex, which may or may not be referenced by l_mutex
* S: l_selcluster->sc_lock
* (: unlocked, stable
* !: unlocked, may only be reliably accessed by the LWP itself
*
* Fields are clustered together by usage (to increase the likelihood
* of cache hits) and by size (to reduce dead space in the structure).
*/
#include <sys/pcu.h>
struct lockdebug;
struct sysent;
struct lwp {
/* Must not be zeroed on free. */
struct cpu_info *volatile l_cpu;/* s: CPU we're on if LSONPROC */
kmutex_t * volatile l_mutex; /* l: ptr to mutex on sched state */
struct turnstile *l_ts; /* l: current turnstile */
int l_stat; /* l: overall LWP status */
int l__reserved; /* : padding - reuse as needed */
/* Scheduling and overall state. */
#define l_startzero l_runq
TAILQ_ENTRY(lwp) l_runq; /* s: run queue */
union {
void * info; /* s: scheduler-specific structure */
u_int timeslice; /* l: time-quantum for SCHED_M2 */
} l_sched;
void *l_addr; /* l: PCB address; use lwp_getpcb() */
struct mdlwp l_md; /* l: machine-dependent fields. */
struct bintime l_rtime; /* l: real time */
struct bintime l_stime; /* l: start time (while ONPROC) */
int l_flag; /* l: misc flag values */
u_int l_swtime; /* l: time swapped in or out */
u_int l_rticks; /* l: Saved start time of run */
u_int l_rticksum; /* l: Sum of ticks spent running */
u_int l_slpticks; /* l: Saved start time of sleep */
u_int l_slpticksum; /* l: Sum of ticks spent sleeping */
int l_class; /* l: scheduling class */
pri_t l_boostpri; /* l: boosted priority after blocking */
pri_t l_priority; /* l: scheduler priority */
pri_t l_inheritedprio;/* l: inherited priority */
pri_t l_protectprio; /* l: for PTHREAD_PRIO_PROTECT */
pri_t l_auxprio; /* l: max(inherit,protect) priority */
int l_protectdepth; /* l: for PTHREAD_PRIO_PROTECT */
u_int l_cpticks; /* (: Ticks of CPU time */
psetid_t l_psid; /* l: assigned processor-set ID */
fixpt_t l_pctcpu; /* p: %cpu during l_swtime */
fixpt_t l_estcpu; /* l: cpu time for SCHED_4BSD */
SLIST_HEAD(, turnstile) l_pi_lenders; /* l: ts lending us priority */
struct cpu_info *l_target_cpu; /* l: target CPU to migrate */
struct lwpctl *l_lwpctl; /* p: lwpctl block kernel address */
struct lcpage *l_lcpage; /* p: lwpctl containing page */
kcpuset_t *l_affinity; /* l: CPU set for affinity */
/* Synchronisation. */
const struct syncobj *l_syncobj;/* l: sync object operations set */
LIST_ENTRY(lwp) l_sleepchain; /* l: sleep queue */
wchan_t l_wchan; /* l: sleep address */
const char *l_wmesg; /* l: reason for sleep */
struct sleepq *l_sleepq; /* l: current sleep queue */
callout_t l_timeout_ch; /* !: callout for tsleep */
kcondvar_t l_waitcv; /* a: vfork() wait */
u_int l_slptime; /* l: time since last blocked */
bool l_vforkwaiting; /* a: vfork() waiting */
/* User-space synchronization. */
uintptr_t l_robust_head; /* !: list of robust futexes */
uint32_t l___rsvd1; /* reserved for future use */
#if PCU_UNIT_COUNT > 0
struct cpu_info * volatile l_pcu_cpu[PCU_UNIT_COUNT];
uint32_t l_pcu_valid;
#endif
/* Process level and global state, misc. */
lwpid_t l_lid; /* (: LWP identifier; local to proc */
LIST_ENTRY(lwp) l_list; /* a: entry on list of all LWPs */
void *l_ctxlink; /* p: uc_link {get,set}context */
struct proc *l_proc; /* p: parent process */
LIST_ENTRY(lwp) l_sibling; /* p: entry on proc's list of LWPs */
char *l_name; /* (: name, optional */
lwpid_t l_waiter; /* p: first LWP waiting on us */
lwpid_t l_waitingfor; /* p: specific LWP we are waiting on */
int l_prflag; /* p: process level flags */
u_int l_refcnt; /* p: reference count on this LWP */
/* State of select() or poll(). */
int l_selflag; /* S: polling state flags */
int l_selret; /* S: return value of select/poll */
SLIST_HEAD(,selinfo) l_selwait; /* S: descriptors waited on */
uintptr_t l_selrec; /* !: argument for selrecord() */
struct selcluster *l_selcluster;/* !: associated cluster data */
void * l_selbits; /* (: select() bit-field */
size_t l_selni; /* (: size of a single bit-field */
/* Signals. */
int l_sigrestore; /* p: need to restore old sig mask */
sigset_t l_sigwaitset; /* p: signals being waited for */
kcondvar_t l_sigcv; /* p: for sigsuspend() */
struct ksiginfo *l_sigwaited; /* p: delivered signals from set */
sigpend_t *l_sigpendset; /* p: XXX issignal()/postsig() baton */
LIST_ENTRY(lwp) l_sigwaiter; /* p: chain on list of waiting LWPs */
stack_t l_sigstk; /* p: sp & on stack state variable */
sigset_t l_sigmask; /* p: signal mask */
sigpend_t l_sigpend; /* p: signals to this LWP */
sigset_t l_sigoldmask; /* p: mask for sigpause */
/* Private data. */
specificdata_reference
l_specdataref; /* !: subsystem lwp-specific data */
struct timespec l_ktrcsw; /* !: for ktrace CSW trace XXX */
void *l_private; /* !: svr4-style lwp-private data */
struct lwp *l_switchto; /* !: mi_switch: switch to this LWP */
struct kauth_cred *l_cred; /* !: cached credentials */
struct filedesc *l_fd; /* !: cached copy of proc::p_fd */
void *l_emuldata; /* !: kernel lwp-private data */
struct fstrans_lwp_info *l_fstrans; /* (: fstrans private data */
u_short l_shlocks; /* !: lockdebug: shared locks held */
u_short l_exlocks; /* !: lockdebug: excl. locks held */
u_short l_psrefs; /* !: count of psref held */
u_short l_blcnt; /* !: count of kernel_lock held */
volatile int l_nopreempt; /* !: don't preempt me! */
volatile u_int l_dopreempt; /* s: kernel preemption pending */
int l_pflag; /* !: LWP private flags */
int l_dupfd; /* !: side return from cloning devs XXX */
const struct sysent * volatile l_sysent;/* !: currently active syscall */
struct rusage l_ru; /* !: accounting information */
uint64_t l_pfailtime; /* !: for kernel preemption */
uintptr_t l_pfailaddr; /* !: for kernel preemption */
uintptr_t l_pfaillock; /* !: for kernel preemption */
_TAILQ_HEAD(,struct lockdebug,volatile) l_ld_locks;/* !: locks held by LWP */
volatile void *l_ld_wanted; /* !: lock currently wanted by LWP */
uintptr_t l_rwcallsite; /* !: rwlock actual callsite */
int l_tcgen; /* !: for timecounter removal */
/* These are only used by 'options SYSCALL_TIMES'. */
uint32_t l_syscall_time; /* !: time epoch for current syscall */
uint64_t *l_syscall_counter; /* !: counter for current process */
struct kdtrace_thread *l_dtrace; /* (: DTrace-specific data. */
#ifdef KMSAN
void *l_kmsan; /* !: KMSAN private data. */
#endif
#ifdef KCOV
void *l_kcov; /* !: KCOV private data. */
#endif
};
/*
* UAREA_PCB_OFFSET: an offset of PCB structure in the uarea. MD code may
* define it in <machine/proc.h>, to indicate a different uarea layout.
*/
#ifndef UAREA_PCB_OFFSET
#define UAREA_PCB_OFFSET 0
#endif
LIST_HEAD(lwplist, lwp); /* A list of LWPs. */
#ifdef _KERNEL
extern struct lwplist alllwp; /* List of all LWPs. */
extern lwp_t lwp0; /* LWP for proc0. */
extern int maxlwp __read_mostly; /* max number of lwps */
#ifndef MAXLWP
#define MAXLWP 4096 /* default max */
#endif
#ifndef MAXMAXLWP
#define MAXMAXLWP 65535 /* absolute max */
#endif
#endif
#endif /* _KERNEL || _KMEMUSER */
/*
* These flags are kept in l_flag, and they are modified only with the LWP
* locked.
*/
#define LW_IDLE 0x00000001 /* Idle lwp. */
#define LW_LWPCTL 0x00000002 /* Adjust lwpctl in userret */
#define LW_STIMO 0x00000040 /* Sleep timed out */
#define LW_SINTR 0x00000080 /* Sleep is interruptible. */
#define LW_CATCHINTR 0x00000100 /* LW_SINTR intent; see sleepq_block(). */
#define LW_SYSTEM 0x00000200 /* Kernel thread */
#define LW_SYSTEM_FPU 0x00000400 /* Kernel thread with vector/FP enabled */
#define LW_DBGSUSPEND 0x00010000 /* Suspend by debugger */
#define LW_WSUSPEND 0x00020000 /* Suspend before return to user */
#define LW_BATCH 0x00040000 /* LWP tends to hog CPU */
#define LW_WCORE 0x00080000 /* Stop for core dump on return to user */
#define LW_WEXIT 0x00100000 /* Exit before return to user */
#define LW_PENDSIG 0x01000000 /* Pending signal for us */
#define LW_CANCELLED 0x02000000 /* tsleep should not sleep */
#define LW_CACHECRED 0x04000000 /* Cache new process credential */
#define LW_WREBOOT 0x08000000 /* System is rebooting, please suspend */
#define LW_UNPARKED 0x10000000 /* Unpark op pending */
#define LW_RUMP_CLEAR 0x40000000 /* Clear curlwp in RUMP scheduler */
#define LW_RUMP_QEXIT 0x80000000 /* LWP should exit ASAP */
/*
* The second set of flags is kept in l_pflag, and they are modified only by
* the LWP itself, or modified when it's known the LWP cannot be running.
* LP_RUNNING is typically updated with the LWP locked, but not always in
* the case of soft interrupt handlers.
*/
#define LP_KTRACTIVE 0x00000001 /* Executing ktrace operation */
#define LP_KTRCSW 0x00000002 /* ktrace context switch marker */
#define LP_KTRCSWUSER 0x00000004 /* ktrace context switch marker */
/* 0x00000008 was LP_PIDLID */
#define LP_OWEUPC 0x00000010 /* Owe user profiling tick */
#define LP_MPSAFE 0x00000020 /* Starts life without kernel_lock */
#define LP_INTR 0x00000040 /* Soft interrupt handler */
#define LP_SYSCTLWRITE 0x00000080 /* sysctl write lock held */
#define LP_MUSTJOIN 0x00000100 /* Must join kthread on exit */
#define LP_SINGLESTEP 0x00000400 /* Single step thread in ptrace(2) */
#define LP_TIMEINTR 0x00010000 /* Time this soft interrupt */
#define LP_PREEMPTING 0x00020000 /* mi_switch called involuntarily */
#define LP_RUNNING 0x20000000 /* Active on a CPU */
#define LP_TELEPORT 0x40000000 /* Teleport to new CPU on preempt() */
#define LP_BOUND 0x80000000 /* Bound to a CPU */
/*
* The third set of flags is kept in l_prflag and they are modified only
* with p_lock held.
*/
#define LPR_DETACHED 0x00800000 /* Won't be waited for. */
#define LPR_DRAINING 0x80000000 /* Draining references before exiting */
/*
* Mask indicating that there is "exceptional" work to be done on return to
* user.
*/
#define LW_USERRET (LW_WEXIT | LW_PENDSIG | LW_WREBOOT | LW_WSUSPEND \
| LW_WCORE | LW_LWPCTL | LW_CACHECRED)
/*
* Status values.
*
* A note about LSRUN and LSONPROC: LSRUN indicates that a process is
* runnable but *not* yet running, i.e. is on a run queue. LSONPROC
* indicates that the process is actually executing on a CPU, i.e.
* it is no longer on a run queue.
*
* These values are set in stone and must not be reused with future changes.
*/
#define LSIDL 1 /* Process being created by fork. */
#define LSRUN 2 /* Currently runnable. */
#define LSSLEEP 3 /* Sleeping on an address. */
#define LSSTOP 4 /* Process debugging or suspension. */
#define LSZOMB 5 /* Awaiting collection by parent. */
/* define LSDEAD 6 Process is almost a zombie. (removed in 5.0) */
#define LSONPROC 7 /* Process is currently on a CPU. */
#define LSSUSPENDED 8 /* Not running, not signalable. */
#if defined(_KERNEL) || defined(_KMEMUSER)
static __inline void *
lwp_getpcb(struct lwp *l)
{
return l->l_addr;
}
#endif /* _KERNEL || _KMEMUSER */
#ifdef _KERNEL
void lwpinit(void);
void lwp0_init(void);
void lwp_startup(lwp_t *, lwp_t *);
void startlwp(void *);
void lwp_lock(lwp_t *);
void lwp_unlock(lwp_t *);
pri_t lwp_eprio(lwp_t *);
int lwp_locked(lwp_t *, kmutex_t *);
kmutex_t *lwp_setlock(lwp_t *, kmutex_t *);
void lwp_unlock_to(lwp_t *, kmutex_t *);
int lwp_trylock(lwp_t *);
void lwp_changepri(lwp_t *, pri_t);
void lwp_lendpri(lwp_t *, pri_t);
void lwp_addref(lwp_t *);
void lwp_delref(lwp_t *);
void lwp_delref2(lwp_t *);
bool lwp_drainrefs(lwp_t *);
bool lwp_alive(lwp_t *);
lwp_t *lwp_find_first(proc_t *);
int lwp_wait(lwp_t *, lwpid_t, lwpid_t *, bool);
void lwp_continue(lwp_t *);
void lwp_unsleep(lwp_t *, bool);
void lwp_unstop(lwp_t *);
void lwp_exit(lwp_t *);
int lwp_suspend(lwp_t *, lwp_t *);
int lwp_create1(lwp_t *, const void *, size_t, u_long, lwpid_t *);
void lwp_start(lwp_t *, int);
void lwp_migrate(lwp_t *, struct cpu_info *);
lwp_t * lwp_find2(pid_t, lwpid_t);
lwp_t * lwp_find(proc_t *, int);
void lwp_userret(lwp_t *);
void lwp_need_userret(lwp_t *);
void lwp_free(lwp_t *, bool, bool);
long lwp_pctr(void);
int lwp_setprivate(lwp_t *, void *);
int do_lwp_create(lwp_t *, void *, u_long, lwp_t **, const sigset_t *,
const stack_t *);
void lwp_thread_cleanup(lwp_t *);
void lwpinit_specificdata(void);
int lwp_specific_key_create(specificdata_key_t *, specificdata_dtor_t);
void lwp_specific_key_delete(specificdata_key_t);
void lwp_initspecific(lwp_t *);
void lwp_finispecific(lwp_t *);
void *lwp_getspecific(specificdata_key_t);
#if defined(_LWP_API_PRIVATE)
void *_lwp_getspecific_by_lwp(lwp_t *, specificdata_key_t);
#endif
void lwp_setspecific(specificdata_key_t, void *);
void lwp_setspecific_by_lwp(lwp_t *, specificdata_key_t, void *);
/* Syscalls. */
int lwp_park(clockid_t, int, struct timespec *);
int lwp_unpark(const lwpid_t *, const u_int);
/* DDB. */
void lwp_whatis(uintptr_t, void (*)(const char *, ...) __printflike(1, 2));
int lwp_create(lwp_t *, struct proc *, vaddr_t, int, void *, size_t,
void (*)(void *), void *, lwp_t **, int, const sigset_t *, const stack_t *);
/*
* XXX _MODULE
* We should provide real stubs for the below that modules can use.
*/
static __inline void
spc_lock(struct cpu_info *ci)
{
mutex_spin_enter(ci->ci_schedstate.spc_mutex);
}
static __inline void
spc_unlock(struct cpu_info *ci)
{
mutex_spin_exit(ci->ci_schedstate.spc_mutex);
}
static __inline void
spc_dlock(struct cpu_info *ci1, struct cpu_info *ci2)
{
struct schedstate_percpu *spc1 = &ci1->ci_schedstate;
struct schedstate_percpu *spc2 = &ci2->ci_schedstate;
KASSERT(ci1 != ci2);
if (ci1 < ci2) {
mutex_spin_enter(spc1->spc_mutex);
mutex_spin_enter(spc2->spc_mutex);
} else {
mutex_spin_enter(spc2->spc_mutex);
mutex_spin_enter(spc1->spc_mutex);
}
}
/*
* Allow machine-dependent code to override curlwp in <machine/cpu.h> for
* its own convenience. Otherwise, we declare it as appropriate.
*/
#if !defined(curlwp)
#if defined(MULTIPROCESSOR)
#define curlwp curcpu()->ci_curlwp /* Current running LWP */
#else
extern struct lwp *curlwp; /* Current running LWP */
#endif /* MULTIPROCESSOR */
#endif /* ! curlwp */
#define curproc (curlwp->l_proc)
/*
* This provides a way for <machine/cpu.h> to get l_cpu for curlwp before
* struct lwp is defined.
*/
static __inline struct cpu_info *
lwp_getcpu(struct lwp *l)
{
return l->l_cpu;
}
static __inline bool
CURCPU_IDLE_P(void)
{
struct cpu_info *ci = curcpu();
return ci->ci_onproc == ci->ci_data.cpu_idlelwp;
}
/*
* Disable and re-enable preemption. Only for low-level kernel
* use. Device drivers and anything that could potentially be
* compiled as a module should use kpreempt_disable() and
* kpreempt_enable().
*/
static __inline void
KPREEMPT_DISABLE(lwp_t *l)
{
struct lwp *l1 __diagused;
KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1);
l->l_nopreempt++;
__insn_barrier();
}
static __inline void
KPREEMPT_ENABLE(lwp_t *l)
{
struct lwp *l1 __diagused;
KASSERTMSG(l == (l1 = curlwp), "l=%p curlwp=%p", l, l1); KASSERT(l->l_nopreempt > 0);
__insn_barrier();
l->l_nopreempt--;
__insn_barrier();
if (__predict_false(l->l_dopreempt)) kpreempt(0);
}
/* For lwp::l_dopreempt */
#define DOPREEMPT_ACTIVE 0x01
#define DOPREEMPT_COUNTED 0x02
/*
* Prevent curlwp from migrating between CPUs between curlwp_bind and
* curlwp_bindx. One use case is psref(9) that has a contract that
* forbids migrations.
*/
static __inline int
curlwp_bind(void)
{
int bound;
bound = curlwp->l_pflag & LP_BOUND;
curlwp->l_pflag |= LP_BOUND;
__insn_barrier();
return bound;
}
static __inline void
curlwp_bindx(int bound)
{
KASSERT(curlwp->l_pflag & LP_BOUND);
__insn_barrier();
curlwp->l_pflag ^= bound ^ LP_BOUND;
}
#endif /* _KERNEL */
/* Flags for _lwp_create(), as per Solaris. */
#define LWP_DETACHED 0x00000040
#define LWP_SUSPENDED 0x00000080
/* Kernel-internal flags for LWP creation. */
/* 0x40000000 was LWP_PIDLID */
#define LWP_VFORK 0x80000000
#endif /* !_SYS_LWP_H_ */
/* $NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
*/
/*-
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_physio.c,v 1.102 2022/07/10 23:11:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/once.h>
#include <sys/workqueue.h>
#include <sys/kmem.h>
#include <uvm/uvm_extern.h>
ONCE_DECL(physio_initialized);
struct workqueue *physio_workqueue;
int physio_concurrency = 16;
/* #define PHYSIO_DEBUG */
#if defined(PHYSIO_DEBUG)
#define DPRINTF(a) printf a
#else /* defined(PHYSIO_DEBUG) */
#define DPRINTF(a) /* nothing */
#endif /* defined(PHYSIO_DEBUG) */
struct physio_stat {
int ps_running;
int ps_error;
int ps_failed;
off_t ps_endoffset;
size_t ps_resid;
buf_t *ps_orig_bp;
kmutex_t ps_lock;
kcondvar_t ps_cv;
};
static void
physio_done(struct work *wk, void *dummy)
{
struct buf *bp = (void *)wk;
size_t todo = bp->b_bufsize;
size_t done = bp->b_bcount - bp->b_resid;
struct physio_stat *ps = bp->b_private;
bool is_iobuf;
KASSERT(&bp->b_work == wk);
KASSERT(bp->b_bcount <= todo);
KASSERT(bp->b_resid <= bp->b_bcount);
KASSERT((bp->b_flags & B_PHYS) != 0);
KASSERT(dummy == NULL);
vunmapbuf(bp, todo);
uvm_vsunlock(bp->b_proc->p_vmspace, bp->b_data, todo);
mutex_enter(&ps->ps_lock);
is_iobuf = (bp != ps->ps_orig_bp);
if (__predict_false(done != todo)) {
off_t endoffset = dbtob(bp->b_blkno) + done;
/*
* we got an error or hit EOM.
*
* we only care about the first one.
* ie. the one at the lowest offset.
*/
KASSERT(ps->ps_endoffset != endoffset);
DPRINTF(("%s: error=%d at %" PRIu64 " - %" PRIu64
", blkno=%" PRIu64 ", bcount=%d, flags=0x%x\n",
__func__, bp->b_error, dbtob(bp->b_blkno), endoffset,
bp->b_blkno, bp->b_bcount, bp->b_flags));
if (ps->ps_endoffset == -1 || endoffset < ps->ps_endoffset) {
DPRINTF(("%s: ps=%p, error %d -> %d, endoff %" PRIu64
" -> %" PRIu64 "\n",
__func__, ps,
ps->ps_error, bp->b_error,
ps->ps_endoffset, endoffset));
ps->ps_endoffset = endoffset;
ps->ps_error = bp->b_error;
}
ps->ps_failed++;
ps->ps_resid += todo - done;
} else {
KASSERT(bp->b_error == 0);
}
ps->ps_running--;
cv_signal(&ps->ps_cv);
mutex_exit(&ps->ps_lock);
if (is_iobuf)
putiobuf(bp);
}
static void
physio_biodone(struct buf *bp)
{
#if defined(DIAGNOSTIC)
struct physio_stat *ps = bp->b_private;
size_t todo = bp->b_bufsize;
size_t done = bp->b_bcount - bp->b_resid;
KASSERT(ps->ps_running > 0); KASSERT(bp->b_bcount <= todo); KASSERT(bp->b_resid <= bp->b_bcount); if (done == todo) KASSERTMSG(bp->b_error == 0, "error=%d", bp->b_error);
#endif /* defined(DIAGNOSTIC) */
workqueue_enqueue(physio_workqueue, &bp->b_work, NULL);
}
static void
physio_wait(struct physio_stat *ps, int n)
{
KASSERT(mutex_owned(&ps->ps_lock)); while (ps->ps_running > n)
cv_wait(&ps->ps_cv, &ps->ps_lock);
}
static int
physio_init(void)
{
int error;
KASSERT(physio_workqueue == NULL);
error = workqueue_create(&physio_workqueue, "physiod",
physio_done, NULL, PRI_BIO, IPL_BIO, WQ_MPSAFE);
return error;
}
/*
* Do "physical I/O" on behalf of a user. "Physical I/O" is I/O directly
* from the raw device to user buffers, and bypasses the buffer cache.
*/
int
physio(void (*strategy)(struct buf *), struct buf *obp, dev_t dev, int flags,
void (*min_phys)(struct buf *), struct uio *uio)
{
struct iovec *iovp;
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
int i, error;
struct buf *bp = NULL;
struct physio_stat *ps;
int concurrency = physio_concurrency - 1;
int isdisk;
error = RUN_ONCE(&physio_initialized, physio_init); if (__predict_false(error != 0)) {
return error;
}
DPRINTF(("%s: called: off=%" PRIu64 ", resid=%zu\n",
__func__, uio->uio_offset, uio->uio_resid));
flags &= B_READ | B_WRITE;
ps = kmem_zalloc(sizeof(*ps), KM_SLEEP);
/* ps->ps_running = 0; */
/* ps->ps_error = 0; */
/* ps->ps_failed = 0; */
ps->ps_orig_bp = obp;
ps->ps_endoffset = -1;
ps->ps_resid = 0;
mutex_init(&ps->ps_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&ps->ps_cv, "physio");
/* Allow concurrent I/O only for disks */
isdisk = cdev_type(dev) == D_DISK;
if (!isdisk)
concurrency = 0;
/* Make sure we have a buffer, creating one if necessary. */
if (obp != NULL) {
mutex_enter(&bufcache_lock);
/* Mark it busy, so nobody else will use it. */
while (bbusy(obp, false, 0, NULL) == EPASSTHROUGH)
;
mutex_exit(&bufcache_lock);
concurrency = 0; /* see "XXXkludge" comment below */
}
for (i = 0; i < uio->uio_iovcnt; i++) {
bool sync = true;
iovp = &uio->uio_iov[i];
while (iovp->iov_len > 0) {
size_t todo;
vaddr_t endp;
mutex_enter(&ps->ps_lock);
if (ps->ps_failed != 0) {
goto done_locked;
}
physio_wait(ps, sync ? 0 : concurrency);
mutex_exit(&ps->ps_lock);
if (obp != NULL) {
/*
* XXXkludge
* some drivers use "obp" as an identifier.
*/
bp = obp;
} else {
bp = getiobuf(NULL, true);
bp->b_cflags |= BC_BUSY;
}
bp->b_dev = dev;
bp->b_proc = p;
bp->b_private = ps;
/*
* Mrk the buffer busy for physical I/O. Also set
* B_PHYS because it's an I/O to user memory, and
* B_RAW because B_RAW is to be "set by physio for
* raw transfers".
*/
bp->b_oflags = 0;
bp->b_cflags |= BC_BUSY;
bp->b_flags = flags | B_PHYS | B_RAW;
bp->b_iodone = physio_biodone;
/* Set up the buffer for a maximum-sized transfer. */
bp->b_blkno = btodb(uio->uio_offset);
if (isdisk) {
/*
* For disks, check that offsets are at least block
* aligned, the block addresses are used to track
* errors of finished requests.
*/
if (uio->uio_offset & (DEV_BSIZE - 1)) {
error = EINVAL;
goto done;
}
/*
* Split request into MAXPHYS chunks
*/
bp->b_bcount = MIN(MAXPHYS, iovp->iov_len);
} else {
bp->b_bcount = MIN(INT_MAX, iovp->iov_len);
}
bp->b_data = iovp->iov_base;
/*
* Call minphys to bound the transfer size,
* and remember the amount of data to transfer,
* for later comparison.
*/
(*min_phys)(bp);
todo = bp->b_bufsize = bp->b_bcount;
#if defined(DIAGNOSTIC)
if (todo > MAXPHYS)
panic("todo(%zu) > MAXPHYS; minphys broken",
todo);
#endif /* defined(DIAGNOSTIC) */
sync = false;
endp = (vaddr_t)bp->b_data + todo;
if (trunc_page(endp) != endp) {
/*
* Following requests can overlap.
* note that uvm_vslock does round_page.
*/
sync = true;
}
/*
* Lock the part of the user address space involved
* in the transfer.
*/
error = uvm_vslock(p->p_vmspace, bp->b_data, todo,
(flags & B_READ) ? VM_PROT_WRITE : VM_PROT_READ);
if (error) {
goto done;
}
/*
* Beware vmapbuf(); if successful it clobbers
* b_data and saves it in b_saveaddr.
* However, vunmapbuf() restores b_data.
*/
if ((error = vmapbuf(bp, todo)) != 0) {
uvm_vsunlock(p->p_vmspace, bp->b_data, todo);
goto done;
}
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
mutex_enter(&ps->ps_lock);
ps->ps_running++;
mutex_exit(&ps->ps_lock);
/* Call strategy to start the transfer. */
(*strategy)(bp);
bp = NULL;
iovp->iov_len -= todo;
iovp->iov_base = (char *)iovp->iov_base + todo;
uio->uio_offset += todo;
uio->uio_resid -= todo;
}
}
done:
mutex_enter(&ps->ps_lock);
done_locked:
physio_wait(ps, 0);
mutex_exit(&ps->ps_lock);
KASSERT(ps->ps_failed || ps->ps_endoffset == -1);
/*
* Compute residual, for disks adjust for the
* lowest numbered block that returned an error.
*/
if (isdisk) {
if (ps->ps_failed != 0) {
off_t delta;
delta = uio->uio_offset - ps->ps_endoffset;
KASSERT(delta > 0);
uio->uio_resid += delta;
/* uio->uio_offset = ps->ps_endoffset; */
}
} else {
uio->uio_resid += ps->ps_resid;
}
if (bp != NULL && bp != obp) { putiobuf(bp);
}
if (error == 0) { error = ps->ps_error;
}
mutex_destroy(&ps->ps_lock);
cv_destroy(&ps->ps_cv);
kmem_free(ps, sizeof(*ps));
/*
* Clean up the state of the buffer. Remember if somebody wants
* it, so we can wake them up below. Also, if we had to steal it,
* give it back.
*/
if (obp != NULL) { KASSERT((obp->b_cflags & BC_BUSY) != 0);
/*
* If another process is waiting for the raw I/O buffer,
* wake up processes waiting to do physical I/O;
*/
mutex_enter(&bufcache_lock);
obp->b_cflags &= ~(BC_BUSY | BC_WANTED);
obp->b_flags &= ~(B_PHYS | B_RAW);
obp->b_iodone = NULL;
cv_broadcast(&obp->b_busy);
mutex_exit(&bufcache_lock);
}
DPRINTF(("%s: done: off=%" PRIu64 ", resid=%zu\n",
__func__, uio->uio_offset, uio->uio_resid));
return error;
}
/*
* A minphys() routine is called by physio() to adjust the size of each
* I/O transfer before the latter is passed to the strategy routine.
*
* This minphys() is a default that must be called to enforce limits
* that are applicable to all devices, because of limitations in the
* kernel or the hardware platform.
*/
void
minphys(struct buf *bp)
{ if (bp->b_bcount > MAXPHYS) bp->b_bcount = MAXPHYS;
}
/* $NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $ */
/*
* Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
*
* Further information about snapshots can be obtained from:
*
* Marshall Kirk McKusick http://www.mckusick.com/softdep/
* 1614 Oxford Street mckusick@mckusick.com
* Berkeley, CA 94709-1608 +1-510-843-9542
* USA
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ffs_snapshot.c 8.11 (McKusick) 7/23/00
*
* from FreeBSD: ffs_snapshot.c,v 1.79 2004/02/13 02:02:06 kuriyama Exp
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ffs_snapshot.c,v 1.155 2023/05/11 23:11:25 chs Exp $");
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#include "opt_quota.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/sched.h>
#include <sys/stat.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/resource.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/wapbl.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <uvm/uvm.h>
TAILQ_HEAD(inodelst, inode); /* List of active snapshots */
struct snap_info {
kmutex_t si_lock; /* Lock this snapinfo */
kmutex_t si_snaplock; /* Snapshot vnode common lock */
lwp_t *si_owner; /* Snaplock owner */
struct inodelst si_snapshots; /* List of active snapshots */
daddr_t *si_snapblklist; /* Snapshot block hints list */
uint32_t si_gen; /* Incremented on change */
};
#if !defined(FFS_NO_SNAPSHOT)
typedef int (*acctfunc_t)
(struct vnode *, void *, int, int, struct fs *, daddr_t, int);
static int snapshot_setup(struct mount *, struct vnode *);
static int snapshot_copyfs(struct mount *, struct vnode *, void **);
static int snapshot_expunge(struct mount *, struct vnode *,
struct fs *, daddr_t *, daddr_t **);
static int snapshot_expunge_snap(struct mount *, struct vnode *,
struct fs *, daddr_t);
static int snapshot_writefs(struct mount *, struct vnode *, void *);
static int cgaccount(struct vnode *, int, int *);
static int cgaccount1(int, struct vnode *, void *, int);
static int expunge(struct vnode *, struct inode *, struct fs *,
acctfunc_t, int);
static int indiracct(struct vnode *, struct vnode *, int, daddr_t,
daddr_t, daddr_t, daddr_t, daddr_t, struct fs *, acctfunc_t, int);
static int fullacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
static int snapacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
static int mapacct(struct vnode *, void *, int, int, struct fs *,
daddr_t, int);
#endif /* !defined(FFS_NO_SNAPSHOT) */
static int ffs_copyonwrite(void *, struct buf *, bool);
static int snapblkaddr(struct vnode *, daddr_t, daddr_t *);
static int rwfsblk(struct vnode *, int, void *, daddr_t);
static int syncsnap(struct vnode *);
static int wrsnapblk(struct vnode *, void *, daddr_t);
#if !defined(FFS_NO_SNAPSHOT)
static int blocks_in_journal(struct fs *);
#endif
static inline bool is_active_snapshot(struct snap_info *, struct inode *);
static inline daddr_t db_get(struct inode *, int);
static inline void db_assign(struct inode *, int, daddr_t);
static inline daddr_t ib_get(struct inode *, int);
static inline daddr_t idb_get(struct inode *, void *, int);
static inline void idb_assign(struct inode *, void *, int, daddr_t);
#ifdef DEBUG
static int snapdebug = 0;
#endif
int
ffs_snapshot_init(struct ufsmount *ump)
{
struct snap_info *si;
si = ump->um_snapinfo = kmem_alloc(sizeof(*si), KM_SLEEP);
TAILQ_INIT(&si->si_snapshots);
mutex_init(&si->si_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&si->si_snaplock, MUTEX_DEFAULT, IPL_NONE);
si->si_owner = NULL;
si->si_gen = 0;
si->si_snapblklist = NULL;
return 0;
}
void
ffs_snapshot_fini(struct ufsmount *ump)
{
struct snap_info *si;
si = ump->um_snapinfo;
ump->um_snapinfo = NULL;
KASSERT(TAILQ_EMPTY(&si->si_snapshots));
mutex_destroy(&si->si_lock);
mutex_destroy(&si->si_snaplock);
KASSERT(si->si_snapblklist == NULL);
kmem_free(si, sizeof(*si));
}
/*
* Create a snapshot file and initialize it for the filesystem.
* Vnode is locked on entry and return.
*/
int
ffs_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ctime)
{
#if defined(FFS_NO_SNAPSHOT)
return EOPNOTSUPP;
}
#else /* defined(FFS_NO_SNAPSHOT) */
bool suspended = false;
int error, redo = 0, snaploc;
void *sbbuf = NULL;
daddr_t *snaplist = NULL, snaplistsize = 0;
struct buf *bp, *nbp;
struct fs *copy_fs = NULL;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
struct timespec ts;
struct timeval starttime;
#ifdef DEBUG
struct timeval endtime;
#endif
struct vnode *devvp = ip->i_devvp;
/*
* If the vnode already is a snapshot, return.
*/
if ((ip->i_flags & SF_SNAPSHOT)) {
if ((ip->i_flags & SF_SNAPINVAL))
return EINVAL;
if (ctime) {
ctime->tv_sec = DIP(ip, mtime);
ctime->tv_nsec = DIP(ip, mtimensec);
}
return 0;
}
/*
* Check for free snapshot slot in the superblock.
*/
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
if (fs->fs_snapinum[snaploc] == 0)
break;
if (snaploc == FSMAXSNAP)
return (ENOSPC);
/*
* Prepare the vnode to become a snapshot.
*/
error = snapshot_setup(mp, vp);
if (error)
goto out;
/*
* Copy all the cylinder group maps. Although the
* filesystem is still active, we hope that only a few
* cylinder groups will change between now and when we
* suspend operations. Thus, we will be able to quickly
* touch up the few cylinder groups that changed during
* the suspension period.
*/
error = cgaccount(vp, 1, NULL);
if (error)
goto out;
/*
* snapshot is now valid
*/
ip->i_flags &= ~SF_SNAPINVAL;
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Ensure that the snapshot is completely on disk.
* Since we have marked it as a snapshot it is safe to
* unlock it as no process will be allowed to write to it.
*/
error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
if (error)
goto out;
VOP_UNLOCK(vp);
/*
* All allocations are done, so we can now suspend the filesystem.
*/
error = vfs_suspend(vp->v_mount, 0);
if (error == 0) {
suspended = true;
vrele_flush(vp->v_mount);
error = VFS_SYNC(vp->v_mount, MNT_WAIT, curlwp->l_cred);
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (error)
goto out;
getmicrotime(&starttime);
/*
* First, copy all the cylinder group maps that have changed.
*/
error = cgaccount(vp, 2, &redo);
if (error)
goto out;
/*
* Create a copy of the superblock and its summary information.
*/
error = snapshot_copyfs(mp, vp, &sbbuf);
if (error)
goto out;
copy_fs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
/*
* Expunge unlinked files from our view.
*/
error = snapshot_expunge(mp, vp, copy_fs, &snaplistsize, &snaplist);
if (error)
goto out;
/*
* Record snapshot inode. Since this is the newest snapshot,
* it must be placed at the end of the list.
*/
if (ip->i_nlink > 0)
fs->fs_snapinum[snaploc] = ip->i_number;
mutex_enter(&si->si_lock);
if (is_active_snapshot(si, ip))
panic("ffs_snapshot: %"PRIu64" already on list", ip->i_number);
TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
if (TAILQ_FIRST(&si->si_snapshots) == ip) {
/*
* If this is the first snapshot on this filesystem, put the
* preliminary list in place and establish the cow handler.
*/
si->si_snapblklist = snaplist;
fscow_establish(mp, ffs_copyonwrite, devvp);
}
si->si_gen++;
mutex_exit(&si->si_lock);
vp->v_vflag |= VV_SYSTEM;
/*
* Set the mtime to the time the snapshot has been taken.
*/
TIMEVAL_TO_TIMESPEC(&starttime, &ts);
if (ctime)
*ctime = ts;
DIP_ASSIGN(ip, mtime, ts.tv_sec);
DIP_ASSIGN(ip, mtimensec, ts.tv_nsec);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
/*
* Copy allocation information from all snapshots and then
* expunge them from our view.
*/
error = snapshot_expunge_snap(mp, vp, copy_fs, snaplistsize);
if (error)
goto out;
/*
* Write the superblock and its summary information to the snapshot.
*/
error = snapshot_writefs(mp, vp, sbbuf);
if (error)
goto out;
/*
* We're nearly done, ensure that the snapshot is completely on disk.
*/
error = VOP_FSYNC(vp, l->l_cred, FSYNC_WAIT, 0, 0);
if (error)
goto out;
/*
* Invalidate and free all pages on the snapshot vnode.
* We will read and write through the buffercache.
*/
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
error = VOP_PUTPAGES(vp, 0, 0,
PGO_ALLPAGES | PGO_CLEANIT | PGO_SYNCIO | PGO_FREE);
if (error)
goto out;
/*
* Invalidate short ( < fs_bsize ) buffers. We will always read
* full size buffers later.
*/
mutex_enter(&bufcache_lock);
KASSERT(LIST_FIRST(&vp->v_dirtyblkhd) == NULL);
for (bp = LIST_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) {
nbp = LIST_NEXT(bp, b_vnbufs);
if (bp->b_bcount == fs->fs_bsize)
continue;
error = bbusy(bp, false, 0, NULL);
if (error != 0) {
if (error == EPASSTHROUGH) {
nbp = LIST_FIRST(&vp->v_cleanblkhd);
continue;
}
break;
}
brelsel(bp, BC_INVAL | BC_VFLUSH);
}
mutex_exit(&bufcache_lock);
out:
if (sbbuf != NULL) {
free(copy_fs->fs_csp, M_UFSMNT);
free(sbbuf, M_UFSMNT);
}
if (fs->fs_active != NULL) {
free(fs->fs_active, M_DEVBUF);
fs->fs_active = NULL;
}
mutex_enter(&si->si_lock);
if (snaplist != NULL) {
if (si->si_snapblklist == snaplist)
si->si_snapblklist = NULL;
free(snaplist, M_UFSMNT);
}
if (error) {
fs->fs_snapinum[snaploc] = 0;
} else {
/*
* As this is the newest list, it is the most inclusive, so
* should replace the previous list.
*/
si->si_snapblklist = ip->i_snapblklist;
}
si->si_gen++;
mutex_exit(&si->si_lock);
if (suspended) {
VOP_UNLOCK(vp);
vfs_resume(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef DEBUG
getmicrotime(&endtime);
timersub(&endtime, &starttime, &endtime);
printf("%s: suspended %lld.%03d sec, redo %d of %d\n",
mp->mnt_stat.f_mntonname, (long long)endtime.tv_sec,
endtime.tv_usec / 1000, redo, fs->fs_ncg);
#endif
}
if (error) {
if (UFS_WAPBL_BEGIN(mp) == 0) {
/*
* We depend on ffs_truncate() to call ffs_snapremove()
* before it may return an error. On failed
* ffs_truncate() we have normal file with leaked
* (meta-) data, but no snapshot to use.
*/
(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
UFS_WAPBL_END(mp);
}
} else if (ip->i_nlink > 0)
vref(vp);
return (error);
}
/*
* Prepare vnode to become a snapshot.
*/
static int
snapshot_setup(struct mount *mp, struct vnode *vp)
{
int error, n, len, loc, cg;
daddr_t blkno, numblks;
struct buf *ibp, *nbp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct lwp *l = curlwp;
const int wbreak = blocks_in_journal(fs)/8;
struct inode *ip = VTOI(vp);
/*
* Check mount, readonly reference and owner.
*/
if (vp->v_mount != mp)
return EXDEV;
if (vp->v_writecount != 0)
return EBUSY;
error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_FS_SNAPSHOT,
0, mp, vp, NULL);
if (error)
return EACCES;
/*
* Must completely truncate the file here. Allocated
* blocks on a snapshot mean that block has been copied
* on write, see ffs_copyonwrite() testing "blkno != 0"
*/
error = ufs_truncate_all(vp);
if (error)
return error;
/* Change inode to snapshot type file. */
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
#if defined(QUOTA) || defined(QUOTA2)
/* snapshot inodes are not accounted in quotas */
chkiq(ip, -1, l->l_cred, 0);
#endif
ip->i_flags |= (SF_SNAPSHOT | SF_SNAPINVAL);
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
ffs_update(vp, NULL, NULL, UPDATE_WAIT);
UFS_WAPBL_END(mp);
KASSERT(ip->i_flags & SF_SNAPSHOT);
/*
* Write an empty list of preallocated blocks to the end of
* the snapshot to set size to at least that of the filesystem.
*/
numblks = howmany(fs->fs_size, fs->fs_frag);
blkno = 1;
blkno = ufs_rw64(blkno, UFS_FSNEEDSWAP(fs));
error = vn_rdwr(UIO_WRITE, vp,
(void *)&blkno, sizeof(blkno), ffs_lblktosize(fs, (off_t)numblks),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, l->l_cred, NULL, NULL);
if (error)
return error;
/*
* Preallocate critical data structures so that we can copy
* them in without further allocation after we suspend all
* operations on the filesystem. We would like to just release
* the allocated buffers without writing them since they will
* be filled in below once we are ready to go, but this upsets
* the soft update code, so we go ahead and write the new buffers.
*
* Allocate all indirect blocks and mark all of them as not
* needing to be copied.
*/
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for (blkno = UFS_NDADDR, n = 0; blkno < numblks; blkno += FFS_NINDIR(fs)) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
goto out;
brelse(ibp, 0);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
/*
* Allocate copies for the superblock and its summary information.
*/
error = ffs_balloc(vp, fs->fs_sblockloc, fs->fs_sbsize, l->l_cred,
0, &nbp);
if (error)
goto out;
bawrite(nbp);
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
len = howmany(fs->fs_cssize, fs->fs_bsize);
for (loc = 0; loc < len; loc++) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(blkno + loc)),
fs->fs_bsize, l->l_cred, 0, &nbp);
if (error)
goto out;
bawrite(nbp);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
/*
* Allocate all cylinder group blocks.
*/
for (cg = 0; cg < fs->fs_ncg; cg++) {
error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
fs->fs_bsize, l->l_cred, 0, &nbp);
if (error)
goto out;
bawrite(nbp);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
out:
UFS_WAPBL_END(mp);
return error;
}
/*
* Create a copy of the superblock and its summary information.
* It is up to the caller to free copyfs and copy_fs->fs_csp.
*/
static int
snapshot_copyfs(struct mount *mp, struct vnode *vp, void **sbbuf)
{
int error, i, len, loc, size;
void *space;
int32_t *lp;
struct buf *bp;
struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
struct vnode *devvp = VTOI(vp)->i_devvp;
/*
* Grab a copy of the superblock and its summary information.
* We delay writing it until the suspension is released below.
*/
*sbbuf = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
loc = ffs_blkoff(fs, fs->fs_sblockloc);
if (loc > 0)
memset(*sbbuf, 0, loc);
copyfs = (struct fs *)((char *)(*sbbuf) + loc);
memcpy(copyfs, fs, fs->fs_sbsize);
size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
if (fs->fs_sbsize < size)
memset((char *)(*sbbuf) + loc + fs->fs_sbsize, 0,
size - fs->fs_sbsize);
size = ffs_blkroundup(fs, fs->fs_cssize);
if (fs->fs_contigsumsize > 0)
size += fs->fs_ncg * sizeof(int32_t);
space = malloc(size, M_UFSMNT, M_WAITOK);
copyfs->fs_csp = space;
memcpy(copyfs->fs_csp, fs->fs_csp, fs->fs_cssize);
space = (char *)space + fs->fs_cssize;
loc = howmany(fs->fs_cssize, fs->fs_fsize);
i = fs->fs_frag - loc % fs->fs_frag;
len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
if (len > 0) {
if ((error = bread(devvp, FFS_FSBTODB(fs, fs->fs_csaddr + loc),
len, 0, &bp)) != 0) {
free(copyfs->fs_csp, M_UFSMNT);
free(*sbbuf, M_UFSMNT);
*sbbuf = NULL;
return error;
}
memcpy(space, bp->b_data, (u_int)len);
space = (char *)space + len;
brelse(bp, BC_INVAL | BC_NOCACHE);
}
if (fs->fs_contigsumsize > 0) {
copyfs->fs_maxcluster = lp = space;
for (i = 0; i < fs->fs_ncg; i++)
*lp++ = fs->fs_contigsumsize;
}
if (mp->mnt_wapbl)
copyfs->fs_flags &= ~FS_DOWAPBL;
return 0;
}
struct snapshot_expunge_ctx {
struct vnode *logvp;
struct vnode *vp;
struct fs *copy_fs;
};
static bool
snapshot_expunge_selector(void *cl, struct vnode *xvp)
{
struct snapshot_expunge_ctx *c = cl;
struct inode *xp;
KASSERT(mutex_owned(xvp->v_interlock));
xp = VTOI(xvp);
if (xvp->v_type == VNON || VTOI(xvp) == NULL ||
(xp->i_flags & SF_SNAPSHOT))
return false;
#ifdef DEBUG
if (snapdebug)
vprint("ffs_snapshot: busy vnode", xvp);
#endif
if (xvp == c->logvp)
return true;
if (xp->i_nlink > 0)
return false;
if (ffs_checkfreefile(c->copy_fs, c->vp, xp->i_number))
return false;
return true;
}
/*
* We must check for active files that have been unlinked (e.g., with a zero
* link count). We have to expunge all trace of these files from the snapshot
* so that they are not reclaimed prematurely by fsck or unnecessarily dumped.
* Note that we skip unlinked snapshot files as they will be handled separately.
* Calculate the snapshot list size and create a preliminary list.
*/
static int
snapshot_expunge(struct mount *mp, struct vnode *vp, struct fs *copy_fs,
daddr_t *snaplistsize, daddr_t **snaplist)
{
int cg, error = 0, len, loc;
daddr_t blkno, *blkp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *xp;
struct vnode *logvp = NULL, *xvp;
struct vnode_iterator *marker;
struct snapshot_expunge_ctx ctx;
*snaplist = NULL;
/*
* Get the log inode if any.
*/
if ((fs->fs_flags & FS_DOWAPBL) &&
fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
error = VFS_VGET(mp, fs->fs_journallocs[UFS_WAPBL_INFS_INO],
LK_EXCLUSIVE, &logvp);
if (error)
goto out;
}
/*
* We also calculate the needed size for the snapshot list.
*/
*snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
vfs_vnode_iterator_init(mp, &marker);
ctx.logvp = logvp;
ctx.vp = vp;
ctx.copy_fs = copy_fs;
while ((xvp = vfs_vnode_iterator_next(marker, snapshot_expunge_selector,
&ctx)))
{
/*
* If there is a fragment, clear it here.
*/
xp = VTOI(xvp);
blkno = 0;
loc = howmany(xp->i_size, fs->fs_bsize) - 1;
if (loc < UFS_NDADDR) {
len = ffs_fragroundup(fs, ffs_blkoff(fs, xp->i_size));
if (len > 0 && len < fs->fs_bsize) {
error = UFS_WAPBL_BEGIN(mp);
if (error) {
vrele(xvp);
vfs_vnode_iterator_destroy(marker);
goto out;
}
ffs_blkfree_snap(copy_fs, vp, db_get(xp, loc),
len, xp->i_number);
blkno = db_get(xp, loc);
db_assign(xp, loc, 0);
UFS_WAPBL_END(mp);
}
}
*snaplistsize += 1;
error = expunge(vp, xp, copy_fs, fullacct, BLK_NOCOPY);
if (blkno)
db_assign(xp, loc, blkno);
if (!error) {
error = UFS_WAPBL_BEGIN(mp);
if (!error) {
error = ffs_freefile_snap(copy_fs, vp,
xp->i_number, xp->i_mode);
UFS_WAPBL_END(mp);
}
}
vrele(xvp);
if (error) {
vfs_vnode_iterator_destroy(marker);
goto out;
}
}
vfs_vnode_iterator_destroy(marker);
/*
* Create a preliminary list of preallocated snapshot blocks.
*/
*snaplist = malloc(*snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
blkp = &(*snaplist)[1];
*blkp++ = ffs_lblkno(fs, fs->fs_sblockloc);
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
for (cg = 0; cg < fs->fs_ncg; cg++) {
if (ffs_fragstoblks(fs, cgtod(fs, cg)) > blkno)
break;
*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
}
len = howmany(fs->fs_cssize, fs->fs_bsize);
for (loc = 0; loc < len; loc++)
*blkp++ = blkno + loc;
for (; cg < fs->fs_ncg; cg++)
*blkp++ = ffs_fragstoblks(fs, cgtod(fs, cg));
(*snaplist)[0] = blkp - &(*snaplist)[0];
out:
if (logvp != NULL)
vput(logvp);
if (error && *snaplist != NULL) {
free(*snaplist, M_UFSMNT);
*snaplist = NULL;
}
return error;
}
/*
* Copy allocation information from all the snapshots in this snapshot and
* then expunge them from its view. Also, collect the list of allocated
* blocks in i_snapblklist.
*/
static int
snapshot_expunge_snap(struct mount *mp, struct vnode *vp,
struct fs *copy_fs, daddr_t snaplistsize)
{
int error = 0, i;
daddr_t numblks, *snaplist = NULL;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp), *xp;
struct lwp *l = curlwp;
struct snap_info *si = VFSTOUFS(mp)->um_snapinfo;
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap) {
if (xp != ip) {
error = expunge(vp, xp, fs, snapacct, BLK_SNAP);
if (error)
break;
}
if (xp->i_nlink != 0)
continue;
error = UFS_WAPBL_BEGIN(mp);
if (error)
break;
error = ffs_freefile_snap(copy_fs, vp, xp->i_number, xp->i_mode);
UFS_WAPBL_END(mp);
if (error)
break;
}
if (error)
goto out;
/*
* Allocate space for the full list of preallocated snapshot blocks.
*/
snaplist = malloc(snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
ip->i_snapblklist = &snaplist[1];
/*
* Expunge the blocks used by the snapshots from the set of
* blocks marked as used in the snapshot bitmaps. Also, collect
* the list of allocated blocks in i_snapblklist.
*/
error = expunge(vp, ip, copy_fs, mapacct, BLK_SNAP);
if (error)
goto out;
if (snaplistsize < ip->i_snapblklist - snaplist)
panic("ffs_snapshot: list too small");
snaplistsize = ip->i_snapblklist - snaplist;
snaplist[0] = snaplistsize;
ip->i_snapblklist = &snaplist[0];
/*
* Write out the list of allocated blocks to the end of the snapshot.
*/
numblks = howmany(fs->fs_size, fs->fs_frag);
for (i = 0; i < snaplistsize; i++)
snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
error = vn_rdwr(UIO_WRITE, vp, (void *)snaplist,
snaplistsize * sizeof(daddr_t), ffs_lblktosize(fs, (off_t)numblks),
UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT, l->l_cred, NULL, NULL);
for (i = 0; i < snaplistsize; i++)
snaplist[i] = ufs_rw64(snaplist[i], UFS_FSNEEDSWAP(fs));
out:
if (error && snaplist != NULL) {
free(snaplist, M_UFSMNT);
ip->i_snapblklist = NULL;
}
return error;
}
/*
* Write the superblock and its summary information to the snapshot.
* Make sure, the first UFS_NDADDR blocks get copied to the snapshot.
*/
static int
snapshot_writefs(struct mount *mp, struct vnode *vp, void *sbbuf)
{
int error, len, loc;
void *space;
daddr_t blkno;
struct buf *bp;
struct fs *copyfs, *fs = VFSTOUFS(mp)->um_fs;
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
copyfs = (struct fs *)((char *)sbbuf + ffs_blkoff(fs, fs->fs_sblockloc));
/*
* Write the superblock and its summary information
* to the snapshot.
*/
blkno = ffs_fragstoblks(fs, fs->fs_csaddr);
len = howmany(fs->fs_cssize, fs->fs_bsize);
space = copyfs->fs_csp;
#ifdef FFS_EI
if (UFS_FSNEEDSWAP(fs)) {
ffs_sb_swap(copyfs, copyfs);
ffs_csum_swap(space, space, fs->fs_cssize);
}
#endif
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for (loc = 0; loc < len; loc++) {
error = bread(vp, blkno + loc, fs->fs_bsize,
B_MODIFY, &bp);
if (error) {
break;
}
memcpy(bp->b_data, space, fs->fs_bsize);
space = (char *)space + fs->fs_bsize;
bawrite(bp);
}
if (error)
goto out;
error = bread(vp, ffs_lblkno(fs, fs->fs_sblockloc),
fs->fs_bsize, B_MODIFY, &bp);
if (error) {
goto out;
} else {
memcpy(bp->b_data, sbbuf, fs->fs_bsize);
bawrite(bp);
}
/*
* Copy the first UFS_NDADDR blocks to the snapshot so
* ffs_copyonwrite() and ffs_snapblkfree() will always work on
* indirect blocks.
*/
for (loc = 0; loc < UFS_NDADDR; loc++) {
if (db_get(ip, loc) != 0)
continue;
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)loc),
fs->fs_bsize, l->l_cred, 0, &bp);
if (error)
break;
error = rwfsblk(vp, B_READ, bp->b_data, loc);
if (error) {
brelse(bp, 0);
break;
}
bawrite(bp);
}
out:
UFS_WAPBL_END(mp);
return error;
}
/*
* Copy all cylinder group maps.
*/
static int
cgaccount(struct vnode *vp, int passno, int *redo)
{
int cg, error = 0;
struct buf *nbp;
struct fs *fs = VTOI(vp)->i_fs;
if (redo != NULL)
*redo = 0;
if (passno == 1)
fs->fs_active = malloc(howmany(fs->fs_ncg, NBBY),
M_DEVBUF, M_WAITOK | M_ZERO);
for (cg = 0; cg < fs->fs_ncg; cg++) {
if (passno == 2 && ACTIVECG_ISSET(fs, cg))
continue;
if (redo != NULL)
*redo += 1;
error = UFS_WAPBL_BEGIN(vp->v_mount);
if (error)
return error;
error = ffs_balloc(vp, ffs_lfragtosize(fs, cgtod(fs, cg)),
fs->fs_bsize, curlwp->l_cred, 0, &nbp);
if (error) {
UFS_WAPBL_END(vp->v_mount);
break;
}
error = cgaccount1(cg, vp, nbp->b_data, passno);
bawrite(nbp);
UFS_WAPBL_END(vp->v_mount);
if (error)
break;
}
return error;
}
/*
* Copy a cylinder group map. All the unallocated blocks are marked
* BLK_NOCOPY so that the snapshot knows that it need not copy them
* if they are later written. If passno is one, then this is a first
* pass, so only setting needs to be done. If passno is 2, then this
* is a revision to a previous pass which must be undone as the
* replacement pass is done.
*/
static int
cgaccount1(int cg, struct vnode *vp, void *data, int passno)
{
struct buf *bp, *ibp;
struct inode *ip;
struct cg *cgp;
struct fs *fs;
struct lwp *l = curlwp;
daddr_t base, numblks;
int error, len, loc, ns __unused, indiroff;
ip = VTOI(vp);
fs = ip->i_fs;
ns = UFS_FSNEEDSWAP(fs);
error = bread(ip->i_devvp, FFS_FSBTODB(fs, cgtod(fs, cg)),
(int)fs->fs_cgsize, 0, &bp);
if (error) {
return (error);
}
cgp = (struct cg *)bp->b_data;
if (!cg_chkmagic(cgp, ns)) {
brelse(bp, 0);
return (EIO);
}
ACTIVECG_SET(fs, cg);
memcpy(data, bp->b_data, fs->fs_cgsize);
brelse(bp, 0);
if (fs->fs_cgsize < fs->fs_bsize)
memset((char *)data + fs->fs_cgsize, 0,
fs->fs_bsize - fs->fs_cgsize);
numblks = howmany(fs->fs_size, fs->fs_frag);
len = howmany(fs->fs_fpg, fs->fs_frag);
base = cgbase(fs, cg) / fs->fs_frag;
if (base + len >= numblks)
len = numblks - base - 1;
loc = 0;
if (base < UFS_NDADDR) {
for ( ; loc < UFS_NDADDR; loc++) {
if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
db_assign(ip, loc, BLK_NOCOPY);
else if (db_get(ip, loc) == BLK_NOCOPY) {
if (passno == 2)
db_assign(ip, loc, 0);
else if (passno == 1)
panic("ffs_snapshot: lost direct block");
}
}
}
if ((error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)(base + loc)),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
return (error);
indiroff = (base + loc - UFS_NDADDR) % FFS_NINDIR(fs);
for ( ; loc < len; loc++, indiroff++) {
if (indiroff >= FFS_NINDIR(fs)) {
bawrite(ibp);
if ((error = ffs_balloc(vp,
ffs_lblktosize(fs, (off_t)(base + loc)),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp)) != 0)
return (error);
indiroff = 0;
}
if (ffs_isblock(fs, cg_blksfree(cgp, ns), loc))
idb_assign(ip, ibp->b_data, indiroff, BLK_NOCOPY);
else if (idb_get(ip, ibp->b_data, indiroff) == BLK_NOCOPY) {
if (passno == 2)
idb_assign(ip, ibp->b_data, indiroff, 0);
else if (passno == 1)
panic("ffs_snapshot: lost indirect block");
}
}
bdwrite(ibp);
return (0);
}
/*
* Before expunging a snapshot inode, note all the
* blocks that it claims with BLK_SNAP so that fsck will
* be able to account for those blocks properly and so
* that this snapshot knows that it need not copy them
* if the other snapshot holding them is freed.
*/
static int
expunge(struct vnode *snapvp, struct inode *cancelip, struct fs *fs,
acctfunc_t acctfunc, int expungetype)
{
int i, error, ns __unused;
daddr_t lbn, rlbn;
daddr_t len, blkno, numblks, blksperindir;
struct ufs1_dinode *dip1;
struct ufs2_dinode *dip2;
struct lwp *l = curlwp;
void *bap;
struct buf *bp;
struct mount *mp;
ns = UFS_FSNEEDSWAP(fs);
mp = snapvp->v_mount;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
/*
* Prepare to expunge the inode. If its inode block has not
* yet been copied, then allocate and fill the copy.
*/
lbn = ffs_fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
error = snapblkaddr(snapvp, lbn, &blkno);
if (error)
return error;
if (blkno != 0) {
error = bread(snapvp, lbn, fs->fs_bsize,
B_MODIFY, &bp);
} else {
error = ffs_balloc(snapvp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, l->l_cred, 0, &bp);
if (! error)
error = rwfsblk(snapvp, B_READ, bp->b_data, lbn);
}
if (error) {
UFS_WAPBL_END(mp);
return error;
}
/*
* Set a snapshot inode to be a zero length file, regular files
* or unlinked snapshots to be completely unallocated.
*/
if (fs->fs_magic == FS_UFS1_MAGIC) {
dip1 = (struct ufs1_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
if (cancelip->i_flags & SF_SNAPSHOT) {
dip1->di_flags =
ufs_rw32(ufs_rw32(dip1->di_flags, ns) |
SF_SNAPINVAL, ns);
}
if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
dip1->di_mode = 0;
dip1->di_size = 0;
dip1->di_blocks = 0;
memset(&dip1->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int32_t));
} else {
dip2 = (struct ufs2_dinode *)bp->b_data +
ino_to_fsbo(fs, cancelip->i_number);
if (cancelip->i_flags & SF_SNAPSHOT) {
dip2->di_flags =
ufs_rw32(ufs_rw32(dip2->di_flags, ns) |
SF_SNAPINVAL, ns);
}
if (expungetype == BLK_NOCOPY || cancelip->i_nlink == 0)
dip2->di_mode = 0;
dip2->di_size = 0;
dip2->di_blocks = 0;
memset(&dip2->di_db[0], 0, (UFS_NDADDR + UFS_NIADDR) * sizeof(int64_t));
}
bdwrite(bp);
UFS_WAPBL_END(mp);
/*
* Now go through and expunge all the blocks in the file
* using the function requested.
*/
numblks = howmany(cancelip->i_size, fs->fs_bsize);
if (fs->fs_magic == FS_UFS1_MAGIC)
bap = &cancelip->i_ffs1_db[0];
else
bap = &cancelip->i_ffs2_db[0];
error = (*acctfunc)(snapvp, bap, 0, UFS_NDADDR, fs, 0, expungetype);
if (error)
return (error);
if (fs->fs_magic == FS_UFS1_MAGIC)
bap = &cancelip->i_ffs1_ib[0];
else
bap = &cancelip->i_ffs2_ib[0];
error = (*acctfunc)(snapvp, bap, 0, UFS_NIADDR, fs, -1, expungetype);
if (error)
return (error);
blksperindir = 1;
lbn = -UFS_NDADDR;
len = numblks - UFS_NDADDR;
rlbn = UFS_NDADDR;
for (i = 0; len > 0 && i < UFS_NIADDR; i++) {
error = indiracct(snapvp, ITOV(cancelip), i,
ib_get(cancelip, i), lbn, rlbn, len,
blksperindir, fs, acctfunc, expungetype);
if (error)
return (error);
blksperindir *= FFS_NINDIR(fs);
lbn -= blksperindir + 1;
len -= blksperindir;
rlbn += blksperindir;
}
return (0);
}
/*
* Descend an indirect block chain for vnode cancelvp accounting for all
* its indirect blocks in snapvp.
*/
static int
indiracct(struct vnode *snapvp, struct vnode *cancelvp, int level,
daddr_t blkno, daddr_t lbn, daddr_t rlbn, daddr_t remblks,
daddr_t blksperindir, struct fs *fs, acctfunc_t acctfunc, int expungetype)
{
int error, num, i;
daddr_t subblksperindir;
struct indir indirs[UFS_NIADDR + 2];
daddr_t last;
void *bap;
struct buf *bp;
if (blkno == 0) {
if (expungetype == BLK_NOCOPY)
return (0);
panic("indiracct: missing indir");
}
if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
return (error);
if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
panic("indiracct: botched params");
/*
* We have to expand bread here since it will deadlock looking
* up the block number for any blocks that are not in the cache.
*/
error = ffs_getblk(cancelvp, lbn, FFS_FSBTODB(fs, blkno), fs->fs_bsize,
false, &bp);
if (error)
return error;
if ((bp->b_oflags & (BO_DONE | BO_DELWRI)) == 0 && (error =
rwfsblk(bp->b_vp, B_READ, bp->b_data, ffs_fragstoblks(fs, blkno)))) {
brelse(bp, 0);
return (error);
}
/*
* Account for the block pointers in this indirect block.
*/
last = howmany(remblks, blksperindir);
if (last > FFS_NINDIR(fs))
last = FFS_NINDIR(fs);
bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK | M_ZERO);
memcpy((void *)bap, bp->b_data, fs->fs_bsize);
brelse(bp, 0);
error = (*acctfunc)(snapvp, bap, 0, last,
fs, level == 0 ? rlbn : -1, expungetype);
if (error || level == 0)
goto out;
/*
* Account for the block pointers in each of the indirect blocks
* in the levels below us.
*/
subblksperindir = blksperindir / FFS_NINDIR(fs);
for (lbn++, level--, i = 0; i < last; i++) {
error = indiracct(snapvp, cancelvp, level,
idb_get(VTOI(snapvp), bap, i), lbn, rlbn, remblks,
subblksperindir, fs, acctfunc, expungetype);
if (error)
goto out;
rlbn += blksperindir;
lbn -= blksperindir;
remblks -= blksperindir;
}
out:
free(bap, M_DEVBUF);
return (error);
}
/*
* Do both snap accounting and map accounting.
*/
static int
fullacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno,
int exptype /* BLK_SNAP or BLK_NOCOPY */)
{
int error;
if ((error = snapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype)))
return (error);
return (mapacct(vp, bap, oldblkp, lastblkp, fs, lblkno, exptype));
}
/*
* Identify a set of blocks allocated in a snapshot inode.
*/
static int
snapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno,
int expungetype /* BLK_SNAP or BLK_NOCOPY */)
{
struct inode *ip = VTOI(vp);
struct lwp *l = curlwp;
struct mount *mp = vp->v_mount;
daddr_t blkno;
daddr_t lbn;
struct buf *ibp;
int error, n;
const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
for ( n = 0; oldblkp < lastblkp; oldblkp++) {
blkno = idb_get(ip, bap, oldblkp);
if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
continue;
lbn = ffs_fragstoblks(fs, blkno);
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
} else {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
break;
blkno = idb_get(ip, ibp->b_data,
(lbn - UFS_NDADDR) % FFS_NINDIR(fs));
}
/*
* If we are expunging a snapshot vnode and we
* find a block marked BLK_NOCOPY, then it is
* one that has been allocated to this snapshot after
* we took our current snapshot and can be ignored.
*/
if (expungetype == BLK_SNAP && blkno == BLK_NOCOPY) {
if (lbn >= UFS_NDADDR)
brelse(ibp, 0);
} else {
if (blkno != 0)
panic("snapacct: bad block");
if (lbn < UFS_NDADDR)
db_assign(ip, lbn, expungetype);
else {
idb_assign(ip, ibp->b_data,
(lbn - UFS_NDADDR) % FFS_NINDIR(fs), expungetype);
bdwrite(ibp);
}
}
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
UFS_WAPBL_END(mp);
return error;
}
/*
* Account for a set of blocks allocated in a snapshot inode.
*/
static int
mapacct(struct vnode *vp, void *bap, int oldblkp, int lastblkp,
struct fs *fs, daddr_t lblkno, int expungetype)
{
daddr_t blkno;
struct inode *ip;
struct mount *mp = vp->v_mount;
ino_t inum;
int acctit, error, n;
const int wbreak = blocks_in_journal(VFSTOUFS(mp)->um_fs)/8;
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
ip = VTOI(vp);
inum = ip->i_number;
if (lblkno == -1)
acctit = 0;
else
acctit = 1;
for ( n = 0; oldblkp < lastblkp; oldblkp++, lblkno++) {
blkno = idb_get(ip, bap, oldblkp);
if (blkno == 0 || blkno == BLK_NOCOPY)
continue;
if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
*ip->i_snapblklist++ = lblkno;
if (blkno == BLK_SNAP)
blkno = ffs_blkstofrags(fs, lblkno);
ffs_blkfree_snap(fs, vp, blkno, fs->fs_bsize, inum);
if (wbreak > 0 && (++n % wbreak) == 0) {
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
if (error)
return error;
}
}
UFS_WAPBL_END(mp);
return (0);
}
/*
* Number of blocks that fit into the journal or zero if not logging.
*/
static int
blocks_in_journal(struct fs *fs)
{
off_t bpj;
if ((fs->fs_flags & FS_DOWAPBL) == 0)
return 0;
bpj = 1;
if (fs->fs_journal_version == UFS_WAPBL_VERSION) {
switch (fs->fs_journal_location) {
case UFS_WAPBL_JOURNALLOC_END_PARTITION:
bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_EPART_BLKSZ]*
fs->fs_journallocs[UFS_WAPBL_EPART_COUNT];
break;
case UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM:
bpj = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ]*
fs->fs_journallocs[UFS_WAPBL_INFS_COUNT];
break;
}
}
bpj /= fs->fs_bsize;
return (bpj > 0 ? bpj : 1);
}
#endif /* defined(FFS_NO_SNAPSHOT) */
/*
* Decrement extra reference on snapshot when last name is removed.
* It will not be freed until the last open reference goes away.
*/
void
ffs_snapgone(struct vnode *vp)
{
struct inode *xp, *ip = VTOI(vp);
struct mount *mp = spec_node_getmountedfs(ip->i_devvp);
struct fs *fs;
struct snap_info *si;
int snaploc;
si = VFSTOUFS(mp)->um_snapinfo;
/*
* Find snapshot in incore list.
*/
mutex_enter(&si->si_lock);
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
if (xp == ip)
break;
mutex_exit(&si->si_lock);
if (xp != NULL)
vrele(ITOV(ip));
#ifdef DEBUG
else if (snapdebug)
printf("ffs_snapgone: lost snapshot vnode %llu\n",
(unsigned long long)ip->i_number);
#endif
/*
* Delete snapshot inode from superblock. Keep list dense.
*/
mutex_enter(&si->si_lock);
fs = ip->i_fs;
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
if (fs->fs_snapinum[snaploc] == ip->i_number)
break;
if (snaploc < FSMAXSNAP) {
for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
if (fs->fs_snapinum[snaploc] == 0)
break;
fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
}
fs->fs_snapinum[snaploc - 1] = 0;
}
si->si_gen++;
mutex_exit(&si->si_lock);
}
/*
* Prepare a snapshot file for being removed.
*/
void
ffs_snapremove(struct vnode *vp)
{
struct inode *ip = VTOI(vp), *xp;
struct vnode *devvp = ip->i_devvp;
struct fs *fs = ip->i_fs;
struct mount *mp = spec_node_getmountedfs(devvp);
struct buf *ibp;
struct snap_info *si;
struct lwp *l = curlwp;
daddr_t numblks, blkno, dblk;
int error, loc, last;
si = VFSTOUFS(mp)->um_snapinfo;
/*
* If active, delete from incore list (this snapshot may
* already have been in the process of being deleted, so
* would not have been active).
*
* Clear copy-on-write flag if last snapshot.
*/
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
if (is_active_snapshot(si, ip)) {
TAILQ_REMOVE(&si->si_snapshots, ip, i_nextsnap);
if (TAILQ_FIRST(&si->si_snapshots) != 0) {
/* Roll back the list of preallocated blocks. */
xp = TAILQ_LAST(&si->si_snapshots, inodelst);
si->si_snapblklist = xp->i_snapblklist;
si->si_gen++;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
} else {
si->si_snapblklist = 0;
si->si_gen++;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
fscow_disestablish(mp, ffs_copyonwrite, devvp);
}
if (ip->i_snapblklist != NULL) {
free(ip->i_snapblklist, M_UFSMNT);
ip->i_snapblklist = NULL;
}
} else {
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
}
/*
* Clear all BLK_NOCOPY fields. Pass any block claims to other
* snapshots that want them (see ffs_snapblkfree below).
*/
for (blkno = 1; blkno < UFS_NDADDR; blkno++) {
dblk = db_get(ip, blkno);
if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
db_assign(ip, blkno, 0);
else if ((dblk == ffs_blkstofrags(fs, blkno) &&
ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
ip->i_number))) {
DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
db_assign(ip, blkno, 0);
}
}
numblks = howmany(ip->i_size, fs->fs_bsize);
for (blkno = UFS_NDADDR; blkno < numblks; blkno += FFS_NINDIR(fs)) {
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)blkno),
fs->fs_bsize, l->l_cred, B_METAONLY, &ibp);
if (error)
continue;
if (fs->fs_size - blkno > FFS_NINDIR(fs))
last = FFS_NINDIR(fs);
else
last = fs->fs_size - blkno;
for (loc = 0; loc < last; loc++) {
dblk = idb_get(ip, ibp->b_data, loc);
if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
idb_assign(ip, ibp->b_data, loc, 0);
else if (dblk == ffs_blkstofrags(fs, blkno) &&
ffs_snapblkfree(fs, ip->i_devvp, dblk,
fs->fs_bsize, ip->i_number)) {
DIP_ADD(ip, blocks, -btodb(fs->fs_bsize));
idb_assign(ip, ibp->b_data, loc, 0);
}
}
bawrite(ibp);
UFS_WAPBL_END(mp);
error = UFS_WAPBL_BEGIN(mp);
KASSERT(error == 0);
}
/*
* Clear snapshot flag and drop reference.
*/
ip->i_flags &= ~(SF_SNAPSHOT | SF_SNAPINVAL);
DIP_ASSIGN(ip, flags, ip->i_flags);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
#if defined(QUOTA) || defined(QUOTA2)
chkdq(ip, DIP(ip, blocks), l->l_cred, FORCE);
chkiq(ip, 1, l->l_cred, FORCE);
#endif
}
/*
* Notification that a block is being freed. Return zero if the free
* should be allowed to proceed. Return non-zero if the snapshot file
* wants to claim the block. The block will be claimed if it is an
* uncopied part of one of the snapshots. It will be freed if it is
* either a BLK_NOCOPY or has already been copied in all of the snapshots.
* If a fragment is being freed, then all snapshots that care about
* it must make a copy since a snapshot file can only claim full sized
* blocks. Note that if more than one snapshot file maps the block,
* we can pick one at random to claim it. Since none of the snapshots
* can change, we are assurred that they will all see the same unmodified
* image. When deleting a snapshot file (see ffs_snapremove above), we
* must push any of these claimed blocks to one of the other snapshots
* that maps it. These claimed blocks are easily identified as they will
* have a block number equal to their logical block number within the
* snapshot. A copied block can never have this property because they
* must always have been allocated from a BLK_NOCOPY location.
*/
int
ffs_snapblkfree(struct fs *fs, struct vnode *devvp, daddr_t bno,
long size, ino_t inum)
{
struct mount *mp = spec_node_getmountedfs(devvp);
struct buf *ibp;
struct inode *ip;
struct vnode *vp = NULL;
struct snap_info *si;
void *saved_data = NULL;
daddr_t lbn;
daddr_t blkno;
uint32_t gen;
int indiroff = 0, error = 0, claimedblk = 0;
si = VFSTOUFS(mp)->um_snapinfo;
lbn = ffs_fragstoblks(fs, bno);
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
si->si_owner = curlwp;
retry:
gen = si->si_gen;
TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
vp = ITOV(ip);
/*
* Lookup block being written.
*/
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
} else {
mutex_exit(&si->si_lock);
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn),
fs->fs_bsize, FSCRED, B_METAONLY, &ibp);
if (error) {
mutex_enter(&si->si_lock);
break;
}
indiroff = (lbn - UFS_NDADDR) % FFS_NINDIR(fs); blkno = idb_get(ip, ibp->b_data, indiroff);
mutex_enter(&si->si_lock);
if (gen != si->si_gen) {
brelse(ibp, 0);
goto retry;
}
}
/*
* Check to see if block needs to be copied.
*/
if (blkno == 0) {
/*
* A block that we map is being freed. If it has not
* been claimed yet, we will claim or copy it (below).
*/
claimedblk = 1;
} else if (blkno == BLK_SNAP) {
/*
* No previous snapshot claimed the block,
* so it will be freed and become a BLK_NOCOPY
* (don't care) for us.
*/
if (claimedblk)
panic("snapblkfree: inconsistent block type");
if (lbn < UFS_NDADDR) {
db_assign(ip, lbn, BLK_NOCOPY);
ip->i_flag |= IN_CHANGE | IN_UPDATE;
} else {
idb_assign(ip, ibp->b_data, indiroff,
BLK_NOCOPY);
mutex_exit(&si->si_lock);
if (ip->i_nlink > 0)
bwrite(ibp);
else
bdwrite(ibp);
mutex_enter(&si->si_lock);
if (gen != si->si_gen)
goto retry;
}
continue;
} else /* BLK_NOCOPY or default */ {
/*
* If the snapshot has already copied the block
* (default), or does not care about the block,
* it is not needed.
*/
if (lbn >= UFS_NDADDR) brelse(ibp, 0);
continue;
}
/*
* If this is a full size block, we will just grab it
* and assign it to the snapshot inode. Otherwise we
* will proceed to copy it. See explanation for this
* routine as to why only a single snapshot needs to
* claim this block.
*/
if (size == fs->fs_bsize) {
#ifdef DEBUG
if (snapdebug)
printf("%s %llu lbn %" PRId64
"from inum %llu\n",
"Grabonremove: snapino",
(unsigned long long)ip->i_number,
lbn, (unsigned long long)inum);
#endif
mutex_exit(&si->si_lock);
if (lbn < UFS_NDADDR) {
db_assign(ip, lbn, bno);
} else {
idb_assign(ip, ibp->b_data, indiroff, bno);
if (ip->i_nlink > 0)
bwrite(ibp);
else
bdwrite(ibp);
}
DIP_ADD(ip, blocks, btodb(size));
ip->i_flag |= IN_CHANGE | IN_UPDATE;
if (ip->i_nlink > 0 && mp->mnt_wapbl) error = syncsnap(vp);
else
error = 0;
mutex_enter(&si->si_lock);
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
return (error == 0);
}
if (lbn >= UFS_NDADDR) brelse(ibp, 0);
#ifdef DEBUG
if (snapdebug)
printf("%s%llu lbn %" PRId64 " %s %llu size %ld\n",
"Copyonremove: snapino ",
(unsigned long long)ip->i_number,
lbn, "for inum", (unsigned long long)inum, size);
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
mutex_exit(&si->si_lock);
if (saved_data == NULL) {
saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
error = rwfsblk(vp, B_READ, saved_data, lbn);
if (error) { free(saved_data, M_UFSMNT);
saved_data = NULL;
mutex_enter(&si->si_lock);
break;
}
}
error = wrsnapblk(vp, saved_data, lbn); if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
error = syncsnap(vp);
mutex_enter(&si->si_lock);
if (error)
break;
if (gen != si->si_gen)
goto retry;
}
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
if (saved_data) free(saved_data, M_UFSMNT);
/*
* If we have been unable to allocate a block in which to do
* the copy, then return non-zero so that the fragment will
* not be freed. Although space will be lost, the snapshot
* will stay consistent.
*/
return (error);
}
/*
* Associate snapshot files when mounting.
*/
void
ffs_snapshot_mount(struct mount *mp)
{
struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
struct fs *fs = VFSTOUFS(mp)->um_fs;
struct lwp *l = curlwp;
struct vnode *vp;
struct inode *ip, *xp;
struct snap_info *si;
daddr_t snaplistsize, *snapblklist;
int i, error, ns __unused, snaploc, loc;
/*
* No persistent snapshots on apple ufs file systems.
*/
if (UFS_MPISAPPLEUFS(VFSTOUFS(mp)))
return;
si = VFSTOUFS(mp)->um_snapinfo;
ns = UFS_FSNEEDSWAP(fs);
/*
* XXX The following needs to be set before ffs_truncate or
* VOP_READ can be called.
*/
mp->mnt_stat.f_iosize = fs->fs_bsize;
/*
* Process each snapshot listed in the superblock.
*/
vp = NULL;
mutex_enter(&si->si_lock);
for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
if (fs->fs_snapinum[snaploc] == 0)
break;
if ((error = VFS_VGET(mp, fs->fs_snapinum[snaploc],
LK_EXCLUSIVE, &vp)) != 0) {
printf("ffs_snapshot_mount: vget failed %d\n", error);
continue;
}
ip = VTOI(vp);
if ((ip->i_flags & (SF_SNAPSHOT | SF_SNAPINVAL)) !=
SF_SNAPSHOT) {
printf("ffs_snapshot_mount: non-snapshot inode %d\n",
fs->fs_snapinum[snaploc]);
vput(vp);
vp = NULL;
for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
if (fs->fs_snapinum[loc] == 0)
break;
fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
}
fs->fs_snapinum[loc - 1] = 0;
snaploc--;
continue;
}
/*
* Read the block hints list. Use an empty list on
* read errors.
*/
error = vn_rdwr(UIO_READ, vp,
(void *)&snaplistsize, sizeof(snaplistsize),
ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
l->l_cred, NULL, NULL);
if (error) {
printf("ffs_snapshot_mount: read_1 failed %d\n", error);
snaplistsize = 1;
} else
snaplistsize = ufs_rw64(snaplistsize, ns);
snapblklist = malloc(
snaplistsize * sizeof(daddr_t), M_UFSMNT, M_WAITOK);
if (error)
snapblklist[0] = 1;
else {
error = vn_rdwr(UIO_READ, vp, (void *)snapblklist,
snaplistsize * sizeof(daddr_t),
ffs_lblktosize(fs, howmany(fs->fs_size, fs->fs_frag)),
UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT|IO_ALTSEMANTICS,
l->l_cred, NULL, NULL);
for (i = 0; i < snaplistsize; i++)
snapblklist[i] = ufs_rw64(snapblklist[i], ns);
if (error) {
printf("ffs_snapshot_mount: read_2 failed %d\n",
error);
snapblklist[0] = 1;
}
}
ip->i_snapblklist = &snapblklist[0];
/*
* Link it onto the active snapshot list.
*/
if (is_active_snapshot(si, ip))
panic("ffs_snapshot_mount: %"PRIu64" already on list",
ip->i_number);
else
TAILQ_INSERT_TAIL(&si->si_snapshots, ip, i_nextsnap);
vp->v_vflag |= VV_SYSTEM;
VOP_UNLOCK(vp);
}
/*
* No usable snapshots found.
*/
if (vp == NULL) {
mutex_exit(&si->si_lock);
return;
}
/*
* Attach the block hints list. We always want to
* use the list from the newest snapshot.
*/
xp = TAILQ_LAST(&si->si_snapshots, inodelst);
si->si_snapblklist = xp->i_snapblklist;
fscow_establish(mp, ffs_copyonwrite, devvp);
si->si_gen++;
mutex_exit(&si->si_lock);
}
/*
* Disassociate snapshot files when unmounting.
*/
void
ffs_snapshot_unmount(struct mount *mp)
{
struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
struct inode *xp;
struct vnode *vp = NULL;
struct snap_info *si;
si = VFSTOUFS(mp)->um_snapinfo;
mutex_enter(&si->si_lock);
while ((xp = TAILQ_FIRST(&si->si_snapshots)) != 0) {
vp = ITOV(xp);
TAILQ_REMOVE(&si->si_snapshots, xp, i_nextsnap);
if (xp->i_snapblklist == si->si_snapblklist)
si->si_snapblklist = NULL;
free(xp->i_snapblklist, M_UFSMNT);
if (xp->i_nlink > 0) {
si->si_gen++;
mutex_exit(&si->si_lock);
vrele(vp);
mutex_enter(&si->si_lock);
}
}
si->si_gen++;
mutex_exit(&si->si_lock);
if (vp)
fscow_disestablish(mp, ffs_copyonwrite, devvp);
}
/*
* Check for need to copy block that is about to be written,
* copying the block if necessary.
*/
static int
ffs_copyonwrite(void *v, struct buf *bp, bool data_valid)
{
struct fs *fs;
struct inode *ip;
struct vnode *devvp = v, *vp = NULL;
struct mount *mp = spec_node_getmountedfs(devvp);
struct snap_info *si;
void *saved_data = NULL;
daddr_t lbn, blkno, *snapblklist;
uint32_t gen;
int lower, upper, mid, snapshot_locked = 0, error = 0;
/*
* Check for valid snapshots.
*/
si = VFSTOUFS(mp)->um_snapinfo;
mutex_enter(&si->si_lock);
ip = TAILQ_FIRST(&si->si_snapshots);
if (ip == NULL) {
mutex_exit(&si->si_lock);
return 0;
}
/*
* First check to see if it is after the file system,
* in the journal or in the preallocated list.
* By doing these checks we avoid several potential deadlocks.
*/
fs = ip->i_fs;
lbn = ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno));
if (bp->b_blkno >= FFS_FSBTODB(fs, fs->fs_size)) {
mutex_exit(&si->si_lock);
return 0;
}
if ((fs->fs_flags & FS_DOWAPBL) &&
fs->fs_journal_location == UFS_WAPBL_JOURNALLOC_IN_FILESYSTEM) {
off_t blk_off, log_start, log_end;
log_start = (off_t)fs->fs_journallocs[UFS_WAPBL_INFS_ADDR] *
fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
log_end = log_start + fs->fs_journallocs[UFS_WAPBL_INFS_COUNT] *
fs->fs_journallocs[UFS_WAPBL_INFS_BLKSZ];
blk_off = dbtob(bp->b_blkno);
if (blk_off >= log_start && blk_off < log_end) {
mutex_exit(&si->si_lock);
return 0;
}
}
snapblklist = si->si_snapblklist;
upper = (snapblklist != NULL ? snapblklist[0] - 1 : 0);
lower = 1;
while (lower <= upper) {
mid = (lower + upper) / 2;
if (snapblklist[mid] == lbn)
break;
if (snapblklist[mid] < lbn)
lower = mid + 1;
else
upper = mid - 1;
}
if (lower <= upper) {
mutex_exit(&si->si_lock);
return 0;
}
/*
* Not in the precomputed list, so check the snapshots.
*/
if (si->si_owner != curlwp) {
if (!mutex_tryenter(&si->si_snaplock)) {
mutex_exit(&si->si_lock);
mutex_enter(&si->si_snaplock);
mutex_enter(&si->si_lock);
}
si->si_owner = curlwp;
snapshot_locked = 1;
}
if (data_valid && bp->b_bcount == fs->fs_bsize)
saved_data = bp->b_data;
retry:
gen = si->si_gen;
TAILQ_FOREACH(ip, &si->si_snapshots, i_nextsnap) {
vp = ITOV(ip);
/*
* We ensure that everything of our own that needs to be
* copied will be done at the time that ffs_snapshot is
* called. Thus we can skip the check here which can
* deadlock in doing the lookup in ffs_balloc.
*/
if (bp->b_vp == vp)
continue;
/*
* Check to see if block needs to be copied.
*/
if (lbn < UFS_NDADDR) {
blkno = db_get(ip, lbn);
} else {
mutex_exit(&si->si_lock);
blkno = 0; /* XXX: GCC */
if ((error = snapblkaddr(vp, lbn, &blkno)) != 0) {
mutex_enter(&si->si_lock);
break;
}
mutex_enter(&si->si_lock);
if (gen != si->si_gen)
goto retry;
}
KASSERTMSG((blkno != BLK_SNAP || bp->b_lblkno < 0),
"ffs_copyonwrite: bad copy block: blkno %jd, lblkno %jd",
(intmax_t)blkno, (intmax_t)bp->b_lblkno);
if (blkno != 0)
continue;
if (curlwp == uvm.pagedaemon_lwp) {
error = ENOMEM;
break;
}
/* Only one level of recursion allowed. */
KASSERT(snapshot_locked);
/*
* Allocate the block into which to do the copy. Since
* multiple processes may all try to copy the same block,
* we have to recheck our need to do a copy if we sleep
* waiting for the lock.
*
* Because all snapshots on a filesystem share a single
* lock, we ensure that we will never be in competition
* with another process to allocate a block.
*/
#ifdef DEBUG
if (snapdebug) {
printf("Copyonwrite: snapino %llu lbn %" PRId64 " for ",
(unsigned long long)ip->i_number, lbn);
if (bp->b_vp == devvp)
printf("fs metadata");
else
printf("inum %llu", (unsigned long long)
VTOI(bp->b_vp)->i_number);
printf(" lblkno %" PRId64 "\n", bp->b_lblkno);
}
#endif
/*
* If we have already read the old block contents, then
* simply copy them to the new block. Note that we need
* to synchronously write snapshots that have not been
* unlinked, and hence will be visible after a crash,
* to ensure their integrity.
*/
mutex_exit(&si->si_lock);
if (saved_data == NULL) {
saved_data = malloc(fs->fs_bsize, M_UFSMNT, M_WAITOK);
error = rwfsblk(vp, B_READ, saved_data, lbn);
if (error) {
free(saved_data, M_UFSMNT);
saved_data = NULL;
mutex_enter(&si->si_lock);
break;
}
}
error = wrsnapblk(vp, saved_data, lbn);
if (error == 0 && ip->i_nlink > 0 && mp->mnt_wapbl)
error = syncsnap(vp);
mutex_enter(&si->si_lock);
if (error)
break;
if (gen != si->si_gen)
goto retry;
}
/*
* Note that we need to synchronously write snapshots that
* have not been unlinked, and hence will be visible after
* a crash, to ensure their integrity.
*/
if (snapshot_locked) {
si->si_owner = NULL;
mutex_exit(&si->si_lock);
mutex_exit(&si->si_snaplock);
} else
mutex_exit(&si->si_lock);
if (saved_data && saved_data != bp->b_data)
free(saved_data, M_UFSMNT);
return error;
}
/*
* Read from a snapshot.
*/
int
ffs_snapshot_read(struct vnode *vp, struct uio *uio, int ioflag)
{
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct snap_info *si = VFSTOUFS(vp->v_mount)->um_snapinfo;
struct buf *bp;
daddr_t lbn, nextlbn;
off_t fsbytes, bytesinfile;
long size, xfersize, blkoffset;
int error;
mutex_enter(&si->si_snaplock);
if (ioflag & IO_ALTSEMANTICS)
fsbytes = ip->i_size;
else
fsbytes = ffs_lfragtosize(fs, fs->fs_size);
for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
bytesinfile = fsbytes - uio->uio_offset;
if (bytesinfile <= 0)
break;
lbn = ffs_lblkno(fs, uio->uio_offset);
nextlbn = lbn + 1;
size = fs->fs_bsize;
blkoffset = ffs_blkoff(fs, uio->uio_offset);
xfersize = MIN(MIN(fs->fs_bsize - blkoffset, uio->uio_resid),
bytesinfile);
if (ffs_lblktosize(fs, nextlbn + 1) >= fsbytes) {
if (ffs_lblktosize(fs, lbn) + size > fsbytes)
size = ffs_fragroundup(fs,
fsbytes - ffs_lblktosize(fs, lbn));
error = bread(vp, lbn, size, 0, &bp);
} else {
int nextsize = fs->fs_bsize;
error = breadn(vp, lbn,
size, &nextlbn, &nextsize, 1, 0, &bp);
}
if (error)
break;
/*
* We should only get non-zero b_resid when an I/O error
* has occurred, which should cause us to break above.
* However, if the short read did not cause an error,
* then we want to ensure that we do not uiomove bad
* or uninitialized data.
*/
size -= bp->b_resid;
if (size < blkoffset + xfersize) {
xfersize = size - blkoffset;
if (xfersize <= 0)
break;
}
error = uiomove((char *)bp->b_data + blkoffset, xfersize, uio);
if (error)
break;
brelse(bp, BC_AGE);
}
if (bp != NULL)
brelse(bp, BC_AGE);
mutex_exit(&si->si_snaplock);
return error;
}
/*
* Lookup a snapshots data block address.
* Simpler than UFS_BALLOC() as we know all metadata is already allocated
* and safe even for the pagedaemon where we cannot bread().
*/
static int
snapblkaddr(struct vnode *vp, daddr_t lbn, daddr_t *res)
{
struct indir indirs[UFS_NIADDR + 2];
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *bp;
int error, num;
KASSERT(lbn >= 0);
if (lbn < UFS_NDADDR) {
*res = db_get(ip, lbn);
return 0;
}
if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
return error;
if (curlwp == uvm.pagedaemon_lwp) {
mutex_enter(&bufcache_lock);
bp = incore(vp, indirs[num-1].in_lbn);
if (bp && (bp->b_oflags & (BO_DONE | BO_DELWRI))) {
*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
error = 0;
} else
error = ENOMEM;
mutex_exit(&bufcache_lock);
return error;
}
error = bread(vp, indirs[num-1].in_lbn, fs->fs_bsize, 0, &bp);
if (error == 0) {
*res = idb_get(ip, bp->b_data, indirs[num-1].in_off);
brelse(bp, 0);
}
return error;
}
/*
* Read or write the specified block of the filesystem vp resides on
* from or to the disk bypassing the buffer cache.
*/
static int
rwfsblk(struct vnode *vp, int flags, void *data, daddr_t lbn)
{
int error;
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *nbp;
nbp = getiobuf(NULL, true);
nbp->b_flags = flags;
nbp->b_bcount = nbp->b_bufsize = fs->fs_bsize;
nbp->b_error = 0;
nbp->b_data = data;
nbp->b_blkno = nbp->b_rawblkno = FFS_FSBTODB(fs, ffs_blkstofrags(fs, lbn));
nbp->b_proc = NULL;
nbp->b_dev = ip->i_devvp->v_rdev;
SET(nbp->b_cflags, BC_BUSY); /* mark buffer busy */
bdev_strategy(nbp);
error = biowait(nbp);
putiobuf(nbp);
return error;
}
/*
* Write all dirty buffers to disk and invalidate them.
*/
static int
syncsnap(struct vnode *vp)
{
int error;
buf_t *bp;
struct fs *fs = VTOI(vp)->i_fs;
mutex_enter(&bufcache_lock);
while ((bp = LIST_FIRST(&vp->v_dirtyblkhd))) {
error = bbusy(bp, false, 0, NULL);
if (error == EPASSTHROUGH)
continue;
else if (error != 0) {
mutex_exit(&bufcache_lock);
return error;
}
KASSERT(bp->b_bcount == fs->fs_bsize);
mutex_exit(&bufcache_lock);
error = rwfsblk(vp, B_WRITE, bp->b_data,
ffs_fragstoblks(fs, FFS_DBTOFSB(fs, bp->b_blkno)));
brelse(bp, BC_INVAL | BC_VFLUSH);
if (error)
return error;
mutex_enter(&bufcache_lock);
}
mutex_exit(&bufcache_lock);
return 0;
}
/*
* Write the specified block to a snapshot.
*/
static int
wrsnapblk(struct vnode *vp, void *data, daddr_t lbn)
{
struct inode *ip = VTOI(vp);
struct fs *fs = ip->i_fs;
struct buf *bp;
int error;
error = ffs_balloc(vp, ffs_lblktosize(fs, (off_t)lbn), fs->fs_bsize,
FSCRED, (ip->i_nlink > 0 ? B_SYNC : 0), &bp);
if (error)
return error;
memcpy(bp->b_data, data, fs->fs_bsize);
if (ip->i_nlink > 0)
error = bwrite(bp);
else
bawrite(bp);
return error;
}
/*
* Check if this inode is present on the active snapshot list.
* Must be called with snapinfo locked.
*/
static inline bool
is_active_snapshot(struct snap_info *si, struct inode *ip)
{
struct inode *xp;
KASSERT(mutex_owned(&si->si_lock));
TAILQ_FOREACH(xp, &si->si_snapshots, i_nextsnap)
if (xp == ip)
return true;
return false;
}
/*
* Get/Put direct block from inode or buffer containing disk addresses. Take
* care for fs type (UFS1/UFS2) and byte swapping. These functions should go
* into a global include.
*/
static inline daddr_t
db_get(struct inode *ip, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(ip->i_ffs1_db[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(ip->i_ffs2_db[loc], UFS_IPNEEDSWAP(ip));
}
static inline void
db_assign(struct inode *ip, int loc, daddr_t val)
{
if (ip->i_ump->um_fstype == UFS1)
ip->i_ffs1_db[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
else
ip->i_ffs2_db[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}
__unused static inline daddr_t
ib_get(struct inode *ip, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(ip->i_ffs1_ib[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(ip->i_ffs2_ib[loc], UFS_IPNEEDSWAP(ip));
}
static inline daddr_t
idb_get(struct inode *ip, void *bf, int loc)
{
if (ip->i_ump->um_fstype == UFS1)
return ufs_rw32(((int32_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
else
return ufs_rw64(((int64_t *)(bf))[loc], UFS_IPNEEDSWAP(ip));
}
static inline void
idb_assign(struct inode *ip, void *bf, int loc, daddr_t val)
{
if (ip->i_ump->um_fstype == UFS1)
((int32_t *)(bf))[loc] = ufs_rw32(val, UFS_IPNEEDSWAP(ip));
else
((int64_t *)(bf))[loc] = ufs_rw64(val, UFS_IPNEEDSWAP(ip));
}
/* $NetBSD: subr_kobj_vfs.c,v 1.12 2021/06/29 22:40:53 dholland Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software developed for The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1998-2000 Doug Rabson
* Copyright (c) 2004 Peter Wemm
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Kernel loader vfs routines.
*/
#include <sys/kobj_impl.h>
#ifdef _KERNEL_OPT
#include "opt_modular.h"
#endif
#ifdef MODULAR
#include <sys/param.h>
#include <sys/fcntl.h>
#include <sys/module.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_kobj_vfs.c,v 1.12 2021/06/29 22:40:53 dholland Exp $");
static void
kobj_close_vfs(kobj_t ko)
{
VOP_UNLOCK(ko->ko_source);
vn_close(ko->ko_source, FREAD, kauth_cred_get());
}
/*
* kobj_read:
*
* Utility function: read from the object.
*/
static int
kobj_read_vfs(kobj_t ko, void **basep, size_t size, off_t off,
bool allocate)
{
size_t resid;
void *base;
int error;
KASSERT(ko->ko_source != NULL);
if (allocate) {
base = kmem_alloc(size, KM_SLEEP);
} else {
base = *basep;
#ifdef DIAGNOSTIC
bool ok = false;
if ((uintptr_t)base >= (uintptr_t)ko->ko_text_address &&
(uintptr_t)base + size <=
(uintptr_t)ko->ko_text_address + ko->ko_text_size)
ok = true;
if ((uintptr_t)base >= (uintptr_t)ko->ko_data_address &&
(uintptr_t)base + size <=
(uintptr_t)ko->ko_data_address + ko->ko_data_size)
ok = true;
if ((uintptr_t)base >= (uintptr_t)ko->ko_rodata_address &&
(uintptr_t)base + size <=
(uintptr_t)ko->ko_rodata_address + ko->ko_rodata_size)
ok = true;
if (!ok)
panic("kobj_read_vfs: not in a dedicated segment");
#endif
}
error = vn_rdwr(UIO_READ, ko->ko_source, base, size, off,
UIO_SYSSPACE, IO_NODELOCKED, curlwp->l_cred, &resid,
curlwp);
if (error == 0 && resid != 0) {
error = EINVAL;
}
if (allocate && error != 0) {
kmem_free(base, size);
base = NULL;
}
if (allocate)
*basep = base;
return error;
}
/*
* kobj_load_vfs:
*
* Load an object located in the file system.
*/
int
kobj_load_vfs(kobj_t *kop, const char *path, const bool nochroot)
{
struct pathbuf *pb;
struct vnode *vp;
int error;
kobj_t ko;
KASSERT(path != NULL);
if (strchr(path, '/') == NULL)
return ENOENT;
ko = kmem_zalloc(sizeof(*ko), KM_SLEEP);
pb = pathbuf_create(path);
if (pb == NULL) {
kmem_free(ko, sizeof(*ko));
return ENOMEM;
}
error = vn_open(NULL, pb, (nochroot ? NOCHROOT : 0), FREAD, 0,
&vp, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb);
kmem_free(ko, sizeof(*ko));
return error;
}
ko->ko_type = KT_VNODE;
kobj_setname(ko, path);
ko->ko_source = vp;
ko->ko_read = kobj_read_vfs;
ko->ko_close = kobj_close_vfs;
pathbuf_destroy(pb);
*kop = ko;
return kobj_load(ko);
}
#else /* MODULAR */
int
kobj_load_vfs(kobj_t *kop, const char *path, const bool nochroot)
{
return ENOSYS;
}
#endif
/* $NetBSD: mfs_vfsops.c,v 1.116 2022/03/19 13:53:33 hannken Exp $ */
/*
* Copyright (c) 1989, 1990, 1993, 1994
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)mfs_vfsops.c 8.11 (Berkeley) 6/19/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: mfs_vfsops.c,v 1.116 2022/03/19 13:53:33 hannken Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/mount.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#include <ufs/ufs/quota.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufsmount.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ffs/fs.h>
#include <ufs/ffs/ffs_extern.h>
#include <ufs/mfs/mfsnode.h>
#include <ufs/mfs/mfs_extern.h>
MODULE(MODULE_CLASS_VFS, mfs, "ffs");
kmutex_t mfs_lock; /* global lock */
/* used for building internal dev_t, minor == 0 reserved for miniroot */
static devminor_t mfs_minor = 1;
static int mfs_initcnt;
extern int (**mfs_vnodeop_p)(void *);
/*
* mfs vfs operations.
*/
extern const struct vnodeopv_desc mfs_vnodeop_opv_desc;
const struct vnodeopv_desc * const mfs_vnodeopv_descs[] = {
&mfs_vnodeop_opv_desc,
NULL,
};
struct vfsops mfs_vfsops = {
.vfs_name = MOUNT_MFS,
.vfs_min_mount_data = sizeof (struct mfs_args),
.vfs_mount = mfs_mount,
.vfs_start = mfs_start,
.vfs_unmount = ffs_unmount,
.vfs_root = ufs_root,
.vfs_quotactl = ufs_quotactl,
.vfs_statvfs = mfs_statvfs,
.vfs_sync = ffs_sync,
.vfs_vget = ufs_vget,
.vfs_loadvnode = ffs_loadvnode,
.vfs_newvnode = ffs_newvnode,
.vfs_fhtovp = ffs_fhtovp,
.vfs_vptofh = ffs_vptofh,
.vfs_init = mfs_init,
.vfs_reinit = mfs_reinit,
.vfs_done = mfs_done,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = mfs_vnodeopv_descs
};
SYSCTL_SETUP(mfs_sysctl_setup, "mfs sysctl")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_ALIAS,
CTLTYPE_NODE, "mfs",
SYSCTL_DESCR("Memory based file system"),
NULL, 1, NULL, 0,
CTL_VFS, 3, CTL_EOL);
/*
* XXX the "1" and the "3" above could be dynamic, thereby
* eliminating one more instance of the "number to vfs"
* mapping problem, but they are in order as taken from
* sys/mount.h
*/
}
static int
mfs_modcmd(modcmd_t cmd, void *arg)
{
int error;
switch (cmd) {
case MODULE_CMD_INIT:
error = vfs_attach(&mfs_vfsops);
if (error != 0)
break;
break;
case MODULE_CMD_FINI:
error = vfs_detach(&mfs_vfsops);
if (error != 0)
break;
break;
default:
error = ENOTTY;
break;
}
return (error);
}
/*
* Memory based filesystem initialization.
*/
void
mfs_init(void)
{
if (mfs_initcnt++ == 0) {
mutex_init(&mfs_lock, MUTEX_DEFAULT, IPL_NONE);
ffs_init();
}
}
void
mfs_reinit(void)
{
ffs_reinit();
}
void
mfs_done(void)
{
if (--mfs_initcnt == 0) {
ffs_done();
mutex_destroy(&mfs_lock);
}
}
/*
* Called by main() when mfs is going to be mounted as root.
*/
int
mfs_mountroot(void)
{
struct fs *fs;
struct mount *mp;
struct lwp *l = curlwp; /* XXX */
struct ufsmount *ump;
struct mfsnode *mfsp;
int error = 0;
if ((error = vfs_rootmountalloc(MOUNT_MFS, "mfs_root", &mp))) {
vrele(rootvp);
return (error);
}
mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
rootvp->v_data = mfsp;
rootvp->v_op = mfs_vnodeop_p;
rootvp->v_tag = VT_MFS;
mfsp->mfs_baseoff = mfs_rootbase;
mfsp->mfs_size = mfs_rootsize;
mfsp->mfs_vnode = rootvp;
mfsp->mfs_proc = NULL; /* indicate kernel space */
mfsp->mfs_shutdown = 0;
cv_init(&mfsp->mfs_cv, "mfs");
mfsp->mfs_refcnt = 1;
bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
if ((error = ffs_mountfs(rootvp, mp, l)) != 0) {
vfs_unbusy(mp);
bufq_free(mfsp->mfs_buflist);
vfs_rele(mp);
kmem_free(mfsp, sizeof(*mfsp));
return (error);
}
mountlist_append(mp);
mp->mnt_vnodecovered = NULLVP;
ump = VFSTOUFS(mp);
fs = ump->um_fs;
(void) copystr(mp->mnt_stat.f_mntonname, fs->fs_fsmnt, MNAMELEN - 1, 0);
(void)ffs_statvfs(mp, &mp->mnt_stat);
vfs_unbusy(mp);
return (0);
}
/*
* VFS Operations.
*
* mount system call
*/
/* ARGSUSED */
int
mfs_mount(struct mount *mp, const char *path, void *data, size_t *data_len)
{
struct lwp *l = curlwp;
struct vnode *devvp;
struct mfs_args *args = data;
struct ufsmount *ump;
struct fs *fs;
struct mfsnode *mfsp;
struct proc *p;
devminor_t minor;
int flags, error = 0;
if (args == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
p = l->l_proc;
if (mp->mnt_flag & MNT_GETARGS) {
struct vnode *vp;
ump = VFSTOUFS(mp);
if (ump == NULL)
return EIO;
vp = ump->um_devvp;
if (vp == NULL)
return EIO;
mfsp = VTOMFS(vp);
if (mfsp == NULL)
return EIO;
args->fspec = NULL;
args->base = mfsp->mfs_baseoff;
args->size = mfsp->mfs_size;
*data_len = sizeof *args;
return 0;
}
/*
* XXX turn off async to avoid hangs when writing lots of data.
* the problem is that MFS needs to allocate pages to clean pages,
* so if we wait until the last minute to clean pages then there
* may not be any pages available to do the cleaning.
* ... and since the default partially-synchronous mode turns out
* to not be sufficient under heavy load, make it full synchronous.
*/
mp->mnt_flag &= ~MNT_ASYNC;
mp->mnt_flag |= MNT_SYNCHRONOUS;
/*
* If updating, check whether changing from read-only to
* read/write; if there is no device name, that's all we do.
*/
if (mp->mnt_flag & MNT_UPDATE) {
ump = VFSTOUFS(mp);
fs = ump->um_fs;
if (fs->fs_ronly == 0 && (mp->mnt_flag & MNT_RDONLY)) {
flags = WRITECLOSE;
if (mp->mnt_flag & MNT_FORCE)
flags |= FORCECLOSE;
error = ffs_flushfiles(mp, flags, l);
if (error)
return (error);
}
if (fs->fs_ronly && (mp->mnt_iflag & IMNT_WANTRDWR)) fs->fs_ronly = 0;
if (args->fspec == NULL)
return EINVAL;
return (0);
}
mutex_enter(&mfs_lock);
minor = mfs_minor++;
mutex_exit(&mfs_lock);
error = bdevvp(makedev(255, minor), &devvp);
if (error)
return (error);
mfsp = kmem_alloc(sizeof(*mfsp), KM_SLEEP);
/*
* Changing v_op and v_data here is safe as we are
* the exclusive owner of this device node.
*/
KASSERT(devvp->v_op == spec_vnodeop_p); KASSERT(devvp->v_data == NULL);
devvp->v_op = mfs_vnodeop_p;
devvp->v_data = mfsp;
mfsp->mfs_baseoff = args->base;
mfsp->mfs_size = args->size;
mfsp->mfs_vnode = devvp;
mfsp->mfs_proc = p;
mfsp->mfs_shutdown = 0;
cv_init(&mfsp->mfs_cv, "mfsidl");
mfsp->mfs_refcnt = 1;
bufq_alloc(&mfsp->mfs_buflist, "fcfs", 0);
if ((error = ffs_mountfs(devvp, mp, l)) != 0) {
mfsp->mfs_shutdown = 1;
vrele(devvp);
return (error);
}
ump = VFSTOUFS(mp);
fs = ump->um_fs;
error = set_statvfs_info(path, UIO_USERSPACE, args->fspec,
UIO_USERSPACE, mp->mnt_op->vfs_name, mp, l);
if (error)
return error;
(void)strncpy(fs->fs_fsmnt, mp->mnt_stat.f_mntonname,
sizeof(fs->fs_fsmnt));
fs->fs_fsmnt[sizeof(fs->fs_fsmnt) - 1] = '\0';
/* XXX: cleanup on error */
return 0;
}
/*
* Used to grab the process and keep it in the kernel to service
* memory filesystem I/O requests.
*
* Loop servicing I/O requests.
* Copy the requested data into or out of the memory filesystem
* address space.
*/
/* ARGSUSED */
int
mfs_start(struct mount *mp, int flags)
{
struct vnode *vp;
struct mfsnode *mfsp;
struct proc *p;
struct buf *bp;
void *base;
int sleepreturn = 0, refcnt, error;
ksiginfoq_t kq;
/*
* Ensure that file system is still mounted when getting mfsnode.
* Add a reference to the mfsnode to prevent it disappearing in
* this routine.
*/
if ((error = vfs_busy(mp)) != 0)
return error;
vp = VFSTOUFS(mp)->um_devvp;
mfsp = VTOMFS(vp);
mutex_enter(&mfs_lock);
mfsp->mfs_refcnt++;
mutex_exit(&mfs_lock);
vfs_unbusy(mp);
base = mfsp->mfs_baseoff;
mutex_enter(&mfs_lock);
while (mfsp->mfs_shutdown != 1) {
while ((bp = bufq_get(mfsp->mfs_buflist)) != NULL) {
mutex_exit(&mfs_lock);
mfs_doio(bp, base);
mutex_enter(&mfs_lock);
}
/*
* If a non-ignored signal is received, try to unmount.
* If that fails, or the filesystem is already in the
* process of being unmounted, clear the signal (it has been
* "processed"), otherwise we will loop here, as tsleep
* will always return EINTR/ERESTART.
*/
if (sleepreturn != 0) {
mutex_exit(&mfs_lock);
if (dounmount(mp, 0, curlwp) != 0) {
p = curproc;
ksiginfo_queue_init(&kq);
mutex_enter(p->p_lock);
sigclearall(p, NULL, &kq);
mutex_exit(p->p_lock);
ksiginfo_queue_drain(&kq);
}
sleepreturn = 0;
mutex_enter(&mfs_lock);
continue;
}
sleepreturn = cv_wait_sig(&mfsp->mfs_cv, &mfs_lock);
}
KASSERT(bufq_peek(mfsp->mfs_buflist) == NULL);
refcnt = --mfsp->mfs_refcnt;
mutex_exit(&mfs_lock);
if (refcnt == 0) {
bufq_free(mfsp->mfs_buflist);
cv_destroy(&mfsp->mfs_cv);
kmem_free(mfsp, sizeof(*mfsp));
}
return (sleepreturn);
}
/*
* Get file system statistics.
*/
int
mfs_statvfs(struct mount *mp, struct statvfs *sbp)
{
int error;
error = ffs_statvfs(mp, sbp);
if (error)
return error;
(void)strncpy(sbp->f_fstypename, mp->mnt_op->vfs_name,
sizeof(sbp->f_fstypename));
sbp->f_fstypename[sizeof(sbp->f_fstypename) - 1] = '\0';
return 0;
}
/* $NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $ */
/*-
* Copyright (c) 1999, 2001, 2004, 2006, 2007, 2008, 2019
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_fork.c,v 1.230 2023/02/25 08:22:00 skrll Exp $");
#include "opt_ktrace.h"
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/ras.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/acct.h>
#include <sys/ktrace.h>
#include <sys/sched.h>
#include <sys/signalvar.h>
#include <sys/syscall.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/uidinfo.h>
#include <sys/sdt.h>
#include <sys/ptrace.h>
/*
* DTrace SDT provider definitions
*/
SDT_PROVIDER_DECLARE(proc);
SDT_PROBE_DEFINE3(proc, kernel, , create,
"struct proc *", /* new process */
"struct proc *", /* parent process */
"int" /* flags */);
u_int nprocs __cacheline_aligned = 1; /* process 0 */
/*
* Number of ticks to sleep if fork() would fail due to process hitting
* limits. Exported in miliseconds to userland via sysctl.
*/
int forkfsleep = 0;
int
sys_fork(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, 0, SIGCHLD, NULL, 0, NULL, NULL, retval);
}
/*
* vfork(2) system call compatible with 4.4BSD (i.e. BSD with Mach VM).
* Address space is not shared, but parent is blocked until child exit.
*/
int
sys_vfork(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, FORK_PPWAIT, SIGCHLD, NULL, 0, NULL, NULL,
retval);
}
/*
* New vfork(2) system call for NetBSD, which implements original 3BSD vfork(2)
* semantics. Address space is shared, and parent is blocked until child exit.
*/
int
sys___vfork14(struct lwp *l, const void *v, register_t *retval)
{
return fork1(l, FORK_PPWAIT|FORK_SHAREVM, SIGCHLD, NULL, 0,
NULL, NULL, retval);
}
/*
* Linux-compatible __clone(2) system call.
*/
int
sys___clone(struct lwp *l, const struct sys___clone_args *uap,
register_t *retval)
{
/* {
syscallarg(int) flags;
syscallarg(void *) stack;
} */
int flags, sig;
/*
* We don't support the CLONE_PTRACE flag.
*/
if (SCARG(uap, flags) & (CLONE_PTRACE))
return EINVAL;
/*
* Linux enforces CLONE_VM with CLONE_SIGHAND, do same.
*/
if (SCARG(uap, flags) & CLONE_SIGHAND
&& (SCARG(uap, flags) & CLONE_VM) == 0)
return EINVAL;
flags = 0;
if (SCARG(uap, flags) & CLONE_VM)
flags |= FORK_SHAREVM;
if (SCARG(uap, flags) & CLONE_FS)
flags |= FORK_SHARECWD;
if (SCARG(uap, flags) & CLONE_FILES)
flags |= FORK_SHAREFILES;
if (SCARG(uap, flags) & CLONE_SIGHAND)
flags |= FORK_SHARESIGS;
if (SCARG(uap, flags) & CLONE_VFORK)
flags |= FORK_PPWAIT;
sig = SCARG(uap, flags) & CLONE_CSIGNAL;
if (sig < 0 || sig >= _NSIG)
return EINVAL;
/*
* Note that the Linux API does not provide a portable way of
* specifying the stack area; the caller must know if the stack
* grows up or down. So, we pass a stack size of 0, so that the
* code that makes this adjustment is a noop.
*/
return fork1(l, flags, sig, SCARG(uap, stack), 0,
NULL, NULL, retval);
}
/*
* Print the 'table full' message once per 10 seconds.
*/
static struct timeval fork_tfmrate = { 10, 0 };
/*
* Check if a process is traced and shall inform about FORK events.
*/
static inline bool
tracefork(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEFORK|PSL_TRACED)) ==
(PSL_TRACEFORK|PSL_TRACED) && (flags & FORK_PPWAIT) == 0;
}
/*
* Check if a process is traced and shall inform about VFORK events.
*/
static inline bool
tracevfork(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEVFORK|PSL_TRACED)) ==
(PSL_TRACEVFORK|PSL_TRACED) && (flags & FORK_PPWAIT) != 0;
}
/*
* Check if a process is traced and shall inform about VFORK_DONE events.
*/
static inline bool
tracevforkdone(struct proc *p, int flags)
{
return (p->p_slflag & (PSL_TRACEVFORK_DONE|PSL_TRACED)) ==
(PSL_TRACEVFORK_DONE|PSL_TRACED) && (flags & FORK_PPWAIT);
}
/*
* General fork call. Note that another LWP in the process may call exec()
* or exit() while we are forking. It's safe to continue here, because
* neither operation will complete until all LWPs have exited the process.
*/
int
fork1(struct lwp *l1, int flags, int exitsig, void *stack, size_t stacksize,
void (*func)(void *), void *arg, register_t *retval)
{
struct proc *p1, *p2, *parent;
struct plimit *p1_lim;
uid_t uid;
struct lwp *l2;
int count;
vaddr_t uaddr;
int tnprocs;
int error = 0;
p1 = l1->l_proc;
uid = kauth_cred_getuid(l1->l_cred);
tnprocs = atomic_inc_uint_nv(&nprocs);
/*
* Although process entries are dynamically created, we still keep
* a global limit on the maximum number we will create.
*/
if (__predict_false(tnprocs >= maxproc))
error = -1;
else
error = kauth_authorize_process(l1->l_cred,
KAUTH_PROCESS_FORK, p1, KAUTH_ARG(tnprocs), NULL, NULL);
if (error) {
static struct timeval lasttfm;
atomic_dec_uint(&nprocs);
if (ratecheck(&lasttfm, &fork_tfmrate)) tablefull("proc", "increase kern.maxproc or NPROC"); if (forkfsleep) kpause("forkmx", false, forkfsleep, NULL);
return EAGAIN;
}
/*
* Enforce limits.
*/
count = chgproccnt(uid, 1);
if (__predict_false(count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur)) { if (kauth_authorize_process(l1->l_cred, KAUTH_PROCESS_RLIMIT,
p1, KAUTH_ARG(KAUTH_REQ_PROCESS_RLIMIT_BYPASS),
&p1->p_rlimit[RLIMIT_NPROC], KAUTH_ARG(RLIMIT_NPROC)) != 0) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
if (forkfsleep) kpause("forkulim", false, forkfsleep, NULL);
return EAGAIN;
}
}
/*
* Allocate virtual address space for the U-area now, while it
* is still easy to abort the fork operation if we're out of
* kernel virtual address space.
*/
uaddr = uvm_uarea_alloc();
if (__predict_false(uaddr == 0)) {
(void)chgproccnt(uid, -1);
atomic_dec_uint(&nprocs);
return ENOMEM;
}
/* Allocate new proc. */
p2 = proc_alloc();
if (p2 == NULL) {
/* We were unable to allocate a process ID. */
uvm_uarea_free(uaddr);
mutex_enter(p1->p_lock);
uid = kauth_cred_getuid(p1->p_cred);
(void)chgproccnt(uid, -1);
mutex_exit(p1->p_lock);
atomic_dec_uint(&nprocs);
return EAGAIN;
}
/*
* We are now committed to the fork. From here on, we may
* block on resources, but resource allocation may NOT fail.
*/
/*
* Make a proc table entry for the new process.
* Start by zeroing the section of proc that is zero-initialized,
* then copy the section that is copied directly from the parent.
*/
memset(&p2->p_startzero, 0,
(unsigned) ((char *)&p2->p_endzero - (char *)&p2->p_startzero));
memcpy(&p2->p_startcopy, &p1->p_startcopy,
(unsigned) ((char *)&p2->p_endcopy - (char *)&p2->p_startcopy));
TAILQ_INIT(&p2->p_sigpend.sp_info);
LIST_INIT(&p2->p_lwps);
LIST_INIT(&p2->p_sigwaiters);
/*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
* Inherit flags we want to keep. The flags related to SIGCHLD
* handling are important in order to keep a consistent behaviour
* for the child after the fork. If we are a 32-bit process, the
* child will be too.
*/
p2->p_flag =
p1->p_flag & (PK_SUGID | PK_NOCLDWAIT | PK_CLDSIGIGN | PK_32);
p2->p_emul = p1->p_emul;
p2->p_execsw = p1->p_execsw;
if (flags & FORK_SYSTEM) {
/*
* Mark it as a system process. Set P_NOCLDWAIT so that
* children are reparented to init(8) when they exit.
* init(8) can easily wait them out for us.
*/
p2->p_flag |= (PK_SYSTEM | PK_NOCLDWAIT);
}
mutex_init(&p2->p_stmutex, MUTEX_DEFAULT, IPL_HIGH);
mutex_init(&p2->p_auxlock, MUTEX_DEFAULT, IPL_NONE);
rw_init(&p2->p_reflock);
cv_init(&p2->p_waitcv, "wait");
cv_init(&p2->p_lwpcv, "lwpwait");
/*
* Share a lock between the processes if they are to share signal
* state: we must synchronize access to it.
*/
if (flags & FORK_SHARESIGS) {
p2->p_lock = p1->p_lock;
mutex_obj_hold(p1->p_lock);
} else
p2->p_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
kauth_proc_fork(p1, p2);
p2->p_raslist = NULL;
#if defined(__HAVE_RAS)
ras_fork(p1, p2);
#endif
/* bump references to the text vnode (for procfs) */
p2->p_textvp = p1->p_textvp;
if (p2->p_textvp) vref(p2->p_textvp); if (p1->p_path) p2->p_path = kmem_strdupsize(p1->p_path, NULL, KM_SLEEP);
else
p2->p_path = NULL;
if (flags & FORK_SHAREFILES)
fd_share(p2);
else if (flags & FORK_CLEANFILES)
p2->p_fd = fd_init(NULL);
else
p2->p_fd = fd_copy();
/* XXX racy */
p2->p_mqueue_cnt = p1->p_mqueue_cnt;
if (flags & FORK_SHARECWD)
cwdshare(p2);
else
p2->p_cwdi = cwdinit();
/*
* Note: p_limit (rlimit stuff) is copy-on-write, so normally
* we just need increase pl_refcnt.
*/
p1_lim = p1->p_limit;
if (!p1_lim->pl_writeable) {
lim_addref(p1_lim);
p2->p_limit = p1_lim;
} else {
p2->p_limit = lim_copy(p1_lim);
}
if (flags & FORK_PPWAIT) {
/* Mark ourselves as waiting for a child. */
p2->p_lflag = PL_PPWAIT;
l1->l_vforkwaiting = true;
p2->p_vforklwp = l1;
} else {
p2->p_lflag = 0;
l1->l_vforkwaiting = false;
}
p2->p_sflag = 0;
p2->p_slflag = 0;
parent = (flags & FORK_NOWAIT) ? initproc : p1;
p2->p_pptr = parent;
p2->p_ppid = parent->p_pid;
LIST_INIT(&p2->p_children);
p2->p_aio = NULL;
#ifdef KTRACE
/*
* Copy traceflag and tracefile if enabled.
* If not inherited, these were zeroed above.
*/
if (p1->p_traceflag & KTRFAC_INHERIT) {
mutex_enter(&ktrace_lock);
p2->p_traceflag = p1->p_traceflag;
if ((p2->p_tracep = p1->p_tracep) != NULL) ktradref(p2);
mutex_exit(&ktrace_lock);
}
#endif
/*
* Create signal actions for the child process.
*/
p2->p_sigacts = sigactsinit(p1, flags & FORK_SHARESIGS);
mutex_enter(p1->p_lock);
p2->p_sflag |=
(p1->p_sflag & (PS_STOPFORK | PS_STOPEXEC | PS_NOCLDSTOP));
sched_proc_fork(p1, p2);
mutex_exit(p1->p_lock);
p2->p_stflag = p1->p_stflag;
/*
* p_stats.
* Copy parts of p_stats, and zero out the rest.
*/
p2->p_stats = pstatscopy(p1->p_stats);
/*
* Set up the new process address space.
*/
uvm_proc_fork(p1, p2, (flags & FORK_SHAREVM) ? true : false);
/*
* Finish creating the child process.
* It will return through a different path later.
*/
lwp_create(l1, p2, uaddr, (flags & FORK_PPWAIT) ? LWP_VFORK : 0,
stack, stacksize, (func != NULL) ? func : child_return, arg, &l2,
l1->l_class, &l1->l_sigmask, &l1->l_sigstk);
/*
* Inherit l_private from the parent.
* Note that we cannot use lwp_setprivate() here since that
* also sets the CPU TLS register, which is incorrect if the
* process has changed that without letting the kernel know.
*/
l2->l_private = l1->l_private;
/*
* If emulation has a process fork hook, call it now.
*/
if (p2->p_emul->e_proc_fork) (*p2->p_emul->e_proc_fork)(p2, l1, flags);
/*
* ...and finally, any other random fork hooks that subsystems
* might have registered.
*/
doforkhooks(p2, p1);
SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
/*
* It's now safe for the scheduler and other processes to see the
* child process.
*/
mutex_enter(&proc_lock);
if (p1->p_session->s_ttyvp != NULL && p1->p_lflag & PL_CONTROLT) p2->p_lflag |= PL_CONTROLT; LIST_INSERT_HEAD(&parent->p_children, p2, p_sibling);
p2->p_exitsig = exitsig; /* signal for parent on exit */
/*
* Trace fork(2) and vfork(2)-like events on demand in a debugger.
*/
if (tracefork(p1, flags) || tracevfork(p1, flags)) {
proc_changeparent(p2, p1->p_pptr);
SET(p2->p_slflag, PSL_TRACEDCHILD);
}
p2->p_oppid = p1->p_pid; /* Remember the original parent id. */
LIST_INSERT_AFTER(p1, p2, p_pglist); LIST_INSERT_HEAD(&allproc, p2, p_list);
p2->p_trace_enabled = trace_is_enabled(p2);
#ifdef __HAVE_SYSCALL_INTERN
(*p2->p_emul->e_syscall_intern)(p2);
#endif
/*
* Update stats now that we know the fork was successful.
*/
KPREEMPT_DISABLE(l1); CPU_COUNT(CPU_COUNT_FORKS, 1); if (flags & FORK_PPWAIT) CPU_COUNT(CPU_COUNT_FORKS_PPWAIT, 1); if (flags & FORK_SHAREVM) CPU_COUNT(CPU_COUNT_FORKS_SHAREVM, 1); KPREEMPT_ENABLE(l1); if (ktrpoint(KTR_EMUL)) p2->p_traceflag |= KTRFAC_TRC_EMUL;
/*
* Notify any interested parties about the new process.
*/
if (!SLIST_EMPTY(&p1->p_klist)) { mutex_exit(&proc_lock);
knote_proc_fork(p1, p2);
mutex_enter(&proc_lock);
}
/*
* Make child runnable, set start time, and add to run queue except
* if the parent requested the child to start in SSTOP state.
*/
mutex_enter(p2->p_lock);
/*
* Start profiling.
*/
if ((p2->p_stflag & PST_PROFIL) != 0) { mutex_spin_enter(&p2->p_stmutex);
startprofclock(p2);
mutex_spin_exit(&p2->p_stmutex);
}
getmicrotime(&p2->p_stats->p_start);
p2->p_acflag = AFORK;
lwp_lock(l2);
KASSERT(p2->p_nrlwps == 1); KASSERT(l2->l_stat == LSIDL);
if (p2->p_sflag & PS_STOPFORK) {
p2->p_nrlwps = 0;
p2->p_stat = SSTOP;
p2->p_waited = 0;
p1->p_nstopchild++;
l2->l_stat = LSSTOP;
KASSERT(l2->l_wchan == NULL);
lwp_unlock(l2);
} else {
p2->p_nrlwps = 1;
p2->p_stat = SACTIVE;
setrunnable(l2);
/* LWP now unlocked */
}
/*
* Return child pid to parent process,
* marking us as parent via retval[1].
*/
if (retval != NULL) { retval[0] = p2->p_pid;
retval[1] = 0;
}
mutex_exit(p2->p_lock);
/*
* Let the parent know that we are tracing its child.
*/
if (tracefork(p1, flags) || tracevfork(p1, flags)) {
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD,
tracefork(p1, flags) ? PTRACE_FORK : PTRACE_VFORK,
retval[0]);
mutex_enter(&proc_lock);
}
/*
* Preserve synchronization semantics of vfork. If waiting for
* child to exec or exit, sleep until it clears p_vforkwaiting.
*/
while (l1->l_vforkwaiting)
cv_wait(&l1->l_waitcv, &proc_lock);
/*
* Let the parent know that we are tracing its child.
*/
if (tracevforkdone(p1, flags)) {
mutex_enter(p1->p_lock);
eventswitch(TRAP_CHLD, PTRACE_VFORK_DONE, retval[0]);
} else
mutex_exit(&proc_lock);
return 0;
}
/*
* MI code executed in each newly spawned process before returning to userland.
*/
void
child_return(void *arg)
{
struct lwp *l = curlwp;
struct proc *p = l->l_proc;
if ((p->p_slflag & (PSL_TRACED|PSL_TRACEDCHILD)) ==
(PSL_TRACED|PSL_TRACEDCHILD)) {
eventswitchchild(p, TRAP_CHLD,
ISSET(p->p_lflag, PL_PPWAIT) ? PTRACE_VFORK : PTRACE_FORK);
}
md_child_return(l);
/*
* Return SYS_fork for all fork types, including vfork(2) and clone(2).
*
* This approach simplifies the code and avoids extra locking.
*/
ktrsysret(SYS_fork, 0, 0);
}
/* $NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.60 2024/02/19 20:10:09 mrg Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/lwp.h>
#include <sys/atomic.h>
#include <sys/kernel.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/lock.h>
#include <machine/cpu_counter.h>
#include <machine/cpuvar.h>
#include <machine/cpufunc.h>
#include <machine/specialreg.h>
#include <machine/cputypes.h>
#include "tsc.h"
#define TSC_SYNC_ROUNDS 1000
#define ABS(a) ((a) >= 0 ? (a) : -(a))
static u_int tsc_get_timecount(struct timecounter *);
static void tsc_delay(unsigned int);
static uint64_t tsc_dummy_cacheline __cacheline_aligned;
uint64_t tsc_freq __read_mostly; /* exported for sysctl */
static int64_t tsc_drift_max = 1000; /* max cycles */
static int64_t tsc_drift_observed;
uint64_t (*rdtsc)(void) = rdtsc_cpuid;
uint64_t (*cpu_counter)(void) = cpu_counter_cpuid;
uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid;
int tsc_user_enabled = 1;
static volatile int64_t tsc_sync_val;
static volatile struct cpu_info *tsc_sync_cpu;
static struct timecounter tsc_timecounter = {
.tc_get_timecount = tsc_get_timecount,
.tc_counter_mask = ~0U,
.tc_name = "TSC",
.tc_quality = 3000,
};
bool
tsc_is_invariant(void)
{
struct cpu_info *ci;
uint32_t descs[4];
uint32_t family;
bool invariant;
if (!cpu_hascounter())
return false;
ci = curcpu();
invariant = false;
if (cpu_vendor == CPUVENDOR_INTEL) {
/*
* From Intel(tm) 64 and IA-32 Architectures Software
* Developer's Manual Volume 3A: System Programming Guide,
* Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
* where the TSC is known invariant:
*
* Pentium 4, Intel Xeon (family 0f, models 03 and higher)
* Core Solo and Core Duo processors (family 06, model 0e)
* Xeon 5100 series and Core 2 Duo (family 06, model 0f)
* Core 2 and Xeon (family 06, model 17)
* Atom (family 06, model 1c)
*
* We'll also assume that it's safe on the Pentium, and
* that it's safe on P-II and P-III Xeons due to the
* typical configuration of those systems.
*
*/
switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
case 0x05:
invariant = true;
break;
case 0x06:
invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
break;
case 0x0f:
invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
break;
}
} else if (cpu_vendor == CPUVENDOR_AMD) {
/*
* TSC and Power Management Events on AMD Processors
* Nov 2, 2005 Rich Brunner, AMD Fellow
* http://lkml.org/lkml/2005/11/4/173
*
* See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
* Management Features, AMD64 Architecture Programmer's
* Manual Volume 3: General-Purpose and System Instructions.
* The check is done below.
*/
/*
* AMD Errata 778: Processor Core Time Stamp Counters May
* Experience Drift
*
* This affects all family 15h and family 16h processors.
*/
switch (CPUID_TO_FAMILY(ci->ci_signature)) {
case 0x15:
case 0x16:
return false;
}
}
/*
* The best way to check whether the TSC counter is invariant or not
* is to check CPUID 80000007.
*/
family = CPUID_TO_BASEFAMILY(ci->ci_signature);
if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
&& ((family == 0x06) || (family == 0x0f))) {
x86_cpuid(0x80000000, descs);
if (descs[0] >= 0x80000007) {
x86_cpuid(0x80000007, descs);
invariant = (descs[3] & CPUID_APM_ITSC) != 0;
}
}
return invariant;
}
/* Setup function pointers for rdtsc() and timecounter(9). */
void
tsc_setfunc(struct cpu_info *ci)
{
bool use_lfence, use_mfence;
use_lfence = use_mfence = false;
/*
* XXX On AMD, we might be able to use lfence for some cases:
* a) if MSR_DE_CFG exist and the bit 1 is set.
* b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
* lfence is always serializing.
*
* We don't use it because the test result showed mfence was better
* than lfence with MSR_DE_CFG.
*/
if (cpu_vendor == CPUVENDOR_AMD)
use_mfence = true;
else if (cpu_vendor == CPUVENDOR_INTEL)
use_lfence = true;
/* LFENCE and MFENCE are applicable if SSE2 is set. */
if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
use_lfence = use_mfence = false;
#define TSC_SETFUNC(fence) \
do { \
rdtsc = rdtsc_##fence; \
cpu_counter = cpu_counter_##fence; \
cpu_counter32 = cpu_counter32_##fence; \
} while (/* CONSTCOND */ 0)
if (use_lfence)
TSC_SETFUNC(lfence);
else if (use_mfence)
TSC_SETFUNC(mfence);
else
TSC_SETFUNC(cpuid);
aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
}
/*
* Initialize timecounter(9) and DELAY() function of TSC.
*
* This function is called after all secondary processors were brought up
* and drift has been measured, and after any other potential delay funcs
* have been installed (e.g. lapic_delay()).
*/
void
tsc_tc_init(void)
{
struct cpu_info *ci;
bool invariant;
if (!cpu_hascounter())
return;
ci = curcpu();
tsc_freq = ci->ci_data.cpu_cc_freq;
invariant = tsc_is_invariant();
if (!invariant) {
aprint_debug("TSC not known invariant on this CPU\n");
tsc_timecounter.tc_quality = -100;
} else if (tsc_drift_observed > tsc_drift_max) {
aprint_error("ERROR: %lld cycle TSC drift observed\n",
(long long)tsc_drift_observed);
tsc_timecounter.tc_quality = -100;
invariant = false;
} else if (vm_guest == VM_GUEST_NO) {
delay_func = tsc_delay;
} else if (vm_guest == VM_GUEST_VIRTUALBOX) {
tsc_timecounter.tc_quality = -100;
}
if (tsc_freq != 0) {
tsc_timecounter.tc_frequency = tsc_freq;
tc_init(&tsc_timecounter);
}
}
/*
* Record drift (in clock cycles). Called during AP startup.
*/
void
tsc_sync_drift(int64_t drift)
{
if (drift < 0)
drift = -drift;
if (drift > tsc_drift_observed)
tsc_drift_observed = drift;
}
/*
* Called during startup of APs, by the boot processor. Interrupts
* are disabled on entry.
*/
static void __noinline
tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
{
uint64_t bptsc;
if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
panic("tsc_sync_bp: 1");
}
/* Prepare a cache miss for the other side. */
(void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
/* Flag our readiness. */
atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
/* Wait for other side then read our TSC. */
while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
__insn_barrier();
}
bptsc = rdtsc();
/* Wait for the results to come in. */
while (tsc_sync_cpu == ci) {
x86_pause();
}
if (tsc_sync_cpu != NULL) {
panic("tsc_sync_bp: 2");
}
*bptscp = bptsc;
*aptscp = tsc_sync_val;
}
void
tsc_sync_bp(struct cpu_info *ci)
{
int64_t bptsc, aptsc, val, diff;
if (!cpu_hascounter())
return;
val = INT64_MAX;
for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
tsc_read_bp(ci, &bptsc, &aptsc);
diff = bptsc - aptsc;
if (ABS(diff) < ABS(val)) {
val = diff;
}
}
ci->ci_data.cpu_cc_skew = val;
}
/*
* Called during startup of AP, by the AP itself. Interrupts are
* disabled on entry.
*/
static void __noinline
tsc_post_ap(struct cpu_info *ci)
{
uint64_t tsc;
/* Wait for go-ahead from primary. */
while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
__insn_barrier();
}
/* Instruct primary to read its counter. */
atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
/* Suffer a cache miss, then read TSC. */
__insn_barrier();
tsc = tsc_dummy_cacheline;
__insn_barrier();
tsc += rdtsc();
/* Post result. Ensure the whole value goes out atomically. */
(void)atomic_swap_64(&tsc_sync_val, tsc);
if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
panic("tsc_sync_ap");
}
}
void
tsc_sync_ap(struct cpu_info *ci)
{
if (!cpu_hascounter())
return;
for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
tsc_post_ap(ci);
}
}
static void
tsc_apply_cpu(void *arg1, void *arg2)
{
bool enable = arg1 != NULL;
if (enable) {
lcr4(rcr4() & ~CR4_TSD);
} else {
lcr4(rcr4() | CR4_TSD);
}
}
void
tsc_user_enable(void)
{
uint64_t xc;
xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
xc_wait(xc);
}
void
tsc_user_disable(void)
{
uint64_t xc;
xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
xc_wait(xc);
}
uint64_t
cpu_frequency(struct cpu_info *ci)
{
return ci->ci_data.cpu_cc_freq;
}
int
cpu_hascounter(void)
{
return cpu_feature[0] & CPUID_TSC;
}
static void
tsc_delay(unsigned int us)
{
uint64_t start, delta;
start = cpu_counter();
delta = (uint64_t)us * tsc_freq / 1000000;
while ((cpu_counter() - start) < delta) {
x86_pause();
}
}
static u_int
tsc_get_timecount(struct timecounter *tc)
{
#if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
static int lastwarn;
uint64_t cur, prev;
lwp_t *l = curlwp;
int ticks;
/*
* Previous value must be read before the counter and stored to
* after, because this routine can be called from interrupt context
* and may run over the top of an existing invocation. Ordering is
* guaranteed by "volatile" on md_tsc.
*/
prev = l->l_md.md_tsc;
cur = cpu_counter();
if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
__cpu_simple_lock_try(&lock)) {
ticks = getticks();
if (ticks - lastwarn >= hz) {
printf(
"WARNING: %s TSC went backwards by %u - "
"change sysctl(7) kern.timecounter?\n",
cpu_name(curcpu()), (unsigned)(prev - cur));
lastwarn = ticks;
}
__cpu_simple_unlock(&lock);
}
l->l_md.md_tsc = cur;
return (uint32_t)cur;
#else
return cpu_counter32();
#endif
}
/*
* tsc has been reset; zero the cached tsc of every lwp in the system
* so we don't spuriously report that the tsc has gone backward.
* Caller must ensure all LWPs are quiescent (except the current one,
* obviously) and interrupts are blocked while we update this.
*/
void
tsc_tc_reset(void)
{
struct lwp *l;
LIST_FOREACH(l, &alllwp, l_list)
l->l_md.md_tsc = 0;
}
/* $NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Passive references
*
* Passive references are references to objects that guarantee the
* object will not be destroyed until the reference is released.
*
* Passive references require no interprocessor synchronization to
* acquire or release. However, destroying the target of passive
* references requires expensive interprocessor synchronization --
* xcalls to determine on which CPUs the object is still in use.
*
* Passive references may be held only on a single CPU and by a
* single LWP. They require the caller to allocate a little stack
* space, a struct psref object. Sleeping while a passive
* reference is held is allowed, provided that the owner's LWP is
* bound to a CPU -- e.g., the owner is a softint or a bound
* kthread. However, sleeping should be kept to a short duration,
* e.g. sleeping on an adaptive lock.
*
* Passive references serve as an intermediate stage between
* reference counting and passive serialization (pserialize(9)):
*
* - If you need references to transfer from CPU to CPU or LWP to
* LWP, or if you need long-term references, you must use
* reference counting, e.g. with atomic operations or locks,
* which incurs interprocessor synchronization for every use --
* cheaper than an xcall, but not scalable.
*
* - If all users *guarantee* that they will not sleep, then it is
* not necessary to use passive references: you may as well just
* use the even cheaper pserialize(9), because you have
* satisfied the requirements of a pserialize read section.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_psref.c,v 1.18 2022/02/12 16:31:06 macallan Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/psref.h>
#include <sys/queue.h>
#include <sys/xcall.h>
#include <sys/lwp.h>
SLIST_HEAD(psref_head, psref);
static bool _psref_held(const struct psref_target *, struct psref_class *,
bool);
/*
* struct psref_class
*
* Private global state for a class of passive reference targets.
* Opaque to callers.
*/
struct psref_class {
kmutex_t prc_lock;
kcondvar_t prc_cv;
struct percpu *prc_percpu; /* struct psref_cpu */
ipl_cookie_t prc_iplcookie;
unsigned int prc_xc_flags;
};
/*
* struct psref_cpu
*
* Private per-CPU state for a class of passive reference targets.
* Not exposed by the API.
*/
struct psref_cpu {
struct psref_head pcpu_head;
};
/*
* Data structures and functions for debugging.
*/
#ifndef PSREF_DEBUG_NITEMS
#define PSREF_DEBUG_NITEMS 16
#endif
struct psref_debug_item {
void *prdi_caller;
struct psref *prdi_psref;
};
struct psref_debug {
int prd_refs_peek;
struct psref_debug_item prd_items[PSREF_DEBUG_NITEMS];
};
#ifdef PSREF_DEBUG
static void psref_debug_acquire(struct psref *);
static void psref_debug_release(struct psref *);
static void psref_debug_lwp_free(void *);
static specificdata_key_t psref_debug_lwp_key;
#endif
/*
* psref_init()
*/
void
psref_init(void)
{
#ifdef PSREF_DEBUG
lwp_specific_key_create(&psref_debug_lwp_key, psref_debug_lwp_free);
#endif
}
/*
* psref_class_create(name, ipl)
*
* Create a new passive reference class, with the given wchan name
* and ipl.
*/
struct psref_class *
psref_class_create(const char *name, int ipl)
{
struct psref_class *class;
ASSERT_SLEEPABLE();
class = kmem_alloc(sizeof(*class), KM_SLEEP);
class->prc_percpu = percpu_alloc(sizeof(struct psref_cpu));
mutex_init(&class->prc_lock, MUTEX_DEFAULT, ipl);
cv_init(&class->prc_cv, name);
class->prc_iplcookie = makeiplcookie(ipl);
class->prc_xc_flags = XC_HIGHPRI_IPL(ipl);
return class;
}
static void __diagused
psref_cpu_drained_p(void *p, void *cookie, struct cpu_info *ci __unused)
{
const struct psref_cpu *pcpu = p;
bool *retp = cookie;
if (!SLIST_EMPTY(&pcpu->pcpu_head))
*retp = false;
}
static bool __diagused
psref_class_drained_p(const struct psref_class *prc)
{
bool ret = true;
percpu_foreach(prc->prc_percpu, &psref_cpu_drained_p, &ret);
return ret;
}
/*
* psref_class_destroy(class)
*
* Destroy a passive reference class and free memory associated
* with it. All targets in this class must have been drained and
* destroyed already.
*/
void
psref_class_destroy(struct psref_class *class)
{
KASSERT(psref_class_drained_p(class));
cv_destroy(&class->prc_cv);
mutex_destroy(&class->prc_lock);
percpu_free(class->prc_percpu, sizeof(struct psref_cpu));
kmem_free(class, sizeof(*class));
}
/*
* psref_target_init(target, class)
*
* Initialize a passive reference target in the specified class.
* The caller is responsible for issuing a membar_producer after
* psref_target_init and before exposing a pointer to the target
* to other CPUs.
*/
void
psref_target_init(struct psref_target *target,
struct psref_class *class)
{
target->prt_class = class;
target->prt_draining = false;
}
#ifdef DEBUG
static bool
psref_exist(struct psref_cpu *pcpu, struct psref *psref)
{
struct psref *_psref;
SLIST_FOREACH(_psref, &pcpu->pcpu_head, psref_entry) {
if (_psref == psref)
return true;
}
return false;
}
static void
psref_check_duplication(struct psref_cpu *pcpu, struct psref *psref,
const struct psref_target *target)
{
bool found = false;
found = psref_exist(pcpu, psref);
if (found) {
panic("The psref is already in the list (acquiring twice?): "
"psref=%p target=%p", psref, target);
}
}
static void
psref_check_existence(struct psref_cpu *pcpu, struct psref *psref,
const struct psref_target *target)
{
bool found = false;
found = psref_exist(pcpu, psref);
if (!found) {
panic("The psref isn't in the list (releasing unused psref?): "
"psref=%p target=%p", psref, target);
}
}
#endif /* DEBUG */
/*
* psref_acquire(psref, target, class)
*
* Acquire a passive reference to the specified target, which must
* be in the specified class.
*
* The caller must guarantee that the target will not be destroyed
* before psref_acquire returns.
*
* The caller must additionally guarantee that it will not switch
* CPUs before releasing the passive reference, either by
* disabling kpreemption and avoiding sleeps, or by being in a
* softint or in an LWP bound to a CPU.
*/
void
psref_acquire(struct psref *psref, const struct psref_target *target,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
target);
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
#ifdef DEBUG
/* Sanity-check if the target is already acquired with the same psref. */
psref_check_duplication(pcpu, psref, target);
#endif
/* Record our reference. */
SLIST_INSERT_HEAD(&pcpu->pcpu_head, psref, psref_entry);
psref->psref_target = target;
psref->psref_lwp = curlwp;
psref->psref_cpu = curcpu();
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
curlwp->l_psrefs++;
#endif
#ifdef PSREF_DEBUG
psref_debug_acquire(psref);
#endif
}
/*
* psref_release(psref, target, class)
*
* Release a passive reference to the specified target, which must
* be in the specified class.
*
* The caller must not have switched CPUs or LWPs since acquiring
* the passive reference.
*/
void
psref_release(struct psref *psref, const struct psref_target *target,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Make sure the psref looks sensible. */
KASSERTMSG((psref->psref_target == target),
"passive reference target mismatch: %p (ref) != %p (expected)",
psref->psref_target, target);
KASSERTMSG((psref->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
psref->psref_lwp, curlwp);
KASSERTMSG((psref->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(psref->psref_cpu), cpu_index(curcpu()));
/*
* Block interrupts and remove the psref from the current CPU's
* list. No need to percpu_getref or get the head of the list,
* and the caller guarantees that we are bound to a CPU anyway
* (as does blocking interrupts).
*/
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
#ifdef DEBUG
/* Sanity-check if the target is surely acquired before. */
psref_check_existence(pcpu, psref, target);
#endif
SLIST_REMOVE(&pcpu->pcpu_head, psref, psref, psref_entry);
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
KASSERT(curlwp->l_psrefs > 0);
curlwp->l_psrefs--;
#endif
#ifdef PSREF_DEBUG
psref_debug_release(psref);
#endif
/* If someone is waiting for users to drain, notify 'em. */
if (__predict_false(target->prt_draining)) cv_broadcast(&class->prc_cv);
}
/*
* psref_copy(pto, pfrom, class)
*
* Copy a passive reference from pfrom, which must be in the
* specified class, to pto. Both pfrom and pto must later be
* released with psref_release.
*
* The caller must not have switched CPUs or LWPs since acquiring
* pfrom, and must not switch CPUs or LWPs before releasing both
* pfrom and pto.
*/
void
psref_copy(struct psref *pto, const struct psref *pfrom,
struct psref_class *class)
{
struct psref_cpu *pcpu;
int s;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((pto != pfrom),
"can't copy passive reference to itself: %p",
pto);
/* Make sure the pfrom reference looks sensible. */
KASSERTMSG((pfrom->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
pfrom->psref_lwp, curlwp);
KASSERTMSG((pfrom->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(pfrom->psref_cpu), cpu_index(curcpu()));
KASSERTMSG((pfrom->psref_target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
pfrom->psref_target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
/* Record the new reference. */
SLIST_INSERT_HEAD(&pcpu->pcpu_head, pto, psref_entry);
pto->psref_target = pfrom->psref_target;
pto->psref_lwp = curlwp;
pto->psref_cpu = curcpu();
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
#if defined(DIAGNOSTIC) || defined(PSREF_DEBUG)
curlwp->l_psrefs++;
#endif
}
/*
* struct psreffed
*
* Global state for draining a psref target.
*/
struct psreffed {
struct psref_class *class;
struct psref_target *target;
bool ret;
};
static void
psreffed_p_xc(void *cookie0, void *cookie1 __unused)
{
struct psreffed *P = cookie0;
/*
* If we hold a psref to the target, then answer true.
*
* This is the only dynamic decision that may be made with
* psref_held.
*
* No need to lock anything here: every write transitions from
* false to true, so there can be no conflicting writes. No
* need for a memory barrier here because P->ret is read only
* after xc_wait, which has already issued any necessary memory
* barriers.
*/
if (_psref_held(P->target, P->class, true))
P->ret = true;
}
static bool
psreffed_p(struct psref_target *target, struct psref_class *class)
{
struct psreffed P = {
.class = class,
.target = target,
.ret = false,
};
if (__predict_true(mp_online)) {
/*
* Ask all CPUs to say whether they hold a psref to the
* target.
*/
xc_wait(xc_broadcast(class->prc_xc_flags, &psreffed_p_xc, &P,
NULL));
} else
psreffed_p_xc(&P, NULL);
return P.ret;
}
/*
* psref_target_destroy(target, class)
*
* Destroy a passive reference target. Waits for all existing
* references to drain. Caller must guarantee no new references
* will be acquired once it calls psref_target_destroy, e.g. by
* removing the target from a global list first. May sleep.
*/
void
psref_target_destroy(struct psref_target *target, struct psref_class *class)
{
ASSERT_SLEEPABLE();
KASSERTMSG(!target->prt_draining, "psref target already destroyed: %p",
target);
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Request psref_release to notify us when done. */
target->prt_draining = true;
/* Wait until there are no more references on any CPU. */
while (psreffed_p(target, class)) {
/*
* This enter/wait/exit business looks wrong, but it is
* both necessary, because psreffed_p performs a
* low-priority xcall and hence cannot run while a
* mutex is locked, and OK, because the wait is timed
* -- explicit wakeups are only an optimization.
*/
mutex_enter(&class->prc_lock);
(void)cv_timedwait(&class->prc_cv, &class->prc_lock, 1);
mutex_exit(&class->prc_lock);
}
/* No more references. Cause subsequent psref_acquire to kassert. */
target->prt_class = NULL;
}
static bool
_psref_held(const struct psref_target *target, struct psref_class *class,
bool lwp_mismatch_ok)
{
const struct psref_cpu *pcpu;
const struct psref *psref;
int s;
bool held = false;
KASSERTMSG((kpreempt_disabled() || cpu_softintr_p() ||
ISSET(curlwp->l_pflag, LP_BOUND)),
"passive references are CPU-local,"
" but preemption is enabled and the caller is not"
" in a softint or CPU-bound LWP");
KASSERTMSG((target->prt_class == class),
"mismatched psref target class: %p (ref) != %p (expected)",
target->prt_class, class);
/* Block interrupts and acquire the current CPU's reference list. */
s = splraiseipl(class->prc_iplcookie);
pcpu = percpu_getref(class->prc_percpu);
/* Search through all the references on this CPU. */
SLIST_FOREACH(psref, &pcpu->pcpu_head, psref_entry) {
/* Sanity-check the reference's CPU. */
KASSERTMSG((psref->psref_cpu == curcpu()),
"passive reference transferred from CPU %u to CPU %u",
cpu_index(psref->psref_cpu), cpu_index(curcpu()));
/* If it doesn't match, skip it and move on. */
if (psref->psref_target != target)
continue;
/*
* Sanity-check the reference's LWP if we are asserting
* via psref_held that this LWP holds it, but not if we
* are testing in psref_target_destroy whether any LWP
* still holds it.
*/
KASSERTMSG((lwp_mismatch_ok || psref->psref_lwp == curlwp),
"passive reference transferred from lwp %p to lwp %p",
psref->psref_lwp, curlwp);
/* Stop here and report that we found it. */
held = true;
break;
}
/* Release the CPU list and restore interrupts. */
percpu_putref(class->prc_percpu);
splx(s);
return held;
}
/*
* psref_held(target, class)
*
* True if the current CPU holds a passive reference to target,
* false otherwise. May be used only inside assertions.
*/
bool
psref_held(const struct psref_target *target, struct psref_class *class)
{
return _psref_held(target, class, false);
}
#ifdef PSREF_DEBUG
void
psref_debug_init_lwp(struct lwp *l)
{
struct psref_debug *prd;
prd = kmem_zalloc(sizeof(*prd), KM_SLEEP);
lwp_setspecific_by_lwp(l, psref_debug_lwp_key, prd);
}
static void
psref_debug_lwp_free(void *arg)
{
struct psref_debug *prd = arg;
kmem_free(prd, sizeof(*prd));
}
static void
psref_debug_acquire(struct psref *psref)
{
struct psref_debug *prd;
struct lwp *l = curlwp;
int s, i;
prd = lwp_getspecific(psref_debug_lwp_key);
if (__predict_false(prd == NULL)) {
psref->psref_debug = NULL;
return;
}
s = splserial();
if (l->l_psrefs > prd->prd_refs_peek) {
prd->prd_refs_peek = l->l_psrefs;
if (__predict_false(prd->prd_refs_peek > PSREF_DEBUG_NITEMS))
panic("exceeded PSREF_DEBUG_NITEMS");
}
for (i = 0; i < prd->prd_refs_peek; i++) {
struct psref_debug_item *prdi = &prd->prd_items[i];
if (prdi->prdi_psref != NULL)
continue;
prdi->prdi_caller = psref->psref_debug;
prdi->prdi_psref = psref;
psref->psref_debug = prdi;
break;
}
if (__predict_false(i == prd->prd_refs_peek))
panic("out of range: %d", i);
splx(s);
}
static void
psref_debug_release(struct psref *psref)
{
int s;
s = splserial();
if (__predict_true(psref->psref_debug != NULL)) {
struct psref_debug_item *prdi = psref->psref_debug;
prdi->prdi_psref = NULL;
}
splx(s);
}
void
psref_debug_barrier(void)
{
struct psref_debug *prd;
struct lwp *l = curlwp;
int s, i;
prd = lwp_getspecific(psref_debug_lwp_key);
if (__predict_false(prd == NULL))
return;
s = splserial();
for (i = 0; i < prd->prd_refs_peek; i++) {
struct psref_debug_item *prdi = &prd->prd_items[i];
if (__predict_true(prdi->prdi_psref == NULL))
continue;
panic("psref leaked: lwp(%p) acquired at %p", l, prdi->prdi_caller);
}
prd->prd_refs_peek = 0; /* Reset the counter */
splx(s);
}
#endif /* PSREF_DEBUG */
/* $NetBSD: pslist.h,v 1.7 2019/12/01 15:28:19 riastradh Exp $ */
/*-
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_PSLIST_H
#define _SYS_PSLIST_H
#include <sys/param.h>
#include <sys/atomic.h>
struct pslist_head;
struct pslist_entry;
struct pslist_head {
struct pslist_entry *plh_first;
};
struct pslist_entry {
struct pslist_entry **ple_prevp;
struct pslist_entry *ple_next;
};
#ifdef _KERNEL
#define _PSLIST_ASSERT KASSERT
#else
#include <assert.h>
#define _PSLIST_ASSERT assert
#endif
#define _PSLIST_POISON ((void *)1ul)
/*
* Initialization. Allowed only when the caller has exclusive access,
* excluding writers and readers.
*/
static __inline void
pslist_init(struct pslist_head *head)
{
head->plh_first = NULL; /* not yet published, so no atomic */
}
static __inline void
pslist_destroy(struct pslist_head *head __diagused)
{
_PSLIST_ASSERT(head->plh_first == NULL);
}
static __inline void
pslist_entry_init(struct pslist_entry *entry)
{
entry->ple_next = NULL;
entry->ple_prevp = NULL;
}
static __inline void
pslist_entry_destroy(struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_prevp == NULL);
/*
* Poison the next entry. If we used NULL here, then readers
* would think they were simply at the end of the list.
* Instead, cause readers to crash.
*/
atomic_store_relaxed(&entry->ple_next, _PSLIST_POISON);
}
/*
* Writer operations. Caller must exclude other writers, but not
* necessarily readers.
*
* Writes to initialize a new entry must precede its publication by
* writing to plh_first / ple_next / *ple_prevp.
*
* The ple_prevp field is serialized by the caller's exclusive lock and
* not read by readers, and hence its ordering relative to the internal
* memory barriers is inconsequential.
*/
static __inline void
pslist_writer_insert_head(struct pslist_head *head, struct pslist_entry *new)
{ _PSLIST_ASSERT(head->plh_first == NULL ||
head->plh_first->ple_prevp == &head->plh_first);
_PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = &head->plh_first;
new->ple_next = head->plh_first; /* not yet published, so no atomic */
if (head->plh_first != NULL) head->plh_first->ple_prevp = &new->ple_next; atomic_store_release(&head->plh_first, new);
}
static __inline void
pslist_writer_insert_before(struct pslist_entry *entry,
struct pslist_entry *new)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
_PSLIST_ASSERT(entry->ple_prevp != NULL);
_PSLIST_ASSERT(*entry->ple_prevp == entry);
_PSLIST_ASSERT(new->ple_next == NULL);
_PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = entry->ple_prevp;
new->ple_next = entry; /* not yet published, so no atomic */
/*
* Pairs with atomic_load_consume in pslist_reader_first or
* pslist_reader_next.
*/
atomic_store_release(entry->ple_prevp, new);
entry->ple_prevp = &new->ple_next;
}
static __inline void
pslist_writer_insert_after(struct pslist_entry *entry,
struct pslist_entry *new)
{ _PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); _PSLIST_ASSERT(entry->ple_prevp != NULL); _PSLIST_ASSERT(*entry->ple_prevp == entry); _PSLIST_ASSERT(new->ple_next == NULL); _PSLIST_ASSERT(new->ple_prevp == NULL);
new->ple_prevp = &entry->ple_next;
new->ple_next = entry->ple_next; /* not yet published, so no atomic */
if (new->ple_next != NULL) new->ple_next->ple_prevp = &new->ple_next;
/* Pairs with atomic_load_consume in pslist_reader_next. */
atomic_store_release(&entry->ple_next, new);
}
static __inline void
pslist_writer_remove(struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON); _PSLIST_ASSERT(entry->ple_prevp != NULL); _PSLIST_ASSERT(*entry->ple_prevp == entry); if (entry->ple_next != NULL) entry->ple_next->ple_prevp = entry->ple_prevp;
/*
* No need for atomic_store_release because there's no
* initialization that this must happen after -- the store
* transitions from a good state with the entry to a good state
* without the entry, both of which are valid for readers to
* witness.
*/
atomic_store_relaxed(entry->ple_prevp, entry->ple_next);
entry->ple_prevp = NULL;
/*
* Leave entry->ple_next intact so that any extant readers can
* continue iterating through the list. The caller must then
* wait for readers to drain, e.g. with pserialize_perform,
* before destroying and reusing the entry.
*/
}
static __inline struct pslist_entry *
pslist_writer_first(const struct pslist_head *head)
{
return head->plh_first;
}
static __inline struct pslist_entry *
pslist_writer_next(const struct pslist_entry *entry)
{
_PSLIST_ASSERT(entry->ple_next != _PSLIST_POISON);
return entry->ple_next;
}
static __inline void *
_pslist_writer_first_container(const struct pslist_head *head,
const ptrdiff_t offset)
{
struct pslist_entry *first = head->plh_first;
return (first == NULL ? NULL : (char *)first - offset);
}
static __inline void *
_pslist_writer_next_container(const struct pslist_entry *entry,
const ptrdiff_t offset)
{
struct pslist_entry *next = entry->ple_next;
_PSLIST_ASSERT(next != _PSLIST_POISON);
return (next == NULL ? NULL : (char *)next - offset);
}
/*
* Reader operations. Caller must block pserialize_perform or
* equivalent and be bound to a CPU. Only plh_first/ple_next may be
* read, and only with consuming memory order so that data-dependent
* loads happen afterward.
*/
static __inline struct pslist_entry *
pslist_reader_first(const struct pslist_head *head)
{
/*
* Pairs with atomic_store_release in pslist_writer_insert_head
* or pslist_writer_insert_before.
*/
return atomic_load_consume(&head->plh_first);
}
static __inline struct pslist_entry *
pslist_reader_next(const struct pslist_entry *entry)
{
/*
* Pairs with atomic_store_release in
* pslist_writer_insert_before or pslist_writer_insert_after.
*/
struct pslist_entry *next = atomic_load_consume(&entry->ple_next); _PSLIST_ASSERT(next != _PSLIST_POISON);
return next;
}
static __inline void *
_pslist_reader_first_container(const struct pslist_head *head,
const ptrdiff_t offset)
{
struct pslist_entry *first = pslist_reader_first(head);
if (first == NULL)
return NULL;
return (char *)first - offset;
}
static __inline void *
_pslist_reader_next_container(const struct pslist_entry *entry,
const ptrdiff_t offset)
{
struct pslist_entry *next = pslist_reader_next(entry);
if (next == NULL)
return NULL;
return (char *)next - offset;
}
/*
* Type-safe macros for convenience.
*/
#if defined(__COVERITY__) || defined(__LGTM_BOT__)
#define _PSLIST_VALIDATE_PTRS(P, Q) 0
#define _PSLIST_VALIDATE_CONTAINER(P, T, F) 0
#else
#define _PSLIST_VALIDATE_PTRS(P, Q) \
(0 * sizeof((P) - (Q)) * sizeof(*(P)) * sizeof(*(Q)))
#define _PSLIST_VALIDATE_CONTAINER(P, T, F) \
(0 * sizeof((P) - &((T *)(((char *)(P)) - offsetof(T, F)))->F))
#endif
#define PSLIST_INITIALIZER { .plh_first = NULL }
#define PSLIST_ENTRY_INITIALIZER { .ple_next = NULL, .ple_prevp = NULL }
#define PSLIST_INIT(H) pslist_init((H))
#define PSLIST_DESTROY(H) pslist_destroy((H))
#define PSLIST_ENTRY_INIT(E, F) pslist_entry_init(&(E)->F)
#define PSLIST_ENTRY_DESTROY(E, F) pslist_entry_destroy(&(E)->F)
#define PSLIST_WRITER_INSERT_HEAD(H, V, F) \
pslist_writer_insert_head((H), &(V)->F)
#define PSLIST_WRITER_INSERT_BEFORE(E, N, F) \
pslist_writer_insert_before(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \
&(N)->F)
#define PSLIST_WRITER_INSERT_AFTER(E, N, F) \
pslist_writer_insert_after(&(E)->F + _PSLIST_VALIDATE_PTRS(E, N), \
&(N)->F)
#define PSLIST_WRITER_REMOVE(E, F) \
pslist_writer_remove(&(E)->F)
#define PSLIST_WRITER_FIRST(H, T, F) \
((T *)(_pslist_writer_first_container((H), offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_writer_first(H), T, F))
#define PSLIST_WRITER_NEXT(V, T, F) \
((T *)(_pslist_writer_next_container(&(V)->F, offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_writer_next(&(V)->F), T, F))
#define PSLIST_WRITER_FOREACH(V, H, T, F) \
for ((V) = PSLIST_WRITER_FIRST((H), T, F); \
(V) != NULL; \
(V) = PSLIST_WRITER_NEXT((V), T, F))
#define PSLIST_READER_FIRST(H, T, F) \
((T *)(_pslist_reader_first_container((H), offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_reader_first(H), T, F))
#define PSLIST_READER_NEXT(V, T, F) \
((T *)(_pslist_reader_next_container(&(V)->F, offsetof(T, F))) + \
_PSLIST_VALIDATE_CONTAINER(pslist_reader_next(&(V)->F), T, F))
#define PSLIST_READER_FOREACH(V, H, T, F) \
for ((V) = PSLIST_READER_FIRST((H), T, F); \
(V) != NULL; \
(V) = PSLIST_READER_NEXT((V), T, F))
#endif /* _SYS_PSLIST_H */
/* $NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $ */
/*-
* Copyright (c) 2003, 2007, 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Paul Kranenburg, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1996 John S. Dyson
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice immediately at the beginning of the file, without modification,
* this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Absolutely no warranty of function or purpose is made by the author
* John S. Dyson.
* 4. Modifications may be freely made to this file if the above conditions
* are met.
*/
/*
* This file contains a high-performance replacement for the socket-based
* pipes scheme originally used. It does not support all features of
* sockets, but does do everything that pipes normally do.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_pipe.c,v 1.167 2024/02/10 09:21:54 andvar Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/kernel.h>
#include <sys/ttycom.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/signalvar.h>
#include <sys/vnode.h>
#include <sys/uio.h>
#include <sys/select.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/pipe.h>
static int pipe_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int pipe_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
static int pipe_close(file_t *);
static int pipe_poll(file_t *, int);
static int pipe_kqfilter(file_t *, struct knote *);
static int pipe_stat(file_t *, struct stat *);
static int pipe_ioctl(file_t *, u_long, void *);
static void pipe_restart(file_t *);
static int pipe_fpathconf(file_t *, int, register_t *);
static int pipe_posix_fadvise(file_t *, off_t, off_t, int);
static const struct fileops pipeops = {
.fo_name = "pipe",
.fo_read = pipe_read,
.fo_write = pipe_write,
.fo_ioctl = pipe_ioctl,
.fo_fcntl = fnullop_fcntl,
.fo_poll = pipe_poll,
.fo_stat = pipe_stat,
.fo_close = pipe_close,
.fo_kqfilter = pipe_kqfilter,
.fo_restart = pipe_restart,
.fo_fpathconf = pipe_fpathconf,
.fo_posix_fadvise = pipe_posix_fadvise,
};
/*
* Default pipe buffer size(s), this can be kind-of large now because pipe
* space is pageable. The pipe code will try to maintain locality of
* reference for performance reasons, so small amounts of outstanding I/O
* will not wipe the cache.
*/
#define MINPIPESIZE (PIPE_SIZE / 3)
#define MAXPIPESIZE (2 * PIPE_SIZE / 3)
/*
* Limit the number of "big" pipes
*/
#define LIMITBIGPIPES 32
static u_int maxbigpipes __read_mostly = LIMITBIGPIPES;
static u_int nbigpipe = 0;
/*
* Amount of KVA consumed by pipe buffers.
*/
static u_int amountpipekva = 0;
static void pipeclose(struct pipe *);
static void pipe_free_kmem(struct pipe *);
static int pipe_create(struct pipe **, pool_cache_t, struct timespec *);
static int pipelock(struct pipe *, bool);
static inline void pipeunlock(struct pipe *);
static void pipeselwakeup(struct pipe *, struct pipe *, int);
static int pipespace(struct pipe *, int);
static int pipe_ctor(void *, void *, int);
static void pipe_dtor(void *, void *);
static pool_cache_t pipe_wr_cache;
static pool_cache_t pipe_rd_cache;
void
pipe_init(void)
{
/* Writer side is not automatically allocated KVA. */
pipe_wr_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "pipewr",
NULL, IPL_NONE, pipe_ctor, pipe_dtor, NULL);
KASSERT(pipe_wr_cache != NULL);
/* Reader side gets preallocated KVA. */
pipe_rd_cache = pool_cache_init(sizeof(struct pipe), 0, 0, 0, "piperd",
NULL, IPL_NONE, pipe_ctor, pipe_dtor, (void *)1);
KASSERT(pipe_rd_cache != NULL);
}
static int
pipe_ctor(void *arg, void *obj, int flags)
{
struct pipe *pipe;
vaddr_t va;
pipe = obj;
memset(pipe, 0, sizeof(struct pipe));
if (arg != NULL) {
/* Preallocate space. */
va = uvm_km_alloc(kernel_map, PIPE_SIZE, 0,
UVM_KMF_PAGEABLE | UVM_KMF_WAITVA);
KASSERT(va != 0);
pipe->pipe_kmem = va;
atomic_add_int(&amountpipekva, PIPE_SIZE);
}
cv_init(&pipe->pipe_rcv, "pipe_rd");
cv_init(&pipe->pipe_wcv, "pipe_wr");
cv_init(&pipe->pipe_draincv, "pipe_drn");
cv_init(&pipe->pipe_lkcv, "pipe_lk");
selinit(&pipe->pipe_sel);
pipe->pipe_state = PIPE_SIGNALR;
return 0;
}
static void
pipe_dtor(void *arg, void *obj)
{
struct pipe *pipe;
pipe = obj;
cv_destroy(&pipe->pipe_rcv);
cv_destroy(&pipe->pipe_wcv);
cv_destroy(&pipe->pipe_draincv);
cv_destroy(&pipe->pipe_lkcv);
seldestroy(&pipe->pipe_sel);
if (pipe->pipe_kmem != 0) { uvm_km_free(kernel_map, pipe->pipe_kmem, PIPE_SIZE,
UVM_KMF_PAGEABLE);
atomic_add_int(&amountpipekva, -PIPE_SIZE);
}
}
/*
* The pipe system call for the DTYPE_PIPE type of pipes
*/
int
pipe1(struct lwp *l, int *fildes, int flags)
{
struct pipe *rpipe, *wpipe;
struct timespec nt;
file_t *rf, *wf;
int fd, error;
proc_t *p;
if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
return EINVAL;
p = curproc;
rpipe = wpipe = NULL;
getnanotime(&nt);
if ((error = pipe_create(&rpipe, pipe_rd_cache, &nt)) || (error = pipe_create(&wpipe, pipe_wr_cache, &nt))) {
goto free2;
}
rpipe->pipe_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
wpipe->pipe_lock = rpipe->pipe_lock;
mutex_obj_hold(wpipe->pipe_lock);
error = fd_allocfile(&rf, &fd);
if (error)
goto free2;
fildes[0] = fd;
error = fd_allocfile(&wf, &fd);
if (error)
goto free3;
fildes[1] = fd;
rf->f_flag = FREAD | flags;
rf->f_type = DTYPE_PIPE;
rf->f_pipe = rpipe;
rf->f_ops = &pipeops;
fd_set_exclose(l, fildes[0], (flags & O_CLOEXEC) != 0);
wf->f_flag = FWRITE | flags;
wf->f_type = DTYPE_PIPE;
wf->f_pipe = wpipe;
wf->f_ops = &pipeops;
fd_set_exclose(l, fildes[1], (flags & O_CLOEXEC) != 0);
rpipe->pipe_peer = wpipe;
wpipe->pipe_peer = rpipe;
fd_affix(p, rf, fildes[0]);
fd_affix(p, wf, fildes[1]);
return (0);
free3:
fd_abort(p, rf, fildes[0]);
free2:
pipeclose(wpipe);
pipeclose(rpipe);
return (error);
}
/*
* Allocate kva for pipe circular buffer, the space is pageable
* This routine will 'realloc' the size of a pipe safely, if it fails
* it will retain the old buffer.
* If it fails it will return ENOMEM.
*/
static int
pipespace(struct pipe *pipe, int size)
{
void *buffer;
/*
* Allocate pageable virtual address space. Physical memory is
* allocated on demand.
*/
if (size == PIPE_SIZE && pipe->pipe_kmem != 0) {
buffer = (void *)pipe->pipe_kmem;
} else {
buffer = (void *)uvm_km_alloc(kernel_map, round_page(size),
0, UVM_KMF_PAGEABLE);
if (buffer == NULL)
return (ENOMEM);
atomic_add_int(&amountpipekva, size);
}
/* free old resources if we're resizing */
pipe_free_kmem(pipe);
pipe->pipe_buffer.buffer = buffer;
pipe->pipe_buffer.size = size;
pipe->pipe_buffer.in = 0;
pipe->pipe_buffer.out = 0;
pipe->pipe_buffer.cnt = 0;
return (0);
}
/*
* Initialize and allocate VM and memory for pipe.
*/
static int
pipe_create(struct pipe **pipep, pool_cache_t cache, struct timespec *nt)
{
struct pipe *pipe;
int error;
pipe = pool_cache_get(cache, PR_WAITOK);
KASSERT(pipe != NULL);
*pipep = pipe;
error = 0;
pipe->pipe_atime = pipe->pipe_mtime = pipe->pipe_btime = *nt;
pipe->pipe_lock = NULL;
if (cache == pipe_rd_cache) {
error = pipespace(pipe, PIPE_SIZE);
} else {
pipe->pipe_buffer.buffer = NULL;
pipe->pipe_buffer.size = 0;
pipe->pipe_buffer.in = 0;
pipe->pipe_buffer.out = 0;
pipe->pipe_buffer.cnt = 0;
}
return error;
}
/*
* Lock a pipe for I/O, blocking other access
* Called with pipe spin lock held.
*/
static int
pipelock(struct pipe *pipe, bool catch_p)
{
int error;
KASSERT(mutex_owned(pipe->pipe_lock)); while (pipe->pipe_state & PIPE_LOCKFL) {
if (catch_p) {
error = cv_wait_sig(&pipe->pipe_lkcv, pipe->pipe_lock);
if (error != 0) {
return error;
}
} else
cv_wait(&pipe->pipe_lkcv, pipe->pipe_lock);
}
pipe->pipe_state |= PIPE_LOCKFL;
return 0;
}
/*
* unlock a pipe I/O lock
*/
static inline void
pipeunlock(struct pipe *pipe)
{
KASSERT(pipe->pipe_state & PIPE_LOCKFL);
pipe->pipe_state &= ~PIPE_LOCKFL;
cv_signal(&pipe->pipe_lkcv);
}
/*
* Select/poll wakeup. This also sends SIGIO to peer connected to
* 'sigpipe' side of pipe.
*/
static void
pipeselwakeup(struct pipe *selp, struct pipe *sigp, int code)
{
int band;
switch (code) {
case POLL_IN:
band = POLLIN|POLLRDNORM;
break;
case POLL_OUT:
band = POLLOUT|POLLWRNORM;
break;
case POLL_HUP:
band = POLLHUP;
break;
case POLL_ERR:
band = POLLERR;
break;
default:
band = 0;
#ifdef DIAGNOSTIC
printf("bad siginfo code %d in pipe notification.\n", code);
#endif
break;
}
selnotify(&selp->pipe_sel, band, NOTE_SUBMIT);
if (sigp == NULL || (sigp->pipe_state & PIPE_ASYNC) == 0)
return;
fownsignal(sigp->pipe_pgid, SIGIO, code, band, selp);
}
static int
pipe_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct pipe *rpipe = fp->f_pipe;
struct pipebuf *bp = &rpipe->pipe_buffer;
kmutex_t *lock = rpipe->pipe_lock;
int error;
size_t nread = 0;
size_t size;
size_t ocnt;
unsigned int wakeup_state = 0;
/*
* Try to avoid locking the pipe if we have nothing to do.
*
* There are programs which share one pipe amongst multiple processes
* and perform non-blocking reads in parallel, even if the pipe is
* empty. This in particular is the case with BSD make, which when
* spawned with a high -j number can find itself with over half of the
* calls failing to find anything.
*/
if ((fp->f_flag & FNONBLOCK) != 0) { if (__predict_false(uio->uio_resid == 0))
return (0);
if (atomic_load_relaxed(&bp->cnt) == 0 && (atomic_load_relaxed(&rpipe->pipe_state) & PIPE_EOF) == 0)
return (EAGAIN);
}
mutex_enter(lock);
++rpipe->pipe_busy;
ocnt = bp->cnt;
again:
error = pipelock(rpipe, true);
if (error)
goto unlocked_error;
while (uio->uio_resid) {
/*
* Normal pipe buffer receive.
*/
if (bp->cnt > 0) {
size = bp->size - bp->out;
if (size > bp->cnt)
size = bp->cnt;
if (size > uio->uio_resid)
size = uio->uio_resid;
mutex_exit(lock);
error = uiomove((char *)bp->buffer + bp->out, size, uio);
mutex_enter(lock);
if (error)
break;
bp->out += size;
if (bp->out >= bp->size)
bp->out = 0;
bp->cnt -= size;
/*
* If there is no more to read in the pipe, reset
* its pointers to the beginning. This improves
* cache hit stats.
*/
if (bp->cnt == 0) { bp->in = 0;
bp->out = 0;
}
nread += size;
continue;
}
/*
* Break if some data was read.
*/
if (nread > 0)
break;
/*
* Detect EOF condition.
* Read returns 0 on EOF, no need to set error.
*/
if (rpipe->pipe_state & PIPE_EOF)
break;
/*
* Don't block on non-blocking I/O.
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
break;
}
/*
* Unlock the pipe buffer for our remaining processing.
* We will either break out with an error or we will
* sleep and relock to loop.
*/
pipeunlock(rpipe);
#if 1 /* XXX (dsl) I'm sure these aren't needed here ... */
/*
* We want to read more, wake up select/poll.
*/
pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
/*
* If the "write-side" is blocked, wake it up now.
*/
cv_broadcast(&rpipe->pipe_wcv);
#endif
if (wakeup_state & PIPE_RESTART) {
error = ERESTART;
goto unlocked_error;
}
/* Now wait until the pipe is filled */
error = cv_wait_sig(&rpipe->pipe_rcv, lock);
if (error != 0)
goto unlocked_error;
wakeup_state = rpipe->pipe_state;
goto again;
}
if (error == 0)
getnanotime(&rpipe->pipe_atime);
pipeunlock(rpipe);
unlocked_error:
--rpipe->pipe_busy;
if (rpipe->pipe_busy == 0) { rpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&rpipe->pipe_draincv);
}
if (bp->cnt < MINPIPESIZE) { cv_broadcast(&rpipe->pipe_wcv);
}
/*
* If anything was read off the buffer, signal to the writer it's
* possible to write more data. Also send signal if we are here for the
* first time after last write.
*/
if ((bp->size - bp->cnt) >= PIPE_BUF && (ocnt != bp->cnt || (rpipe->pipe_state & PIPE_SIGNALR))) { pipeselwakeup(rpipe, rpipe->pipe_peer, POLL_OUT);
rpipe->pipe_state &= ~PIPE_SIGNALR;
}
mutex_exit(lock);
return (error);
}
static int
pipe_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct pipe *wpipe, *rpipe;
struct pipebuf *bp;
kmutex_t *lock;
int error;
unsigned int wakeup_state = 0;
/* We want to write to our peer */
rpipe = fp->f_pipe;
lock = rpipe->pipe_lock;
error = 0;
mutex_enter(lock);
wpipe = rpipe->pipe_peer;
/*
* Detect loss of pipe read side, issue SIGPIPE if lost.
*/
if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) != 0) {
mutex_exit(lock);
return EPIPE;
}
++wpipe->pipe_busy;
/* Acquire the long-term pipe lock */
if ((error = pipelock(wpipe, true)) != 0) {
--wpipe->pipe_busy;
if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&wpipe->pipe_draincv);
}
mutex_exit(lock);
return (error);
}
bp = &wpipe->pipe_buffer;
/*
* If it is advantageous to resize the pipe buffer, do so.
*/
if ((uio->uio_resid > PIPE_SIZE) && (nbigpipe < maxbigpipes) && (bp->size <= PIPE_SIZE) && (bp->cnt == 0)) { if (pipespace(wpipe, BIG_PIPE_SIZE) == 0)
atomic_inc_uint(&nbigpipe);
}
while (uio->uio_resid) {
size_t space;
space = bp->size - bp->cnt;
/* Writes of size <= PIPE_BUF must be atomic. */
if ((space < uio->uio_resid) && (uio->uio_resid <= PIPE_BUF))
space = 0;
if (space > 0) {
int size; /* Transfer size */
int segsize; /* first segment to transfer */
/*
* Transfer size is minimum of uio transfer
* and free space in pipe buffer.
*/
if (space > uio->uio_resid)
size = uio->uio_resid;
else
size = space;
/*
* First segment to transfer is minimum of
* transfer size and contiguous space in
* pipe buffer. If first segment to transfer
* is less than the transfer size, we've got
* a wraparound in the buffer.
*/
segsize = bp->size - bp->in;
if (segsize > size)
segsize = size;
/* Transfer first segment */
mutex_exit(lock);
error = uiomove((char *)bp->buffer + bp->in, segsize,
uio);
if (error == 0 && segsize < size) {
/*
* Transfer remaining part now, to
* support atomic writes. Wraparound
* happened.
*/
KASSERT(bp->in + segsize == bp->size);
error = uiomove(bp->buffer,
size - segsize, uio);
}
mutex_enter(lock);
if (error)
break;
bp->in += size;
if (bp->in >= bp->size) { KASSERT(bp->in == size - segsize + bp->size);
bp->in = size - segsize;
}
bp->cnt += size;
KASSERT(bp->cnt <= bp->size);
wakeup_state = 0;
} else {
/*
* If the "read-side" has been blocked, wake it up now.
*/
cv_broadcast(&wpipe->pipe_rcv);
/*
* Don't block on non-blocking I/O.
*/
if (fp->f_flag & FNONBLOCK) {
error = EAGAIN;
break;
}
/*
* We have no more space and have something to offer,
* wake up select/poll.
*/
if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN); if (wakeup_state & PIPE_RESTART) {
error = ERESTART;
break;
}
/*
* If read side wants to go away, we just issue a signal
* to ourselves.
*/
if (wpipe->pipe_state & PIPE_EOF) {
error = EPIPE;
break;
}
pipeunlock(wpipe);
error = cv_wait_sig(&wpipe->pipe_wcv, lock);
(void)pipelock(wpipe, false); if (error != 0)
break;
wakeup_state = wpipe->pipe_state;
}
}
--wpipe->pipe_busy;
if (wpipe->pipe_busy == 0) { wpipe->pipe_state &= ~PIPE_RESTART;
cv_broadcast(&wpipe->pipe_draincv);
}
if (bp->cnt > 0) { cv_broadcast(&wpipe->pipe_rcv);
}
/*
* Don't return EPIPE if I/O was successful
*/
if (error == EPIPE && bp->cnt == 0 && uio->uio_resid == 0)
error = 0;
if (error == 0)
getnanotime(&wpipe->pipe_mtime);
/*
* We have something to offer, wake up select/poll.
*/
if (bp->cnt) pipeselwakeup(wpipe, wpipe, POLL_IN);
/*
* Arrange for next read(2) to do a signal.
*/
wpipe->pipe_state |= PIPE_SIGNALR;
pipeunlock(wpipe);
mutex_exit(lock);
return (error);
}
/*
* We implement a very minimal set of ioctls for compatibility with sockets.
*/
int
pipe_ioctl(file_t *fp, u_long cmd, void *data)
{
struct pipe *pipe = fp->f_pipe;
kmutex_t *lock = pipe->pipe_lock;
switch (cmd) {
case FIONBIO:
return (0);
case FIOASYNC:
mutex_enter(lock);
if (*(int *)data) {
pipe->pipe_state |= PIPE_ASYNC;
} else {
pipe->pipe_state &= ~PIPE_ASYNC;
}
mutex_exit(lock);
return (0);
case FIONREAD:
mutex_enter(lock);
*(int *)data = pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case FIONWRITE:
/* Look at other side */
mutex_enter(lock);
pipe = pipe->pipe_peer;
if (pipe == NULL)
*(int *)data = 0;
else
*(int *)data = pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case FIONSPACE:
/* Look at other side */
mutex_enter(lock);
pipe = pipe->pipe_peer;
if (pipe == NULL)
*(int *)data = 0;
else
*(int *)data = pipe->pipe_buffer.size -
pipe->pipe_buffer.cnt;
mutex_exit(lock);
return (0);
case TIOCSPGRP:
case FIOSETOWN:
return fsetown(&pipe->pipe_pgid, cmd, data);
case TIOCGPGRP:
case FIOGETOWN:
return fgetown(pipe->pipe_pgid, cmd, data);
}
return (EPASSTHROUGH);
}
int
pipe_poll(file_t *fp, int events)
{
struct pipe *rpipe = fp->f_pipe;
struct pipe *wpipe;
int eof = 0;
int revents = 0;
mutex_enter(rpipe->pipe_lock);
wpipe = rpipe->pipe_peer;
if (events & (POLLIN | POLLRDNORM)) if ((rpipe->pipe_buffer.cnt > 0) ||
(rpipe->pipe_state & PIPE_EOF))
revents |= events & (POLLIN | POLLRDNORM);
eof |= (rpipe->pipe_state & PIPE_EOF);
if (wpipe == NULL)
revents |= events & (POLLOUT | POLLWRNORM);
else {
if (events & (POLLOUT | POLLWRNORM)) if ((wpipe->pipe_state & PIPE_EOF) || (
(wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF))
revents |= events & (POLLOUT | POLLWRNORM);
eof |= (wpipe->pipe_state & PIPE_EOF);
}
if (wpipe == NULL || eof)
revents |= POLLHUP;
if (revents == 0) { if (events & (POLLIN | POLLRDNORM)) selrecord(curlwp, &rpipe->pipe_sel); if (events & (POLLOUT | POLLWRNORM)) selrecord(curlwp, &wpipe->pipe_sel);
}
mutex_exit(rpipe->pipe_lock);
return (revents);
}
static int
pipe_stat(file_t *fp, struct stat *ub)
{
struct pipe *pipe = fp->f_pipe;
mutex_enter(pipe->pipe_lock);
memset(ub, 0, sizeof(*ub));
ub->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
ub->st_blksize = pipe->pipe_buffer.size;
if (ub->st_blksize == 0 && pipe->pipe_peer) ub->st_blksize = pipe->pipe_peer->pipe_buffer.size;
ub->st_size = pipe->pipe_buffer.cnt;
ub->st_blocks = (ub->st_size) ? 1 : 0;
ub->st_atimespec = pipe->pipe_atime;
ub->st_mtimespec = pipe->pipe_mtime;
ub->st_ctimespec = ub->st_birthtimespec = pipe->pipe_btime;
ub->st_uid = kauth_cred_geteuid(fp->f_cred);
ub->st_gid = kauth_cred_getegid(fp->f_cred);
/*
* Left as 0: st_dev, st_ino, st_nlink, st_rdev, st_flags, st_gen.
* XXX (st_dev, st_ino) should be unique.
*/
mutex_exit(pipe->pipe_lock);
return 0;
}
static int
pipe_close(file_t *fp)
{
struct pipe *pipe = fp->f_pipe;
fp->f_pipe = NULL;
pipeclose(pipe);
return (0);
}
static void
pipe_restart(file_t *fp)
{
struct pipe *pipe = fp->f_pipe;
/*
* Unblock blocked reads/writes in order to allow close() to complete.
* System calls return ERESTART so that the fd is revalidated.
* (Partial writes return the transfer length.)
*/
mutex_enter(pipe->pipe_lock);
pipe->pipe_state |= PIPE_RESTART;
/* Wakeup both cvs, maybe we only need one, but maybe there are some
* other paths where wakeup is needed, and it saves deciding which! */
cv_broadcast(&pipe->pipe_rcv);
cv_broadcast(&pipe->pipe_wcv);
mutex_exit(pipe->pipe_lock);
}
static int
pipe_fpathconf(struct file *fp, int name, register_t *retval)
{
switch (name) {
case _PC_PIPE_BUF:
*retval = PIPE_BUF;
return 0;
default:
return EINVAL;
}
}
static int
pipe_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{
return ESPIPE;
}
static void
pipe_free_kmem(struct pipe *pipe)
{
if (pipe->pipe_buffer.buffer != NULL) { if (pipe->pipe_buffer.size > PIPE_SIZE) { atomic_dec_uint(&nbigpipe);
}
if (pipe->pipe_buffer.buffer != (void *)pipe->pipe_kmem) { uvm_km_free(kernel_map,
(vaddr_t)pipe->pipe_buffer.buffer,
pipe->pipe_buffer.size, UVM_KMF_PAGEABLE);
atomic_add_int(&amountpipekva,
-pipe->pipe_buffer.size);
}
pipe->pipe_buffer.buffer = NULL;
}
}
/*
* Shutdown the pipe.
*/
static void
pipeclose(struct pipe *pipe)
{
kmutex_t *lock;
struct pipe *ppipe;
if (pipe == NULL)
return;
KASSERT(cv_is_valid(&pipe->pipe_rcv)); KASSERT(cv_is_valid(&pipe->pipe_wcv)); KASSERT(cv_is_valid(&pipe->pipe_draincv)); KASSERT(cv_is_valid(&pipe->pipe_lkcv));
lock = pipe->pipe_lock;
if (lock == NULL)
/* Must have failed during create */
goto free_resources;
mutex_enter(lock);
pipeselwakeup(pipe, pipe, POLL_HUP);
/*
* If the other side is blocked, wake it up saying that
* we want to close it down.
*/
pipe->pipe_state |= PIPE_EOF;
if (pipe->pipe_busy) { while (pipe->pipe_busy) {
cv_broadcast(&pipe->pipe_wcv);
cv_wait_sig(&pipe->pipe_draincv, lock);
}
}
/*
* Disconnect from peer.
*/
if ((ppipe = pipe->pipe_peer) != NULL) { pipeselwakeup(ppipe, ppipe, POLL_HUP);
ppipe->pipe_state |= PIPE_EOF;
cv_broadcast(&ppipe->pipe_rcv);
ppipe->pipe_peer = NULL;
}
/*
* Any knote objects still left in the list are
* the one attached by peer. Since no one will
* traverse this list, we just clear it.
*
* XXX Exposes select/kqueue internals.
*/
SLIST_INIT(&pipe->pipe_sel.sel_klist);
KASSERT((pipe->pipe_state & PIPE_LOCKFL) == 0);
mutex_exit(lock);
mutex_obj_free(lock);
/*
* Free resources.
*/
free_resources:
pipe->pipe_pgid = 0;
pipe->pipe_state = PIPE_SIGNALR;
pipe->pipe_peer = NULL;
pipe->pipe_lock = NULL;
pipe_free_kmem(pipe);
if (pipe->pipe_kmem != 0) {
pool_cache_put(pipe_rd_cache, pipe);
} else {
pool_cache_put(pipe_wr_cache, pipe);
}
}
static void
filt_pipedetach(struct knote *kn)
{
struct pipe *pipe;
kmutex_t *lock;
pipe = ((file_t *)kn->kn_obj)->f_pipe;
lock = pipe->pipe_lock;
mutex_enter(lock);
switch(kn->kn_filter) {
case EVFILT_WRITE:
/* Need the peer structure, not our own. */
pipe = pipe->pipe_peer;
/* If reader end already closed, just return. */
if (pipe == NULL) {
mutex_exit(lock);
return;
}
break;
default:
/* Nothing to do. */
break;
}
KASSERT(kn->kn_hook == pipe);
selremove_knote(&pipe->pipe_sel, kn);
mutex_exit(lock);
}
static int
filt_piperead(struct knote *kn, long hint)
{
struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
struct pipe *wpipe;
int rv;
if ((hint & NOTE_SUBMIT) == 0) { mutex_enter(rpipe->pipe_lock);
}
wpipe = rpipe->pipe_peer;
kn->kn_data = rpipe->pipe_buffer.cnt;
if ((rpipe->pipe_state & PIPE_EOF) ||
(wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
knote_set_eof(kn, 0);
rv = 1;
} else {
rv = kn->kn_data > 0;
}
if ((hint & NOTE_SUBMIT) == 0) { mutex_exit(rpipe->pipe_lock);
}
return rv;
}
static int
filt_pipewrite(struct knote *kn, long hint)
{
struct pipe *rpipe = ((file_t *)kn->kn_obj)->f_pipe;
struct pipe *wpipe;
int rv;
if ((hint & NOTE_SUBMIT) == 0) {
mutex_enter(rpipe->pipe_lock);
}
wpipe = rpipe->pipe_peer;
if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
kn->kn_data = 0;
knote_set_eof(kn, 0);
rv = 1;
} else {
kn->kn_data = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
rv = kn->kn_data >= PIPE_BUF;
}
if ((hint & NOTE_SUBMIT) == 0) {
mutex_exit(rpipe->pipe_lock);
}
return rv;
}
static const struct filterops pipe_rfiltops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_pipedetach,
.f_event = filt_piperead,
};
static const struct filterops pipe_wfiltops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_pipedetach,
.f_event = filt_pipewrite,
};
static int
pipe_kqfilter(file_t *fp, struct knote *kn)
{
struct pipe *pipe;
kmutex_t *lock;
pipe = ((file_t *)kn->kn_obj)->f_pipe;
lock = pipe->pipe_lock;
mutex_enter(lock);
switch (kn->kn_filter) {
case EVFILT_READ:
kn->kn_fop = &pipe_rfiltops;
break;
case EVFILT_WRITE:
kn->kn_fop = &pipe_wfiltops;
pipe = pipe->pipe_peer;
if (pipe == NULL) {
/* Other end of pipe has been closed. */
mutex_exit(lock);
return (EBADF);
}
break;
default:
mutex_exit(lock);
return (EINVAL);
}
kn->kn_hook = pipe;
selrecord_knote(&pipe->pipe_sel, kn);
mutex_exit(lock);
return (0);
}
/*
* Handle pipe sysctls.
*/
SYSCTL_SETUP(sysctl_kern_pipe_setup, "sysctl kern.pipe subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "pipe",
SYSCTL_DESCR("Pipe settings"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_PIPE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxbigpipes",
SYSCTL_DESCR("Maximum number of \"big\" pipes"),
NULL, 0, &maxbigpipes, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_MAXBIGPIPES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "nbigpipes",
SYSCTL_DESCR("Number of \"big\" pipes"),
NULL, 0, &nbigpipe, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_NBIGPIPES, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "kvasize",
SYSCTL_DESCR("Amount of kernel memory consumed by pipe "
"buffers"),
NULL, 0, &amountpipekva, 0,
CTL_KERN, KERN_PIPE, KERN_PIPE_KVASIZE, CTL_EOL);
}
/* $NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)sys_socket.c 8.3 (Berkeley) 2/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sys_socket.c,v 1.81 2023/04/22 13:53:02 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/systm.h>
#include <sys/file.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
static int soo_fpathconf(struct file *, int, register_t *);
static int soo_posix_fadvise(struct file *, off_t, off_t, int);
const struct fileops socketops = {
.fo_name = "socket",
.fo_read = soo_read,
.fo_write = soo_write,
.fo_ioctl = soo_ioctl,
.fo_fcntl = fnullop_fcntl,
.fo_poll = soo_poll,
.fo_stat = soo_stat,
.fo_close = soo_close,
.fo_kqfilter = soo_kqfilter,
.fo_restart = soo_restart,
.fo_fpathconf = soo_fpathconf,
.fo_posix_fadvise = soo_posix_fadvise,
};
int (*ifioctl)(struct socket *, u_long, void *, struct lwp *) = (void *)eopnotsupp;
/* ARGSUSED */
int
soo_read(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct socket *so = fp->f_socket;
int error;
error = (*so->so_receive)(so, NULL, uio, NULL, NULL, NULL);
return error;
}
/* ARGSUSED */
int
soo_write(file_t *fp, off_t *offset, struct uio *uio, kauth_cred_t cred,
int flags)
{
struct socket *so = fp->f_socket;
int error;
error = (*so->so_send)(so, NULL, uio, NULL, NULL, 0, curlwp);
return error;
}
int
soo_ioctl(file_t *fp, u_long cmd, void *data)
{
struct socket *so = fp->f_socket;
int error = 0;
switch (cmd) {
case FIONBIO:
solock(so);
if (*(int *)data)
so->so_state |= SS_NBIO;
else
so->so_state &= ~SS_NBIO;
sounlock(so);
break;
case FIOASYNC:
solock(so);
if (*(int *)data) {
so->so_rcv.sb_flags |= SB_ASYNC;
so->so_snd.sb_flags |= SB_ASYNC;
} else {
so->so_rcv.sb_flags &= ~SB_ASYNC;
so->so_snd.sb_flags &= ~SB_ASYNC;
}
sounlock(so);
break;
case FIONREAD:
*(int *)data = so->so_rcv.sb_cc;
break;
case FIONWRITE:
*(int *)data = so->so_snd.sb_cc;
break;
case FIONSPACE:
/*
* See the comment around sbspace()'s definition
* in sys/socketvar.h in face of counts about maximum
* to understand the following test. We detect overflow
* and return zero.
*/
solock(so); if ((so->so_snd.sb_hiwat < so->so_snd.sb_cc) || (so->so_snd.sb_mbmax < so->so_snd.sb_mbcnt))
*(int *)data = 0;
else
*(int *)data = sbspace(&so->so_snd);
sounlock(so);
break;
case SIOCSPGRP:
case FIOSETOWN:
case TIOCSPGRP:
error = fsetown(&so->so_pgid, cmd, data);
break;
case SIOCGPGRP:
case FIOGETOWN:
case TIOCGPGRP:
error = fgetown(so->so_pgid, cmd, data);
break;
case SIOCATMARK:
*(int *)data = (so->so_state&SS_RCVATMARK) != 0;
break;
case SIOCPEELOFF:
solock(so);
error = do_sys_peeloff(so, data);
sounlock(so);
break;
default:
/*
* Interface/routing/protocol specific ioctls:
* interface and routing ioctls should have a
* different entry since a socket's unnecessary
*/
if (IOCGROUP(cmd) == 'i')
/*
* KERNEL_LOCK will be held later if if_ioctl() of the
* interface isn't MP-safe.
*/
error = ifioctl(so, cmd, data, curlwp);
else {
KERNEL_LOCK(1, NULL);
error = (*so->so_proto->pr_usrreqs->pr_ioctl)(so,
cmd, data, NULL);
KERNEL_UNLOCK_ONE(NULL);
}
break;
}
return error;
}
int
soo_poll(file_t *fp, int events)
{
return sopoll(fp->f_socket, events);
}
int
soo_stat(file_t *fp, struct stat *ub)
{
struct socket *so = fp->f_socket;
int error;
memset(ub, 0, sizeof(*ub));
ub->st_mode = S_IFSOCK;
solock(so);
error = (*so->so_proto->pr_usrreqs->pr_stat)(so, ub);
sounlock(so);
return error;
}
/* ARGSUSED */
int
soo_close(file_t *fp)
{
int error = 0;
if (fp->f_socket) error = soclose(fp->f_socket);
fp->f_socket = NULL;
return error;
}
void
soo_restart(file_t *fp)
{
sorestart(fp->f_socket);
}
static int
soo_fpathconf(struct file *fp, int name, register_t *retval)
{ switch (name) {
case _PC_PIPE_BUF:
*retval = PIPE_BUF;
return 0;
default:
return EINVAL;
}
}
static int
soo_posix_fadvise(struct file *fp, off_t offset, off_t len, int advice)
{
return ESPIPE;
}
/* $NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $ */
/*-
* Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.151 2023/09/02 17:44:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_dtrace.h"
#include "opt_gprof.h"
#include "opt_multiprocessor.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/sysctl.h>
#include <sys/timex.h>
#include <sys/sched.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/cpu.h>
#include <sys/atomic.h>
#include <sys/rndsource.h>
#include <sys/heartbeat.h>
#ifdef GPROF
#include <sys/gmon.h>
#endif
#ifdef KDTRACE_HOOKS
#include <sys/dtrace_bsd.h>
#include <sys/cpu.h>
cyclic_clock_func_t cyclic_clock_func[MAXCPUS];
#endif
static int sysctl_kern_clockrate(SYSCTLFN_PROTO);
/*
* Clock handling routines.
*
* This code is written to operate with two timers that run independently of
* each other. The main clock, running hz times per second, is used to keep
* track of real time. The second timer handles kernel and user profiling,
* and does resource use estimation. If the second timer is programmable,
* it is randomized to avoid aliasing between the two clocks. For example,
* the randomization prevents an adversary from always giving up the CPU
* just before its quantum expires. Otherwise, it would never accumulate
* CPU ticks. The mean frequency of the second timer is stathz.
*
* If no second timer exists, stathz will be zero; in this case we drive
* profiling and statistics off the main clock. This WILL NOT be accurate;
* do not do it unless absolutely necessary.
*
* The statistics clock may (or may not) be run at a higher rate while
* profiling. This profile clock runs at profhz. We require that profhz
* be an integral multiple of stathz.
*
* If the statistics clock is running fast, it must be divided by the ratio
* profhz/stathz for statistics. (For profiling, every tick counts.)
*/
int stathz;
int profhz;
int profsrc;
int schedhz;
int profprocs;
static int hardclock_ticks;
static int hardscheddiv; /* hard => sched divider (used if schedhz == 0) */
static int psdiv; /* prof => stat divider */
int psratio; /* ratio: prof / stat */
struct clockrnd {
struct krndsource source;
unsigned needed;
};
static struct clockrnd hardclockrnd __aligned(COHERENCY_UNIT);
static struct clockrnd statclockrnd __aligned(COHERENCY_UNIT);
static void
clockrnd_get(size_t needed, void *cookie)
{
struct clockrnd *C = cookie;
/* Start sampling. */
atomic_store_relaxed(&C->needed, 2*NBBY*needed);
}
static void
clockrnd_sample(struct clockrnd *C)
{
struct cpu_info *ci = curcpu();
/* If there's nothing needed right now, stop here. */
if (__predict_true(atomic_load_relaxed(&C->needed) == 0))
return;
/*
* If we're not the primary core of a package, we're probably
* driven by the same clock as the primary core, so don't
* bother.
*/
if (ci != ci->ci_package1st)
return;
/* Take a sample and enter it into the pool. */
rnd_add_uint32(&C->source, 0);
/*
* On the primary CPU, count down. Using an atomic decrement
* here isn't really necessary -- on every platform we care
* about, stores to unsigned int are atomic, and the only other
* memory operation that could happen here is for another CPU
* to store a higher value for needed. But using an atomic
* decrement avoids giving the impression of data races, and is
* unlikely to hurt because only one CPU will ever be writing
* to the location.
*/
if (CPU_IS_PRIMARY(curcpu())) {
unsigned needed __diagused;
needed = atomic_dec_uint_nv(&C->needed);
KASSERT(needed != UINT_MAX);
}
}
static u_int get_intr_timecount(struct timecounter *);
static struct timecounter intr_timecounter = {
.tc_get_timecount = get_intr_timecount,
.tc_poll_pps = NULL,
.tc_counter_mask = ~0u,
.tc_frequency = 0,
.tc_name = "clockinterrupt",
/* quality - minimum implementation level for a clock */
.tc_quality = 0,
.tc_priv = NULL,
};
static u_int
get_intr_timecount(struct timecounter *tc)
{
return (u_int)getticks();
}
int
getticks(void)
{
return atomic_load_relaxed(&hardclock_ticks);
}
/*
* Initialize clock frequencies and start both clocks running.
*/
void
initclocks(void)
{
static struct sysctllog *clog;
int i;
/*
* Set divisors to 1 (normal case) and let the machine-specific
* code do its bit.
*/
psdiv = 1;
/*
* Call cpu_initclocks() before registering the default
* timecounter, in case it needs to adjust hz.
*/
const int old_hz = hz;
cpu_initclocks();
if (old_hz != hz) {
tick = 1000000 / hz;
tickadj = (240000 / (60 * hz)) ? (240000 / (60 * hz)) : 1;
}
/*
* provide minimum default time counter
* will only run at interrupt resolution
*/
intr_timecounter.tc_frequency = hz;
tc_init(&intr_timecounter);
/*
* Compute profhz and stathz, fix profhz if needed.
*/
i = stathz ? stathz : hz;
if (profhz == 0)
profhz = i;
psratio = profhz / i;
if (schedhz == 0) {
/* 16Hz is best */
hardscheddiv = hz / 16;
if (hardscheddiv <= 0)
panic("hardscheddiv");
}
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "clockrate",
SYSCTL_DESCR("Kernel clock rates"),
sysctl_kern_clockrate, 0, NULL,
sizeof(struct clockinfo),
CTL_KERN, KERN_CLOCKRATE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_INT, "hardclock_ticks",
SYSCTL_DESCR("Number of hardclock ticks"),
NULL, 0, &hardclock_ticks, sizeof(hardclock_ticks),
CTL_KERN, KERN_HARDCLOCK_TICKS, CTL_EOL);
rndsource_setcb(&hardclockrnd.source, clockrnd_get, &hardclockrnd);
rnd_attach_source(&hardclockrnd.source, "hardclock", RND_TYPE_SKEW,
RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|RND_FLAG_HASCB);
if (stathz) {
rndsource_setcb(&statclockrnd.source, clockrnd_get,
&statclockrnd);
rnd_attach_source(&statclockrnd.source, "statclock",
RND_TYPE_SKEW,
(RND_FLAG_COLLECT_TIME|RND_FLAG_ESTIMATE_TIME|
RND_FLAG_HASCB));
}
}
/*
* The real-time timer, interrupting hz times per second.
*/
void
hardclock(struct clockframe *frame)
{
struct lwp *l;
struct cpu_info *ci;
clockrnd_sample(&hardclockrnd);
ci = curcpu();
l = ci->ci_onproc;
ptimer_tick(l, CLKF_USERMODE(frame));
/*
* If no separate statistics clock is available, run it from here.
*/
if (stathz == 0)
statclock(frame);
/*
* If no separate schedclock is provided, call it here
* at about 16 Hz.
*/
if (schedhz == 0) {
if ((int)(--ci->ci_schedstate.spc_schedticks) <= 0) {
schedclock(l);
ci->ci_schedstate.spc_schedticks = hardscheddiv;
}
}
if ((--ci->ci_schedstate.spc_ticks) <= 0)
sched_tick(ci);
if (CPU_IS_PRIMARY(ci)) {
atomic_store_relaxed(&hardclock_ticks,
atomic_load_relaxed(&hardclock_ticks) + 1);
tc_ticktock();
}
/*
* Make sure the CPUs and timecounter are making progress.
*/
heartbeat();
/*
* Update real-time timeout queue.
*/
callout_hardclock();
}
/*
* Start profiling on a process.
*
* Kernel profiling passes proc0 which never exits and hence
* keeps the profile clock running constantly.
*/
void
startprofclock(struct proc *p)
{ KASSERT(mutex_owned(&p->p_stmutex)); if ((p->p_stflag & PST_PROFIL) == 0) {
p->p_stflag |= PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (++profprocs == 1 && stathz != 0) psdiv = psratio;
}
}
/*
* Stop profiling on a process.
*/
void
stopprofclock(struct proc *p)
{ KASSERT(mutex_owned(&p->p_stmutex)); if (p->p_stflag & PST_PROFIL) {
p->p_stflag &= ~PST_PROFIL;
/*
* This is only necessary if using the clock as the
* profiling source.
*/
if (--profprocs == 0 && stathz != 0) psdiv = 1;
}
}
void
schedclock(struct lwp *l)
{
if ((l->l_flag & LW_IDLE) != 0)
return;
sched_schedclock(l);
}
/*
* Statistics clock. Grab profile sample, and if divider reaches 0,
* do process and kernel statistics.
*/
void
statclock(struct clockframe *frame)
{
#ifdef GPROF
struct gmonparam *g;
intptr_t i;
#endif
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
struct proc *p;
struct lwp *l;
if (stathz)
clockrnd_sample(&statclockrnd);
/*
* Notice changes in divisor frequency, and adjust clock
* frequency accordingly.
*/
if (spc->spc_psdiv != psdiv) {
spc->spc_psdiv = psdiv;
spc->spc_pscnt = psdiv;
if (psdiv == 1) {
setstatclockrate(stathz);
} else {
setstatclockrate(profhz);
}
}
l = ci->ci_onproc;
if ((l->l_flag & LW_IDLE) != 0) {
/*
* don't account idle lwps as swapper.
*/
p = NULL;
} else {
p = l->l_proc;
mutex_spin_enter(&p->p_stmutex);
}
if (CLKF_USERMODE(frame)) {
KASSERT(p != NULL);
if ((p->p_stflag & PST_PROFIL) && profsrc == PROFSRC_CLOCK)
addupc_intr(l, CLKF_PC(frame));
if (--spc->spc_pscnt > 0) {
mutex_spin_exit(&p->p_stmutex);
return;
}
/*
* Came from user mode; CPU was in user state.
* If this process is being profiled record the tick.
*/
p->p_uticks++;
if (p->p_nice > NZERO)
spc->spc_cp_time[CP_NICE]++;
else
spc->spc_cp_time[CP_USER]++;
} else {
#ifdef GPROF
/*
* Kernel statistics are just like addupc_intr, only easier.
*/
#if defined(MULTIPROCESSOR) && !defined(_RUMPKERNEL)
g = curcpu()->ci_gmon;
if (g != NULL &&
profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#else
g = &_gmonparam;
if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
#endif
i = CLKF_PC(frame) - g->lowpc;
if (i < g->textsize) {
i /= HISTFRACTION * sizeof(*g->kcount);
g->kcount[i]++;
}
}
#endif
#ifdef LWP_PC
if (p != NULL && profsrc == PROFSRC_CLOCK &&
(p->p_stflag & PST_PROFIL)) {
addupc_intr(l, LWP_PC(l));
}
#endif
if (--spc->spc_pscnt > 0) {
if (p != NULL)
mutex_spin_exit(&p->p_stmutex);
return;
}
/*
* Came from kernel mode, so we were:
* - handling an interrupt,
* - doing syscall or trap work on behalf of the current
* user process, or
* - spinning in the idle loop.
* Whichever it is, charge the time as appropriate.
* Note that we charge interrupts to the current process,
* regardless of whether they are ``for'' that process,
* so that we know how much of its real time was spent
* in ``non-process'' (i.e., interrupt) work.
*/
if (CLKF_INTR(frame) || (curlwp->l_pflag & LP_INTR) != 0) {
if (p != NULL) {
p->p_iticks++;
}
spc->spc_cp_time[CP_INTR]++;
} else if (p != NULL) {
p->p_sticks++;
spc->spc_cp_time[CP_SYS]++;
} else {
spc->spc_cp_time[CP_IDLE]++;
}
}
spc->spc_pscnt = psdiv;
if (p != NULL) {
atomic_inc_uint(&l->l_cpticks);
mutex_spin_exit(&p->p_stmutex);
}
#ifdef KDTRACE_HOOKS
cyclic_clock_func_t func = cyclic_clock_func[cpu_index(ci)];
if (func) {
(*func)((struct clockframe *)frame);
}
#endif
}
/*
* sysctl helper routine for kern.clockrate. Assembles a struct on
* the fly to be returned to the caller.
*/
static int
sysctl_kern_clockrate(SYSCTLFN_ARGS)
{
struct clockinfo clkinfo;
struct sysctlnode node;
clkinfo.tick = tick;
clkinfo.tickadj = tickadj;
clkinfo.hz = hz;
clkinfo.profhz = profhz;
clkinfo.stathz = stathz ? stathz : hz;
node = *rnode;
node.sysctl_data = &clkinfo;
return (sysctl_lookup(SYSCTLFN_CALL(&node)));
}
/* $NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $ */
/* $KAME$ */
/*
* Copyright (C) 2000 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: scope6.c,v 1.23 2020/06/16 17:12:18 maxv Exp $");
#include <sys/param.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/systm.h>
#include <sys/queue.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <netinet/in.h>
#include <netinet6/in6_var.h>
#include <netinet6/scope6_var.h>
#ifdef ENABLE_DEFAULT_SCOPE
int ip6_use_defzone = 1;
#else
int ip6_use_defzone = 0;
#endif
static struct scope6_id sid_default;
#define SID(ifp) \
((ifp)->if_afdata[AF_INET6] == NULL ? NULL : \
((struct in6_ifextra *)(ifp)->if_afdata[AF_INET6])->scope6_id)
void
scope6_init(void)
{
memset(&sid_default, 0, sizeof(sid_default));
}
struct scope6_id *
scope6_ifattach(struct ifnet *ifp)
{
struct scope6_id *sid;
sid = malloc(sizeof(*sid), M_IFADDR, M_WAITOK | M_ZERO);
/*
* XXX: IPV6_ADDR_SCOPE_xxx macros are not standard.
* Should we rather hardcode here?
*/
sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL] = ifp->if_index;
sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL] = ifp->if_index;
#ifdef MULTI_SCOPE
/* by default, we don't care about scope boundary for these scopes. */
sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL] = 1;
sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL] = 1;
#endif
return sid;
}
void
scope6_ifdetach(struct scope6_id *sid)
{
free(sid, M_IFADDR);
}
/*
* Get a scope of the address. Interface-local, link-local, site-local
* or global.
*/
int
in6_addrscope(const struct in6_addr *addr)
{
int scope;
if (addr->s6_addr[0] == 0xfe) { scope = addr->s6_addr[1] & 0xc0; switch (scope) {
case 0x80:
return IPV6_ADDR_SCOPE_LINKLOCAL;
case 0xc0:
return IPV6_ADDR_SCOPE_SITELOCAL;
default:
return IPV6_ADDR_SCOPE_GLOBAL; /* just in case */
}
}
if (addr->s6_addr[0] == 0xff) {
scope = addr->s6_addr[1] & 0x0f;
/*
* due to other scope such as reserved,
* return scope doesn't work.
*/
switch (scope) {
case IPV6_ADDR_SCOPE_INTFACELOCAL:
return IPV6_ADDR_SCOPE_INTFACELOCAL;
case IPV6_ADDR_SCOPE_LINKLOCAL:
return IPV6_ADDR_SCOPE_LINKLOCAL;
case IPV6_ADDR_SCOPE_SITELOCAL:
return IPV6_ADDR_SCOPE_SITELOCAL;
default:
return IPV6_ADDR_SCOPE_GLOBAL;
}
}
if (memcmp(&in6addr_loopback, addr, sizeof(*addr) - 1) == 0) { if (addr->s6_addr[15] == 1) /* loopback */
return IPV6_ADDR_SCOPE_LINKLOCAL;
if (addr->s6_addr[15] == 0) {
/*
* Regard the unspecified addresses as global,
* since it has no ambiguity.
* XXX: not sure if it's correct...
*/
return IPV6_ADDR_SCOPE_GLOBAL;
}
}
return IPV6_ADDR_SCOPE_GLOBAL;
}
uint32_t
scope6_addr2default(const struct in6_addr *addr)
{
uint32_t id;
/*
* special case: The loopback address should be considered as
* link-local, but there's no ambiguity in the syntax.
*/
if (IN6_IS_ADDR_LOOPBACK(addr))
return 0;
/*
* XXX: 32-bit read is atomic on all our platforms, is it OK
* not to lock here?
*/
id = sid_default.s6id_list[in6_addrscope(addr)];
return id;
}
/*
* Validate the specified scope zone ID in the sin6_scope_id field. If the ID
* is unspecified (=0), needs to be specified, and the default zone ID can be
* used, the default value will be used.
* This routine then generates the kernel-internal form: if the address scope
* of is interface-local or link-local, embed the interface index in the
* address.
*/
int
sa6_embedscope(struct sockaddr_in6 *sin6, int defaultok)
{
struct ifnet *ifp;
uint32_t zoneid;
if ((zoneid = sin6->sin6_scope_id) == 0 && defaultok) zoneid = scope6_addr2default(&sin6->sin6_addr); if (zoneid != 0 && (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr))) {
int s;
/*
* At this moment, we only check interface-local and
* link-local scope IDs, and use interface indices as the
* zone IDs assuming a one-to-one mapping between interfaces
* and links.
*/
s = pserialize_read_enter();
ifp = if_byindex(zoneid);
if (ifp == NULL) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
/* XXX assignment to 16bit from 32bit variable */
sin6->sin6_addr.s6_addr16[1] = htons(zoneid & 0xffff);
sin6->sin6_scope_id = 0;
}
return 0;
}
struct sockaddr *
sockaddr_in6_externalize(struct sockaddr *dst, socklen_t socklen,
const struct sockaddr *src)
{
struct sockaddr_in6 *sin6;
sin6 = satosin6(sockaddr_copy(dst, socklen, src));
if (sin6 == NULL || sa6_recoverscope(sin6) != 0)
return NULL;
return dst;
}
/*
* generate standard sockaddr_in6 from embedded form.
*/
int
sa6_recoverscope(struct sockaddr_in6 *sin6)
{
uint32_t zoneid;
char ip6buf[INET6_ADDRSTRLEN];
if (sin6->sin6_scope_id != 0) {
log(LOG_NOTICE,
"%s: assumption failure (non 0 ID): %s%%%d\n", __func__,
IN6_PRINT(ip6buf, &sin6->sin6_addr), sin6->sin6_scope_id);
/* XXX: proceed anyway... */
}
if (IN6_IS_SCOPE_LINKLOCAL(&sin6->sin6_addr) ||
IN6_IS_ADDR_MC_INTFACELOCAL(&sin6->sin6_addr)) {
/*
* KAME assumption: link id == interface id
*/
zoneid = ntohs(sin6->sin6_addr.s6_addr16[1]);
if (zoneid) {
int s = pserialize_read_enter();
if (!if_byindex(zoneid)) {
pserialize_read_exit(s);
return ENXIO;
}
pserialize_read_exit(s);
sin6->sin6_addr.s6_addr16[1] = 0;
sin6->sin6_scope_id = zoneid;
}
}
return 0;
}
int
in6_setzoneid(struct in6_addr *in6, uint32_t zoneid)
{
if (IN6_IS_SCOPE_EMBEDDABLE(in6))
in6->s6_addr16[1] = htons(zoneid & 0xffff); /* XXX */
return 0;
}
/*
* Determine the appropriate scope zone ID for in6 and ifp. If ret_id is
* non NULL, it is set to the zone ID. If the zone ID needs to be embedded
* in the in6_addr structure, in6 will be modified.
*/
int
in6_setscope(struct in6_addr *in6, const struct ifnet *ifp, uint32_t *ret_id)
{
int scope;
uint32_t zoneid = 0;
const struct scope6_id *sid = SID(ifp); if (sid == NULL) {
log(LOG_NOTICE, "%s: no scope id for %s\n", __func__,
if_name(ifp));
return EINVAL;
}
/*
* special case: the loopback address can only belong to a loopback
* interface.
*/
if (IN6_IS_ADDR_LOOPBACK(in6)) {
if (!(ifp->if_flags & IFF_LOOPBACK)) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_NOTICE, "%s: can't set scope for not loopback "
"interface %s and loopback address %s\n",
__func__, if_name(ifp), IN6_PRINT(ip6buf, in6));
return EINVAL;
} else {
if (ret_id != NULL) *ret_id = 0; /* there's no ambiguity */
return 0;
}
}
scope = in6_addrscope(in6); switch (scope) {
case IPV6_ADDR_SCOPE_INTFACELOCAL: /* should be interface index */
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_INTFACELOCAL];
break;
case IPV6_ADDR_SCOPE_LINKLOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_LINKLOCAL];
break;
case IPV6_ADDR_SCOPE_SITELOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_SITELOCAL];
break;
case IPV6_ADDR_SCOPE_ORGLOCAL:
zoneid = sid->s6id_list[IPV6_ADDR_SCOPE_ORGLOCAL];
break;
default:
zoneid = 0; /* XXX: treat as global. */
break;
}
if (ret_id != NULL) *ret_id = zoneid; return in6_setzoneid(in6, zoneid);
}
const char *
in6_getscopename(const struct in6_addr *addr)
{
switch (in6_addrscope(addr)) {
case IPV6_ADDR_SCOPE_INTFACELOCAL:
return "interface";
#if IPV6_ADDR_SCOPE_INTFACELOCAL != IPV6_ADDR_SCOPE_NODELOCAL
case IPV6_ADDR_SCOPE_NODELOCAL:
return "node";
#endif
case IPV6_ADDR_SCOPE_LINKLOCAL:
return "link";
case IPV6_ADDR_SCOPE_SITELOCAL:
return "site";
case IPV6_ADDR_SCOPE_ORGLOCAL:
return "organization";
case IPV6_ADDR_SCOPE_GLOBAL:
return "global";
default:
return "unknown";
}
}
/*
* Just clear the embedded scope identifier. Return 0 if the original address
* is intact; return non 0 if the address is modified.
*/
int
in6_clearscope(struct in6_addr *in6)
{
int modified = 0;
if (IN6_IS_SCOPE_LINKLOCAL(in6) || IN6_IS_ADDR_MC_INTFACELOCAL(in6)) {
if (in6->s6_addr16[1] != 0)
modified = 1;
in6->s6_addr16[1] = 0;
}
return modified;
}
/* $NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $ */
/*-
* Copyright (c) 1998, 2000, 2004, 2008, 2009, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
*/
/*
* Copyright (c) 1997 Christopher G. Demetriou. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uipc_usrreq.c,v 1.203 2022/05/28 22:08:46 andvar Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/filedesc.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/unpcb.h>
#include <sys/un.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/mbuf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/compat_stub.h>
#include <compat/sys/socket.h>
#include <compat/net/route_70.h>
/*
* Unix communications domain.
*
* TODO:
* RDM
* rethink name space problems
* need a proper out-of-band
*
* Notes on locking:
*
* The generic rules noted in uipc_socket2.c apply. In addition:
*
* o We have a global lock, uipc_lock.
*
* o All datagram sockets are locked by uipc_lock.
*
* o For stream socketpairs, the two endpoints are created sharing the same
* independent lock. Sockets presented to PRU_CONNECT2 must already have
* matching locks.
*
* o Stream sockets created via socket() start life with their own
* independent lock.
*
* o Stream connections to a named endpoint are slightly more complicated.
* Sockets that have called listen() have their lock pointer mutated to
* the global uipc_lock. When establishing a connection, the connecting
* socket also has its lock mutated to uipc_lock, which matches the head
* (listening socket). We create a new socket for accept() to return, and
* that also shares the head's lock. Until the connection is completely
* done on both ends, all three sockets are locked by uipc_lock. Once the
* connection is complete, the association with the head's lock is broken.
* The connecting socket and the socket returned from accept() have their
* lock pointers mutated away from uipc_lock, and back to the connecting
* socket's original, independent lock. The head continues to be locked
* by uipc_lock.
*
* o If uipc_lock is determined to be a significant source of contention,
* it could easily be hashed out. It is difficult to simply make it an
* independent lock because of visibility / garbage collection issues:
* if a socket has been associated with a lock at any point, that lock
* must remain valid until the socket is no longer visible in the system.
* The lock must not be freed or otherwise destroyed until any sockets
* that had referenced it have also been destroyed.
*/
const struct sockaddr_un sun_noname = {
.sun_len = offsetof(struct sockaddr_un, sun_path),
.sun_family = AF_LOCAL,
};
ino_t unp_ino; /* prototype for fake inode numbers */
static struct mbuf * unp_addsockcred(struct lwp *, struct mbuf *);
static void unp_discard_later(file_t *);
static void unp_discard_now(file_t *);
static void unp_disconnect1(struct unpcb *);
static bool unp_drop(struct unpcb *, int);
static int unp_internalize(struct mbuf **);
static void unp_mark(file_t *);
static void unp_scan(struct mbuf *, void (*)(file_t *), int);
static void unp_shutdown1(struct unpcb *);
static void unp_thread(void *);
static void unp_thread_kick(void);
static kmutex_t *uipc_lock;
static kcondvar_t unp_thread_cv;
static lwp_t *unp_thread_lwp;
static SLIST_HEAD(,file) unp_thread_discard;
static int unp_defer;
static struct sysctllog *usrreq_sysctllog;
static void unp_sysctl_create(void);
/* Compat interface */
struct mbuf * stub_compat_70_unp_addsockcred(lwp_t *, struct mbuf *);
struct mbuf * stub_compat_70_unp_addsockcred(struct lwp *lwp,
struct mbuf *control)
{
/* just copy our initial argument */
return control;
}
bool compat70_ocreds_valid = false;
/*
* Initialize Unix protocols.
*/
void
uipc_init(void)
{
int error;
unp_sysctl_create();
uipc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
cv_init(&unp_thread_cv, "unpgc");
error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, unp_thread,
NULL, &unp_thread_lwp, "unpgc");
if (error != 0)
panic("uipc_init %d", error);
}
static void
unp_connid(struct lwp *l, struct unpcb *unp, int flags)
{
unp->unp_connid.unp_pid = l->l_proc->p_pid;
unp->unp_connid.unp_euid = kauth_cred_geteuid(l->l_cred);
unp->unp_connid.unp_egid = kauth_cred_getegid(l->l_cred);
unp->unp_flags |= flags;
}
/*
* A connection succeeded: disassociate both endpoints from the head's
* lock, and make them share their own lock. There is a race here: for
* a very brief time one endpoint will be locked by a different lock
* than the other end. However, since the current thread holds the old
* lock (the listening socket's lock, the head) access can still only be
* made to one side of the connection.
*/
static void
unp_setpeerlocks(struct socket *so, struct socket *so2)
{
struct unpcb *unp;
kmutex_t *lock;
KASSERT(solocked2(so, so2));
/*
* Bail out if either end of the socket is not yet fully
* connected or accepted. We only break the lock association
* with the head when the pair of sockets stand completely
* on their own.
*/
KASSERT(so->so_head == NULL); if (so2->so_head != NULL)
return;
/*
* Drop references to old lock. A third reference (from the
* queue head) must be held as we still hold its lock. Bonus:
* we don't need to worry about garbage collecting the lock.
*/
lock = so->so_lock;
KASSERT(lock == uipc_lock);
mutex_obj_free(lock);
mutex_obj_free(lock);
/*
* Grab stream lock from the initiator and share between the two
* endpoints. Issue memory barrier to ensure all modifications
* become globally visible before the lock change. so2 is
* assumed not to have a stream lock, because it was created
* purely for the server side to accept this connection and
* started out life using the domain-wide lock.
*/
unp = sotounpcb(so);
KASSERT(unp->unp_streamlock != NULL); KASSERT(sotounpcb(so2)->unp_streamlock == NULL);
lock = unp->unp_streamlock;
unp->unp_streamlock = NULL;
mutex_obj_hold(lock);
/*
* Ensure lock is initialized before publishing it with
* solockreset. Pairs with atomic_load_consume in solock and
* various loops to reacquire lock after wakeup.
*/
membar_release();
/*
* possible race if lock is not held - see comment in
* uipc_usrreq(PRU_ACCEPT).
*/
KASSERT(mutex_owned(lock));
solockreset(so, lock);
solockreset(so2, lock);
}
/*
* Reset a socket's lock back to the domain-wide lock.
*/
static void
unp_resetlock(struct socket *so)
{
kmutex_t *olock, *nlock;
struct unpcb *unp;
KASSERT(solocked(so));
olock = so->so_lock;
nlock = uipc_lock;
if (olock == nlock)
return;
unp = sotounpcb(so);
KASSERT(unp->unp_streamlock == NULL);
unp->unp_streamlock = olock;
mutex_obj_hold(nlock);
mutex_enter(nlock);
solockreset(so, nlock);
mutex_exit(olock);
}
static void
unp_free(struct unpcb *unp)
{
if (unp->unp_addr) free(unp->unp_addr, M_SONAME); if (unp->unp_streamlock != NULL) mutex_obj_free(unp->unp_streamlock);
kmem_free(unp, sizeof(*unp));
}
static int
unp_output(struct mbuf *m, struct mbuf *control, struct unpcb *unp)
{
struct socket *so2;
const struct sockaddr_un *sun;
/* XXX: server side closed the socket */
if (unp->unp_conn == NULL)
return ECONNREFUSED;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked(so2));
if (unp->unp_addr)
sun = unp->unp_addr;
else
sun = &sun_noname;
if (unp->unp_conn->unp_flags & UNP_WANTCRED) control = unp_addsockcred(curlwp, control); if (unp->unp_conn->unp_flags & UNP_OWANTCRED) MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
stub_compat_70_unp_addsockcred(curlwp, control), control);
if (sbappendaddr(&so2->so_rcv, (const struct sockaddr *)sun, m,
control) == 0) {
unp_dispose(control);
m_freem(control);
m_freem(m);
/* Don't call soroverflow because we're returning this
* error directly to the sender. */
so2->so_rcv.sb_overflowed++;
return ENOBUFS;
} else {
sorwakeup(so2);
return 0;
}
}
static void
unp_setaddr(struct socket *so, struct sockaddr *nam, bool peeraddr)
{
const struct sockaddr_un *sun = NULL;
struct unpcb *unp;
KASSERT(solocked(so));
unp = sotounpcb(so);
if (peeraddr) {
if (unp->unp_conn && unp->unp_conn->unp_addr)
sun = unp->unp_conn->unp_addr;
} else {
if (unp->unp_addr)
sun = unp->unp_addr;
}
if (sun == NULL)
sun = &sun_noname;
memcpy(nam, sun, sun->sun_len);
}
static int
unp_rcvd(struct socket *so, int flags, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
struct socket *so2;
u_int newhiwat;
KASSERT(solocked(so)); KASSERT(unp != NULL); switch (so->so_type) {
case SOCK_DGRAM:
panic("uipc 1");
/*NOTREACHED*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
#define rcv (&so->so_rcv)
#define snd (&so2->so_snd)
if (unp->unp_conn == 0)
break;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2));
/*
* Adjust backpressure on sender
* and wakeup any waiting to write.
*/
snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt;
unp->unp_mbcnt = rcv->sb_mbcnt;
newhiwat = snd->sb_hiwat + unp->unp_cc - rcv->sb_cc;
(void)chgsbsize(so2->so_uidinfo,
&snd->sb_hiwat, newhiwat, RLIM_INFINITY);
unp->unp_cc = rcv->sb_cc;
sowwakeup(so2);
#undef snd
#undef rcv
break;
default:
panic("uipc 2");
}
return 0;
}
static int
unp_recvoob(struct socket *so, struct mbuf *m, int flags)
{ KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
unp_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
int error = 0;
u_int newhiwat;
struct socket *so2;
KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(m != NULL);
/*
* Note: unp_internalize() rejects any control message
* other than SCM_RIGHTS, and only allows one. This
* has the side-effect of preventing a caller from
* forging SCM_CREDS.
*/
if (control) {
sounlock(so);
error = unp_internalize(&control); solock(so); if (error != 0) {
m_freem(control);
m_freem(m);
return error;
}
}
switch (so->so_type) {
case SOCK_DGRAM: {
KASSERT(so->so_lock == uipc_lock);
if (nam) {
if ((so->so_state & SS_ISCONNECTED) != 0)
error = EISCONN;
else {
/*
* Note: once connected, the
* socket's lock must not be
* dropped until we have sent
* the message and disconnected.
* This is necessary to prevent
* intervening control ops, like
* another connection.
*/
error = unp_connect(so, nam, l);
}
} else {
if ((so->so_state & SS_ISCONNECTED) == 0)
error = ENOTCONN;
}
if (error) {
unp_dispose(control);
m_freem(control);
m_freem(m);
return error;
}
error = unp_output(m, control, unp); if (nam) unp_disconnect1(unp);
break;
}
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
#define rcv (&so2->so_rcv)
#define snd (&so->so_snd)
if (unp->unp_conn == NULL) {
error = ENOTCONN;
break;
}
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2)); if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
/*
* Credentials are passed only once on
* SOCK_STREAM and SOCK_SEQPACKET.
*/
unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
control = unp_addsockcred(l, control);
}
if (unp->unp_conn->unp_flags & UNP_OWANTCRED) {
/*
* Credentials are passed only once on
* SOCK_STREAM and SOCK_SEQPACKET.
*/
unp->unp_conn->unp_flags &= ~UNP_OWANTCRED;
MODULE_HOOK_CALL(uipc_unp_70_hook, (curlwp, control),
stub_compat_70_unp_addsockcred(curlwp, control),
control);
}
/*
* Send to paired receive port, and then reduce
* send buffer hiwater marks to maintain backpressure.
* Wake up readers.
*/
if (control) {
if (sbappendcontrol(rcv, m, control) != 0)
control = NULL;
} else {
switch(so->so_type) {
case SOCK_SEQPACKET:
sbappendrecord(rcv, m);
break;
case SOCK_STREAM:
sbappend(rcv, m);
break;
default:
panic("uipc_usrreq");
break;
}
}
snd->sb_mbmax -=
rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
newhiwat = snd->sb_hiwat -
(rcv->sb_cc - unp->unp_conn->unp_cc);
(void)chgsbsize(so->so_uidinfo,
&snd->sb_hiwat, newhiwat, RLIM_INFINITY);
unp->unp_conn->unp_cc = rcv->sb_cc;
sorwakeup(so2);
#undef snd
#undef rcv
if (control != NULL) { unp_dispose(control);
m_freem(control);
}
break;
default:
panic("uipc 4");
}
return error;
}
static int
unp_sendoob(struct socket *so, struct mbuf *m, struct mbuf * control)
{ KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
/*
* Unix domain socket option processing.
*/
int
uipc_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
struct unpcb *unp = sotounpcb(so);
int optval = 0, error = 0;
KASSERT(solocked(so)); if (sopt->sopt_level != SOL_LOCAL) {
error = ENOPROTOOPT;
} else switch (op) {
case PRCO_SETOPT:
switch (sopt->sopt_name) {
case LOCAL_OCREDS:
if (!compat70_ocreds_valid) {
error = ENOPROTOOPT;
break;
}
/* FALLTHROUGH */
case LOCAL_CREDS:
case LOCAL_CONNWAIT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch (sopt->sopt_name) {
#define OPTSET(bit) \
if (optval) \
unp->unp_flags |= (bit); \
else \
unp->unp_flags &= ~(bit);
case LOCAL_CREDS:
OPTSET(UNP_WANTCRED);
break;
case LOCAL_CONNWAIT:
OPTSET(UNP_CONNWAIT);
break;
case LOCAL_OCREDS:
OPTSET(UNP_OWANTCRED);
break;
}
break;
#undef OPTSET
default:
error = ENOPROTOOPT;
break;
}
break;
case PRCO_GETOPT:
sounlock(so);
switch (sopt->sopt_name) {
case LOCAL_PEEREID:
if (unp->unp_flags & UNP_EIDSVALID) { error = sockopt_set(sopt, &unp->unp_connid,
sizeof(unp->unp_connid));
} else {
error = EINVAL;
}
break;
case LOCAL_CREDS:
#define OPTBIT(bit) (unp->unp_flags & (bit) ? 1 : 0)
optval = OPTBIT(UNP_WANTCRED);
error = sockopt_setint(sopt, optval);
break;
case LOCAL_OCREDS:
if (compat70_ocreds_valid) { optval = OPTBIT(UNP_OWANTCRED);
error = sockopt_setint(sopt, optval);
break;
}
#undef OPTBIT
/* FALLTHROUGH */
default:
error = ENOPROTOOPT;
break;
}
solock(so);
break;
}
return (error);
}
/*
* Both send and receive buffers are allocated PIPSIZ bytes of buffering
* for stream sockets, although the total for sender and receiver is
* actually only PIPSIZ.
* Datagram sockets really use the sendspace as the maximum datagram size,
* and don't really want to reserve the sendspace. Their recvspace should
* be large enough for at least one max-size datagram plus address.
*/
#ifndef PIPSIZ
#define PIPSIZ 8192
#endif
u_long unpst_sendspace = PIPSIZ;
u_long unpst_recvspace = PIPSIZ;
u_long unpdg_sendspace = 2*1024; /* really max datagram size */
u_long unpdg_recvspace = 16*1024;
u_int unp_rights; /* files in flight */
u_int unp_rights_ratio = 2; /* limit, fraction of maxfiles */
static int
unp_attach(struct socket *so, int proto)
{
struct unpcb *unp = sotounpcb(so);
u_long sndspc, rcvspc;
int error;
KASSERT(unp == NULL); switch (so->so_type) {
case SOCK_SEQPACKET:
/* FALLTHROUGH */
case SOCK_STREAM:
if (so->so_lock == NULL) {
so->so_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
solock(so);
}
sndspc = unpst_sendspace;
rcvspc = unpst_recvspace;
break;
case SOCK_DGRAM:
if (so->so_lock == NULL) {
mutex_obj_hold(uipc_lock);
so->so_lock = uipc_lock;
solock(so);
}
sndspc = unpdg_sendspace;
rcvspc = unpdg_recvspace;
break;
default:
panic("unp_attach");
}
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
error = soreserve(so, sndspc, rcvspc);
if (error) {
return error;
}
}
unp = kmem_zalloc(sizeof(*unp), KM_SLEEP);
nanotime(&unp->unp_ctime);
unp->unp_socket = so;
so->so_pcb = unp;
KASSERT(solocked(so));
return 0;
}
static void
unp_detach(struct socket *so)
{
struct unpcb *unp;
vnode_t *vp;
unp = sotounpcb(so);
KASSERT(unp != NULL); KASSERT(solocked(so));
retry:
if ((vp = unp->unp_vnode) != NULL) {
sounlock(so);
/* Acquire v_interlock to protect against unp_connect(). */
/* XXXAD racy */
mutex_enter(vp->v_interlock);
vp->v_socket = NULL;
mutex_exit(vp->v_interlock);
vrele(vp);
solock(so);
unp->unp_vnode = NULL;
}
if (unp->unp_conn) unp_disconnect1(unp);
while (unp->unp_refs) {
KASSERT(solocked2(so, unp->unp_refs->unp_socket)); if (unp_drop(unp->unp_refs, ECONNRESET)) { solock(so);
goto retry;
}
}
soisdisconnected(so);
so->so_pcb = NULL;
if (unp_rights) {
/*
* Normally the receive buffer is flushed later, in sofree,
* but if our receive buffer holds references to files that
* are now garbage, we will enqueue those file references to
* the garbage collector and kick it into action.
*/
sorflush(so);
unp_free(unp); unp_thread_kick();
} else
unp_free(unp);
}
static int
unp_accept(struct socket *so, struct sockaddr *nam)
{
struct unpcb *unp = sotounpcb(so);
struct socket *so2;
KASSERT(solocked(so)); KASSERT(nam != NULL);
/* XXX code review required to determine if unp can ever be NULL */
if (unp == NULL)
return EINVAL;
KASSERT(so->so_lock == uipc_lock);
/*
* Mark the initiating STREAM socket as connected *ONLY*
* after it's been accepted. This prevents a client from
* overrunning a server and receiving ECONNREFUSED.
*/
if (unp->unp_conn == NULL) {
/*
* This will use the empty socket and will not
* allocate.
*/
unp_setaddr(so, nam, true);
return 0;
}
so2 = unp->unp_conn->unp_socket;
if (so2->so_state & SS_ISCONNECTING) { KASSERT(solocked2(so, so->so_head)); KASSERT(solocked2(so2, so->so_head));
soisconnected(so2);
}
/*
* If the connection is fully established, break the
* association with uipc_lock and give the connected
* pair a separate lock to share.
* There is a race here: sotounpcb(so2)->unp_streamlock
* is not locked, so when changing so2->so_lock
* another thread can grab it while so->so_lock is still
* pointing to the (locked) uipc_lock.
* this should be harmless, except that this makes
* solocked2() and solocked() unreliable.
* Another problem is that unp_setaddr() expects the
* the socket locked. Grabbing sotounpcb(so2)->unp_streamlock
* fixes both issues.
*/
mutex_enter(sotounpcb(so2)->unp_streamlock);
unp_setpeerlocks(so2, so);
/*
* Only now return peer's address, as we may need to
* block in order to allocate memory.
*
* XXX Minor race: connection can be broken while
* lock is dropped in unp_setaddr(). We will return
* error == 0 and sun_noname as the peer address.
*/
unp_setaddr(so, nam, true);
/* so_lock now points to unp_streamlock */
mutex_exit(so2->so_lock);
return 0;
}
static int
unp_ioctl(struct socket *so, u_long cmd, void *nam, struct ifnet *ifp)
{
return EOPNOTSUPP;
}
static int
unp_stat(struct socket *so, struct stat *ub)
{
struct unpcb *unp;
struct socket *so2;
KASSERT(solocked(so));
unp = sotounpcb(so);
if (unp == NULL)
return EINVAL;
ub->st_blksize = so->so_snd.sb_hiwat;
switch (so->so_type) {
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
if (unp->unp_conn == 0)
break;
so2 = unp->unp_conn->unp_socket;
KASSERT(solocked2(so, so2));
ub->st_blksize += so2->so_rcv.sb_cc;
break;
default:
break;
}
ub->st_dev = NODEV;
if (unp->unp_ino == 0) unp->unp_ino = unp_ino++;
ub->st_atimespec = ub->st_mtimespec = ub->st_ctimespec = unp->unp_ctime;
ub->st_ino = unp->unp_ino;
ub->st_uid = so->so_uidinfo->ui_uid;
ub->st_gid = so->so_egid;
return (0);
}
static int
unp_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
KASSERT(sotounpcb(so) != NULL);
KASSERT(nam != NULL);
unp_setaddr(so, nam, true);
return 0;
}
static int
unp_sockaddr(struct socket *so, struct sockaddr *nam)
{ KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); KASSERT(nam != NULL); unp_setaddr(so, nam, false);
return 0;
}
/*
* we only need to perform this allocation until syscalls other than
* bind are adjusted to use sockaddr_big.
*/
static struct sockaddr_un *
makeun_sb(struct sockaddr *nam, size_t *addrlen)
{
struct sockaddr_un *sun;
*addrlen = nam->sa_len + 1;
sun = malloc(*addrlen, M_SONAME, M_WAITOK);
memcpy(sun, nam, nam->sa_len);
*(((char *)sun) + nam->sa_len) = '\0';
return sun;
}
static int
unp_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct sockaddr_un *sun;
struct unpcb *unp;
vnode_t *vp;
struct vattr vattr;
size_t addrlen;
int error;
struct pathbuf *pb;
struct nameidata nd;
proc_t *p;
unp = sotounpcb(so);
KASSERT(solocked(so)); KASSERT(unp != NULL); KASSERT(nam != NULL); if (unp->unp_vnode != NULL)
return (EINVAL);
if ((unp->unp_flags & UNP_BUSY) != 0) {
/*
* EALREADY may not be strictly accurate, but since this
* is a major application error it's hardly a big deal.
*/
return (EALREADY);
}
unp->unp_flags |= UNP_BUSY;
sounlock(so);
p = l->l_proc;
sun = makeun_sb(nam, &addrlen);
pb = pathbuf_create(sun->sun_path);
if (pb == NULL) {
error = ENOMEM;
goto bad;
}
NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT | TRYEMULROOT, pb);
/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto bad;
}
vp = nd.ni_vp;
if (vp != NULL) {
VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
if (nd.ni_dvp == vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(vp);
pathbuf_destroy(pb);
error = EADDRINUSE;
goto bad;
}
vattr_null(&vattr);
vattr.va_type = VSOCK;
vattr.va_mode = ACCESSPERMS & ~(p->p_cwdi->cwdi_cmask);
error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
if (error) {
vput(nd.ni_dvp);
pathbuf_destroy(pb);
goto bad;
}
vp = nd.ni_vp;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
solock(so);
vp->v_socket = unp->unp_socket;
unp->unp_vnode = vp;
unp->unp_addrlen = addrlen;
unp->unp_addr = sun;
VOP_UNLOCK(vp);
vput(nd.ni_dvp);
unp->unp_flags &= ~UNP_BUSY;
pathbuf_destroy(pb);
return (0);
bad:
free(sun, M_SONAME);
solock(so);
unp->unp_flags &= ~UNP_BUSY;
return (error);
}
static int
unp_listen(struct socket *so, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
KASSERT(solocked(so)); KASSERT(unp != NULL);
/*
* If the socket can accept a connection, it must be
* locked by uipc_lock.
*/
unp_resetlock(so);
if (unp->unp_vnode == NULL)
return EINVAL;
unp_connid(l, unp, UNP_EIDSBIND);
return 0;
}
static int
unp_disconnect(struct socket *so)
{ KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL);
unp_disconnect1(sotounpcb(so));
return 0;
}
static int
unp_shutdown(struct socket *so)
{ KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL);
socantsendmore(so);
unp_shutdown1(sotounpcb(so));
return 0;
}
static int
unp_abort(struct socket *so)
{ KASSERT(solocked(so)); KASSERT(sotounpcb(so) != NULL); (void)unp_drop(sotounpcb(so), ECONNABORTED); KASSERT(so->so_head == NULL); KASSERT(so->so_pcb != NULL);
unp_detach(so);
return 0;
}
static int
unp_connect1(struct socket *so, struct socket *so2, struct lwp *l)
{
struct unpcb *unp = sotounpcb(so);
struct unpcb *unp2;
if (so2->so_type != so->so_type)
return EPROTOTYPE;
/*
* All three sockets involved must be locked by same lock:
*
* local endpoint (so)
* remote endpoint (so2)
* queue head (so2->so_head, only if PR_CONNREQUIRED)
*/
KASSERT(solocked2(so, so2)); KASSERT(so->so_head == NULL); if (so2->so_head != NULL) { KASSERT(so2->so_lock == uipc_lock); KASSERT(solocked2(so2, so2->so_head));
}
unp2 = sotounpcb(so2);
unp->unp_conn = unp2;
switch (so->so_type) {
case SOCK_DGRAM:
unp->unp_nextref = unp2->unp_refs;
unp2->unp_refs = unp;
soisconnected(so);
break;
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
/*
* SOCK_SEQPACKET and SOCK_STREAM cases are handled by callers
* which are unp_connect() or unp_connect2().
*/
break;
default:
panic("unp_connect1");
}
return 0;
}
int
unp_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct sockaddr_un *sun;
vnode_t *vp;
struct socket *so2, *so3;
struct unpcb *unp, *unp2, *unp3;
size_t addrlen;
int error;
struct pathbuf *pb;
struct nameidata nd;
unp = sotounpcb(so);
if ((unp->unp_flags & UNP_BUSY) != 0) {
/*
* EALREADY may not be strictly accurate, but since this
* is a major application error it's hardly a big deal.
*/
return (EALREADY);
}
unp->unp_flags |= UNP_BUSY;
sounlock(so);
sun = makeun_sb(nam, &addrlen);
pb = pathbuf_create(sun->sun_path);
if (pb == NULL) {
error = ENOMEM;
goto bad2;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | TRYEMULROOT, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto bad2;
}
vp = nd.ni_vp;
pathbuf_destroy(pb);
if (vp->v_type != VSOCK) {
error = ENOTSOCK;
goto bad;
}
if ((error = VOP_ACCESS(vp, VWRITE, l->l_cred)) != 0)
goto bad;
/* Acquire v_interlock to protect against unp_detach(). */
mutex_enter(vp->v_interlock);
so2 = vp->v_socket;
if (so2 == NULL) {
mutex_exit(vp->v_interlock);
error = ECONNREFUSED;
goto bad;
}
if (so->so_type != so2->so_type) {
mutex_exit(vp->v_interlock);
error = EPROTOTYPE;
goto bad;
}
solock(so);
unp_resetlock(so);
mutex_exit(vp->v_interlock);
if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
/*
* This may seem somewhat fragile but is OK: if we can
* see SO_ACCEPTCONN set on the endpoint, then it must
* be locked by the domain-wide uipc_lock.
*/
KASSERT((so2->so_options & SO_ACCEPTCONN) == 0 ||
so2->so_lock == uipc_lock);
if ((so2->so_options & SO_ACCEPTCONN) == 0 ||
(so3 = sonewconn(so2, false)) == NULL) {
error = ECONNREFUSED;
sounlock(so);
goto bad;
}
unp2 = sotounpcb(so2);
unp3 = sotounpcb(so3);
if (unp2->unp_addr) { unp3->unp_addr = malloc(unp2->unp_addrlen,
M_SONAME, M_WAITOK);
memcpy(unp3->unp_addr, unp2->unp_addr,
unp2->unp_addrlen);
unp3->unp_addrlen = unp2->unp_addrlen;
}
unp3->unp_flags = unp2->unp_flags;
so2 = so3;
/*
* The connector's (client's) credentials are copied from its
* process structure at the time of connect() (which is now).
*/
unp_connid(l, unp3, UNP_EIDSVALID);
/*
* The receiver's (server's) credentials are copied from the
* unp_peercred member of socket on which the former called
* listen(); unp_listen() cached that process's credentials
* at that time so we can use them now.
*/
if (unp2->unp_flags & UNP_EIDSBIND) { memcpy(&unp->unp_connid, &unp2->unp_connid,
sizeof(unp->unp_connid));
unp->unp_flags |= UNP_EIDSVALID;
}
}
error = unp_connect1(so, so2, l);
if (error) {
sounlock(so);
goto bad;
}
unp2 = sotounpcb(so2);
switch (so->so_type) {
/*
* SOCK_DGRAM and default cases are handled in prior call to
* unp_connect1(), do not add a default case without fixing
* unp_connect1().
*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
unp2->unp_conn = unp;
if ((unp->unp_flags | unp2->unp_flags) & UNP_CONNWAIT)
soisconnecting(so);
else
soisconnected(so);
soisconnected(so2);
/*
* If the connection is fully established, break the
* association with uipc_lock and give the connected
* pair a separate lock to share.
*/
KASSERT(so2->so_head != NULL);
unp_setpeerlocks(so, so2);
break;
}
sounlock(so);
bad:
vput(vp);
bad2:
free(sun, M_SONAME);
solock(so);
unp->unp_flags &= ~UNP_BUSY;
return (error);
}
int
unp_connect2(struct socket *so, struct socket *so2)
{
struct unpcb *unp = sotounpcb(so);
struct unpcb *unp2;
int error = 0;
KASSERT(solocked2(so, so2));
error = unp_connect1(so, so2, curlwp);
if (error)
return error;
unp2 = sotounpcb(so2);
switch (so->so_type) {
/*
* SOCK_DGRAM and default cases are handled in prior call to
* unp_connect1(), do not add a default case without fixing
* unp_connect1().
*/
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
unp2->unp_conn = unp;
soisconnected(so);
soisconnected(so2);
break;
}
return error;
}
static void
unp_disconnect1(struct unpcb *unp)
{
struct unpcb *unp2 = unp->unp_conn;
struct socket *so;
if (unp2 == 0)
return;
unp->unp_conn = 0;
so = unp->unp_socket;
switch (so->so_type) {
case SOCK_DGRAM:
if (unp2->unp_refs == unp)
unp2->unp_refs = unp->unp_nextref;
else {
unp2 = unp2->unp_refs;
for (;;) {
KASSERT(solocked2(so, unp2->unp_socket));
if (unp2 == 0)
panic("unp_disconnect1");
if (unp2->unp_nextref == unp)
break;
unp2 = unp2->unp_nextref;
}
unp2->unp_nextref = unp->unp_nextref;
}
unp->unp_nextref = 0;
so->so_state &= ~SS_ISCONNECTED;
break;
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
KASSERT(solocked2(so, unp2->unp_socket));
soisdisconnected(so);
unp2->unp_conn = 0;
soisdisconnected(unp2->unp_socket);
break;
}
}
static void
unp_shutdown1(struct unpcb *unp)
{
struct socket *so;
switch(unp->unp_socket->so_type) {
case SOCK_SEQPACKET: /* FALLTHROUGH */
case SOCK_STREAM:
if (unp->unp_conn && (so = unp->unp_conn->unp_socket)) socantrcvmore(so);
break;
default:
break;
}
}
static bool
unp_drop(struct unpcb *unp, int errno)
{
struct socket *so = unp->unp_socket;
KASSERT(solocked(so));
so->so_error = errno;
unp_disconnect1(unp);
if (so->so_head) {
so->so_pcb = NULL;
/* sofree() drops the socket lock */
sofree(so);
unp_free(unp);
return true;
}
return false;
}
#ifdef notdef
unp_drain(void)
{
}
#endif
int
unp_externalize(struct mbuf *rights, struct lwp *l, int flags)
{
struct cmsghdr * const cm = mtod(rights, struct cmsghdr *);
struct proc * const p = l->l_proc;
file_t **rp;
int error = 0;
const size_t nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
sizeof(file_t *);
if (nfds == 0)
goto noop;
int * const fdp = kmem_alloc(nfds * sizeof(int), KM_SLEEP);
rw_enter(&p->p_cwdi->cwdi_lock, RW_READER);
/* Make sure the recipient should be able to see the files.. */
rp = (file_t **)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++) {
file_t * const fp = *rp++;
if (fp == NULL) {
error = EINVAL;
goto out;
}
/*
* If we are in a chroot'ed directory, and
* someone wants to pass us a directory, make
* sure it's inside the subtree we're allowed
* to access.
*/
if (p->p_cwdi->cwdi_rdir != NULL && fp->f_type == DTYPE_VNODE) {
vnode_t *vp = fp->f_vnode;
if ((vp->v_type == VDIR) &&
!vn_isunder(vp, p->p_cwdi->cwdi_rdir, l)) {
error = EPERM;
goto out;
}
}
}
restart:
/*
* First loop -- allocate file descriptor table slots for the
* new files.
*/
for (size_t i = 0; i < nfds; i++) {
if ((error = fd_alloc(p, 0, &fdp[i])) != 0) {
/*
* Back out what we've done so far.
*/
while (i-- > 0) {
fd_abort(p, NULL, fdp[i]);
}
if (error == ENOSPC) { fd_tryexpand(p);
error = 0;
goto restart;
}
/*
* This is the error that has historically
* been returned, and some callers may
* expect it.
*/
error = EMSGSIZE;
goto out;
}
}
/*
* Now that adding them has succeeded, update all of the
* file passing state and affix the descriptors.
*/
rp = (file_t **)CMSG_DATA(cm);
int *ofdp = (int *)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++) {
file_t * const fp = *rp++;
const int fd = fdp[i];
atomic_dec_uint(&unp_rights);
fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
fd_affix(p, fp, fd);
/*
* Done with this file pointer, replace it with a fd;
*/
*ofdp++ = fd;
mutex_enter(&fp->f_lock);
fp->f_msgcount--;
mutex_exit(&fp->f_lock);
/*
* Note that fd_affix() adds a reference to the file.
* The file may already have been closed by another
* LWP in the process, so we must drop the reference
* added by unp_internalize() with closef().
*/
closef(fp);
}
/*
* Adjust length, in case of transition from large file_t
* pointers to ints.
*/
if (sizeof(file_t *) != sizeof(int)) {
cm->cmsg_len = CMSG_LEN(nfds * sizeof(int));
rights->m_len = CMSG_SPACE(nfds * sizeof(int));
}
out:
if (__predict_false(error != 0)) {
file_t **const fpp = (file_t **)CMSG_DATA(cm);
for (size_t i = 0; i < nfds; i++)
unp_discard_now(fpp[i]);
/*
* Truncate the array so that nobody will try to interpret
* what is now garbage in it.
*/
cm->cmsg_len = CMSG_LEN(0);
rights->m_len = CMSG_SPACE(0);
}
rw_exit(&p->p_cwdi->cwdi_lock);
kmem_free(fdp, nfds * sizeof(int));
noop:
/*
* Don't disclose kernel memory in the alignment space.
*/
KASSERT(cm->cmsg_len <= rights->m_len);
memset(&mtod(rights, char *)[cm->cmsg_len], 0, rights->m_len -
cm->cmsg_len);
return error;
}
static int
unp_internalize(struct mbuf **controlp)
{
filedesc_t *fdescp = curlwp->l_fd;
fdtab_t *dt;
struct mbuf *control = *controlp;
struct cmsghdr *newcm, *cm = mtod(control, struct cmsghdr *);
file_t **rp, **files;
file_t *fp;
int i, fd, *fdp;
int nfds, error;
u_int maxmsg;
error = 0;
newcm = NULL;
/* Sanity check the control message header. */
if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || cm->cmsg_len > control->m_len ||
cm->cmsg_len < CMSG_ALIGN(sizeof(*cm)))
return (EINVAL);
/*
* Verify that the file descriptors are valid, and acquire
* a reference to each.
*/
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) / sizeof(int);
fdp = (int *)CMSG_DATA(cm);
maxmsg = maxfiles / unp_rights_ratio;
for (i = 0; i < nfds; i++) {
fd = *fdp++;
if (atomic_inc_uint_nv(&unp_rights) > maxmsg) {
atomic_dec_uint(&unp_rights);
nfds = i;
error = EAGAIN;
goto out;
}
if ((fp = fd_getfile(fd)) == NULL
|| fp->f_type == DTYPE_KQUEUE) {
if (fp)
fd_putfile(fd);
atomic_dec_uint(&unp_rights);
nfds = i;
error = EBADF;
goto out;
}
}
/* Allocate new space and copy header into it. */
newcm = malloc(CMSG_SPACE(nfds * sizeof(file_t *)), M_MBUF, M_WAITOK);
if (newcm == NULL) {
error = E2BIG;
goto out;
}
memcpy(newcm, cm, sizeof(struct cmsghdr));
memset(newcm + 1, 0, CMSG_LEN(0) - sizeof(struct cmsghdr));
files = (file_t **)CMSG_DATA(newcm);
/*
* Transform the file descriptors into file_t pointers, in
* reverse order so that if pointers are bigger than ints, the
* int won't get until we're done. No need to lock, as we have
* already validated the descriptors with fd_getfile().
*/
fdp = (int *)CMSG_DATA(cm) + nfds;
rp = files + nfds;
for (i = 0; i < nfds; i++) { dt = atomic_load_consume(&fdescp->fd_dt); fp = atomic_load_consume(&dt->dt_ff[*--fdp]->ff_file); KASSERT(fp != NULL);
mutex_enter(&fp->f_lock);
*--rp = fp;
fp->f_count++;
fp->f_msgcount++;
mutex_exit(&fp->f_lock);
}
out:
/* Release descriptor references. */
fdp = (int *)CMSG_DATA(cm);
for (i = 0; i < nfds; i++) {
fd_putfile(*fdp++);
if (error != 0) { atomic_dec_uint(&unp_rights);
}
}
if (error == 0) { if (control->m_flags & M_EXT) {
m_freem(control);
*controlp = control = m_get(M_WAIT, MT_CONTROL);
}
MEXTADD(control, newcm, CMSG_SPACE(nfds * sizeof(file_t *)),
M_MBUF, NULL, NULL);
cm = newcm;
/*
* Adjust message & mbuf to note amount of space
* actually used.
*/
cm->cmsg_len = CMSG_LEN(nfds * sizeof(file_t *));
control->m_len = CMSG_SPACE(nfds * sizeof(file_t *));
}
return error;
}
struct mbuf *
unp_addsockcred(struct lwp *l, struct mbuf *control)
{
struct sockcred *sc;
struct mbuf *m;
void *p;
m = sbcreatecontrol1(&p, SOCKCREDSIZE(kauth_cred_ngroups(l->l_cred)),
SCM_CREDS, SOL_SOCKET, M_WAITOK);
if (m == NULL)
return control;
sc = p;
sc->sc_pid = l->l_proc->p_pid;
sc->sc_uid = kauth_cred_getuid(l->l_cred);
sc->sc_euid = kauth_cred_geteuid(l->l_cred);
sc->sc_gid = kauth_cred_getgid(l->l_cred);
sc->sc_egid = kauth_cred_getegid(l->l_cred);
sc->sc_ngroups = kauth_cred_ngroups(l->l_cred);
for (int i = 0; i < sc->sc_ngroups; i++)
sc->sc_groups[i] = kauth_cred_group(l->l_cred, i);
return m_add(control, m);
}
/*
* Do a mark-sweep GC of files in the system, to free up any which are
* caught in flight to an about-to-be-closed socket. Additionally,
* process deferred file closures.
*/
static void
unp_gc(file_t *dp)
{
extern struct domain unixdomain;
file_t *fp, *np;
struct socket *so, *so1;
u_int i, oflags, rflags;
bool didwork;
KASSERT(curlwp == unp_thread_lwp);
KASSERT(mutex_owned(&filelist_lock));
/*
* First, process deferred file closures.
*/
while (!SLIST_EMPTY(&unp_thread_discard)) {
fp = SLIST_FIRST(&unp_thread_discard);
KASSERT(fp->f_unpcount > 0);
KASSERT(fp->f_count > 0);
KASSERT(fp->f_msgcount > 0);
KASSERT(fp->f_count >= fp->f_unpcount);
KASSERT(fp->f_count >= fp->f_msgcount);
KASSERT(fp->f_msgcount >= fp->f_unpcount);
SLIST_REMOVE_HEAD(&unp_thread_discard, f_unplist);
i = fp->f_unpcount;
fp->f_unpcount = 0;
mutex_exit(&filelist_lock);
for (; i != 0; i--) {
unp_discard_now(fp);
}
mutex_enter(&filelist_lock);
}
/*
* Clear mark bits. Ensure that we don't consider new files
* entering the file table during this loop (they will not have
* FSCAN set).
*/
unp_defer = 0;
LIST_FOREACH(fp, &filehead, f_list) {
for (oflags = fp->f_flag;; oflags = rflags) {
rflags = atomic_cas_uint(&fp->f_flag, oflags,
(oflags | FSCAN) & ~(FMARK|FDEFER));
if (__predict_true(oflags == rflags)) {
break;
}
}
}
/*
* Iterate over the set of sockets, marking ones believed (based on
* refcount) to be referenced from a process, and marking for rescan
* sockets which are queued on a socket. Recan continues descending
* and searching for sockets referenced by sockets (FDEFER), until
* there are no more socket->socket references to be discovered.
*/
do {
didwork = false;
for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
KASSERT(mutex_owned(&filelist_lock));
np = LIST_NEXT(fp, f_list);
mutex_enter(&fp->f_lock);
if ((fp->f_flag & FDEFER) != 0) {
atomic_and_uint(&fp->f_flag, ~FDEFER);
unp_defer--;
if (fp->f_count == 0) {
/*
* XXX: closef() doesn't pay attention
* to FDEFER
*/
mutex_exit(&fp->f_lock);
continue;
}
} else {
if (fp->f_count == 0 ||
(fp->f_flag & FMARK) != 0 ||
fp->f_count == fp->f_msgcount ||
fp->f_unpcount != 0) {
mutex_exit(&fp->f_lock);
continue;
}
}
atomic_or_uint(&fp->f_flag, FMARK);
if (fp->f_type != DTYPE_SOCKET ||
(so = fp->f_socket) == NULL ||
so->so_proto->pr_domain != &unixdomain ||
(so->so_proto->pr_flags & PR_RIGHTS) == 0) {
mutex_exit(&fp->f_lock);
continue;
}
/* Gain file ref, mark our position, and unlock. */
didwork = true;
LIST_INSERT_AFTER(fp, dp, f_list);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&filelist_lock);
/*
* Mark files referenced from sockets queued on the
* accept queue as well.
*/
solock(so);
unp_scan(so->so_rcv.sb_mb, unp_mark, 0);
if ((so->so_options & SO_ACCEPTCONN) != 0) {
TAILQ_FOREACH(so1, &so->so_q0, so_qe) {
unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
}
TAILQ_FOREACH(so1, &so->so_q, so_qe) {
unp_scan(so1->so_rcv.sb_mb, unp_mark, 0);
}
}
sounlock(so);
/* Re-lock and restart from where we left off. */
closef(fp);
mutex_enter(&filelist_lock);
np = LIST_NEXT(dp, f_list);
LIST_REMOVE(dp, f_list);
}
/*
* Bail early if we did nothing in the loop above. Could
* happen because of concurrent activity causing unp_defer
* to get out of sync.
*/
} while (unp_defer != 0 && didwork);
/*
* Sweep pass.
*
* We grab an extra reference to each of the files that are
* not otherwise accessible and then free the rights that are
* stored in messages on them.
*/
for (fp = LIST_FIRST(&filehead); fp != NULL; fp = np) {
KASSERT(mutex_owned(&filelist_lock));
np = LIST_NEXT(fp, f_list);
mutex_enter(&fp->f_lock);
/*
* Ignore non-sockets.
* Ignore dead sockets, or sockets with pending close.
* Ignore sockets obviously referenced elsewhere.
* Ignore sockets marked as referenced by our scan.
* Ignore new sockets that did not exist during the scan.
*/
if (fp->f_type != DTYPE_SOCKET ||
fp->f_count == 0 || fp->f_unpcount != 0 ||
fp->f_count != fp->f_msgcount ||
(fp->f_flag & (FMARK | FSCAN)) != FSCAN) {
mutex_exit(&fp->f_lock);
continue;
}
/* Gain file ref, mark our position, and unlock. */
LIST_INSERT_AFTER(fp, dp, f_list);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&filelist_lock);
/*
* Flush all data from the socket's receive buffer.
* This will cause files referenced only by the
* socket to be queued for close.
*/
so = fp->f_socket;
solock(so);
sorflush(so);
sounlock(so);
/* Re-lock and restart from where we left off. */
closef(fp);
mutex_enter(&filelist_lock);
np = LIST_NEXT(dp, f_list);
LIST_REMOVE(dp, f_list);
}
}
/*
* Garbage collector thread. While SCM_RIGHTS messages are in transit,
* wake once per second to garbage collect. Run continually while we
* have deferred closes to process.
*/
static void
unp_thread(void *cookie)
{
file_t *dp;
/* Allocate a dummy file for our scans. */
if ((dp = fgetdummy()) == NULL) {
panic("unp_thread");
}
mutex_enter(&filelist_lock);
for (;;) {
KASSERT(mutex_owned(&filelist_lock));
if (SLIST_EMPTY(&unp_thread_discard)) {
if (unp_rights != 0) {
(void)cv_timedwait(&unp_thread_cv,
&filelist_lock, hz);
} else {
cv_wait(&unp_thread_cv, &filelist_lock);
}
}
unp_gc(dp);
}
/* NOTREACHED */
}
/*
* Kick the garbage collector into action if there is something for
* it to process.
*/
static void
unp_thread_kick(void)
{
if (!SLIST_EMPTY(&unp_thread_discard) || unp_rights != 0) { mutex_enter(&filelist_lock);
cv_signal(&unp_thread_cv);
mutex_exit(&filelist_lock);
}
}
void
unp_dispose(struct mbuf *m)
{ if (m) unp_scan(m, unp_discard_later, 1);
}
void
unp_scan(struct mbuf *m0, void (*op)(file_t *), int discard)
{
struct mbuf *m;
file_t **rp, *fp;
struct cmsghdr *cm;
int i, qfds;
while (m0) { for (m = m0; m; m = m->m_next) { if (m->m_type != MT_CONTROL ||
m->m_len < sizeof(*cm)) {
continue;
}
cm = mtod(m, struct cmsghdr *);
if (cm->cmsg_level != SOL_SOCKET ||
cm->cmsg_type != SCM_RIGHTS)
continue;
qfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm)))
/ sizeof(file_t *);
rp = (file_t **)CMSG_DATA(cm);
for (i = 0; i < qfds; i++) {
fp = *rp;
if (discard) {
*rp = 0;
}
(*op)(fp);
rp++;
}
}
m0 = m0->m_nextpkt;
}
}
void
unp_mark(file_t *fp)
{
if (fp == NULL)
return;
/* If we're already deferred, don't screw up the defer count */
mutex_enter(&fp->f_lock);
if (fp->f_flag & (FMARK | FDEFER)) {
mutex_exit(&fp->f_lock);
return;
}
/*
* Minimize the number of deferrals... Sockets are the only type of
* file which can hold references to another file, so just mark
* other files, and defer unmarked sockets for the next pass.
*/
if (fp->f_type == DTYPE_SOCKET) {
unp_defer++;
KASSERT(fp->f_count != 0);
atomic_or_uint(&fp->f_flag, FDEFER);
} else {
atomic_or_uint(&fp->f_flag, FMARK);
}
mutex_exit(&fp->f_lock);
}
static void
unp_discard_now(file_t *fp)
{ if (fp == NULL)
return;
KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0);
mutex_enter(&fp->f_lock);
fp->f_msgcount--;
mutex_exit(&fp->f_lock);
atomic_dec_uint(&unp_rights);
(void)closef(fp);
}
static void
unp_discard_later(file_t *fp)
{
if (fp == NULL)
return;
KASSERT(fp->f_count > 0); KASSERT(fp->f_msgcount > 0);
mutex_enter(&filelist_lock);
if (fp->f_unpcount++ == 0) { SLIST_INSERT_HEAD(&unp_thread_discard, fp, f_unplist);
}
mutex_exit(&filelist_lock);
}
static void
unp_sysctl_create(void)
{
KASSERT(usrreq_sysctllog == NULL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "sendspace",
SYSCTL_DESCR("Default stream send space"),
NULL, 0, &unpst_sendspace, 0,
CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "recvspace",
SYSCTL_DESCR("Default stream recv space"),
NULL, 0, &unpst_recvspace, 0,
CTL_NET, PF_LOCAL, SOCK_STREAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "sendspace",
SYSCTL_DESCR("Default datagram send space"),
NULL, 0, &unpdg_sendspace, 0,
CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "recvspace",
SYSCTL_DESCR("Default datagram recv space"),
NULL, 0, &unpdg_recvspace, 0,
CTL_NET, PF_LOCAL, SOCK_DGRAM, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "inflight",
SYSCTL_DESCR("File descriptors in flight"),
NULL, 0, &unp_rights, 0,
CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
sysctl_createv(&usrreq_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_INT, "deferred",
SYSCTL_DESCR("File descriptors deferred for close"),
NULL, 0, &unp_defer, 0,
CTL_NET, PF_LOCAL, CTL_CREATE, CTL_EOL);
}
const struct pr_usrreqs unp_usrreqs = {
.pr_attach = unp_attach,
.pr_detach = unp_detach,
.pr_accept = unp_accept,
.pr_bind = unp_bind,
.pr_listen = unp_listen,
.pr_connect = unp_connect,
.pr_connect2 = unp_connect2,
.pr_disconnect = unp_disconnect,
.pr_shutdown = unp_shutdown,
.pr_abort = unp_abort,
.pr_ioctl = unp_ioctl,
.pr_stat = unp_stat,
.pr_peeraddr = unp_peeraddr,
.pr_sockaddr = unp_sockaddr,
.pr_rcvd = unp_rcvd,
.pr_recvoob = unp_recvoob,
.pr_send = unp_send,
.pr_sendoob = unp_sendoob,
};
/* $NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $ */
/*
* Copyright (c) 1999 National Aeronautics & Space Administration
* All rights reserved.
*
* This software was written by William Studenmund of the
* Numerical Aerospace Simulation Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the National Aeronautics & Space Administration
* nor the names of its contributors may be used to endorse or promote
* products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NATIONAL AERONAUTICS & SPACE ADMINISTRATION
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE ADMINISTRATION OR CONTRIB-
* UTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY,
* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software donated to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Id: lofs_subr.c,v 1.11 1992/05/30 10:05:43 jsp Exp
* @(#)null_subr.c 8.7 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: layer_subr.c,v 1.39 2022/04/10 09:50:46 andvar Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/kmem.h>
#include <miscfs/genfs/layer.h>
#include <miscfs/genfs/layer_extern.h>
#ifdef LAYERFS_DIAGNOSTIC
int layerfs_debug = 1;
#endif
/*
* layer cache:
* Each cache entry holds a reference to the lower vnode
* along with a pointer to the alias vnode. When an
* entry is added the lower vnode is VREF'd. When the
* alias is removed the lower vnode is vrele'd.
*/
void
layerfs_init(void)
{
/* Nothing. */
}
void
layerfs_done(void)
{
/* Nothing. */
}
/*
* layer_node_create: try to find an existing layerfs vnode referring to it,
* otherwise make a new vnode which contains a reference to the lower vnode.
*/
int
layer_node_create(struct mount *mp, struct vnode *lowervp, struct vnode **nvpp)
{
int error;
struct vnode *aliasvp;
error = vcache_get(mp, &lowervp, sizeof(lowervp), &aliasvp);
if (error)
return error;
/*
* Now that we acquired a reference on the upper vnode, release one
* on the lower node. The existence of the layer_node retains one
* reference to the lower node.
*/
vrele(lowervp);
KASSERT(vrefcnt(lowervp) > 0);
#ifdef LAYERFS_DIAGNOSTIC
if (layerfs_debug)
vprint("layer_node_create: alias", aliasvp);
#endif
*nvpp = aliasvp;
return 0;
}
#ifdef LAYERFS_DIAGNOSTIC
struct vnode *
layer_checkvp(struct vnode *vp, const char *fil, int lno)
{
struct layer_node *a = VTOLAYER(vp);
#ifdef notyet
/*
* Can't do this check because vop_reclaim runs
* with a funny vop vector.
*
* WRS - no it doesnt...
*/
if (vp->v_op != layer_vnodeop_p) {
printf ("layer_checkvp: on non-layer-node\n");
#ifdef notyet
while (layer_checkvp_barrier) /*WAIT*/ ;
#endif
panic("layer_checkvp");
};
#endif
if (a->layer_lowervp == NULL) {
/* Should never happen */
int i; u_long *p;
printf("vp = %p, ZERO ptr\n", vp);
for (p = (u_long *) a, i = 0; i < 8; i++)
printf(" %lx", p[i]);
printf("\n");
/* wait for debugger */
panic("layer_checkvp");
}
if (vrefcnt(a->layer_lowervp) < 1) {
int i; u_long *p;
printf("vp = %p, unref'ed lowervp\n", vp);
for (p = (u_long *) a, i = 0; i < 8; i++)
printf(" %lx", p[i]);
printf("\n");
/* wait for debugger */
panic ("layer with unref'ed lowervp");
};
#ifdef notnow
printf("layer %p/%d -> %p/%d [%s, %d]\n",
LAYERTOV(a), vrefcnt(LAYERTOV(a)),
a->layer_lowervp, vrefcnt(a->layer_lowervp),
fil, lno);
#endif
return a->layer_lowervp;
}
#endif
/* $NetBSD: copystr.c,v 1.1 2020/06/30 16:20:02 maxv Exp $ */
/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Maxime Villard.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/systm.h>
#include <sys/errno.h>
int
copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done)
{
const char *src = kfaddr;
char *dst = kdaddr;
size_t i;
for (i = 0; i < len; i++) {
if ((*dst++ = *src++) == '\0') {
if (done) *done = i + 1;
return 0;
}
}
if (done)
*done = i;
return ENAMETOOLONG;
}
/* $NetBSD: sleepq.h,v 1.42 2023/10/15 10:30:00 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_SLEEPQ_H_
#define _SYS_SLEEPQ_H_
#include <sys/param.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/sched.h>
#include <sys/wchan.h>
struct syncobj;
/*
* Generic sleep queues.
*/
typedef struct sleepq sleepq_t;
void sleepq_init(sleepq_t *);
void sleepq_remove(sleepq_t *, lwp_t *, bool);
int sleepq_enter(sleepq_t *, lwp_t *, kmutex_t *);
void sleepq_enqueue(sleepq_t *, wchan_t, const char *,
const struct syncobj *, bool);
void sleepq_transfer(lwp_t *, sleepq_t *, sleepq_t *, wchan_t, const char *,
const struct syncobj *, kmutex_t *, bool);
void sleepq_uncatch(lwp_t *);
void sleepq_unsleep(lwp_t *, bool);
void sleepq_timeout(void *);
void sleepq_wake(sleepq_t *, wchan_t, u_int, kmutex_t *);
int sleepq_abort(kmutex_t *, int);
void sleepq_changepri(lwp_t *, pri_t);
void sleepq_lendpri(lwp_t *, pri_t);
int sleepq_block(int, bool, const struct syncobj *, int);
#ifdef _KERNEL
#include <sys/kernel.h>
typedef union {
kmutex_t lock;
uint8_t pad[COHERENCY_UNIT];
} sleepqlock_t;
/*
* Return non-zero if it is unsafe to sleep.
*
* XXX This only exists because panic() is broken.
*/
static __inline bool
sleepq_dontsleep(lwp_t *l)
{
return cold || (doing_shutdown && (panicstr || CURCPU_IDLE_P()));
}
#endif /* _KERNEL */
#include <sys/sleeptab.h>
#endif /* _SYS_SLEEPQ_H_ */
/* $NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christos Zoulas.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: compat_50_quota.c,v 1.4 2022/09/21 07:15:24 dholland Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/module.h>
#include <sys/namei.h>
#include <sys/param.h>
#include <sys/quota.h>
#include <sys/quotactl.h>
#include <sys/systm.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <sys/vfs_syscalls.h>
#include <sys/vnode.h>
#include <ufs/ufs/quota1.h>
static const struct syscall_package vfs_syscalls_50_quota_syscalls[] = {
{ SYS_compat_50_quotactl, 0, (sy_call_t *)compat_50_sys_quotactl },
{ 0, 0, NULL }
};
/* ARGSUSED */
int
compat_50_sys_quotactl(struct lwp *l, const struct compat_50_sys_quotactl_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) path;
syscallarg(int) cmd;
syscallarg(int) uid;
syscallarg(void *) arg;
} */
struct vnode *vp;
struct mount *mp;
int q1cmd;
int idtype;
char *qfile;
struct dqblk dqblk;
struct quotakey key;
struct quotaval blocks, files;
struct quotastat qstat;
int error;
error = namei_simple_user(SCARG(uap, path),
NSM_FOLLOW_TRYEMULROOT, &vp);
if (error != 0)
return (error);
mp = vp->v_mount;
q1cmd = SCARG(uap, cmd);
idtype = quota_idtype_from_ufs(q1cmd & SUBCMDMASK);
if (idtype == -1) {
return EINVAL;
}
switch ((q1cmd & ~SUBCMDMASK) >> SUBCMDSHIFT) {
case Q_QUOTAON:
qfile = PNBUF_GET();
error = copyinstr(SCARG(uap, arg), qfile, PATH_MAX, NULL);
if (error != 0) {
PNBUF_PUT(qfile);
break;
}
error = vfs_quotactl_quotaon(mp, idtype, qfile);
PNBUF_PUT(qfile);
break;
case Q_QUOTAOFF:
error = vfs_quotactl_quotaoff(mp, idtype);
break;
case Q_GETQUOTA:
key.qk_idtype = idtype;
key.qk_id = SCARG(uap, uid);
key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
error = vfs_quotactl_get(mp, &key, &blocks);
if (error) {
break;
}
key.qk_objtype = QUOTA_OBJTYPE_FILES;
error = vfs_quotactl_get(mp, &key, &files);
if (error) {
break;
}
quotavals_to_dqblk(&blocks, &files, &dqblk);
error = copyout(&dqblk, SCARG(uap, arg), sizeof(dqblk));
break;
case Q_SETQUOTA:
error = copyin(SCARG(uap, arg), &dqblk, sizeof(dqblk));
if (error) {
break;
}
dqblk_to_quotavals(&dqblk, &blocks, &files);
key.qk_idtype = idtype;
key.qk_id = SCARG(uap, uid);
key.qk_objtype = QUOTA_OBJTYPE_BLOCKS;
error = vfs_quotactl_put(mp, &key, &blocks);
if (error) {
break;
}
key.qk_objtype = QUOTA_OBJTYPE_FILES;
error = vfs_quotactl_put(mp, &key, &files);
break;
case Q_SYNC:
/*
* not supported but used only to see if quota is supported,
* emulate with stat
*
* XXX should probably be supported
*/
(void)idtype; /* not used */
error = vfs_quotactl_stat(mp, &qstat);
break;
case Q_SETUSE:
default:
error = EOPNOTSUPP;
break;
}
vrele(vp);
return error;
}
MODULE(MODULE_CLASS_EXEC, compat_50_quota, "compat_50,ufs");
static int
compat_50_quota_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return syscall_establish(NULL, vfs_syscalls_50_quota_syscalls);
case MODULE_CMD_FINI:
return syscall_disestablish(NULL, vfs_syscalls_50_quota_syscalls);
default:
return ENOTTY;
}
}
/* $NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $ */
/*
* Copyright (c) 1994
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_subr.c 8.20 (Berkeley) 5/20/95
*/
/*
* Copyright (c) 1994 Jan-Simon Pendry
*
* This code is derived from software contributed to Berkeley by
* Jan-Simon Pendry.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)union_subr.c 8.20 (Berkeley) 5/20/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/malloc.h>
#include <sys/dirent.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/queue.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/kauth.h>
#include <uvm/uvm_extern.h>
#include <fs/union/union.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
static LIST_HEAD(uhashhead, union_node) *uhashtbl;
static u_long uhash_mask; /* size of hash table - 1 */
#define UNION_HASH(u, l) \
((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
#define NOHASH ((u_long)-1)
static kmutex_t uhash_lock;
static void union_newupper(struct union_node *, struct vnode *);
static void union_newlower(struct union_node *, struct vnode *);
static void union_ref(struct union_node *);
static void union_rele(struct union_node *);
static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t, const char *);
int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
static void union_dircache_r(struct vnode *, struct vnode ***, int *);
struct vnode *union_dircache(struct vnode *, struct lwp *);
void
union_init(void)
{
mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
}
void
union_reinit(void)
{
struct union_node *un;
struct uhashhead *oldhash, *hash;
u_long oldmask, mask, val;
int i;
hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
mutex_enter(&uhash_lock);
oldhash = uhashtbl;
oldmask = uhash_mask;
uhashtbl = hash;
uhash_mask = mask;
for (i = 0; i <= oldmask; i++) {
while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
LIST_REMOVE(un, un_cache);
val = UNION_HASH(un->un_uppervp, un->un_lowervp);
LIST_INSERT_HEAD(&hash[val], un, un_cache);
}
}
mutex_exit(&uhash_lock);
hashdone(oldhash, HASH_LIST, oldmask);
}
/*
* Free global unionfs resources.
*/
void
union_done(void)
{
hashdone(uhashtbl, HASH_LIST, uhash_mask);
mutex_destroy(&uhash_lock);
/* Make sure to unset the readdir hook. */
vn_union_readdir_hook = NULL;
}
void
union_newlower(struct union_node *un, struct vnode *lowervp)
{
int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
int nhash = UNION_HASH(un->un_uppervp, lowervp);
if (un->un_lowervp == lowervp)
return;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); KASSERT(un->un_lowervp == NULL);
mutex_enter(&uhash_lock);
if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_enter(&un->un_lock);
un->un_lowervp = lowervp;
un->un_lowersz = VNOVAL;
mutex_exit(&un->un_lock);
if (ohash != nhash) {
LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
un->un_cflags |= UN_CACHED;
}
mutex_exit(&uhash_lock);
}
void
union_newupper(struct union_node *un, struct vnode *uppervp)
{
int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
int nhash = UNION_HASH(uppervp, un->un_lowervp);
struct vop_lock_args lock_ap;
struct vop_unlock_args unlock_ap;
int error __diagused;
if (un->un_uppervp == uppervp)
return;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
KASSERT(un->un_uppervp == NULL);
/*
* We have to transfer the vnode lock from the union vnode to
* the upper vnode. Lock the upper vnode first. We cannot use
* VOP_LOCK() here as it would break the fstrans state.
*/
lock_ap.a_desc = VDESC(vop_lock);
lock_ap.a_vp = uppervp;
lock_ap.a_flags = LK_EXCLUSIVE;
error = VCALL(lock_ap.a_vp, VOFFSET(vop_lock), &lock_ap);
KASSERT(error == 0);
mutex_enter(&uhash_lock);
if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_enter(&un->un_lock);
un->un_uppervp = uppervp;
un->un_uppersz = VNOVAL;
/*
* With the upper vnode in place unlock the union vnode to
* finalize the lock transfer.
*/
unlock_ap.a_desc = VDESC(vop_unlock);
unlock_ap.a_vp = UNIONTOV(un);
genfs_unlock(&unlock_ap);
/* Update union vnode interlock, vmobjlock, & klist. */
vshareilock(UNIONTOV(un), uppervp);
rw_obj_hold(uppervp->v_uobj.vmobjlock);
uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
vshareklist(UNIONTOV(un), uppervp);
mutex_exit(&un->un_lock);
if (ohash != nhash) {
LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
un->un_cflags |= UN_CACHED;
}
mutex_exit(&uhash_lock);
}
/*
* Keep track of size changes in the underlying vnodes.
* If the size changes, then callback to the vm layer
* giving priority to the upper layer size.
*
* Mutex un_lock hold on entry and released on return.
*/
void
union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
{
struct union_node *un = VTOUNION(vp);
off_t sz;
KASSERT(mutex_owned(&un->un_lock));
/* only interested in regular files */
if (vp->v_type != VREG) {
mutex_exit(&un->un_lock);
uvm_vnp_setsize(vp, 0);
return;
}
sz = VNOVAL;
if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) { un->un_uppersz = uppersz;
if (sz == VNOVAL)
sz = un->un_uppersz;
}
if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) { un->un_lowersz = lowersz;
if (sz == VNOVAL)
sz = un->un_lowersz;
}
mutex_exit(&un->un_lock);
if (sz != VNOVAL) {
#ifdef UNION_DIAGNOSTIC
printf("union: %s size now %qd\n",
uppersz != VNOVAL ? "upper" : "lower", sz);
#endif
uvm_vnp_setsize(vp, sz);
}
}
static void
union_ref(struct union_node *un)
{
KASSERT(mutex_owned(&uhash_lock));
un->un_refs++;
}
static void
union_rele(struct union_node *un)
{
mutex_enter(&uhash_lock);
un->un_refs--;
if (un->un_refs > 0) {
mutex_exit(&uhash_lock);
return;
}
if (un->un_cflags & UN_CACHED) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_exit(&uhash_lock);
if (un->un_pvp != NULLVP) vrele(un->un_pvp); if (un->un_uppervp != NULLVP) vrele(un->un_uppervp); if (un->un_lowervp != NULLVP) vrele(un->un_lowervp); if (un->un_dirvp != NULLVP) vrele(un->un_dirvp); if (un->un_path) free(un->un_path, M_TEMP);
mutex_destroy(&un->un_lock);
free(un, M_TEMP);
}
/*
* allocate a union_node/vnode pair. the vnode is
* referenced and unlocked. the new vnode is returned
* via (vpp). (mp) is the mountpoint of the union filesystem,
* (dvp) is the parent directory where the upper layer object
* should exist (but doesn't) and (cnp) is the componentname
* information which is partially copied to allow the upper
* layer object to be created at a later time. (uppervp)
* and (lowervp) reference the upper and lower layer objects
* being mapped. either, but not both, can be nil.
* both, if supplied, are unlocked.
* the reference is either maintained in the new union_node
* object which is allocated, or they are vrele'd.
*
* all union_nodes are maintained on a hash
* list. new nodes are only allocated when they cannot
* be found on this list. entries on the list are
* removed when the vfs reclaim entry is called.
*
* the vnode gets attached or referenced with vcache_get().
*/
int
union_allocvp(
struct vnode **vpp,
struct mount *mp,
struct vnode *undvp, /* parent union vnode */
struct vnode *dvp, /* may be null */
struct componentname *cnp, /* may be null */
struct vnode *uppervp, /* may be null */
struct vnode *lowervp, /* may be null */
int docache)
{
int error;
struct union_node *un = NULL, *un1;
struct vnode *vp, *xlowervp = NULLVP;
u_long hash[3];
int try;
bool is_dotdot;
is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));
if (uppervp == NULLVP && lowervp == NULLVP)
panic("union: unidentifiable allocation"); if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
xlowervp = lowervp;
lowervp = NULLVP;
}
/*
* If both uppervp and lowervp are not NULL we have to
* search union nodes with one vnode as NULL too.
*/
hash[0] = UNION_HASH(uppervp, lowervp);
if (uppervp == NULL || lowervp == NULL) {
hash[1] = hash[2] = NOHASH;
} else {
hash[1] = UNION_HASH(uppervp, NULLVP);
hash[2] = UNION_HASH(NULLVP, lowervp);
}
if (!docache) {
un = NULL;
goto found;
}
loop:
mutex_enter(&uhash_lock);
for (try = 0; try < 3; try++) {
if (hash[try] == NOHASH)
continue;
LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) { if ((un->un_lowervp && un->un_lowervp != lowervp) || (un->un_uppervp && un->un_uppervp != uppervp) ||
un->un_mount != mp)
continue;
union_ref(un);
mutex_exit(&uhash_lock);
error = vcache_get(mp, &un, sizeof(un), &vp);
KASSERT(error != 0 || UNIONTOV(un) == vp);
union_rele(un);
if (error == ENOENT)
goto loop;
else if (error)
goto out;
goto found;
}
}
mutex_exit(&uhash_lock);
found:
if (un) {
if (uppervp != dvp) {
if (is_dotdot)
VOP_UNLOCK(dvp);
vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
if (is_dotdot)
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
}
/*
* Save information about the upper layer.
*/
if (uppervp != un->un_uppervp) {
union_newupper(un, uppervp); } else if (uppervp) { vrele(uppervp);
}
/*
* Save information about the lower layer.
* This needs to keep track of pathname
* and directory information which union_vn_create
* might need.
*/
if (lowervp != un->un_lowervp) {
union_newlower(un, lowervp); if (cnp && (lowervp != NULLVP)) { un->un_path = malloc(cnp->cn_namelen+1,
M_TEMP, M_WAITOK);
memcpy(un->un_path, cnp->cn_nameptr,
cnp->cn_namelen);
un->un_path[cnp->cn_namelen] = '\0';
vref(dvp);
un->un_dirvp = dvp;
}
} else if (lowervp) { vrele(lowervp);
}
*vpp = UNIONTOV(un);
if (uppervp != dvp) VOP_UNLOCK(*vpp);
error = 0;
goto out;
}
un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
un->un_refs = 1;
un->un_mount = mp;
un->un_vnode = NULL;
un->un_uppervp = uppervp;
un->un_lowervp = lowervp;
un->un_pvp = undvp;
if (undvp != NULLVP) vref(undvp);
un->un_dircache = 0;
un->un_openl = 0;
un->un_cflags = 0;
un->un_hooknode = false;
un->un_uppersz = VNOVAL;
un->un_lowersz = VNOVAL;
if (dvp && cnp && (lowervp != NULLVP)) {
un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
un->un_path[cnp->cn_namelen] = '\0';
vref(dvp);
un->un_dirvp = dvp;
} else {
un->un_path = 0;
un->un_dirvp = 0;
}
if (docache) {
mutex_enter(&uhash_lock);
LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) { if (un1->un_lowervp == lowervp && un1->un_uppervp == uppervp &&
un1->un_mount == mp) {
/*
* Another thread beat us, push back freshly
* allocated node and retry.
*/
mutex_exit(&uhash_lock);
union_rele(un);
goto loop;
}
}
LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
un->un_cflags |= UN_CACHED;
mutex_exit(&uhash_lock);
}
error = vcache_get(mp, &un, sizeof(un), vpp);
KASSERT(error != 0 || UNIONTOV(un) == *vpp);
union_rele(un);
if (error == ENOENT)
goto loop;
out:
if (xlowervp) vrele(xlowervp);
return error;
}
int
union_freevp(struct vnode *vp)
{
struct union_node *un = VTOUNION(vp);
/* Detach vnode from union node. */
un->un_vnode = NULL;
un->un_uppersz = VNOVAL;
un->un_lowersz = VNOVAL;
/* Detach union node from vnode. */
mutex_enter(vp->v_interlock);
vp->v_data = NULL;
mutex_exit(vp->v_interlock);
union_rele(un);
return 0;
}
int
union_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct vattr va;
struct vnode *svp;
struct union_node *un;
struct union_mount *um;
voff_t uppersz, lowersz;
KASSERT(key_len == sizeof(un));
memcpy(&un, key, key_len);
um = MOUNTTOUNIONMOUNT(mp);
svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;
vp->v_tag = VT_UNION;
vp->v_op = union_vnodeop_p;
vp->v_data = un;
un->un_vnode = vp;
vp->v_type = svp->v_type;
if (svp->v_type == VCHR || svp->v_type == VBLK) spec_node_init(vp, svp->v_rdev);
vshareilock(vp, svp);
rw_obj_hold(svp->v_uobj.vmobjlock);
uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
vshareklist(vp, svp);
/* detect the root vnode (and aliases) */
if ((un->un_uppervp == um->um_uppervp) &&
((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
if (un->un_lowervp == NULLVP) {
un->un_lowervp = um->um_lowervp;
if (un->un_lowervp != NULLVP) vref(un->un_lowervp);
}
vp->v_vflag |= VV_ROOT;
}
uppersz = lowersz = VNOVAL;
if (un->un_uppervp != NULLVP) { if (vn_lock(un->un_uppervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
uppersz = va.va_size;
VOP_UNLOCK(un->un_uppervp);
}
}
if (un->un_lowervp != NULLVP) { if (vn_lock(un->un_lowervp, LK_SHARED) == 0) { if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
lowersz = va.va_size;
VOP_UNLOCK(un->un_lowervp);
}
}
mutex_enter(&un->un_lock);
union_newsize(vp, uppersz, lowersz);
mutex_enter(&uhash_lock);
union_ref(un);
mutex_exit(&uhash_lock);
*new_key = &vp->v_data;
return 0;
}
/*
* copyfile. copy the vnode (fvp) to the vnode (tvp)
* using a sequence of reads and writes. both (fvp)
* and (tvp) are locked on entry and exit.
*/
int
union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
struct lwp *l)
{
char *tbuf;
struct uio uio;
struct iovec iov;
int error = 0;
/*
* strategy:
* allocate a buffer of size MAXBSIZE.
* loop doing reads and writes, keeping track
* of the current uio offset.
* give up at the first sign of trouble.
*/
uio.uio_offset = 0;
UIO_SETUP_SYSSPACE(&uio);
tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
/* ugly loop follows... */
do {
off_t offset = uio.uio_offset;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
iov.iov_base = tbuf;
iov.iov_len = MAXBSIZE;
uio.uio_resid = iov.iov_len;
uio.uio_rw = UIO_READ;
error = VOP_READ(fvp, &uio, 0, cred);
if (error == 0) {
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
iov.iov_base = tbuf;
iov.iov_len = MAXBSIZE - uio.uio_resid;
uio.uio_offset = offset;
uio.uio_rw = UIO_WRITE;
uio.uio_resid = iov.iov_len;
if (uio.uio_resid == 0)
break;
do {
error = VOP_WRITE(tvp, &uio, 0, cred);
} while ((uio.uio_resid > 0) && (error == 0));
}
} while (error == 0);
free(tbuf, M_TEMP);
return (error);
}
/*
* (un) is assumed to be locked on entry and remains
* locked on exit.
*/
int
union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
struct lwp *l)
{
int error;
struct vnode *lvp, *uvp;
struct vattr lvattr, uvattr;
error = union_vn_create(&uvp, un, l);
if (error)
return (error);
union_newupper(un, uvp);
lvp = un->un_lowervp;
if (docopy) {
/*
* XX - should not ignore errors
* from VOP_CLOSE
*/
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_GETATTR(lvp, &lvattr, cred);
if (error == 0)
error = VOP_OPEN(lvp, FREAD, cred);
if (error == 0) {
error = union_copyfile(lvp, uvp, cred, l);
(void) VOP_CLOSE(lvp, FREAD, cred);
}
if (error == 0) {
/* Copy permissions up too */
vattr_null(&uvattr);
uvattr.va_mode = lvattr.va_mode;
uvattr.va_flags = lvattr.va_flags;
error = VOP_SETATTR(uvp, &uvattr, cred);
}
VOP_UNLOCK(lvp);
#ifdef UNION_DIAGNOSTIC
if (error == 0)
uprintf("union: copied up %s\n", un->un_path);
#endif
}
union_vn_close(uvp, FWRITE, cred, l);
/*
* Subsequent IOs will go to the top layer, so
* call close on the lower vnode and open on the
* upper vnode to ensure that the filesystem keeps
* its references counts right. This doesn't do
* the right thing with (cred) and (FREAD) though.
* Ignoring error returns is not right, either.
*/
if (error == 0) {
int i;
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
for (i = 0; i < un->un_openl; i++) {
(void) VOP_CLOSE(lvp, FREAD, cred);
(void) VOP_OPEN(uvp, FREAD, cred);
}
un->un_openl = 0;
VOP_UNLOCK(lvp);
}
return (error);
}
/*
* Prepare the creation of a new node in the upper layer.
*
* (dvp) is the directory in which to create the new node.
* it is locked on entry and exit.
* (cnp) is the componentname to be created.
* (cred, path, hash) are credentials, path and its hash to fill (cnp).
*/
static int
union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
const char *path)
{
int error;
struct vnode *vp;
cnp->cn_nameiop = CREATE;
cnp->cn_flags = LOCKPARENT | ISLASTCN;
cnp->cn_cred = cred;
cnp->cn_nameptr = path;
cnp->cn_namelen = strlen(path);
error = VOP_LOOKUP(dvp, &vp, cnp);
if (error == 0) {
KASSERT(vp != NULL);
VOP_ABORTOP(dvp, cnp);
vrele(vp);
error = EEXIST;
} else if (error == EJUSTRETURN) {
error = 0;
}
return error;
}
/*
* Create a shadow directory in the upper layer.
* The new vnode is returned locked.
*
* (um) points to the union mount structure for access to the
* the mounting process's credentials.
* (dvp) is the directory in which to create the shadow directory.
* it is unlocked on entry and exit.
* (cnp) is the componentname to be created.
* (vpp) is the returned newly created shadow directory, which
* is returned locked.
*
* N.B. We still attempt to create shadow directories even if the union
* is mounted read-only, which is a little nonintuitive.
*/
int
union_mkshadow(struct union_mount *um, struct vnode *dvp,
struct componentname *cnp, struct vnode **vpp)
{
int error;
struct vattr va;
struct componentname cn;
char *pnbuf;
if (cnp->cn_namelen + 1 > MAXPATHLEN)
return ENAMETOOLONG;
pnbuf = PNBUF_GET();
memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
pnbuf[cnp->cn_namelen] = '\0';
vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
error = union_do_lookup(dvp, &cn,
(um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
if (error) {
VOP_UNLOCK(dvp);
PNBUF_PUT(pnbuf);
return error;
}
/*
* policy: when creating the shadow directory in the
* upper layer, create it owned by the user who did
* the mount, group from parent directory, and mode
* 777 modified by umask (ie mostly identical to the
* mkdir syscall). (jsp, kb)
*/
vattr_null(&va);
va.va_type = VDIR;
va.va_mode = um->um_cmode;
KASSERT(*vpp == NULL);
error = VOP_MKDIR(dvp, vpp, &cn, &va);
VOP_UNLOCK(dvp);
PNBUF_PUT(pnbuf);
return error;
}
/*
* Create a whiteout entry in the upper layer.
*
* (um) points to the union mount structure for access to the
* the mounting process's credentials.
* (dvp) is the directory in which to create the whiteout.
* it is locked on entry and exit.
* (cnp) is the componentname to be created.
* (un) holds the path and its hash to be created.
*/
int
union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
struct componentname *cnp, struct union_node *un)
{
int error;
struct componentname cn;
error = union_do_lookup(dvp, &cn,
(um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
un->un_path);
if (error)
return error;
error = VOP_WHITEOUT(dvp, &cn, CREATE);
return error;
}
/*
* union_vn_create: creates and opens a new shadow file
* on the upper union layer. this function is similar
* in spirit to calling vn_open but it avoids calling namei().
* the problem with calling namei is that a) it locks too many
* things, and b) it doesn't start at the "right" directory,
* whereas union_do_lookup is told where to start.
*/
int
union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
{
struct vnode *vp;
kauth_cred_t cred = l->l_cred;
struct vattr vat;
struct vattr *vap = &vat;
int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
int error;
int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
struct componentname cn;
*vpp = NULLVP;
vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
un->un_path);
if (error) {
VOP_UNLOCK(un->un_dirvp);
return error;
}
/*
* Good - there was no race to create the file
* so go ahead and create it. The permissions
* on the file will be 0666 modified by the
* current user's umask. Access to the file, while
* it is unioned, will require access to the top *and*
* bottom files. Access when not unioned will simply
* require access to the top-level file.
* TODO: confirm choice of access permissions.
*/
vattr_null(vap);
vap->va_type = VREG;
vap->va_mode = cmode;
vp = NULL;
error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
if (error) {
VOP_UNLOCK(un->un_dirvp);
return error;
}
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VOP_UNLOCK(un->un_dirvp);
error = VOP_OPEN(vp, fmode, cred);
if (error) {
vput(vp);
return error;
}
vp->v_writecount++;
VOP_UNLOCK(vp);
*vpp = vp;
return 0;
}
int
union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
{
if (fmode & FWRITE)
--vp->v_writecount;
return (VOP_CLOSE(vp, fmode, cred));
}
void
union_removed_upper(struct union_node *un)
{
struct vnode *vp = UNIONTOV(un);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#if 1
/*
* We do not set the uppervp to NULLVP here, because lowervp
* may also be NULLVP, so this routine would end up creating
* a bogus union node with no upper or lower VP (that causes
* pain in many places that assume at least one VP exists).
* Since we've removed this node from the cache hash chains,
* it won't be found again. When all current holders
* release it, union_inactive() will vgone() it.
*/
union_diruncache(un);
#else
union_newupper(un, NULLVP);
#endif
VOP_UNLOCK(vp);
mutex_enter(&uhash_lock);
if (un->un_cflags & UN_CACHED) {
un->un_cflags &= ~UN_CACHED;
LIST_REMOVE(un, un_cache);
}
mutex_exit(&uhash_lock);
}
#if 0
struct vnode *
union_lowervp(struct vnode *vp)
{
struct union_node *un = VTOUNION(vp);
if ((un->un_lowervp != NULLVP) &&
(vp->v_type == un->un_lowervp->v_type)) {
if (vget(un->un_lowervp, 0, true /* wait */) == 0)
return (un->un_lowervp);
}
return (NULLVP);
}
#endif
/*
* determine whether a whiteout is needed
* during a remove/rmdir operation.
*/
int
union_dowhiteout(struct union_node *un, kauth_cred_t cred)
{
struct vattr va;
if (un->un_lowervp != NULLVP)
return (1);
if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
(va.va_flags & OPAQUE))
return (1);
return (0);
}
static void
union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
{
struct union_node *un;
if (vp->v_op != union_vnodeop_p) {
if (vppp) {
vref(vp);
*(*vppp)++ = vp;
if (--(*cntp) == 0)
panic("union: dircache table too small");
} else {
(*cntp)++;
}
return;
}
un = VTOUNION(vp);
if (un->un_uppervp != NULLVP)
union_dircache_r(un->un_uppervp, vppp, cntp);
if (un->un_lowervp != NULLVP)
union_dircache_r(un->un_lowervp, vppp, cntp);
}
struct vnode *
union_dircache(struct vnode *vp, struct lwp *l)
{
int cnt;
struct vnode *nvp = NULLVP;
struct vnode **vpp;
struct vnode **dircache;
int error;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
dircache = VTOUNION(vp)->un_dircache;
nvp = NULLVP;
if (dircache == 0) {
cnt = 0;
union_dircache_r(vp, 0, &cnt);
cnt++;
dircache = (struct vnode **)
malloc(cnt * sizeof(struct vnode *),
M_TEMP, M_WAITOK);
vpp = dircache;
union_dircache_r(vp, &vpp, &cnt);
VTOUNION(vp)->un_dircache = dircache;
*vpp = NULLVP;
vpp = dircache + 1;
} else {
vpp = dircache;
do {
if (*vpp++ == VTOUNION(vp)->un_lowervp)
break;
} while (*vpp != NULLVP);
}
if (*vpp == NULLVP)
goto out;
vref(*vpp);
error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0,
NULLVP, *vpp, 0);
if (!error) {
vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
VTOUNION(vp)->un_dircache = 0;
VTOUNION(nvp)->un_hooknode = true;
VTOUNION(nvp)->un_dircache = dircache;
}
out:
VOP_UNLOCK(vp);
return (nvp);
}
void
union_diruncache(struct union_node *un)
{
struct vnode **vpp;
KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE); if (un->un_dircache != 0) { for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
vrele(*vpp);
free(un->un_dircache, M_TEMP);
un->un_dircache = 0;
}
}
/*
* Check whether node can rmdir (check empty).
*/
int
union_check_rmdir(struct union_node *un, kauth_cred_t cred)
{
int dirlen, eofflag, error;
char *dirbuf;
struct vattr va;
struct vnode *tvp;
struct dirent *dp, *edp;
struct componentname cn;
struct iovec aiov;
struct uio auio;
KASSERT(un->un_uppervp != NULL);
/* Check upper for being opaque. */
KASSERT(VOP_ISLOCKED(un->un_uppervp));
error = VOP_GETATTR(un->un_uppervp, &va, cred);
if (error || (va.va_flags & OPAQUE))
return error;
if (un->un_lowervp == NULL)
return 0;
/* Check lower for being empty. */
vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(un->un_lowervp, &va, cred);
if (error) {
VOP_UNLOCK(un->un_lowervp);
return error;
}
dirlen = va.va_blocksize;
dirbuf = kmem_alloc(dirlen, KM_SLEEP);
/* error = 0; */
eofflag = 0;
auio.uio_offset = 0;
do {
aiov.iov_len = dirlen;
aiov.iov_base = dirbuf;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_resid = aiov.iov_len;
auio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&auio);
error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
NULL, NULL);
if (error)
break;
edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
for (dp = (struct dirent *)dirbuf;
error == 0 && dp < edp;
dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
if (dp->d_reclen == 0) {
error = ENOTEMPTY;
break;
}
if (dp->d_type == DT_WHT || (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
(dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
continue;
/* Check for presence in the upper layer. */
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | RDONLY;
cn.cn_cred = cred;
cn.cn_nameptr = dp->d_name;
cn.cn_namelen = dp->d_namlen;
error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
error = 0;
continue;
}
if (error == 0)
vrele(tvp);
error = ENOTEMPTY;
}
} while (error == 0 && !eofflag);
kmem_free(dirbuf, dirlen);
VOP_UNLOCK(un->un_lowervp);
return error;
}
/*
* This hook is called from vn_readdir() to switch to lower directory
* entry after the upper directory is read.
*/
int
union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
{
struct vnode *vp = *vpp, *lvp;
struct vattr va;
int error;
if (vp->v_op != union_vnodeop_p)
return (0);
/*
* If the directory is opaque,
* then don't show lower entries
*/
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, fp->f_cred);
VOP_UNLOCK(vp);
if (error || (va.va_flags & OPAQUE))
return error;
if ((lvp = union_dircache(vp, l)) == NULLVP)
return (0);
error = VOP_OPEN(lvp, FREAD, fp->f_cred);
if (error) {
vput(lvp);
return (error);
}
VOP_UNLOCK(lvp);
fp->f_vnode = lvp;
fp->f_offset = 0;
error = vn_close(vp, FREAD, fp->f_cred);
if (error)
return (error);
*vpp = lvp;
return (0);
}
/* $NetBSD: puffs_vfsops.c,v 1.126 2021/04/01 19:00:33 christos Exp $ */
/*
* Copyright (c) 2005, 2006 Antti Kantee. All Rights Reserved.
*
* Development of this software was supported by the
* Google Summer of Code program and the Ulla Tuominen Foundation.
* The Google SoC project was mentored by Bill Studenmund.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
* OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: puffs_vfsops.c,v 1.126 2021/04/01 19:00:33 christos Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/extattr.h>
#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/dirent.h>
#include <sys/kauth.h>
#include <sys/proc.h>
#include <sys/module.h>
#include <sys/kthread.h>
#include <uvm/uvm.h>
#include <dev/putter/putter_sys.h>
#include <miscfs/genfs/genfs.h>
#include <fs/puffs/puffs_msgif.h>
#include <fs/puffs/puffs_sys.h>
#include <lib/libkern/libkern.h>
#include <nfs/nfsproto.h> /* for fh sizes */
MODULE(MODULE_CLASS_VFS, puffs, "putter");
VFS_PROTOS(puffs_vfsop);
static struct putter_ops puffs_putter = {
.pop_getout = puffs_msgif_getout,
.pop_releaseout = puffs_msgif_releaseout,
.pop_waitcount = puffs_msgif_waitcount,
.pop_dispatch = puffs_msgif_dispatch,
.pop_close = puffs_msgif_close,
};
static const struct genfs_ops puffs_genfsops = {
.gop_size = puffs_gop_size,
.gop_write = genfs_gop_write,
.gop_markupdate = puffs_gop_markupdate,
#if 0
.gop_alloc, should ask userspace
#endif
.gop_putrange = genfs_gop_putrange,
};
/*
* Try to ensure data structures used by the puffs protocol
* do not unexpectedly change.
*/
#if defined(__i386__) && defined(__ELF__)
CTASSERT(sizeof(struct puffs_kargs) == 3928);
CTASSERT(sizeof(struct vattr) == 136);
CTASSERT(sizeof(struct puffs_req) == 44);
#endif
int
puffs_vfsop_mount(struct mount *mp, const char *path, void *data,
size_t *data_len)
{
struct puffs_mount *pmp = NULL;
struct puffs_kargs *args;
char fstype[_VFS_NAMELEN];
char *p;
int error = 0, i;
pid_t mntpid = curlwp->l_proc->p_pid;
if (data == NULL)
return EINVAL;
if (*data_len < sizeof *args)
return EINVAL;
if (mp->mnt_flag & MNT_GETARGS) {
pmp = MPTOPUFFSMP(mp);
*(struct puffs_kargs *)data = pmp->pmp_args;
*data_len = sizeof *args;
return 0;
}
/* update is not supported currently */
if (mp->mnt_flag & MNT_UPDATE)
return EOPNOTSUPP;
args = (struct puffs_kargs *)data;
if (args->pa_vers != PUFFSVERSION) {
printf("puffs_mount: development version mismatch: "
"kernel %d, lib %d\n", PUFFSVERSION, args->pa_vers);
error = EINVAL;
goto out;
}
if ((args->pa_flags & ~PUFFS_KFLAG_MASK) != 0) {
printf("puffs_mount: invalid KFLAGs 0x%x\n", args->pa_flags);
error = EINVAL;
goto out;
}
if ((args->pa_fhflags & ~PUFFS_FHFLAG_MASK) != 0) {
printf("puffs_mount: invalid FHFLAGs 0x%x\n", args->pa_fhflags);
error = EINVAL;
goto out;
}
for (i = 0; i < __arraycount(args->pa_spare); i++) { if (args->pa_spare[i] != 0) {
printf("puffs_mount: pa_spare[%d] = 0x%x\n",
i, args->pa_spare[i]);
error = EINVAL;
goto out;
}
}
/* use dummy value for passthrough */
if (args->pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH)
args->pa_fhsize = sizeof(struct fid);
/* sanitize file handle length */
if (PUFFS_TOFHSIZE(args->pa_fhsize) > FHANDLE_SIZE_MAX) { printf("puffs_mount: handle size %zu too large\n",
args->pa_fhsize);
error = EINVAL;
goto out;
}
/* sanity check file handle max sizes */
if (args->pa_fhsize && args->pa_fhflags & PUFFS_FHFLAG_PROTOMASK) {
size_t kfhsize = PUFFS_TOFHSIZE(args->pa_fhsize);
if (args->pa_fhflags & PUFFS_FHFLAG_NFSV2) {
if (NFSX_FHTOOBIG_P(kfhsize, 0)) {
printf("puffs_mount: fhsize larger than "
"NFSv2 max %d\n",
PUFFS_FROMFHSIZE(NFSX_V2FH));
error = EINVAL;
goto out;
}
}
if (args->pa_fhflags & PUFFS_FHFLAG_NFSV3) {
if (NFSX_FHTOOBIG_P(kfhsize, 1)) {
printf("puffs_mount: fhsize larger than "
"NFSv3 max %d\n",
PUFFS_FROMFHSIZE(NFSX_V3FHMAX));
error = EINVAL;
goto out;
}
}
}
/* don't allow non-printing characters (like my sweet umlauts.. snif) */
args->pa_typename[sizeof(args->pa_typename)-1] = '\0';
for (p = args->pa_typename; *p; p++) if (*p < ' ' || *p > '~') *p = '.';
args->pa_mntfromname[sizeof(args->pa_mntfromname)-1] = '\0';
for (p = args->pa_mntfromname; *p; p++) if (*p < ' ' || *p > '~') *p = '.';
/* build real name */
(void)strlcpy(fstype, PUFFS_TYPEPREFIX, sizeof(fstype));
(void)strlcat(fstype, args->pa_typename, sizeof(fstype));
/* inform user server if it got the max request size it wanted */
if (args->pa_maxmsglen == 0 || args->pa_maxmsglen > PUFFS_MSG_MAXSIZE)
args->pa_maxmsglen = PUFFS_MSG_MAXSIZE;
else if (args->pa_maxmsglen < 2*PUFFS_MSGSTRUCT_MAX)
args->pa_maxmsglen = 2*PUFFS_MSGSTRUCT_MAX;
(void)strlcpy(args->pa_typename, fstype, sizeof(args->pa_typename));
error = set_statvfs_info(path, UIO_USERSPACE, args->pa_mntfromname,
UIO_SYSSPACE, fstype, mp, curlwp);
if (error)
goto out;
mp->mnt_stat.f_iosize = DEV_BSIZE;
mp->mnt_stat.f_namemax = args->pa_svfsb.f_namemax;
/*
* We can't handle the VFS_STATVFS() mount_domount() does
* after VFS_MOUNT() because we'd deadlock, so handle it
* here already.
*/
struct statvfs *sb = STATVFSBUF_GET();
puffs_statvfs_to_statvfs(&args->pa_svfsb, sb);
copy_statvfs_info(sb, mp);
STATVFSBUF_PUT(sb);
statvfs_to_puffs_statvfs(&mp->mnt_stat, &args->pa_svfsb);
KASSERT(curlwp != uvm.pagedaemon_lwp);
pmp = kmem_zalloc(sizeof(struct puffs_mount), KM_SLEEP);
mp->mnt_fs_bshift = DEV_BSHIFT;
mp->mnt_dev_bshift = DEV_BSHIFT;
mp->mnt_flag &= ~MNT_LOCAL; /* we don't really know, so ... */
mp->mnt_data = pmp;
#if 0
/*
* XXX: puffs code is MPSAFE. However, VFS really isn't.
* Currently, there is nothing which protects an inode from
* reclaim while there are threads inside the file system.
* This means that in the event of a server crash, an MPSAFE
* mount is likely to end up accessing invalid memory. For the
* non-mpsafe case, the kernel lock, general structure of
* puffs and pmp_refcount protect the threads during escape.
*
* Fixing this will require:
* a) fixing vfs
* OR
* b) adding a small sleep to puffs_msgif_close() between
* userdead() and dounmount().
* (well, this isn't really a fix, but would solve
* 99.999% of the race conditions).
*
* Also, in the event of "b", unmount -f should be used,
* like with any other file system, sparingly and only when
* it is "known" to be safe.
*/
mp->mnt_iflags |= IMNT_MPSAFE;
#endif
pmp->pmp_status = PUFFSTAT_MOUNTING;
pmp->pmp_mp = mp;
pmp->pmp_msg_maxsize = args->pa_maxmsglen;
pmp->pmp_args = *args;
/*
* Inform the fileops processing code that we have a mountpoint.
* If it doesn't know about anyone with our pid/fd having the
* device open, punt
*/
if ((pmp->pmp_pi
= putter_attach(mntpid, args->pa_fd, pmp, &puffs_putter)) == NULL) {
error = ENOENT;
goto out;
}
/* XXX: check parameters */
pmp->pmp_root_cookie = args->pa_root_cookie;
switch (args->pa_root_vtype) {
case VNON: case VREG: case VDIR: case VBLK:
case VCHR: case VLNK: case VSOCK: case VFIFO:
break;
default:
error = EINVAL;
goto out;
}
pmp->pmp_root_vtype = args->pa_root_vtype;
if (args->pa_root_vsize < 0) {
error = EINVAL;
goto out;
}
pmp->pmp_root_vsize = args->pa_root_vsize;
pmp->pmp_root_rdev = args->pa_root_rdev;
pmp->pmp_docompat = args->pa_time32;
mutex_init(&pmp->pmp_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&pmp->pmp_sopmtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&pmp->pmp_msg_waiter_cv, "puffsget");
cv_init(&pmp->pmp_refcount_cv, "puffsref");
cv_init(&pmp->pmp_unmounting_cv, "puffsum");
cv_init(&pmp->pmp_sopcv, "puffsop");
TAILQ_INIT(&pmp->pmp_msg_touser);
TAILQ_INIT(&pmp->pmp_msg_replywait);
TAILQ_INIT(&pmp->pmp_sopfastreqs);
TAILQ_INIT(&pmp->pmp_sopnodereqs);
if ((error = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL,
puffs_sop_thread, pmp, NULL, "puffsop")) != 0)
goto out;
pmp->pmp_sopthrcount = 1;
DPRINTF(("puffs_mount: mount point at %p, puffs specific at %p\n",
mp, MPTOPUFFSMP(mp)));
vfs_getnewfsid(mp);
out:
if (error && pmp && pmp->pmp_pi)
putter_detach(pmp->pmp_pi);
if (error && pmp)
kmem_free(pmp, sizeof(struct puffs_mount));
return error;
}
int
puffs_vfsop_start(struct mount *mp, int flags)
{
struct puffs_mount *pmp = MPTOPUFFSMP(mp);
KASSERT(pmp->pmp_status == PUFFSTAT_MOUNTING);
pmp->pmp_status = PUFFSTAT_RUNNING;
return 0;
}
int
puffs_vfsop_unmount(struct mount *mp, int mntflags)
{
PUFFS_MSG_VARS(vfs, unmount);
struct puffs_mount *pmp;
int error, force;
error = 0;
force = mntflags & MNT_FORCE;
pmp = MPTOPUFFSMP(mp);
DPRINTF(("puffs_unmount: detach filesystem from vfs, current "
"status 0x%x\n", pmp->pmp_status));
/*
* flush all the vnodes. VOP_RECLAIM() takes care that the
* root vnode does not get flushed until unmount. The
* userspace root node cookie is stored in the mount
* structure, so we can always re-instantiate a root vnode,
* should userspace unmount decide it doesn't want to
* cooperate.
*/
error = vflush(mp, NULLVP, force ? FORCECLOSE : 0);
if (error)
goto out;
/*
* If we are not DYING, we should ask userspace's opinion
* about the situation
*/
mutex_enter(&pmp->pmp_lock);
if (pmp->pmp_status != PUFFSTAT_DYING) {
pmp->pmp_unmounting = 1;
mutex_exit(&pmp->pmp_lock);
PUFFS_MSG_ALLOC(vfs, unmount);
puffs_msg_setinfo(park_unmount,
PUFFSOP_VFS, PUFFS_VFS_UNMOUNT, NULL);
unmount_msg->pvfsr_flags = mntflags;
PUFFS_MSG_ENQUEUEWAIT(pmp, park_unmount, error);
PUFFS_MSG_RELEASE(unmount);
error = checkerr(pmp, error, __func__);
DPRINTF(("puffs_unmount: error %d force %d\n", error, force));
mutex_enter(&pmp->pmp_lock);
pmp->pmp_unmounting = 0;
cv_broadcast(&pmp->pmp_unmounting_cv);
}
/*
* if userspace cooperated or we really need to die,
* screw what userland thinks and just die.
*/
if (error == 0 || force) {
struct puffs_sopreq *psopr;
/* tell waiters & other resources to go unwait themselves */
puffs_userdead(pmp);
putter_detach(pmp->pmp_pi);
/*
* Wait until there are no more users for the mount resource.
* Notice that this is hooked against transport_close
* and return from touser. In an ideal world, it would
* be hooked against final return from all operations.
* But currently it works well enough, since nobody
* does weird blocking voodoo after return from touser().
*/
while (pmp->pmp_refcount != 0)
cv_wait(&pmp->pmp_refcount_cv, &pmp->pmp_lock);
mutex_exit(&pmp->pmp_lock);
/*
* Release kernel thread now that there is nothing
* it would be wanting to lock.
*/
KASSERT(curlwp != uvm.pagedaemon_lwp);
psopr = kmem_alloc(sizeof(*psopr), KM_SLEEP);
psopr->psopr_sopreq = PUFFS_SOPREQSYS_EXIT;
mutex_enter(&pmp->pmp_sopmtx);
if (pmp->pmp_sopthrcount == 0) {
mutex_exit(&pmp->pmp_sopmtx);
kmem_free(psopr, sizeof(*psopr));
mutex_enter(&pmp->pmp_sopmtx);
KASSERT(pmp->pmp_sopthrcount == 0);
} else {
TAILQ_INSERT_TAIL(&pmp->pmp_sopfastreqs,
psopr, psopr_entries);
cv_signal(&pmp->pmp_sopcv);
}
while (pmp->pmp_sopthrcount > 0)
cv_wait(&pmp->pmp_sopcv, &pmp->pmp_sopmtx);
mutex_exit(&pmp->pmp_sopmtx);
/* free resources now that we hopefully have no waiters left */
cv_destroy(&pmp->pmp_unmounting_cv);
cv_destroy(&pmp->pmp_refcount_cv);
cv_destroy(&pmp->pmp_msg_waiter_cv);
cv_destroy(&pmp->pmp_sopcv);
mutex_destroy(&pmp->pmp_lock);
mutex_destroy(&pmp->pmp_sopmtx);
kmem_free(pmp, sizeof(struct puffs_mount));
error = 0;
} else {
mutex_exit(&pmp->pmp_lock);
}
out:
DPRINTF(("puffs_unmount: return %d\n", error));
return error;
}
/*
* This doesn't need to travel to userspace
*/
int
puffs_vfsop_root(struct mount *mp, int lktype, struct vnode **vpp)
{
struct puffs_mount *pmp = MPTOPUFFSMP(mp);
int rv;
rv = puffs_cookie2vnode(pmp, pmp->pmp_root_cookie, vpp);
KASSERT(rv != PUFFS_NOSUCHCOOKIE);
if (rv != 0)
return rv;
rv = vn_lock(*vpp, lktype);
if (rv != 0) {
vrele(*vpp);
*vpp = NULL;
return rv;
}
return 0;
}
int
puffs_vfsop_statvfs(struct mount *mp, struct statvfs *sbp)
{
PUFFS_MSG_VARS(vfs, statvfs);
struct puffs_mount *pmp;
int error = 0;
pmp = MPTOPUFFSMP(mp);
/*
* If we are mounting, it means that the userspace counterpart
* is calling mount(2), but mount(2) also calls statvfs. So
* requesting statvfs from userspace would mean a deadlock.
* Compensate.
*/
if (__predict_false(pmp->pmp_status == PUFFSTAT_MOUNTING))
return EINPROGRESS;
PUFFS_MSG_ALLOC(vfs, statvfs);
puffs_msg_setinfo(park_statvfs, PUFFSOP_VFS, PUFFS_VFS_STATVFS, NULL);
PUFFS_MSG_ENQUEUEWAIT(pmp, park_statvfs, error);
error = checkerr(pmp, error, __func__);
statvfs_msg->pvfsr_sb.f_iosize = DEV_BSIZE;
/*
* Try to produce a sensible result even in the event
* of userspace error.
*
* XXX: cache the copy in non-error case
*/
if (!error) {
puffs_statvfs_to_statvfs(&statvfs_msg->pvfsr_sb, sbp);
}
copy_statvfs_info(sbp, mp);
if (!error) {
statvfs_to_puffs_statvfs(sbp, &statvfs_msg->pvfsr_sb);
}
PUFFS_MSG_RELEASE(statvfs);
return error;
}
static bool
pageflush_selector(void *cl, struct vnode *vp)
{
KASSERT(mutex_owned(vp->v_interlock));
return vp->v_type == VREG &&
!(LIST_EMPTY(&vp->v_dirtyblkhd) &&
(vp->v_iflag & VI_ONWORKLST) == 0);
}
static int
pageflush(struct mount *mp, kauth_cred_t cred, int waitfor)
{
struct puffs_node *pn;
struct vnode *vp;
struct vnode_iterator *marker;
int error, rv, fsyncwait;
error = 0;
fsyncwait = (waitfor == MNT_WAIT) ? FSYNC_WAIT : 0;
/*
* Sync all cached data from regular vnodes (which are not
* currently locked, see below). After this we call VFS_SYNC
* for the fs server, which should handle data and metadata for
* all the nodes it knows to exist.
*/
vfs_vnode_iterator_init(mp, &marker);
while ((vp = vfs_vnode_iterator_next(marker, pageflush_selector,
NULL)))
{
/*
* Here we try to get a reference to the vnode and to
* lock it. This is mostly cargo-culted, but I will
* offer an explanation to why I believe this might
* actually do the right thing.
*
* If the vnode is a goner, we quite obviously don't need
* to sync it.
*
* If the vnode was busy, we don't need to sync it because
* this is never called with MNT_WAIT except from
* dounmount(), when we are wait-flushing all the dirty
* vnodes through other routes in any case. So there,
* sync() doesn't actually sync. Happy now?
*/
error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
if (error) {
vrele(vp);
continue;
}
pn = VPTOPP(vp);
/* hmm.. is the FAF thing entirely sensible? */
if (waitfor == MNT_LAZY) {
mutex_enter(vp->v_interlock);
pn->pn_stat |= PNODE_FAF;
mutex_exit(vp->v_interlock);
}
rv = VOP_FSYNC(vp, cred, fsyncwait, 0, 0);
if (waitfor == MNT_LAZY) {
mutex_enter(vp->v_interlock);
pn->pn_stat &= ~PNODE_FAF;
mutex_exit(vp->v_interlock);
}
if (rv)
error = rv;
vput(vp);
}
vfs_vnode_iterator_destroy(marker);
return error;
}
int
puffs_vfsop_sync(struct mount *mp, int waitfor, struct kauth_cred *cred)
{
PUFFS_MSG_VARS(vfs, sync);
struct puffs_mount *pmp = MPTOPUFFSMP(mp);
int error, rv;
error = pageflush(mp, cred, waitfor);
/* sync fs */
PUFFS_MSG_ALLOC(vfs, sync);
sync_msg->pvfsr_waitfor = waitfor;
puffs_credcvt(&sync_msg->pvfsr_cred, cred);
puffs_msg_setinfo(park_sync, PUFFSOP_VFS, PUFFS_VFS_SYNC, NULL);
PUFFS_MSG_ENQUEUEWAIT(pmp, park_sync, rv);
rv = checkerr(pmp, rv, __func__);
if (rv)
error = rv;
PUFFS_MSG_RELEASE(sync);
return error;
}
int
puffs_vfsop_fhtovp(struct mount *mp, struct fid *fhp, int lktype,
struct vnode **vpp)
{
PUFFS_MSG_VARS(vfs, fhtonode);
struct puffs_mount *pmp = MPTOPUFFSMP(mp);
struct vnode *vp;
void *fhdata;
size_t argsize, fhlen;
int error;
if (pmp->pmp_args.pa_fhsize == 0)
return EOPNOTSUPP;
if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) {
fhlen = fhp->fid_len;
fhdata = fhp;
} else {
fhlen = PUFFS_FROMFHSIZE(fhp->fid_len);
fhdata = fhp->fid_data;
if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC) {
if (pmp->pmp_args.pa_fhsize < fhlen)
return EINVAL;
} else {
if (pmp->pmp_args.pa_fhsize != fhlen)
return EINVAL;
}
}
argsize = sizeof(struct puffs_vfsmsg_fhtonode) + fhlen;
puffs_msgmem_alloc(argsize, &park_fhtonode, (void *)&fhtonode_msg, 1);
fhtonode_msg->pvfsr_dsize = fhlen;
memcpy(fhtonode_msg->pvfsr_data, fhdata, fhlen);
puffs_msg_setinfo(park_fhtonode, PUFFSOP_VFS, PUFFS_VFS_FHTOVP, NULL);
PUFFS_MSG_ENQUEUEWAIT(pmp, park_fhtonode, error);
error = checkerr(pmp, error, __func__);
if (error)
goto out;
error = puffs_getvnode(mp, fhtonode_msg->pvfsr_fhcookie,
fhtonode_msg->pvfsr_vtype, fhtonode_msg->pvfsr_size,
fhtonode_msg->pvfsr_rdev, &vp);
if (error)
goto out;
vn_lock(vp, lktype | LK_RETRY);
*vpp = vp;
out:
puffs_msgmem_release(park_fhtonode);
return error;
}
int
puffs_vfsop_vptofh(struct vnode *vp, struct fid *fhp, size_t *fh_size)
{
PUFFS_MSG_VARS(vfs, nodetofh);
struct puffs_mount *pmp = MPTOPUFFSMP(vp->v_mount);
size_t argsize, fhlen;
int error;
if (pmp->pmp_args.pa_fhsize == 0)
return EOPNOTSUPP;
/* if file handles are static len, we can test len immediately */
if (((pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC) == 0)
&& ((pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) == 0)
&& (PUFFS_FROMFHSIZE(*fh_size) < pmp->pmp_args.pa_fhsize)) {
*fh_size = PUFFS_TOFHSIZE(pmp->pmp_args.pa_fhsize);
return E2BIG;
}
if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH)
fhlen = *fh_size;
else
fhlen = PUFFS_FROMFHSIZE(*fh_size);
argsize = sizeof(struct puffs_vfsmsg_nodetofh) + fhlen;
puffs_msgmem_alloc(argsize, &park_nodetofh, (void *)&nodetofh_msg, 1);
nodetofh_msg->pvfsr_fhcookie = VPTOPNC(vp);
nodetofh_msg->pvfsr_dsize = fhlen;
puffs_msg_setinfo(park_nodetofh, PUFFSOP_VFS, PUFFS_VFS_VPTOFH, NULL);
PUFFS_MSG_ENQUEUEWAIT(pmp, park_nodetofh, error);
error = checkerr(pmp, error, __func__);
if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH)
fhlen = nodetofh_msg->pvfsr_dsize;
else if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_DYNAMIC)
fhlen = PUFFS_TOFHSIZE(nodetofh_msg->pvfsr_dsize);
else
fhlen = PUFFS_TOFHSIZE(pmp->pmp_args.pa_fhsize);
if (error) {
if (error == E2BIG)
*fh_size = fhlen;
goto out;
}
if (fhlen > FHANDLE_SIZE_MAX) {
puffs_senderr(pmp, PUFFS_ERR_VPTOFH, E2BIG,
"file handle too big", VPTOPNC(vp));
error = EPROTO;
goto out;
}
if (*fh_size < fhlen) {
*fh_size = fhlen;
error = E2BIG;
goto out;
}
*fh_size = fhlen;
if (fhp) {
if (pmp->pmp_args.pa_fhflags & PUFFS_FHFLAG_PASSTHROUGH) {
memcpy(fhp, nodetofh_msg->pvfsr_data, fhlen);
} else {
fhp->fid_len = *fh_size;
memcpy(fhp->fid_data, nodetofh_msg->pvfsr_data,
nodetofh_msg->pvfsr_dsize);
}
}
out:
puffs_msgmem_release(park_nodetofh);
return error;
}
int
puffs_vfsop_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
struct puffs_mount *pmp;
struct puffs_node *pnode;
KASSERT(key_len == sizeof(puffs_cookie_t));
pmp = MPTOPUFFSMP(mp);
/* Allocate and initialize the pnode. */
pnode = pool_get(&puffs_pnpool, PR_WAITOK);
memset(pnode, 0, sizeof(struct puffs_node));
pnode->pn_vp = vp;
memcpy(&pnode->pn_cookie, key, key_len);
pnode->pn_refcount = 1;
mutex_init(&pnode->pn_mtx, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&pnode->pn_sizemtx, MUTEX_DEFAULT, IPL_NONE);
selinit(&pnode->pn_sel);
vp->v_tag = VT_PUFFS;
vp->v_type = VNON;
vp->v_op = puffs_vnodeop_p;
if (pnode->pn_cookie == pmp->pmp_root_cookie)
vp->v_vflag |= VV_ROOT;
vp->v_data = pnode;
genfs_node_init(vp, &puffs_genfsops);
uvm_vnp_setsize(vp, 0);
*new_key = &pnode->pn_cookie;
return 0;
}
void
puffs_vfsop_init(void)
{
/* some checks depend on this */
KASSERT(VNOVAL == VSIZENOTSET);
pool_init(&puffs_pnpool, sizeof(struct puffs_node), 0, 0, 0,
"puffpnpl", &pool_allocator_nointr, IPL_NONE);
pool_init(&puffs_vapool, sizeof(struct vattr), 0, 0, 0,
"puffvapl", &pool_allocator_nointr, IPL_NONE);
puffs_msgif_init();
}
void
puffs_vfsop_done(void)
{
puffs_msgif_destroy();
pool_destroy(&puffs_pnpool);
pool_destroy(&puffs_vapool);
}
int
puffs_vfsop_snapshot(struct mount *mp, struct vnode *vp, struct timespec *ts)
{
return EOPNOTSUPP;
}
int
puffs_vfsop_extattrctl(struct mount *mp, int cmd, struct vnode *vp,
int attrnamespace, const char *attrname)
{
PUFFS_MSG_VARS(vfs, extattrctl);
struct puffs_mount *pmp = MPTOPUFFSMP(mp);
struct puffs_node *pnp;
puffs_cookie_t pnc;
int error, flags;
if (vp) {
/* doesn't make sense for puffs servers */
if (vp->v_mount != mp)
return EXDEV;
pnp = vp->v_data;
pnc = pnp->pn_cookie;
flags = PUFFS_EXTATTRCTL_HASNODE;
} else {
pnp = pnc = NULL;
flags = 0;
}
PUFFS_MSG_ALLOC(vfs, extattrctl);
extattrctl_msg->pvfsr_cmd = cmd;
extattrctl_msg->pvfsr_attrnamespace = attrnamespace;
extattrctl_msg->pvfsr_flags = flags;
if (attrname) {
strlcpy(extattrctl_msg->pvfsr_attrname, attrname,
sizeof(extattrctl_msg->pvfsr_attrname));
extattrctl_msg->pvfsr_flags |= PUFFS_EXTATTRCTL_HASATTRNAME;
}
puffs_msg_setinfo(park_extattrctl,
PUFFSOP_VFS, PUFFS_VFS_EXTATTRCTL, pnc);
puffs_msg_enqueue(pmp, park_extattrctl);
if (vp) {
mutex_enter(&pnp->pn_mtx);
puffs_referencenode(pnp);
mutex_exit(&pnp->pn_mtx);
VOP_UNLOCK(vp);
}
error = puffs_msg_wait2(pmp, park_extattrctl, pnp, NULL);
PUFFS_MSG_RELEASE(extattrctl);
if (vp) {
puffs_releasenode(pnp);
}
return checkerr(pmp, error, __func__);
}
const struct vnodeopv_desc * const puffs_vnodeopv_descs[] = {
&puffs_vnodeop_opv_desc,
&puffs_specop_opv_desc,
&puffs_fifoop_opv_desc,
&puffs_msgop_opv_desc,
NULL,
};
struct vfsops puffs_vfsops = {
.vfs_name = MOUNT_PUFFS,
.vfs_min_mount_data = sizeof (struct puffs_kargs),
.vfs_mount = puffs_vfsop_mount,
.vfs_start = puffs_vfsop_start,
.vfs_unmount = puffs_vfsop_unmount,
.vfs_root = puffs_vfsop_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = puffs_vfsop_statvfs,
.vfs_sync = puffs_vfsop_sync,
.vfs_vget = (void *)eopnotsupp,
.vfs_loadvnode = puffs_vfsop_loadvnode,
.vfs_fhtovp = puffs_vfsop_fhtovp,
.vfs_vptofh = puffs_vfsop_vptofh,
.vfs_init = puffs_vfsop_init,
.vfs_done = puffs_vfsop_done,
.vfs_snapshot = puffs_vfsop_snapshot,
.vfs_extattrctl = puffs_vfsop_extattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = puffs_vnodeopv_descs
};
static int
puffs_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return vfs_attach(&puffs_vfsops);
case MODULE_CMD_FINI:
return vfs_detach(&puffs_vfsops);
default:
return ENOTTY;
}
}
/* $NetBSD: vnd_30.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $ */
/*-
* Copyright (c) 1996, 1997, 1998, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* the Systems Programming Group of the University of Utah Computer
* Science Department.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Utah $Hdr: vn.c 1.13 94/04/02$
*
* @(#)vn.c 8.9 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vnd_30.c,v 1.4 2019/12/12 02:15:42 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/malloc.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/disk.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/uio.h>
#include <sys/conf.h>
#include <sys/vnode.h>
#include <sys/compat_stub.h>
#include <net/zlib.h>
#include <dev/vndvar.h>
#include <compat/common/compat_mod.h>
static int compat_30_vndioctl(u_long, struct lwp *, void *, int, struct vattr *,
int (*)(struct lwp *, void *, int, struct vattr *));
static int
compat_30_vndioctl(u_long cmd, struct lwp *l, void *data, int unit,
struct vattr *vattr_p,
int (*get)(struct lwp *, void *, int, struct vattr *))
{
struct vnd_user30 *vnu = data;
int error;
if (cmd != VNDIOCGET30)
return EPASSTHROUGH;
error = (*get)(l, data, unit, vattr_p);
if (error != 0)
return error;
vnu->vnu_dev = vattr_p->va_fsid;
vnu->vnu_ino = vattr_p->va_fileid;
return 0;
}
void
vnd_30_init(void)
{
MODULE_HOOK_SET(compat_vndioctl_30_hook, compat_30_vndioctl);
}
void
vnd_30_fini(void)
{
MODULE_HOOK_UNSET(compat_vndioctl_30_hook);
}
/* $NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $ */
/* $KAME: ip6_mroute.c,v 1.49 2001/07/25 09:21:18 jinmei Exp $ */
/*
* Copyright (C) 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/* BSDI ip_mroute.c,v 2.10 1996/11/14 00:29:52 jch Exp */
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
*/
/*
* Copyright (c) 1989 Stephen Deering
*
* This code is derived from software contributed to Berkeley by
* Stephen Deering of Stanford University.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93
*/
/*
* IP multicast forwarding procedures
*
* Written by David Waitzman, BBN Labs, August 1988.
* Modified by Steve Deering, Stanford, February 1989.
* Modified by Mark J. Steiglitz, Stanford, May, 1991
* Modified by Van Jacobson, LBL, January 1993
* Modified by Ajit Thyagarajan, PARC, August 1993
* Modified by Bill Fenner, PARC, April 1994
*
* MROUTING Revision: 3.5.1.2 + PIM-SMv2 (pimd) Support
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip6_mroute.c,v 1.132 2020/06/12 11:04:45 roy Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_mrouting.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/callout.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <net/if.h>
#include <net/route.h>
#include <net/raw_cb.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/icmp6.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet6/scope6_var.h>
#include <netinet6/pim6.h>
#include <netinet6/pim6_var.h>
#include <netinet6/nd6.h>
static int ip6_mdq(struct mbuf *, struct ifnet *, struct mf6c *);
static void phyint_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
static int set_pim6(int *);
static int socket_send(struct socket *, struct mbuf *, struct sockaddr_in6 *);
static int register_send(struct ip6_hdr *, struct mif6 *, struct mbuf *);
/*
* Globals. All but ip6_mrouter, ip6_mrtproto and mrt6stat could be static,
* except for netstat or debugging purposes.
*/
struct socket *ip6_mrouter = NULL;
int ip6_mrouter_ver = 0;
int ip6_mrtproto = IPPROTO_PIM; /* for netstat only */
struct mrt6stat mrt6stat;
#define NO_RTE_FOUND 0x1
#define RTE_FOUND 0x2
struct mf6c *mf6ctable[MF6CTBLSIZ];
u_char n6expire[MF6CTBLSIZ];
struct mif6 mif6table[MAXMIFS];
#ifdef MRT6DEBUG
u_int mrt6debug = 0; /* debug level */
#define DEBUG_MFC 0x02
#define DEBUG_FORWARD 0x04
#define DEBUG_EXPIRE 0x08
#define DEBUG_XMIT 0x10
#define DEBUG_REG 0x20
#define DEBUG_PIM 0x40
#define __mrt6debugused /* empty */
#else
#define __mrt6debugused __unused
#endif
static void expire_upcalls(void *);
#define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
#define UPCALL_EXPIRE 6 /* number of timeouts */
#ifdef INET
#ifdef MROUTING
extern struct socket *ip_mrouter;
#endif
#endif
/*
* 'Interfaces' associated with decapsulator (so we can tell
* packets that went through it from ones that get reflected
* by a broken gateway). These interfaces are never linked into
* the system ifnet list & no routes point to them. I.e., packets
* can't be sent this way. They only exist as a placeholder for
* multicast source verification.
*/
struct ifnet multicast_register_if6;
#define ENCAP_HOPS 64
/*
* Private variables.
*/
static mifi_t nummifs = 0;
static mifi_t reg_mif_num = (mifi_t)-1;
static percpu_t *pim6stat_percpu;
#define PIM6_STATINC(x) _NET_STATINC(pim6stat_percpu, x)
static int pim6;
/*
* Hash function for a source, group entry
*/
#define MF6CHASH(a, g) MF6CHASHMOD((a).s6_addr32[0] ^ (a).s6_addr32[1] ^ \
(a).s6_addr32[2] ^ (a).s6_addr32[3] ^ \
(g).s6_addr32[0] ^ (g).s6_addr32[1] ^ \
(g).s6_addr32[2] ^ (g).s6_addr32[3])
/*
* Find a route for a given origin IPv6 address and Multicast group address.
* Quality of service parameter to be added in the future!!!
*/
#define MF6CFIND(o, g, rt) do { \
struct mf6c *_rt = mf6ctable[MF6CHASH(o,g)]; \
rt = NULL; \
mrt6stat.mrt6s_mfc_lookups++; \
while (_rt) { \
if (IN6_ARE_ADDR_EQUAL(&_rt->mf6c_origin.sin6_addr, &(o)) && \
IN6_ARE_ADDR_EQUAL(&_rt->mf6c_mcastgrp.sin6_addr, &(g)) && \
(_rt->mf6c_stall == NULL)) { \
rt = _rt; \
break; \
} \
_rt = _rt->mf6c_next; \
} \
if (rt == NULL) { \
mrt6stat.mrt6s_mfc_misses++; \
} \
} while (/*CONSTCOND*/ 0)
/*
* Macros to compute elapsed time efficiently
* Borrowed from Van Jacobson's scheduling code
*/
#define TV_DELTA(a, b, delta) do { \
int xxs; \
\
delta = (a).tv_usec - (b).tv_usec; \
if ((xxs = (a).tv_sec - (b).tv_sec)) { \
switch (xxs) { \
case 2: \
delta += 1000000; \
/* FALLTHROUGH */ \
case 1: \
delta += 1000000; \
break; \
default: \
delta += (1000000 * xxs); \
} \
} \
} while (/*CONSTCOND*/ 0)
#define TV_LT(a, b) (((a).tv_usec < (b).tv_usec && \
(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
#ifdef UPCALL_TIMING
#define UPCALL_MAX 50
u_long upcall_data[UPCALL_MAX + 1];
static void collate();
#endif /* UPCALL_TIMING */
static int get_sg_cnt(struct sioc_sg_req6 *);
static int get_mif6_cnt(struct sioc_mif_req6 *);
static int ip6_mrouter_init(struct socket *, int, int);
static int add_m6if(struct mif6ctl *);
static int del_m6if(mifi_t *);
static int add_m6fc(struct mf6cctl *);
static int del_m6fc(struct mf6cctl *);
static void sysctl_net_inet6_pim6_setup(struct sysctllog **);
static callout_t expire_upcalls_ch;
void
pim6_init(void)
{
sysctl_net_inet6_pim6_setup(NULL);
pim6stat_percpu = percpu_alloc(sizeof(uint64_t) * PIM6_NSTATS);
}
/*
* Handle MRT setsockopt commands to modify the multicast routing tables.
*/
int
ip6_mrouter_set(struct socket *so, struct sockopt *sopt)
{
int error, optval;
struct mif6ctl mifc;
struct mf6cctl mfcc;
mifi_t mifi;
if (sopt->sopt_name != MRT6_INIT && so != ip6_mrouter)
return (EACCES);
error = 0;
switch (sopt->sopt_name) {
#ifdef MRT6_OINIT
case MRT6_OINIT:
#endif
case MRT6_INIT:
error = sockopt_getint(sopt, &optval);
if (error)
break;
return (ip6_mrouter_init(so, optval, sopt->sopt_name));
case MRT6_DONE:
return (ip6_mrouter_done());
case MRT6_ADD_MIF:
error = sockopt_get(sopt, &mifc, sizeof(mifc));
if (error)
break;
return (add_m6if(&mifc));
case MRT6_DEL_MIF:
error = sockopt_get(sopt, &mifi, sizeof(mifi));
if (error)
break;
return (del_m6if(&mifi));
case MRT6_ADD_MFC:
error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
if (error)
break;
return (add_m6fc(&mfcc));
case MRT6_DEL_MFC:
error = sockopt_get(sopt, &mfcc, sizeof(mfcc));
if (error)
break;
return (del_m6fc(&mfcc));
case MRT6_PIM:
error = sockopt_getint(sopt, &optval);
if (error)
break;
return (set_pim6(&optval));
default:
error = EOPNOTSUPP;
}
return (error);
}
/*
* Handle MRT getsockopt commands
*/
int
ip6_mrouter_get(struct socket *so, struct sockopt *sopt)
{
int error;
if (so != ip6_mrouter)
return EACCES;
error = 0;
switch (sopt->sopt_name) {
case MRT6_PIM:
error = sockopt_set(sopt, &pim6, sizeof(pim6));
break;
default:
error = EOPNOTSUPP;
break;
}
return (error);
}
/*
* Handle ioctl commands to obtain information from the cache
*/
int
mrt6_ioctl(u_long cmd, void *data)
{ switch (cmd) {
case SIOCGETSGCNT_IN6:
return (get_sg_cnt((struct sioc_sg_req6 *)data));
case SIOCGETMIFCNT_IN6:
return (get_mif6_cnt((struct sioc_mif_req6 *)data));
default:
return (EINVAL);
}
}
/*
* returns the packet, byte, rpf-failure count for the source group provided
*/
static int
get_sg_cnt(struct sioc_sg_req6 *req)
{
struct mf6c *rt;
int s;
s = splsoftnet();
MF6CFIND(req->src.sin6_addr, req->grp.sin6_addr, rt); splx(s);
if (rt != NULL) {
req->pktcnt = rt->mf6c_pkt_cnt;
req->bytecnt = rt->mf6c_byte_cnt;
req->wrong_if = rt->mf6c_wrong_if;
} else
return (ESRCH);
#if 0
req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
#endif
return 0;
}
/*
* returns the input and output packet and byte counts on the mif provided
*/
static int
get_mif6_cnt(struct sioc_mif_req6 *req)
{
mifi_t mifi = req->mifi;
if (mifi >= nummifs)
return EINVAL;
req->icount = mif6table[mifi].m6_pkt_in;
req->ocount = mif6table[mifi].m6_pkt_out;
req->ibytes = mif6table[mifi].m6_bytes_in;
req->obytes = mif6table[mifi].m6_bytes_out;
return 0;
}
static int
set_pim6(int *i)
{
if ((*i != 1) && (*i != 0))
return EINVAL;
pim6 = *i;
return 0;
}
/*
* Enable multicast routing
*/
static int
ip6_mrouter_init(struct socket *so, int v, int cmd)
{
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG,
"ip6_mrouter_init: so_type = %d, pr_protocol = %d\n",
so->so_type, so->so_proto->pr_protocol);
#endif
if (so->so_type != SOCK_RAW ||
so->so_proto->pr_protocol != IPPROTO_ICMPV6)
return EOPNOTSUPP;
if (v != 1)
return ENOPROTOOPT;
if (ip6_mrouter != NULL)
return EADDRINUSE;
ip6_mrouter = so;
ip6_mrouter_ver = cmd;
memset((void *)mf6ctable, 0, sizeof(mf6ctable));
memset((void *)n6expire, 0, sizeof(n6expire));
pim6 = 0;/* used for stubbing out/in pim stuff */
callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE);
callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
expire_upcalls, NULL);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "ip6_mrouter_init\n");
#endif
return 0;
}
/*
* Disable multicast routing
*/
int
ip6_mrouter_done(void)
{
mifi_t mifi;
int i;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
struct mf6c *rt;
struct rtdetq *rte;
int s;
s = splsoftnet();
/*
* For each phyint in use, disable promiscuous reception of all IPv6
* multicasts.
*/
#ifdef INET
#ifdef MROUTING
/*
* If there is still IPv4 multicast routing daemon,
* we remain interfaces to receive all muliticasted packets.
* XXX: there may be an interface in which the IPv4 multicast
* daemon is not interested...
*/
if (!ip_mrouter)
#endif
#endif
{
for (mifi = 0; mifi < nummifs; mifi++) { if (mif6table[mifi].m6_ifp &&
!(mif6table[mifi].m6_flags & MIFF_REGISTER)) {
ifp = mif6table[mifi].m6_ifp;
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
if_mcast_op(ifp, SIOCDELMULTI,
sin6tocsa(&sin6));
}
}
}
memset((void *)mif6table, 0, sizeof(mif6table));
nummifs = 0;
pim6 = 0; /* used to stub out/in pim specific code */
callout_stop(&expire_upcalls_ch);
/*
* Free all multicast forwarding cache entries.
*/
for (i = 0; i < MF6CTBLSIZ; i++) {
rt = mf6ctable[i];
while (rt) {
struct mf6c *frt;
for (rte = rt->mf6c_stall; rte != NULL; ) {
struct rtdetq *n = rte->next;
m_freem(rte->m);
free(rte, M_MRTABLE);
rte = n;
}
frt = rt;
rt = rt->mf6c_next;
free(frt, M_MRTABLE);
}
}
memset((void *)mf6ctable, 0, sizeof(mf6ctable));
/*
* Reset register interface
*/
if (reg_mif_num != (mifi_t)-1) { if_detach(&multicast_register_if6);
reg_mif_num = (mifi_t)-1;
}
ip6_mrouter = NULL;
ip6_mrouter_ver = 0;
splx(s);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "ip6_mrouter_done\n");
#endif
return 0;
}
void
ip6_mrouter_detach(struct ifnet *ifp)
{
struct rtdetq *rte;
struct mf6c *mfc;
mifi_t mifi;
int i;
if (ip6_mrouter == NULL)
return;
/*
* Delete a mif which points to ifp.
*/
for (mifi = 0; mifi < nummifs; mifi++)
if (mif6table[mifi].m6_ifp == ifp)
del_m6if(&mifi);
/*
* Clear rte->ifp of cache entries received on ifp.
*/
for (i = 0; i < MF6CTBLSIZ; i++) {
if (n6expire[i] == 0)
continue;
for (mfc = mf6ctable[i]; mfc != NULL; mfc = mfc->mf6c_next) {
for (rte = mfc->mf6c_stall; rte != NULL; rte = rte->next) {
if (rte->ifp == ifp)
rte->ifp = NULL;
}
}
}
}
/*
* Add a mif to the mif table
*/
static int
add_m6if(struct mif6ctl *mifcp)
{
struct mif6 *mifp;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
int error, s;
if (mifcp->mif6c_mifi >= MAXMIFS)
return EINVAL;
mifp = mif6table + mifcp->mif6c_mifi;
if (mifp->m6_ifp)
return EADDRINUSE; /* XXX: is it appropriate? */
if (!mifcp->mif6c_pifi || (ifp = if_byindex(mifcp->mif6c_pifi)) == NULL)
return ENXIO;
if (mifcp->mif6c_flags & MIFF_REGISTER) {
ifp = &multicast_register_if6;
if (reg_mif_num == (mifi_t)-1) { strlcpy(ifp->if_xname, "register_mif",
sizeof(ifp->if_xname));
ifp->if_flags |= IFF_LOOPBACK;
ifp->if_index = mifcp->mif6c_mifi;
reg_mif_num = mifcp->mif6c_mifi;
if_attach(ifp);
}
} else {
/* Make sure the interface supports multicast */
if ((ifp->if_flags & IFF_MULTICAST) == 0)
return EOPNOTSUPP;
s = splsoftnet();
/*
* Enable promiscuous reception of all IPv6 multicasts
* from the interface.
*/
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
error = if_mcast_op(ifp, SIOCADDMULTI, sin6tosa(&sin6));
splx(s);
if (error)
return error;
}
s = splsoftnet();
mifp->m6_flags = mifcp->mif6c_flags;
mifp->m6_ifp = ifp;
/* initialize per mif pkt counters */
mifp->m6_pkt_in = 0;
mifp->m6_pkt_out = 0;
mifp->m6_bytes_in = 0;
mifp->m6_bytes_out = 0;
splx(s);
/* Adjust nummifs up if the mifi is higher than nummifs */
if (nummifs <= mifcp->mif6c_mifi) nummifs = mifcp->mif6c_mifi + 1;
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG,
"add_mif #%d, phyint %s\n",
mifcp->mif6c_mifi, ifp->if_xname);
#endif
return 0;
}
/*
* Delete a mif from the mif table
*/
static int
del_m6if(mifi_t *mifip)
{
struct mif6 *mifp = mif6table + *mifip;
mifi_t mifi;
struct ifnet *ifp;
struct sockaddr_in6 sin6;
int s;
if (*mifip >= nummifs)
return EINVAL;
if (mifp->m6_ifp == NULL)
return EINVAL;
s = splsoftnet();
if (!(mifp->m6_flags & MIFF_REGISTER)) {
/*
* XXX: what if there is yet IPv4 multicast daemon
* using the interface?
*/
ifp = mifp->m6_ifp;
sockaddr_in6_init(&sin6, &in6addr_any, 0, 0, 0);
if_mcast_op(ifp, SIOCDELMULTI, sin6tosa(&sin6));
} else {
if (reg_mif_num != (mifi_t)-1) {
if_detach(&multicast_register_if6);
reg_mif_num = (mifi_t)-1;
}
}
memset((void *)mifp, 0, sizeof (*mifp));
/* Adjust nummifs down */
for (mifi = nummifs; mifi > 0; mifi--)
if (mif6table[mifi - 1].m6_ifp)
break;
nummifs = mifi;
splx(s);
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_DEBUG, "del_m6if %d, nummifs %d\n", *mifip, nummifs);
#endif
return 0;
}
/*
* Add an mfc entry
*/
static int
add_m6fc(struct mf6cctl *mfccp)
{
struct mf6c *rt;
u_long hash;
struct rtdetq *rte;
u_short nstl;
int s;
char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
MF6CFIND(mfccp->mf6cc_origin.sin6_addr,
mfccp->mf6cc_mcastgrp.sin6_addr, rt);
/* If an entry already exists, just update the fields */
if (rt) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,"add_m6fc update o %s g %s p %x\n",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent);
#endif
s = splsoftnet();
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
splx(s);
return 0;
}
/*
* Find the entry for which the upcall was made and update
*/
s = splsoftnet();
hash = MF6CHASH(mfccp->mf6cc_origin.sin6_addr,
mfccp->mf6cc_mcastgrp.sin6_addr);
for (rt = mf6ctable[hash], nstl = 0; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
&mfccp->mf6cc_mcastgrp.sin6_addr) &&
(rt->mf6c_stall != NULL)) {
if (nstl++)
log(LOG_ERR,
"add_m6fc: %s o %s g %s p %x dbx %p\n",
"multiple kernel entries",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent, rt->mf6c_stall);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,
"add_m6fc o %s g %s p %x dbg %p\n",
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent, rt->mf6c_stall);
#endif
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
rt->mf6c_expire = 0; /* Don't clean this guy up */
n6expire[hash]--;
/* free packets Qed at the end of this entry */
for (rte = rt->mf6c_stall; rte != NULL; ) {
struct rtdetq *n = rte->next;
if (rte->ifp) { ip6_mdq(rte->m, rte->ifp, rt);
}
m_freem(rte->m);
#ifdef UPCALL_TIMING
collate(&(rte->t));
#endif
free(rte, M_MRTABLE);
rte = n;
}
rt->mf6c_stall = NULL;
}
}
/*
* It is possible that an entry is being inserted without an upcall
*/
if (nstl == 0) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC)
log(LOG_DEBUG,
"add_mfc no upcall h %ld o %s g %s p %x\n",
hash,
IN6_PRINT(ip6bufo,
&mfccp->mf6cc_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfccp->mf6cc_mcastgrp.sin6_addr),
mfccp->mf6cc_parent);
#endif
for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) { if (IN6_ARE_ADDR_EQUAL(&rt->mf6c_origin.sin6_addr, &mfccp->mf6cc_origin.sin6_addr)&&
IN6_ARE_ADDR_EQUAL(&rt->mf6c_mcastgrp.sin6_addr,
&mfccp->mf6cc_mcastgrp.sin6_addr)) {
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
if (rt->mf6c_expire) n6expire[hash]--;
rt->mf6c_expire = 0;
}
}
if (rt == NULL) {
/* no upcall, so make a new entry */
rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
if (rt == NULL) {
splx(s);
return ENOBUFS;
}
/* insert new entry at head of hash chain */
rt->mf6c_origin = mfccp->mf6cc_origin;
rt->mf6c_mcastgrp = mfccp->mf6cc_mcastgrp;
rt->mf6c_parent = mfccp->mf6cc_parent;
rt->mf6c_ifset = mfccp->mf6cc_ifset;
/* initialize pkt counters per src-grp */
rt->mf6c_pkt_cnt = 0;
rt->mf6c_byte_cnt = 0;
rt->mf6c_wrong_if = 0;
rt->mf6c_expire = 0;
rt->mf6c_stall = NULL;
/* link into table */
rt->mf6c_next = mf6ctable[hash];
mf6ctable[hash] = rt;
}
}
splx(s);
return 0;
}
#ifdef UPCALL_TIMING
/*
* collect delay statistics on the upcalls
*/
static void
collate(struct timeval *t)
{
u_long d;
struct timeval tp;
u_long delta;
GET_TIME(tp);
if (TV_LT(*t, tp))
{
TV_DELTA(tp, *t, delta);
d = delta >> 10;
if (d > UPCALL_MAX)
d = UPCALL_MAX;
++upcall_data[d];
}
}
#endif /* UPCALL_TIMING */
/*
* Delete an mfc entry
*/
static int
del_m6fc(struct mf6cctl *mfccp)
{
struct sockaddr_in6 origin;
struct sockaddr_in6 mcastgrp;
struct mf6c *rt;
struct mf6c **nptr;
u_long hash;
int s;
origin = mfccp->mf6cc_origin;
mcastgrp = mfccp->mf6cc_mcastgrp;
hash = MF6CHASH(origin.sin6_addr, mcastgrp.sin6_addr);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_MFC) {
char ip6bufo[INET6_ADDRSTRLEN], ip6bufm[INET6_ADDRSTRLEN];
log(LOG_DEBUG,"del_m6fc orig %s mcastgrp %s\n",
IN6_PRINT(ip6bufo, &origin.sin6_addr),
IN6_PRINT(ip6bufm, &mcastgrp.sin6_addr));
}
#endif
s = splsoftnet();
nptr = &mf6ctable[hash];
while ((rt = *nptr) != NULL) { if (IN6_ARE_ADDR_EQUAL(&origin.sin6_addr, &rt->mf6c_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&mcastgrp.sin6_addr,
&rt->mf6c_mcastgrp.sin6_addr) &&
rt->mf6c_stall == NULL)
break;
nptr = &rt->mf6c_next;
}
if (rt == NULL) {
splx(s);
return EADDRNOTAVAIL;
}
*nptr = rt->mf6c_next;
free(rt, M_MRTABLE);
splx(s);
return 0;
}
static int
socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in6 *src)
{
if (s) {
if (sbappendaddr(&s->so_rcv, sin6tosa(src), mm, NULL) != 0) {
sorwakeup(s);
return 0;
}
soroverflow(s);
}
m_freem(mm);
return -1;
}
/*
* IPv6 multicast forwarding function. This function assumes that the packet
* pointed to by "ip6" has arrived on (or is about to be sent to) the interface
* pointed to by "ifp", and the packet is to be relayed to other networks
* that have members of the packet's destination IPv6 multicast group.
*
* The packet is returned unscathed to the caller, unless it is
* erroneous, in which case a non-zero return value tells the caller to
* discard it.
*/
int
ip6_mforward(struct ip6_hdr *ip6, struct ifnet *ifp, struct mbuf *m)
{
struct mf6c *rt;
struct mif6 *mifp;
struct mbuf *mm;
int s;
mifi_t mifi;
struct sockaddr_in6 sin6;
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG, "ip6_mforward: src %s, dst %s, ifindex %d\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
ifp->if_index);
#endif
/*
* Don't forward a packet with Hop limit of zero or one,
* or a packet destined to a local-only group.
*/
if (ip6->ip6_hlim <= 1 || IN6_IS_ADDR_MC_NODELOCAL(&ip6->ip6_dst) ||
IN6_IS_ADDR_MC_LINKLOCAL(&ip6->ip6_dst))
return 0;
ip6->ip6_hlim--;
/*
* Source address check: do not forward packets with unspecified
* source. It was discussed in July 2000, on ipngwg mailing list.
* This is rather more serious than unicast cases, because some
* MLD packets can be sent with the unspecified source address
* (although such packets must normally set the hop limit field to 1).
*/
if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
IP6_STATINC(IP6_STAT_CANTFORWARD);
if (ip6_log_time + ip6_log_interval < time_uptime) {
ip6_log_time = time_uptime;
log(LOG_DEBUG,
"cannot forward "
"from %s to %s nxt %d received on %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
ip6->ip6_nxt,
m->m_pkthdr.rcvif_index ?
if_name(m_get_rcvif_NOMPSAFE(m)) : "?");
}
return 0;
}
/*
* Determine forwarding mifs from the forwarding cache table
*/
s = splsoftnet();
MF6CFIND(ip6->ip6_src, ip6->ip6_dst, rt);
/* Entry exists, so forward if necessary */
if (rt) {
splx(s);
return ip6_mdq(m, ifp, rt);
} else {
/*
* If we don't have a route for packet's origin, make a copy
* of the packet and send message to routing daemon.
*/
struct mbuf *mb0;
struct rtdetq *rte;
u_long hash;
#ifdef UPCALL_TIMING
struct timeval tp;
GET_TIME(tp);
#endif
mrt6stat.mrt6s_no_route++;
#ifdef MRT6DEBUG
if (mrt6debug & (DEBUG_FORWARD | DEBUG_MFC))
log(LOG_DEBUG, "ip6_mforward: no rte s %s g %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
#endif
/*
* Allocate mbufs early so that we don't do extra work if we
* are just going to fail anyway.
*/
rte = malloc(sizeof(*rte), M_MRTABLE, M_NOWAIT);
if (rte == NULL) {
splx(s);
return ENOBUFS;
}
mb0 = m_copypacket(m, M_DONTWAIT);
/*
* Pullup packet header if needed before storing it,
* as other references may modify it in the meantime.
*/
if (mb0 && M_UNWRITABLE(mb0, sizeof(struct ip6_hdr)))
mb0 = m_pullup(mb0, sizeof(struct ip6_hdr));
if (mb0 == NULL) {
free(rte, M_MRTABLE);
splx(s);
return ENOBUFS;
}
/* is there an upcall waiting for this packet? */
hash = MF6CHASH(ip6->ip6_src, ip6->ip6_dst);
for (rt = mf6ctable[hash]; rt; rt = rt->mf6c_next) {
if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
&rt->mf6c_origin.sin6_addr) &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
&rt->mf6c_mcastgrp.sin6_addr) &&
(rt->mf6c_stall != NULL))
break;
}
if (rt == NULL) {
struct mrt6msg *im;
struct omrt6msg *oim;
/* no upcall, so make a new entry */
rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT);
if (rt == NULL) {
free(rte, M_MRTABLE);
m_freem(mb0);
splx(s);
return ENOBUFS;
}
/*
* Make a copy of the header to send to the user
* level process
*/
mm = m_copym(mb0, 0, sizeof(struct ip6_hdr), M_DONTWAIT);
if (mm == NULL) {
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return ENOBUFS;
}
/*
* Send message to routing daemon
*/
sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);
im = NULL;
oim = NULL;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim = mtod(mm, struct omrt6msg *);
oim->im6_msgtype = MRT6MSG_NOCACHE;
oim->im6_mbz = 0;
break;
case MRT6_INIT:
im = mtod(mm, struct mrt6msg *);
im->im6_msgtype = MRT6MSG_NOCACHE;
im->im6_mbz = 0;
break;
default:
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return EINVAL;
}
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG,
"getting the iif info in the kernel\n");
#endif
for (mifp = mif6table, mifi = 0;
mifi < nummifs && mifp->m6_ifp != ifp;
mifp++, mifi++)
;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim->im6_mif = mifi;
break;
case MRT6_INIT:
im->im6_mif = mifi;
break;
}
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
log(LOG_WARNING, "ip6_mforward: ip6_mrouter "
"socket queue full\n");
mrt6stat.mrt6s_upq_sockfull++;
free(rte, M_MRTABLE);
m_freem(mb0);
free(rt, M_MRTABLE);
splx(s);
return ENOBUFS;
}
mrt6stat.mrt6s_upcalls++;
/* insert new entry at head of hash chain */
memset(rt, 0, sizeof(*rt));
sockaddr_in6_init(&rt->mf6c_origin, &ip6->ip6_src,
0, 0, 0);
sockaddr_in6_init(&rt->mf6c_mcastgrp, &ip6->ip6_dst,
0, 0, 0);
rt->mf6c_expire = UPCALL_EXPIRE;
n6expire[hash]++;
rt->mf6c_parent = MF6C_INCOMPLETE_PARENT;
/* link into table */
rt->mf6c_next = mf6ctable[hash];
mf6ctable[hash] = rt;
/* Add this entry to the end of the queue */
rt->mf6c_stall = rte;
} else {
/* determine if q has overflowed */
struct rtdetq **p;
int npkts = 0;
for (p = &rt->mf6c_stall; *p != NULL; p = &(*p)->next) {
if (++npkts > MAX_UPQ6) {
mrt6stat.mrt6s_upq_ovflw++;
free(rte, M_MRTABLE);
m_freem(mb0);
splx(s);
return 0;
}
}
/* Add this entry to the end of the queue */
*p = rte;
}
rte->next = NULL;
rte->m = mb0;
rte->ifp = ifp;
#ifdef UPCALL_TIMING
rte->t = tp;
#endif
splx(s);
return 0;
}
}
/*
* Clean up cache entries if upcalls are not serviced
* Call from the Slow Timeout mechanism, every 0.25 seconds.
*/
static void
expire_upcalls(void *unused)
{
struct rtdetq *rte;
struct mf6c *mfc, **nptr;
int i;
/* XXX NOMPSAFE still need softnet_lock */
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
for (i = 0; i < MF6CTBLSIZ; i++) {
if (n6expire[i] == 0)
continue;
nptr = &mf6ctable[i];
while ((mfc = *nptr) != NULL) {
rte = mfc->mf6c_stall;
/*
* Skip real cache entries
* Make sure it wasn't marked to not expire (shouldn't happen)
* If it expires now
*/
if (rte != NULL &&
mfc->mf6c_expire != 0 &&
--mfc->mf6c_expire == 0) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_EXPIRE) {
char ip6bufo[INET6_ADDRSTRLEN];
char ip6bufm[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"expire_upcalls: expiring (%s %s)\n",
IN6_PRINT(ip6bufo,
&mfc->mf6c_origin.sin6_addr),
IN6_PRINT(ip6bufm,
&mfc->mf6c_mcastgrp.sin6_addr));
}
#endif
/*
* drop all the packets
* free the mbuf with the pkt, if, timing info
*/
do {
struct rtdetq *n = rte->next;
m_freem(rte->m);
free(rte, M_MRTABLE);
rte = n;
} while (rte != NULL);
mrt6stat.mrt6s_cache_cleanups++;
n6expire[i]--;
*nptr = mfc->mf6c_next;
free(mfc, M_MRTABLE);
} else {
nptr = &mfc->mf6c_next;
}
}
}
callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT,
expire_upcalls, NULL);
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Macro to send packet on mif. Since RSVP packets don't get counted on
* input, they shouldn't get counted on output, so statistics keeping is
* separate.
*/
#define MC6_SEND(ip6, mifp, m) do { \
if ((mifp)->m6_flags & MIFF_REGISTER) \
register_send((ip6), (mifp), (m)); \
else \
phyint_send((ip6), (mifp), (m)); \
} while (/*CONSTCOND*/ 0)
/*
* Packet forwarding routine once entry in the cache is made
*/
static int
ip6_mdq(struct mbuf *m, struct ifnet *ifp, struct mf6c *rt)
{
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
mifi_t mifi, iif;
struct mif6 *mifp;
int plen = m->m_pkthdr.len;
struct in6_addr src0, dst0; /* copies for local work */
u_int32_t iszone, idzone, oszone, odzone;
int error = 0;
/*
* Don't forward if it didn't arrive from the parent mif
* for its origin.
*/
mifi = rt->mf6c_parent;
if ((mifi >= nummifs) || (mif6table[mifi].m6_ifp != ifp)) {
/* came in the wrong interface */
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_FORWARD)
log(LOG_DEBUG,
"wrong if: ifid %d mifi %d mififid %x\n",
ifp->if_index, mifi,
mif6table[mifi].m6_ifp ?
mif6table[mifi].m6_ifp->if_index : -1);
#endif
mrt6stat.mrt6s_wrong_if++;
rt->mf6c_wrong_if++;
/*
* If we are doing PIM processing, and we are forwarding
* packets on this interface, send a message to the
* routing daemon.
*/
/* have to make sure this is a valid mif */
if (mifi < nummifs && mif6table[mifi].m6_ifp) {
if (pim6 && (m->m_flags & M_LOOP) == 0) {
/*
* Check the M_LOOP flag to avoid an
* unnecessary PIM assert.
* XXX: M_LOOP is an ad-hoc hack...
*/
struct sockaddr_in6 sin6;
struct mbuf *mm;
struct mrt6msg *im;
struct omrt6msg *oim;
mm = m_copym(m, 0, sizeof(struct ip6_hdr), M_DONTWAIT);
if (mm && M_UNWRITABLE(mm, sizeof(struct ip6_hdr)))
mm = m_pullup(mm, sizeof(struct ip6_hdr));
if (mm == NULL)
return ENOBUFS;
oim = NULL;
im = NULL;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim = mtod(mm, struct omrt6msg *);
oim->im6_msgtype = MRT6MSG_WRONGMIF;
oim->im6_mbz = 0;
break;
case MRT6_INIT:
im = mtod(mm, struct mrt6msg *);
im->im6_msgtype = MRT6MSG_WRONGMIF;
im->im6_mbz = 0;
break;
default:
m_freem(mm);
return EINVAL;
}
for (mifp = mif6table, iif = 0;
iif < nummifs && mifp &&
mifp->m6_ifp != ifp;
mifp++, iif++)
;
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
switch (ip6_mrouter_ver) {
case MRT6_OINIT:
oim->im6_mif = iif;
sin6.sin6_addr = oim->im6_src;
break;
case MRT6_INIT:
im->im6_mif = iif;
sin6.sin6_addr = im->im6_src;
break;
}
mrt6stat.mrt6s_upcalls++;
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_WARNING, "mdq, ip6_mrouter socket queue full\n");
#endif
++mrt6stat.mrt6s_upq_sockfull;
return ENOBUFS;
}
}
}
return 0;
}
/* If I sourced this packet, it counts as output, else it was input. */
if (m->m_pkthdr.rcvif_index == 0) {
/* XXX: is rcvif really NULL when output?? */
mif6table[mifi].m6_pkt_out++;
mif6table[mifi].m6_bytes_out += plen;
} else {
mif6table[mifi].m6_pkt_in++;
mif6table[mifi].m6_bytes_in += plen;
}
rt->mf6c_pkt_cnt++;
rt->mf6c_byte_cnt += plen;
/*
* For each mif, forward a copy of the packet if there are group
* members downstream on the interface.
*/
src0 = ip6->ip6_src;
dst0 = ip6->ip6_dst;
if ((error = in6_setscope(&src0, ifp, &iszone)) != 0 ||
(error = in6_setscope(&dst0, ifp, &idzone)) != 0) {
IP6_STATINC(IP6_STAT_BADSCOPE);
return error;
}
for (mifp = mif6table, mifi = 0; mifi < nummifs; mifp++, mifi++) {
if (IF_ISSET(mifi, &rt->mf6c_ifset)) {
if (mif6table[mifi].m6_ifp == NULL)
continue;
/*
* check if the outgoing packet is going to break
* a scope boundary.
* XXX: For packets through PIM register tunnel
* interface, we believe the routing daemon.
*/
if ((mif6table[rt->mf6c_parent].m6_flags &
MIFF_REGISTER) == 0 &&
(mif6table[mifi].m6_flags & MIFF_REGISTER) == 0) {
if (in6_setscope(&src0, mif6table[mifi].m6_ifp,
&oszone) ||
in6_setscope(&dst0, mif6table[mifi].m6_ifp,
&odzone) ||
iszone != oszone || idzone != odzone) {
IP6_STATINC(IP6_STAT_BADSCOPE);
continue;
}
}
mifp->m6_pkt_out++;
mifp->m6_bytes_out += plen;
MC6_SEND(ip6, mifp, m);
}
}
return 0;
}
static void
phyint_send(struct ip6_hdr *ip6, struct mif6 *mifp, struct mbuf *m)
{
struct mbuf *mb_copy;
struct ifnet *ifp = mifp->m6_ifp;
int error __mrt6debugused = 0;
int s;
static struct route ro;
bool ingroup;
struct sockaddr_in6 dst6;
s = splsoftnet();
/*
* Make a new reference to the packet; make sure that
* the IPv6 header is actually copied, not just referenced,
* so that ip6_output() only scribbles on the copy.
*/
mb_copy = m_copypacket(m, M_DONTWAIT);
if (mb_copy && M_UNWRITABLE(mb_copy, sizeof(struct ip6_hdr)))
mb_copy = m_pullup(mb_copy, sizeof(struct ip6_hdr));
if (mb_copy == NULL) {
splx(s);
return;
}
/* set MCAST flag to the outgoing packet */
mb_copy->m_flags |= M_MCAST;
/*
* If we sourced the packet, call ip6_output since we may divide
* the packet into fragments when the packet is too big for the
* outgoing interface.
* Otherwise, we can simply send the packet to the interface
* sending queue.
*/
if (m->m_pkthdr.rcvif_index == 0) {
struct ip6_moptions im6o;
im6o.im6o_multicast_if_index = if_get_index(ifp);
/* XXX: ip6_output will override ip6->ip6_hlim */
im6o.im6o_multicast_hlim = ip6->ip6_hlim;
im6o.im6o_multicast_loop = 1;
error = ip6_output(mb_copy, NULL, &ro, IPV6_FORWARDING,
&im6o, NULL, NULL);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT)
log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
mifp - mif6table, error);
#endif
splx(s);
return;
}
/*
* If we belong to the destination multicast group
* on the outgoing interface, loop back a copy.
*/
/*
* Does not have to check source info, as it's already covered by
* ip6_input
*/
sockaddr_in6_init(&dst6, &ip6->ip6_dst, 0, 0, 0);
ingroup = in6_multi_group(&ip6->ip6_dst, ifp);
if (ingroup) {
ip6_mloopback(ifp, m,
satocsin6(rtcache_getdst(&ro)));
}
/*
* Put the packet into the sending queue of the outgoing interface
* if it would fit in the MTU of the interface.
*/
if (mb_copy->m_pkthdr.len <= ifp->if_mtu || ifp->if_mtu < IPV6_MMTU) {
error = ip6_if_output(ifp, ifp, mb_copy, &dst6, NULL);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT)
log(LOG_DEBUG, "phyint_send on mif %td err %d\n",
mifp - mif6table, error);
#endif
} else {
/*
* pMTU discovery is intentionally disabled by default, since
* various routers may notify pMTU in multicast, which can be
* a DDoS to a router.
*/
if (ip6_mcast_pmtu) {
icmp6_error(mb_copy, ICMP6_PACKET_TOO_BIG, 0,
ifp->if_mtu);
} else {
/* simply discard the packet */
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_XMIT) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"phyint_send: packet too big on %s o %s g %s"
" size %d(discarded)\n",
if_name(ifp),
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst),
mb_copy->m_pkthdr.len);
}
#endif
m_freem(mb_copy);
}
}
splx(s);
}
static int
register_send(struct ip6_hdr *ip6, struct mif6 *mif, struct mbuf *m)
{
struct mbuf *mm;
int i, len = m->m_pkthdr.len;
struct sockaddr_in6 sin6;
struct mrt6msg *im6;
#ifdef MRT6DEBUG
if (mrt6debug) {
char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG, "** IPv6 register_send **\n src %s dst %s\n",
IN6_PRINT(ip6bufs, &ip6->ip6_src),
IN6_PRINT(ip6bufd, &ip6->ip6_dst));
}
#endif
PIM6_STATINC(PIM6_STAT_SND_REGISTERS);
/* Make a copy of the packet to send to the user level process */
MGETHDR(mm, M_DONTWAIT, MT_HEADER);
if (mm == NULL)
return ENOBUFS;
mm->m_data += max_linkhdr;
mm->m_len = sizeof(struct ip6_hdr);
if ((mm->m_next = m_copypacket(m, M_DONTWAIT)) == NULL) {
m_freem(mm);
return ENOBUFS;
}
i = MHLEN - M_LEADINGSPACE(mm);
if (i > len)
i = len;
mm = m_pullup(mm, i);
if (mm == NULL)
return ENOBUFS;
mm->m_pkthdr.len = len + sizeof(struct ip6_hdr);
/*
* Send message to routing daemon
*/
sockaddr_in6_init(&sin6, &ip6->ip6_src, 0, 0, 0);
im6 = mtod(mm, struct mrt6msg *);
im6->im6_msgtype = MRT6MSG_WHOLEPKT;
im6->im6_mbz = 0;
im6->im6_mif = mif - mif6table;
/* iif info is not given for reg. encap.n */
mrt6stat.mrt6s_upcalls++;
if (socket_send(ip6_mrouter, mm, &sin6) < 0) {
#ifdef MRT6DEBUG
if (mrt6debug)
log(LOG_WARNING,
"register_send: ip6_mrouter socket queue full\n");
#endif
++mrt6stat.mrt6s_upq_sockfull;
return ENOBUFS;
}
return 0;
}
/*
* PIM sparse mode hook. Receives the pim control messages, and passes them up
* to the listening socket, using rip6_input.
*
* The only message processed is the REGISTER pim message; the pim header
* is stripped off, and the inner packet is passed to register_mforward.
*/
int
pim6_input(struct mbuf **mp, int *offp, int proto)
{
struct pim *pim;
struct ip6_hdr *ip6 __mrt6debugused;
int pimlen;
struct mbuf *m = *mp;
int minlen;
int off = *offp;
PIM6_STATINC(PIM6_STAT_RCV_TOTAL);
ip6 = mtod(m, struct ip6_hdr *);
pimlen = m->m_pkthdr.len - off;
/*
* Validate lengths
*/
if (pimlen < PIM_MINLEN) {
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,"pim6_input: PIM packet too short\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* If the packet is at least as big as a REGISTER, go ahead
* and grab the PIM REGISTER header size, to avoid another
* possible m_pullup() later.
*
* PIM_MINLEN == pimhdr + u_int32 == 8
* PIM6_REG_MINLEN == pimhdr + reghdr + eip6hdr == 4 + 4 + 40
*/
minlen = (pimlen >= PIM6_REG_MINLEN) ? PIM6_REG_MINLEN : PIM_MINLEN;
/*
* Make sure that the IP6 and PIM headers in contiguous memory, and
* possibly the PIM REGISTER header
*/
IP6_EXTHDR_GET(pim, struct pim *, m, off, minlen);
if (pim == NULL) {
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
return IPPROTO_DONE;
}
/* PIM version check */
if (pim->pim_ver != PIM_VERSION) {
PIM6_STATINC(PIM6_STAT_RCV_BADVERSION);
#ifdef MRT6DEBUG
log(LOG_ERR,
"pim6_input: incorrect version %d, expecting %d\n",
pim->pim_ver, PIM_VERSION);
#endif
m_freem(m);
return IPPROTO_DONE;
}
#define PIM6_CHECKSUM
#ifdef PIM6_CHECKSUM
{
int cksumlen;
/*
* Validate checksum.
* If PIM REGISTER, exclude the data packet
*/
if (pim->pim_type == PIM_REGISTER)
cksumlen = PIM_MINLEN;
else
cksumlen = pimlen;
if (in6_cksum(m, IPPROTO_PIM, off, cksumlen)) {
PIM6_STATINC(PIM6_STAT_RCV_BADSUM);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,
"pim6_input: invalid checksum\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
}
#endif /* PIM_CHECKSUM */
if (pim->pim_type == PIM_REGISTER) {
/*
* since this is a REGISTER, we'll make a copy of the register
* headers ip6+pim+u_int32_t+encap_ip6, to be passed up to the
* routing daemon.
*/
static const struct sockaddr_in6 dst = {
.sin6_len = sizeof(dst),
.sin6_family = AF_INET6,
};
struct mbuf *mcp;
struct ip6_hdr *eip6;
u_int32_t *reghdr;
PIM6_STATINC(PIM6_STAT_RCV_REGISTERS);
if ((reg_mif_num >= nummifs) || (reg_mif_num == (mifi_t) -1)) {
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM)
log(LOG_DEBUG,
"pim6_input: register mif not set: %d\n",
reg_mif_num);
#endif
m_freem(m);
return IPPROTO_DONE;
}
reghdr = (u_int32_t *)(pim + 1);
if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
goto pim6_input_to_daemon;
/*
* Validate length
*/
if (pimlen < PIM6_REG_MINLEN) {
#ifdef MRT6DEBUG
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_ERR,
"pim6_input: register packet size too "
"small %d from %s\n",
pimlen, IN6_PRINT(ip6buf, &ip6->ip6_src));
#endif
PIM6_STATINC(PIM6_STAT_RCV_TOOSHORT);
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
m_freem(m);
return IPPROTO_DONE;
}
eip6 = (struct ip6_hdr *)(reghdr + 1);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input[register], eip6: %s -> %s, "
"eip6 plen %d\n",
IN6_PRINT(ip6bufs, &eip6->ip6_src),
IN6_PRINT(ip6bufd, &eip6->ip6_dst),
ntohs(eip6->ip6_plen));
}
#endif
/* verify the version number of the inner packet */
if ((eip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
log(LOG_DEBUG, "pim6_input: invalid IP version (%d) "
"of the inner packet\n",
(eip6->ip6_vfc & IPV6_VERSION));
#endif
m_freem(m);
return IPPROTO_DONE;
}
/* verify the inner packet is destined to a mcast group */
if (!IN6_IS_ADDR_MULTICAST(&eip6->ip6_dst)) {
PIM6_STATINC(PIM6_STAT_RCV_BADREGISTERS);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6buf[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input: inner packet of register "
"is not multicast %s\n",
IN6_PRINT(ip6buf, &eip6->ip6_dst));
}
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* make a copy of the whole header to pass to the daemon later.
*/
mcp = m_copym(m, 0, off + PIM6_REG_MINLEN, M_DONTWAIT);
if (mcp == NULL) {
#ifdef MRT6DEBUG
log(LOG_ERR,
"pim6_input: pim register: "
"could not copy register head\n");
#endif
m_freem(m);
return IPPROTO_DONE;
}
/*
* forward the inner ip6 packet; point m_data at the inner ip6.
*/
m_adj(m, off + PIM_MINLEN);
#ifdef MRT6DEBUG
if (mrt6debug & DEBUG_PIM) {
char ip6bufs[INET6_ADDRSTRLEN];
char ip6bufd[INET6_ADDRSTRLEN];
log(LOG_DEBUG,
"pim6_input: forwarding decapsulated register: "
"src %s, dst %s, mif %d\n",
IN6_PRINT(ip6bufs, &eip6->ip6_src),
IN6_PRINT(ip6bufd, &eip6->ip6_dst),
reg_mif_num);
}
#endif
looutput(mif6table[reg_mif_num].m6_ifp, m, sin6tocsa(&dst),
NULL);
/* prepare the register head to send to the mrouting daemon */
m = mcp;
}
/*
* Pass the PIM message up to the daemon; if it is a register message
* pass the 'head' only up to the daemon. This includes the
* encapsulator ip6 header, pim header, register header and the
* encapsulated ip6 header.
*/
pim6_input_to_daemon:
/*
* Currently, rip6_input() is always called holding softnet_lock
* by ipintr()(!NET_MPSAFE) or PR_INPUT_WRAP()(NET_MPSAFE).
*/
KASSERT(mutex_owned(softnet_lock));
rip6_input(&m, offp, proto);
return IPPROTO_DONE;
}
static int
sysctl_net_inet6_pim6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(pim6stat_percpu, PIM6_NSTATS));
}
static void
sysctl_net_inet6_pim6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "pim6",
SYSCTL_DESCR("PIMv6 settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_PIM, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("PIMv6 statistics"),
sysctl_net_inet6_pim6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_PIM, PIM6CTL_STATS,
CTL_EOL);
}
/* $NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $ */
/*-
* Copyright (c) 2008, 2009, 2023 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
*/
/*
* File descriptor management.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_descrip.c,v 1.262 2023/10/04 22:17:09 ad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
#include <sys/ioctl.h>
#include <sys/fcntl.h>
#include <sys/pool.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/vnode.h>
#include <sys/sysctl.h>
#include <sys/ktrace.h>
/*
* A list (head) of open files, counter, and lock protecting them.
*/
struct filelist filehead __cacheline_aligned;
static u_int nfiles __cacheline_aligned;
kmutex_t filelist_lock __cacheline_aligned;
static pool_cache_t filedesc_cache __read_mostly;
static pool_cache_t file_cache __read_mostly;
static int file_ctor(void *, void *, int);
static void file_dtor(void *, void *);
static void fdfile_ctor(fdfile_t *);
static void fdfile_dtor(fdfile_t *);
static int filedesc_ctor(void *, void *, int);
static void filedesc_dtor(void *, void *);
static int filedescopen(dev_t, int, int, lwp_t *);
static int sysctl_kern_file(SYSCTLFN_PROTO);
static int sysctl_kern_file2(SYSCTLFN_PROTO);
static void fill_file(struct file *, const struct file *);
static void fill_file2(struct kinfo_file *, const file_t *, const fdfile_t *,
int, pid_t);
const struct cdevsw filedesc_cdevsw = {
.d_open = filedescopen,
.d_close = noclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = noioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
/* For ease of reading. */
__strong_alias(fd_putvnode,fd_putfile)
__strong_alias(fd_putsock,fd_putfile)
/*
* Initialize the descriptor system.
*/
void
fd_sys_init(void)
{
static struct sysctllog *clog;
mutex_init(&filelist_lock, MUTEX_DEFAULT, IPL_NONE);
LIST_INIT(&filehead);
file_cache = pool_cache_init(sizeof(file_t), coherency_unit, 0,
0, "file", NULL, IPL_NONE, file_ctor, file_dtor, NULL);
KASSERT(file_cache != NULL);
filedesc_cache = pool_cache_init(sizeof(filedesc_t), coherency_unit,
0, 0, "filedesc", NULL, IPL_NONE, filedesc_ctor, filedesc_dtor,
NULL);
KASSERT(filedesc_cache != NULL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "file",
SYSCTL_DESCR("System open file table"),
sysctl_kern_file, 0, NULL, 0,
CTL_KERN, KERN_FILE, CTL_EOL);
sysctl_createv(&clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "file2",
SYSCTL_DESCR("System open file table"),
sysctl_kern_file2, 0, NULL, 0,
CTL_KERN, KERN_FILE2, CTL_EOL);
}
static bool
fd_isused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles);
return (fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0;
}
/*
* Verify that the bitmaps match the descriptor table.
*/
static inline void
fd_checkmaps(filedesc_t *fdp)
{
#ifdef DEBUG
fdtab_t *dt;
u_int fd;
KASSERT(fdp->fd_refcnt <= 1 || mutex_owned(&fdp->fd_lock));
dt = fdp->fd_dt;
if (fdp->fd_refcnt == -1) {
/*
* fd_free tears down the table without maintaining its bitmap.
*/
return;
}
for (fd = 0; fd < dt->dt_nfiles; fd++) { if (fd < NDFDFILE) { KASSERT(dt->dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
if (dt->dt_ff[fd] == NULL) {
KASSERT(!fd_isused(fdp, fd)); } else if (dt->dt_ff[fd]->ff_file != NULL) { KASSERT(fd_isused(fdp, fd));
}
}
#endif
}
static int
fd_next_zero(filedesc_t *fdp, uint32_t *bitmap, int want, u_int bits)
{
int i, off, maxoff;
uint32_t sub;
KASSERT(mutex_owned(&fdp->fd_lock));
fd_checkmaps(fdp);
if (want > bits)
return -1;
off = want >> NDENTRYSHIFT;
i = want & NDENTRYMASK;
if (i) {
sub = bitmap[off] | ((u_int)~0 >> (NDENTRIES - i));
if (sub != ~0)
goto found;
off++;
}
maxoff = NDLOSLOTS(bits);
while (off < maxoff) { if ((sub = bitmap[off]) != ~0)
goto found;
off++;
}
return -1;
found:
return (off << NDENTRYSHIFT) + ffs(~sub) - 1;
}
static int
fd_last_set(filedesc_t *fd, int last)
{
int off, i;
fdfile_t **ff = fd->fd_dt->dt_ff;
uint32_t *bitmap = fd->fd_lomap;
KASSERT(mutex_owned(&fd->fd_lock));
fd_checkmaps(fd);
off = (last - 1) >> NDENTRYSHIFT;
while (off >= 0 && !bitmap[off])
off--;
if (off < 0)
return -1;
i = ((off + 1) << NDENTRYSHIFT) - 1;
if (i >= last)
i = last - 1;
/* XXX should use bitmap */
while (i > 0 && (ff[i] == NULL || !ff[i]->ff_allocated))
i--;
return i;
}
static inline void
fd_used(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) == 0); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_allocated);
ff->ff_allocated = true;
fdp->fd_lomap[off] |= 1U << (fd & NDENTRYMASK);
if (__predict_false(fdp->fd_lomap[off] == ~0)) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1U << (off & NDENTRYMASK))) == 0);
fdp->fd_himap[off >> NDENTRYSHIFT] |= 1U << (off & NDENTRYMASK);
}
if ((int)fd > fdp->fd_lastfile) { fdp->fd_lastfile = fd;
}
fd_checkmaps(fdp);
}
static inline void
fd_unused(filedesc_t *fdp, unsigned fd)
{
u_int off = fd >> NDENTRYSHIFT;
fdfile_t *ff;
ff = fdp->fd_dt->dt_ff[fd];
KASSERT(mutex_owned(&fdp->fd_lock)); KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); if (fd < fdp->fd_freefile) { fdp->fd_freefile = fd;
}
if (fdp->fd_lomap[off] == ~0) { KASSERT((fdp->fd_himap[off >> NDENTRYSHIFT] &
(1U << (off & NDENTRYMASK))) != 0);
fdp->fd_himap[off >> NDENTRYSHIFT] &=
~(1U << (off & NDENTRYMASK));
}
KASSERT((fdp->fd_lomap[off] & (1U << (fd & NDENTRYMASK))) != 0);
fdp->fd_lomap[off] &= ~(1U << (fd & NDENTRYMASK));
ff->ff_allocated = false;
KASSERT(fd <= fdp->fd_lastfile); if (fd == fdp->fd_lastfile) { fdp->fd_lastfile = fd_last_set(fdp, fd);
}
fd_checkmaps(fdp);
}
/*
* Look up the file structure corresponding to a file descriptor
* and return the file, holding a reference on the descriptor.
*/
file_t *
fd_getfile(unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
/*
* Look up the fdfile structure representing this descriptor.
* We are doing this unlocked. See fd_tryexpand().
*/
fdp = curlwp->l_fd;
dt = atomic_load_consume(&fdp->fd_dt); if (__predict_false(fd >= dt->dt_nfiles)) {
return NULL;
}
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]); if (__predict_false(ff == NULL)) {
return NULL;
}
/* Now get a reference to the descriptor. */
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
ff->ff_refcnt++;
} else {
/*
* Multi threaded: issue a memory barrier to ensure that we
* acquire the file pointer _after_ adding a reference. If
* no memory barrier, we could fetch a stale pointer.
*
* In particular, we must coordinate the following four
* memory operations:
*
* A. fd_close store ff->ff_file = NULL
* B. fd_close refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
* C. fd_getfile atomic_inc_uint(&ff->ff_refcnt)
* D. fd_getfile load fp = ff->ff_file
*
* If the order is D;A;B;C:
*
* 1. D: fp = ff->ff_file
* 2. A: ff->ff_file = NULL
* 3. B: refcnt = atomic_dec_uint_nv(&ff->ff_refcnt)
* 4. C: atomic_inc_uint(&ff->ff_refcnt)
*
* then fd_close determines that there are no more
* references and decides to free fp immediately, at
* the same that fd_getfile ends up with an fp that's
* about to be freed. *boom*
*
* By making B a release operation in fd_close, and by
* making C an acquire operation in fd_getfile, since
* they are atomic operations on the same object, which
* has a total modification order, we guarantee either:
*
* - B happens before C. Then since A is
* sequenced before B in fd_close, and C is
* sequenced before D in fd_getfile, we
* guarantee A happens before D, so fd_getfile
* reads a null fp and safely fails.
*
* - C happens before B. Then fd_getfile may read
* null or nonnull, but either way, fd_close
* will safely wait for references to drain.
*/
atomic_inc_uint(&ff->ff_refcnt);
membar_acquire();
}
/*
* If the file is not open or is being closed then put the
* reference back.
*/
fp = atomic_load_consume(&ff->ff_file); if (__predict_true(fp != NULL)) {
return fp;
}
fd_putfile(fd);
return NULL;
}
/*
* Release a reference to a file descriptor acquired with fd_getfile().
*/
void
fd_putfile(unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
u_int u, v;
fdp = curlwp->l_fd;
KASSERT(fd < atomic_load_consume(&fdp->fd_dt)->dt_nfiles); ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL); KASSERT((ff->ff_refcnt & FR_MASK) > 0); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (fdp->fd_refcnt == 1) {
/*
* Single threaded: don't need to worry about concurrent
* access (other than earlier calls to kqueue, which may
* hold a reference to the descriptor).
*/
if (__predict_false((ff->ff_refcnt & FR_CLOSING) != 0)) {
fd_close(fd);
return;
}
ff->ff_refcnt--;
return;
}
/*
* Ensure that any use of the file is complete and globally
* visible before dropping the final reference. If no membar,
* the current CPU could still access memory associated with
* the file after it has been freed or recycled by another
* CPU.
*/
membar_release();
/*
* Be optimistic and start out with the assumption that no other
* threads are trying to close the descriptor. If the CAS fails,
* we lost a race and/or it's being closed.
*/
for (u = ff->ff_refcnt & FR_MASK;; u = v) {
v = atomic_cas_uint(&ff->ff_refcnt, u, u - 1);
if (__predict_true(u == v)) {
return;
}
if (__predict_false((v & FR_CLOSING) != 0)) {
break;
}
}
/* Another thread is waiting to close the file: join it. */
(void)fd_close(fd);
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a vnode.
*/
int
fd_getvnode(unsigned fd, file_t **fpp)
{
vnode_t *vp;
file_t *fp;
fp = fd_getfile(fd);
if (__predict_false(fp == NULL)) {
return EBADF;
}
if (__predict_false(fp->f_type != DTYPE_VNODE)) {
fd_putfile(fd);
return EINVAL;
}
vp = fp->f_vnode;
if (__predict_false(vp->v_type == VBAD)) {
/* XXX Is this case really necessary? */
fd_putfile(fd);
return EBADF;
}
*fpp = fp;
return 0;
}
/*
* Convenience wrapper around fd_getfile() that returns reference
* to a socket.
*/
int
fd_getsock1(unsigned fd, struct socket **sop, file_t **fp)
{
*fp = fd_getfile(fd);
if (__predict_false(*fp == NULL)) {
return EBADF;
}
if (__predict_false((*fp)->f_type != DTYPE_SOCKET)) {
fd_putfile(fd);
return ENOTSOCK;
}
*sop = (*fp)->f_socket;
return 0;
}
int
fd_getsock(unsigned fd, struct socket **sop)
{
file_t *fp;
return fd_getsock1(fd, sop, &fp);
}
/*
* Look up the file structure corresponding to a file descriptor
* and return it with a reference held on the file, not the
* descriptor.
*
* This is heavyweight and only used when accessing descriptors
* from a foreign process. The caller must ensure that `p' does
* not exit or fork across this call.
*
* To release the file (not descriptor) reference, use closef().
*/
file_t *
fd_getfile2(proc_t *p, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
fdp = p->p_fd;
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
if (fd >= dt->dt_nfiles) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((ff = dt->dt_ff[fd]) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
mutex_exit(&fdp->fd_lock);
return NULL;
}
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
mutex_exit(&fdp->fd_lock);
return fp;
}
/*
* Internal form of close. Must be called with a reference to the
* descriptor, and will drop the reference. When all descriptor
* references are dropped, releases the descriptor slot and a single
* reference to the file structure.
*/
int
fd_close(unsigned fd)
{
struct flock lf;
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
proc_t *p;
lwp_t *l;
u_int refcnt;
l = curlwp;
p = l->l_proc;
fdp = l->l_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
mutex_enter(&fdp->fd_lock);
KASSERT((ff->ff_refcnt & FR_MASK) > 0); fp = atomic_load_consume(&ff->ff_file);
if (__predict_false(fp == NULL)) {
/*
* Another user of the file is already closing, and is
* waiting for other users of the file to drain. Release
* our reference, and wake up the closer.
*/
membar_release();
atomic_dec_uint(&ff->ff_refcnt);
cv_broadcast(&ff->ff_closing);
mutex_exit(&fdp->fd_lock);
/*
* An application error, so pretend that the descriptor
* was already closed. We can't safely wait for it to
* be closed without potentially deadlocking.
*/
return (EBADF);
}
KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
/*
* There may be multiple users of this file within the process.
* Notify existing and new users that the file is closing. This
* will prevent them from adding additional uses to this file
* while we are closing it.
*/
atomic_store_relaxed(&ff->ff_file, NULL);
ff->ff_exclose = false;
/*
* We expect the caller to hold a descriptor reference - drop it.
* The reference count may increase beyond zero at this point due
* to an erroneous descriptor reference by an application, but
* fd_getfile() will notice that the file is being closed and drop
* the reference again.
*/
if (fdp->fd_refcnt == 1) {
/* Single threaded. */
refcnt = --(ff->ff_refcnt);
} else {
/* Multi threaded. */
membar_release();
refcnt = atomic_dec_uint_nv(&ff->ff_refcnt);
membar_acquire();
}
if (__predict_false(refcnt != 0)) {
/*
* Wait for other references to drain. This is typically
* an application error - the descriptor is being closed
* while still in use.
* (Or just a threaded application trying to unblock its
* thread that sleeps in (say) accept()).
*/
atomic_or_uint(&ff->ff_refcnt, FR_CLOSING);
/*
* Remove any knotes attached to the file. A knote
* attached to the descriptor can hold references on it.
*/
mutex_exit(&fdp->fd_lock);
if (!SLIST_EMPTY(&ff->ff_knlist)) { knote_fdclose(fd);
}
/*
* Since the file system code doesn't know which fd
* each request came from (think dup()), we have to
* ask it to return ERESTART for any long-term blocks.
* The re-entry through read/write/etc will detect the
* closed fd and return EBAFD.
* Blocked partial writes may return a short length.
*/
(*fp->f_ops->fo_restart)(fp);
mutex_enter(&fdp->fd_lock);
/*
* We need to see the count drop to zero at least once,
* in order to ensure that all pre-existing references
* have been drained. New references past this point are
* of no interest.
* XXX (dsl) this may need to call fo_restart() after a
* timeout to guarantee that all the system calls exit.
*/
while ((ff->ff_refcnt & FR_MASK) != 0) {
cv_wait(&ff->ff_closing, &fdp->fd_lock);
}
atomic_and_uint(&ff->ff_refcnt, ~FR_CLOSING);
} else {
/* If no references, there must be no knotes. */
KASSERT(SLIST_EMPTY(&ff->ff_knlist));
}
/*
* POSIX record locking dictates that any close releases ALL
* locks owned by this process. This is handled by setting
* a flag in the unlock to free ONLY locks obeying POSIX
* semantics, and not to free BSD-style file locks.
* If the descriptor was in a message, POSIX-style locks
* aren't passed with the descriptor.
*/
if (__predict_false((p->p_flag & PK_ADVLOCK) != 0) &&
fp->f_ops->fo_advlock != NULL) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
mutex_exit(&fdp->fd_lock);
(void)(*fp->f_ops->fo_advlock)(fp, p, F_UNLCK, &lf, F_POSIX);
mutex_enter(&fdp->fd_lock);
}
/* Free descriptor slot. */
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
/* Now drop reference to the file itself. */
return closef(fp);
}
/*
* Duplicate a file descriptor.
*/
int
fd_dup(file_t *fp, int minfd, int *newp, bool exclose)
{
proc_t *p = curproc;
fdtab_t *dt;
int error;
while ((error = fd_alloc(p, minfd, newp)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
dt = atomic_load_consume(&curlwp->l_fd->fd_dt);
dt->dt_ff[*newp]->ff_exclose = exclose;
fd_affix(p, fp, *newp);
return 0;
}
/*
* dup2 operation.
*/
int
fd_dup2(file_t *fp, unsigned newfd, int flags)
{
filedesc_t *fdp = curlwp->l_fd;
fdfile_t *ff;
fdtab_t *dt;
if (flags & ~(O_CLOEXEC|O_NONBLOCK|O_NOSIGPIPE))
return EINVAL;
/*
* Ensure there are enough slots in the descriptor table,
* and allocate an fdfile_t up front in case we need it.
*/
while (newfd >= atomic_load_consume(&fdp->fd_dt)->dt_nfiles) { fd_tryexpand(curproc);
}
ff = kmem_alloc(sizeof(*ff), KM_SLEEP);
fdfile_ctor(ff);
/*
* If there is already a file open, close it. If the file is
* half open, wait for it to be constructed before closing it.
* XXX Potential for deadlock here?
*/
mutex_enter(&fdp->fd_lock);
while (fd_isused(fdp, newfd)) {
mutex_exit(&fdp->fd_lock);
if (fd_getfile(newfd) != NULL) {
(void)fd_close(newfd);
} else {
/*
* Crummy, but unlikely to happen.
* Can occur if we interrupt another
* thread while it is opening a file.
*/
kpause("dup2", false, 1, NULL);
}
mutex_enter(&fdp->fd_lock);
}
dt = fdp->fd_dt;
if (dt->dt_ff[newfd] == NULL) { KASSERT(newfd >= NDFDFILE);
dt->dt_ff[newfd] = ff;
ff = NULL;
}
fd_used(fdp, newfd);
mutex_exit(&fdp->fd_lock);
dt->dt_ff[newfd]->ff_exclose = (flags & O_CLOEXEC) != 0;
fp->f_flag |= flags & (FNONBLOCK|FNOSIGPIPE);
/* Slot is now allocated. Insert copy of the file. */
fd_affix(curproc, fp, newfd);
if (ff != NULL) { cv_destroy(&ff->ff_closing);
kmem_free(ff, sizeof(*ff));
}
return 0;
}
/*
* Drop reference to a file structure.
*/
int
closef(file_t *fp)
{
struct flock lf;
int error;
/*
* Drop reference. If referenced elsewhere it's still open
* and we have nothing more to do.
*/
mutex_enter(&fp->f_lock);
KASSERT(fp->f_count > 0); if (--fp->f_count > 0) {
mutex_exit(&fp->f_lock);
return 0;
}
KASSERT(fp->f_count == 0);
mutex_exit(&fp->f_lock);
/* We held the last reference - release locks, close and free. */
if (fp->f_ops->fo_advlock == NULL) {
KASSERT((fp->f_flag & FHASLOCK) == 0); } else if (fp->f_flag & FHASLOCK) { lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void)(*fp->f_ops->fo_advlock)(fp, fp, F_UNLCK, &lf, F_FLOCK);
}
if (fp->f_ops != NULL) { error = (*fp->f_ops->fo_close)(fp);
} else {
error = 0;
}
KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
return error;
}
/*
* Allocate a file descriptor for the process.
*
* Future idea for experimentation: replace all of this with radixtree.
*/
int
fd_alloc(proc_t *p, int want, int *result)
{
filedesc_t *fdp = p->p_fd;
int i, lim, last, error, hi;
u_int off;
fdtab_t *dt;
KASSERT(p == curproc || p == &proc0);
/*
* Search for a free descriptor starting at the higher
* of want or fd_freefile.
*/
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
lim = uimin((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
last = uimin(dt->dt_nfiles, lim);
for (;;) {
if ((i = want) < fdp->fd_freefile)
i = fdp->fd_freefile;
off = i >> NDENTRYSHIFT;
hi = fd_next_zero(fdp, fdp->fd_himap, off,
(last + NDENTRIES - 1) >> NDENTRYSHIFT);
if (hi == -1)
break;
i = fd_next_zero(fdp, &fdp->fd_lomap[hi],
hi > off ? 0 : i & NDENTRYMASK, NDENTRIES);
if (i == -1) {
/*
* Free file descriptor in this block was
* below want, try again with higher want.
*/
want = (hi + 1) << NDENTRYSHIFT;
continue;
}
i += (hi << NDENTRYSHIFT);
if (i >= last) {
break;
}
if (dt->dt_ff[i] == NULL) { KASSERT(i >= NDFDFILE);
dt->dt_ff[i] = kmem_alloc(sizeof(fdfile_t), KM_SLEEP);
fdfile_ctor(dt->dt_ff[i]);
}
KASSERT(dt->dt_ff[i]->ff_file == NULL);
fd_used(fdp, i);
if (want <= fdp->fd_freefile) { fdp->fd_freefile = i;
}
*result = i;
KASSERT(i >= NDFDFILE ||
dt->dt_ff[i] == (fdfile_t *)fdp->fd_dfdfile[i]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
return 0;
}
/* No space in current array. Let the caller expand and retry. */
error = (dt->dt_nfiles >= lim) ? EMFILE : ENOSPC;
mutex_exit(&fdp->fd_lock);
return error;
}
/*
* Allocate memory for a descriptor table.
*/
static fdtab_t *
fd_dtab_alloc(int n)
{
fdtab_t *dt;
size_t sz;
KASSERT(n > NDFILE);
sz = sizeof(*dt) + (n - NDFILE) * sizeof(dt->dt_ff[0]);
dt = kmem_alloc(sz, KM_SLEEP);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
dt->dt_nfiles = n;
dt->dt_link = NULL;
return dt;
}
/*
* Free a descriptor table, and all tables linked for deferred free.
*/
static void
fd_dtab_free(fdtab_t *dt)
{
fdtab_t *next;
size_t sz;
do {
next = dt->dt_link;
KASSERT(dt->dt_nfiles > NDFILE);
sz = sizeof(*dt) +
(dt->dt_nfiles - NDFILE) * sizeof(dt->dt_ff[0]);
#ifdef DIAGNOSTIC
memset(dt, 0xff, sz);
#endif
kmem_free(dt, sz);
dt = next;
} while (dt != NULL);
}
/*
* Allocate descriptor bitmap.
*/
static void
fd_map_alloc(int n, uint32_t **lo, uint32_t **hi)
{
uint8_t *ptr;
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
ptr = kmem_alloc(szlo + szhi, KM_SLEEP);
*lo = (uint32_t *)ptr;
*hi = (uint32_t *)(ptr + szlo);
}
/*
* Free descriptor bitmap.
*/
static void
fd_map_free(int n, uint32_t *lo, uint32_t *hi)
{
size_t szlo, szhi;
KASSERT(n > NDENTRIES);
szlo = NDLOSLOTS(n) * sizeof(uint32_t);
szhi = NDHISLOTS(n) * sizeof(uint32_t);
KASSERT(hi == (uint32_t *)((uint8_t *)lo + szlo));
kmem_free(lo, szlo + szhi);
}
/*
* Expand a process' descriptor table.
*/
void
fd_tryexpand(proc_t *p)
{
filedesc_t *fdp;
int i, numfiles, oldnfiles;
fdtab_t *newdt, *dt;
uint32_t *newhimap, *newlomap;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
newhimap = NULL;
newlomap = NULL;
oldnfiles = atomic_load_consume(&fdp->fd_dt)->dt_nfiles;
if (oldnfiles < NDEXTENT)
numfiles = NDEXTENT;
else
numfiles = 2 * oldnfiles;
newdt = fd_dtab_alloc(numfiles);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_alloc(numfiles, &newlomap, &newhimap);
}
mutex_enter(&fdp->fd_lock);
dt = fdp->fd_dt;
KASSERT(dt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
if (dt->dt_nfiles != oldnfiles) {
/* fdp changed; caller must retry */
mutex_exit(&fdp->fd_lock);
fd_dtab_free(newdt);
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
fd_map_free(numfiles, newlomap, newhimap);
}
return;
}
/* Copy the existing descriptor table and zero the new portion. */
i = sizeof(fdfile_t *) * oldnfiles;
memcpy(newdt->dt_ff, dt->dt_ff, i);
memset((uint8_t *)newdt->dt_ff + i, 0,
numfiles * sizeof(fdfile_t *) - i);
/*
* Link old descriptor array into list to be discarded. We defer
* freeing until the last reference to the descriptor table goes
* away (usually process exit). This allows us to do lockless
* lookups in fd_getfile().
*/
if (oldnfiles > NDFILE) {
if (fdp->fd_refcnt > 1) {
newdt->dt_link = dt;
} else {
fd_dtab_free(dt);
}
}
if (NDHISLOTS(numfiles) > NDHISLOTS(oldnfiles)) {
i = NDHISLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newhimap, fdp->fd_himap, i);
memset((uint8_t *)newhimap + i, 0,
NDHISLOTS(numfiles) * sizeof(uint32_t) - i);
i = NDLOSLOTS(oldnfiles) * sizeof(uint32_t);
memcpy(newlomap, fdp->fd_lomap, i);
memset((uint8_t *)newlomap + i, 0,
NDLOSLOTS(numfiles) * sizeof(uint32_t) - i);
if (NDHISLOTS(oldnfiles) > NDHISLOTS(NDFILE)) {
fd_map_free(oldnfiles, fdp->fd_lomap, fdp->fd_himap);
}
fdp->fd_himap = newhimap;
fdp->fd_lomap = newlomap;
}
/*
* All other modifications must become globally visible before
* the change to fd_dt. See fd_getfile().
*/
atomic_store_release(&fdp->fd_dt, newdt);
KASSERT(newdt->dt_ff[0] == (fdfile_t *)fdp->fd_dfdfile[0]);
fd_checkmaps(fdp);
mutex_exit(&fdp->fd_lock);
}
/*
* Create a new open file structure and allocate a file descriptor
* for the current process.
*/
int
fd_allocfile(file_t **resultfp, int *resultfd)
{
proc_t *p = curproc;
kauth_cred_t cred;
file_t *fp;
int error;
while ((error = fd_alloc(p, 0, resultfd)) != 0) {
if (error != ENOSPC) {
return error;
}
fd_tryexpand(p);
}
fp = pool_cache_get(file_cache, PR_WAITOK);
if (fp == NULL) {
fd_abort(p, NULL, *resultfd);
return ENFILE;
}
KASSERT(fp->f_count == 0); KASSERT(fp->f_msgcount == 0); KASSERT(fp->f_unpcount == 0);
/* Replace cached credentials if not what we need. */
cred = curlwp->l_cred;
if (__predict_false(cred != fp->f_cred)) { kauth_cred_free(fp->f_cred);
fp->f_cred = kauth_cred_hold(cred);
}
/*
* Don't allow recycled files to be scanned.
* See uipc_usrreq.c.
*/
if (__predict_false((fp->f_flag & FSCAN) != 0)) { mutex_enter(&fp->f_lock);
atomic_and_uint(&fp->f_flag, ~FSCAN);
mutex_exit(&fp->f_lock);
}
fp->f_advice = 0;
fp->f_offset = 0;
*resultfp = fp;
return 0;
}
/*
* Successful creation of a new descriptor: make visible to the process.
*/
void
fd_affix(proc_t *p, file_t *fp, unsigned fd)
{
fdfile_t *ff;
filedesc_t *fdp;
fdtab_t *dt;
KASSERT(p == curproc || p == &proc0);
/* Add a reference to the file structure. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
/*
* Insert the new file into the descriptor slot.
*/
fdp = p->p_fd;
dt = atomic_load_consume(&fdp->fd_dt);
ff = dt->dt_ff[fd];
KASSERT(ff != NULL); KASSERT(ff->ff_file == NULL); KASSERT(ff->ff_allocated); KASSERT(fd_isused(fdp, fd)); KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
/* No need to lock in order to make file initially visible. */
atomic_store_release(&ff->ff_file, fp);
}
/*
* Abort creation of a new descriptor: free descriptor slot and file.
*/
void
fd_abort(proc_t *p, file_t *fp, unsigned fd)
{
filedesc_t *fdp;
fdfile_t *ff;
KASSERT(p == curproc || p == &proc0);
fdp = p->p_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
ff->ff_exclose = false;
KASSERT(fd >= NDFDFILE || ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
mutex_enter(&fdp->fd_lock);
KASSERT(fd_isused(fdp, fd));
fd_unused(fdp, fd);
mutex_exit(&fdp->fd_lock);
if (fp != NULL) { KASSERT(fp->f_count == 0); KASSERT(fp->f_cred != NULL);
pool_cache_put(file_cache, fp);
}
}
static int
file_ctor(void *arg, void *obj, int flags)
{
/*
* It's easy to exhaust the open file limit on a system with many
* CPUs due to caching. Allow a bit of leeway to reduce the element
* of surprise.
*/
u_int slop = PCG_NOBJECTS_NORMAL * (ncpu - 1);
file_t *fp = obj;
memset(fp, 0, sizeof(*fp));
mutex_enter(&filelist_lock);
if (__predict_false(nfiles >= slop + maxfiles)) {
mutex_exit(&filelist_lock);
tablefull("file", "increase kern.maxfiles or MAXFILES");
return ENFILE;
}
nfiles++;
LIST_INSERT_HEAD(&filehead, fp, f_list);
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
fp->f_cred = kauth_cred_hold(curlwp->l_cred);
mutex_exit(&filelist_lock);
return 0;
}
static void
file_dtor(void *arg, void *obj)
{
file_t *fp = obj;
mutex_enter(&filelist_lock);
nfiles--;
LIST_REMOVE(fp, f_list);
mutex_exit(&filelist_lock);
KASSERT(fp->f_count == 0);
kauth_cred_free(fp->f_cred);
mutex_destroy(&fp->f_lock);
}
static void
fdfile_ctor(fdfile_t *ff)
{
memset(ff, 0, sizeof(*ff));
cv_init(&ff->ff_closing, "fdclose");
}
static void
fdfile_dtor(fdfile_t *ff)
{
cv_destroy(&ff->ff_closing);
}
file_t *
fgetdummy(void)
{
file_t *fp;
fp = kmem_zalloc(sizeof(*fp), KM_SLEEP);
mutex_init(&fp->f_lock, MUTEX_DEFAULT, IPL_NONE);
return fp;
}
void
fputdummy(file_t *fp)
{
mutex_destroy(&fp->f_lock);
kmem_free(fp, sizeof(*fp));
}
/*
* Create an initial filedesc structure.
*/
filedesc_t *
fd_init(filedesc_t *fdp)
{
#ifdef DIAGNOSTIC
unsigned fd;
#endif
if (__predict_true(fdp == NULL)) {
fdp = pool_cache_get(filedesc_cache, PR_WAITOK);
} else {
KASSERT(fdp == &filedesc0);
filedesc_ctor(NULL, fdp, PR_WAITOK);
}
#ifdef DIAGNOSTIC
KASSERT(fdp->fd_lastfile == -1);
KASSERT(fdp->fd_lastkqfile == -1);
KASSERT(fdp->fd_knhash == NULL);
KASSERT(fdp->fd_freefile == 0);
KASSERT(fdp->fd_exclose == false);
KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (fd = 0; fd < NDFDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] ==
(fdfile_t *)fdp->fd_dfdfile[fd]);
}
for (fd = NDFDFILE; fd < NDFILE; fd++) {
KASSERT(fdp->fd_dtbuiltin.dt_ff[fd] == NULL);
}
KASSERT(fdp->fd_himap == fdp->fd_dhimap);
KASSERT(fdp->fd_lomap == fdp->fd_dlomap);
#endif /* DIAGNOSTIC */
fdp->fd_refcnt = 1;
fd_checkmaps(fdp);
return fdp;
}
/*
* Initialize a file descriptor table.
*/
static int
filedesc_ctor(void *arg, void *obj, int flag)
{
filedesc_t *fdp = obj;
fdfile_t **ffp;
int i;
memset(fdp, 0, sizeof(*fdp));
mutex_init(&fdp->fd_lock, MUTEX_DEFAULT, IPL_NONE);
fdp->fd_lastfile = -1;
fdp->fd_lastkqfile = -1;
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_dtbuiltin.dt_nfiles = NDFILE;
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
CTASSERT(sizeof(fdp->fd_dfdfile[0]) >= sizeof(fdfile_t));
for (i = 0, ffp = fdp->fd_dt->dt_ff; i < NDFDFILE; i++, ffp++) {
fdfile_ctor(*ffp = (fdfile_t *)fdp->fd_dfdfile[i]);
}
return 0;
}
static void
filedesc_dtor(void *arg, void *obj)
{
filedesc_t *fdp = obj;
int i;
for (i = 0; i < NDFDFILE; i++) {
fdfile_dtor((fdfile_t *)fdp->fd_dfdfile[i]);
}
mutex_destroy(&fdp->fd_lock);
}
/*
* Make p share curproc's filedesc structure.
*/
void
fd_share(struct proc *p)
{
filedesc_t *fdp;
fdp = curlwp->l_fd;
p->p_fd = fdp;
atomic_inc_uint(&fdp->fd_refcnt);
}
/*
* Acquire a hold on a filedesc structure.
*/
void
fd_hold(lwp_t *l)
{
filedesc_t *fdp = l->l_fd;
atomic_inc_uint(&fdp->fd_refcnt);
}
/*
* Copy a filedesc structure.
*/
filedesc_t *
fd_copy(void)
{
filedesc_t *newfdp, *fdp;
fdfile_t *ff, **ffp, **nffp, *ff2;
int i, j, numfiles, lastfile, newlast;
file_t *fp;
fdtab_t *newdt;
fdp = curproc->p_fd;
newfdp = pool_cache_get(filedesc_cache, PR_WAITOK);
newfdp->fd_refcnt = 1;
#ifdef DIAGNOSTIC
KASSERT(newfdp->fd_lastfile == -1); KASSERT(newfdp->fd_lastkqfile == -1); KASSERT(newfdp->fd_knhash == NULL); KASSERT(newfdp->fd_freefile == 0); KASSERT(newfdp->fd_exclose == false); KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin); KASSERT(newfdp->fd_dtbuiltin.dt_nfiles == NDFILE);
for (i = 0; i < NDFDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] ==
(fdfile_t *)&newfdp->fd_dfdfile[i]);
}
for (i = NDFDFILE; i < NDFILE; i++) {
KASSERT(newfdp->fd_dtbuiltin.dt_ff[i] == NULL);
}
#endif /* DIAGNOSTIC */
mutex_enter(&fdp->fd_lock);
fd_checkmaps(fdp);
numfiles = fdp->fd_dt->dt_nfiles;
lastfile = fdp->fd_lastfile;
/*
* If the number of open files fits in the internal arrays
* of the open file structure, use them, otherwise allocate
* additional memory for the number of descriptors currently
* in use.
*/
if (lastfile < NDFILE) {
i = NDFILE;
newdt = newfdp->fd_dt;
KASSERT(newfdp->fd_dt == &newfdp->fd_dtbuiltin);
} else {
/*
* Compute the smallest multiple of NDEXTENT needed
* for the file descriptors currently in use,
* allowing the table to shrink.
*/
i = numfiles;
while (i >= 2 * NDEXTENT && i > lastfile * 2) {
i /= 2;
}
KASSERT(i > NDFILE);
newdt = fd_dtab_alloc(i);
newfdp->fd_dt = newdt;
memcpy(newdt->dt_ff, newfdp->fd_dtbuiltin.dt_ff,
NDFDFILE * sizeof(fdfile_t **));
memset(newdt->dt_ff + NDFDFILE, 0,
(i - NDFDFILE) * sizeof(fdfile_t **));
}
if (NDHISLOTS(i) <= NDHISLOTS(NDFILE)) {
newfdp->fd_himap = newfdp->fd_dhimap;
newfdp->fd_lomap = newfdp->fd_dlomap;
} else {
fd_map_alloc(i, &newfdp->fd_lomap, &newfdp->fd_himap);
KASSERT(i >= NDENTRIES * NDENTRIES);
memset(newfdp->fd_himap, 0, NDHISLOTS(i)*sizeof(uint32_t));
memset(newfdp->fd_lomap, 0, NDLOSLOTS(i)*sizeof(uint32_t));
}
newfdp->fd_freefile = fdp->fd_freefile;
newfdp->fd_exclose = fdp->fd_exclose;
ffp = fdp->fd_dt->dt_ff;
nffp = newdt->dt_ff;
newlast = -1;
for (i = 0; i <= lastfile; i++, ffp++, nffp++) { KASSERT(i >= NDFDFILE ||
*nffp == (fdfile_t *)newfdp->fd_dfdfile[i]);
ff = *ffp;
if (ff == NULL || (fp = atomic_load_consume(&ff->ff_file)) == NULL) {
/* Descriptor unused, or descriptor half open. */
KASSERT(!fd_isused(newfdp, i));
continue;
}
if (__predict_false(fp->f_type == DTYPE_KQUEUE)) {
/* kqueue descriptors cannot be copied. */
if (i < newfdp->fd_freefile) { newfdp->fd_freefile = i;
}
continue;
}
/* It's active: add a reference to the file. */
mutex_enter(&fp->f_lock);
fp->f_count++;
mutex_exit(&fp->f_lock);
/* Allocate an fdfile_t to represent it. */
if (i >= NDFDFILE) {
ff2 = kmem_alloc(sizeof(*ff2), KM_SLEEP);
fdfile_ctor(ff2);
*nffp = ff2;
} else {
ff2 = newdt->dt_ff[i];
}
ff2->ff_file = fp;
ff2->ff_exclose = ff->ff_exclose;
ff2->ff_allocated = true;
/* Fix up bitmaps. */
j = i >> NDENTRYSHIFT;
KASSERT((newfdp->fd_lomap[j] & (1U << (i & NDENTRYMASK))) == 0);
newfdp->fd_lomap[j] |= 1U << (i & NDENTRYMASK);
if (__predict_false(newfdp->fd_lomap[j] == ~0)) { KASSERT((newfdp->fd_himap[j >> NDENTRYSHIFT] &
(1U << (j & NDENTRYMASK))) == 0);
newfdp->fd_himap[j >> NDENTRYSHIFT] |=
1U << (j & NDENTRYMASK);
}
newlast = i;
}
KASSERT(newdt->dt_ff[0] == (fdfile_t *)newfdp->fd_dfdfile[0]);
newfdp->fd_lastfile = newlast;
fd_checkmaps(newfdp);
mutex_exit(&fdp->fd_lock);
return newfdp;
}
/*
* Release a filedesc structure.
*/
void
fd_free(void)
{
fdfile_t *ff;
file_t *fp;
int fd, nf;
fdtab_t *dt;
lwp_t * const l = curlwp;
filedesc_t * const fdp = l->l_fd;
const bool noadvlock = (l->l_proc->p_flag & PK_ADVLOCK) == 0;
KASSERT(atomic_load_consume(&fdp->fd_dt)->dt_ff[0] ==
(fdfile_t *)fdp->fd_dfdfile[0]);
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL);
membar_release();
if (atomic_dec_uint_nv(&fdp->fd_refcnt) > 0)
return;
membar_acquire();
/*
* Close any files that the process holds open.
*/
dt = fdp->fd_dt;
fd_checkmaps(fdp);
#ifdef DEBUG
fdp->fd_refcnt = -1; /* see fd_checkmaps */
#endif
for (fd = 0, nf = dt->dt_nfiles; fd < nf; fd++) {
ff = dt->dt_ff[fd];
KASSERT(fd >= NDFDFILE ||
ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (ff == NULL)
continue;
if ((fp = atomic_load_consume(&ff->ff_file)) != NULL) {
/*
* Must use fd_close() here if there is
* a reference from kqueue or we might have posix
* advisory locks.
*/
if (__predict_true(ff->ff_refcnt == 0) && (noadvlock || fp->f_type != DTYPE_VNODE)) {
ff->ff_file = NULL;
ff->ff_exclose = false;
ff->ff_allocated = false;
closef(fp);
} else {
ff->ff_refcnt++;
fd_close(fd);
}
}
KASSERT(ff->ff_refcnt == 0); KASSERT(ff->ff_file == NULL); KASSERT(!ff->ff_exclose); KASSERT(!ff->ff_allocated); if (fd >= NDFDFILE) { cv_destroy(&ff->ff_closing);
kmem_free(ff, sizeof(*ff));
dt->dt_ff[fd] = NULL;
}
}
/*
* Clean out the descriptor table for the next user and return
* to the cache.
*/
if (__predict_false(dt != &fdp->fd_dtbuiltin)) { fd_dtab_free(fdp->fd_dt);
/* Otherwise, done above. */
memset(&fdp->fd_dtbuiltin.dt_ff[NDFDFILE], 0,
(NDFILE - NDFDFILE) * sizeof(fdp->fd_dtbuiltin.dt_ff[0]));
fdp->fd_dt = &fdp->fd_dtbuiltin;
}
if (__predict_false(NDHISLOTS(nf) > NDHISLOTS(NDFILE))) { KASSERT(fdp->fd_himap != fdp->fd_dhimap); KASSERT(fdp->fd_lomap != fdp->fd_dlomap); fd_map_free(nf, fdp->fd_lomap, fdp->fd_himap);
}
if (__predict_false(fdp->fd_knhash != NULL)) {
hashdone(fdp->fd_knhash, HASH_LIST, fdp->fd_knhashmask);
fdp->fd_knhash = NULL;
fdp->fd_knhashmask = 0;
} else {
KASSERT(fdp->fd_knhashmask == 0);
}
fdp->fd_dt = &fdp->fd_dtbuiltin;
fdp->fd_lastkqfile = -1;
fdp->fd_lastfile = -1;
fdp->fd_freefile = 0;
fdp->fd_exclose = false;
memset(&fdp->fd_startzero, 0, sizeof(*fdp) -
offsetof(filedesc_t, fd_startzero));
fdp->fd_himap = fdp->fd_dhimap;
fdp->fd_lomap = fdp->fd_dlomap;
KASSERT(fdp->fd_dtbuiltin.dt_nfiles == NDFILE); KASSERT(fdp->fd_dtbuiltin.dt_link == NULL); KASSERT(fdp->fd_dt == &fdp->fd_dtbuiltin);
#ifdef DEBUG
fdp->fd_refcnt = 0; /* see fd_checkmaps */
#endif
fd_checkmaps(fdp);
pool_cache_put(filedesc_cache, fdp);
}
/*
* File Descriptor pseudo-device driver (/dev/fd/).
*
* Opening minor device N dup()s the file (if any) connected to file
* descriptor N belonging to the calling process. Note that this driver
* consists of only the ``open()'' routine, because all subsequent
* references to this file will be direct to the other driver.
*/
static int
filedescopen(dev_t dev, int mode, int type, lwp_t *l)
{
/*
* XXX Kludge: set dupfd to contain the value of the
* the file descriptor being sought for duplication. The error
* return ensures that the vnode for this device will be released
* by vn_open. Open will detect this special error and take the
* actions in fd_dupopen below. Other callers of vn_open or VOP_OPEN
* will simply report the error.
*/
l->l_dupfd = minor(dev); /* XXX */
return EDUPFD;
}
/*
* Duplicate the specified descriptor to a free descriptor.
*
* old is the original fd.
* moveit is true if we should move rather than duplicate.
* flags are the open flags (converted from O_* to F*).
* newp returns the new fd on success.
*
* These two cases are produced by the EDUPFD and EMOVEFD magic
* errnos, but in the interest of removing that regrettable interface,
* vn_open has been changed to intercept them. Now vn_open returns
* either a vnode or a filehandle, and the filehandle is accompanied
* by a boolean that says whether we should dup (moveit == false) or
* move (moveit == true) the fd.
*
* The dup case is used by /dev/stderr, /proc/self/fd, and such. The
* move case is used by cloner devices that allocate a fd of their
* own (a layering violation that should go away eventually) that
* then needs to be put in the place open() expects it.
*/
int
fd_dupopen(int old, bool moveit, int flags, int *newp)
{
filedesc_t *fdp;
fdfile_t *ff;
file_t *fp;
fdtab_t *dt;
int error;
if ((fp = fd_getfile(old)) == NULL) {
return EBADF;
}
fdp = curlwp->l_fd;
dt = atomic_load_consume(&fdp->fd_dt);
ff = dt->dt_ff[old];
/*
* There are two cases of interest here.
*
* 1. moveit == false (used to be the EDUPFD magic errno):
* simply dup (old) to file descriptor (new) and return.
*
* 2. moveit == true (used to be the EMOVEFD magic errno):
* steal away the file structure from (old) and store it in
* (new). (old) is effectively closed by this operation.
*/
if (moveit == false) {
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
*/
if (((flags & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
error = EACCES;
goto out;
}
/* Copy it. */
error = fd_dup(fp, 0, newp, ff->ff_exclose);
} else {
/* Copy it. */
error = fd_dup(fp, 0, newp, ff->ff_exclose);
if (error != 0) {
goto out;
}
/* Steal away the file pointer from 'old'. */
(void)fd_close(old);
return 0;
}
out:
fd_putfile(old);
return error;
}
/*
* Close open files on exec.
*/
void
fd_closeexec(void)
{
proc_t *p;
filedesc_t *fdp;
fdfile_t *ff;
lwp_t *l;
fdtab_t *dt;
int fd;
l = curlwp;
p = l->l_proc;
fdp = p->p_fd;
if (fdp->fd_refcnt > 1) {
fdp = fd_copy();
fd_free();
p->p_fd = fdp;
l->l_fd = fdp;
}
if (!fdp->fd_exclose) {
return;
}
fdp->fd_exclose = false;
dt = atomic_load_consume(&fdp->fd_dt);
for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
if ((ff = dt->dt_ff[fd]) == NULL) {
KASSERT(fd >= NDFDFILE);
continue;
}
KASSERT(fd >= NDFDFILE ||
ff == (fdfile_t *)fdp->fd_dfdfile[fd]);
if (ff->ff_file == NULL)
continue;
if (ff->ff_exclose) {
/*
* We need a reference to close the file.
* No other threads can see the fdfile_t at
* this point, so don't bother locking.
*/
KASSERT((ff->ff_refcnt & FR_CLOSING) == 0);
ff->ff_refcnt++;
fd_close(fd);
}
}
}
/*
* Sets descriptor owner. If the owner is a process, 'pgid'
* is set to positive value, process ID. If the owner is process group,
* 'pgid' is set to -pg_id.
*/
int
fsetown(pid_t *pgid, u_long cmd, const void *data)
{
pid_t id = *(const pid_t *)data;
int error;
if (id == INT_MIN)
return EINVAL;
switch (cmd) {
case TIOCSPGRP:
if (id < 0)
return EINVAL;
id = -id;
break;
default:
break;
}
if (id > 0) {
mutex_enter(&proc_lock);
error = proc_find(id) ? 0 : ESRCH;
mutex_exit(&proc_lock);
} else if (id < 0) {
error = pgid_in_session(curproc, -id);
} else {
error = 0;
}
if (!error) {
*pgid = id;
}
return error;
}
void
fd_set_exclose(struct lwp *l, int fd, bool exclose)
{
filedesc_t *fdp = l->l_fd;
fdfile_t *ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd];
ff->ff_exclose = exclose;
if (exclose) fdp->fd_exclose = true;
}
/*
* Return descriptor owner information. If the value is positive,
* it's process ID. If it's negative, it's process group ID and
* needs the sign removed before use.
*/
int
fgetown(pid_t pgid, u_long cmd, void *data)
{
switch (cmd) {
case TIOCGPGRP:
*(int *)data = -pgid;
break;
default:
*(int *)data = pgid;
break;
}
return 0;
}
/*
* Send signal to descriptor owner, either process or process group.
*/
void
fownsignal(pid_t pgid, int signo, int code, int band, void *fdescdata)
{
ksiginfo_t ksi;
KASSERT(!cpu_intr_p()); if (pgid == 0) {
return;
}
KSI_INIT(&ksi);
ksi.ksi_signo = signo;
ksi.ksi_code = code;
ksi.ksi_band = band;
mutex_enter(&proc_lock);
if (pgid > 0) {
struct proc *p1;
p1 = proc_find(pgid);
if (p1 != NULL) { kpsignal(p1, &ksi, fdescdata);
}
} else {
struct pgrp *pgrp;
KASSERT(pgid < 0);
pgrp = pgrp_find(-pgid);
if (pgrp != NULL) { kpgsignal(pgrp, &ksi, fdescdata, 0);
}
}
mutex_exit(&proc_lock);
}
int
fd_clone(file_t *fp, unsigned fd, int flag, const struct fileops *fops,
void *data)
{
fdfile_t *ff;
filedesc_t *fdp;
fp->f_flag = flag & FMASK;
fdp = curproc->p_fd;
ff = atomic_load_consume(&fdp->fd_dt)->dt_ff[fd]; KASSERT(ff != NULL);
ff->ff_exclose = (flag & O_CLOEXEC) != 0;
fp->f_type = DTYPE_MISC;
fp->f_ops = fops;
fp->f_data = data;
curlwp->l_dupfd = fd;
fd_affix(curproc, fp, fd);
return EMOVEFD;
}
int
fnullop_fcntl(file_t *fp, u_int cmd, void *data)
{
if (cmd == F_SETFL)
return 0;
return EOPNOTSUPP;
}
int
fnullop_poll(file_t *fp, int which)
{
return 0;
}
int
fnullop_kqfilter(file_t *fp, struct knote *kn)
{
return EOPNOTSUPP;
}
void
fnullop_restart(file_t *fp)
{
}
int
fbadop_read(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_write(file_t *fp, off_t *offset, struct uio *uio,
kauth_cred_t cred, int flags)
{
return EOPNOTSUPP;
}
int
fbadop_ioctl(file_t *fp, u_long com, void *data)
{
return EOPNOTSUPP;
}
int
fbadop_stat(file_t *fp, struct stat *sb)
{
return EOPNOTSUPP;
}
int
fbadop_close(file_t *fp)
{
return EOPNOTSUPP;
}
/*
* sysctl routines pertaining to file descriptors
*/
/* Initialized in sysctl_init() for now... */
extern kmutex_t sysctl_file_marker_lock;
static u_int sysctl_file_marker = 1;
/*
* Expects to be called with proc_lock and sysctl_file_marker_lock locked.
*/
static void
sysctl_file_marker_reset(void)
{
struct proc *p;
PROCLIST_FOREACH(p, &allproc) {
struct filedesc *fd = p->p_fd;
fdtab_t *dt;
u_int i;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
struct file *fp;
fdfile_t *ff;
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
continue;
}
fp->f_marker = 0;
}
mutex_exit(&fd->fd_lock);
}
}
/*
* sysctl helper routine for kern.file pseudo-subtree.
*/
static int
sysctl_kern_file(SYSCTLFN_ARGS)
{
const bool allowaddr = get_expose_address(curproc);
struct filelist flist;
int error;
size_t buflen;
struct file *fp, fbuf;
char *start, *where;
struct proc *p;
start = where = oldp;
buflen = *oldlenp;
if (where == NULL) {
/*
* overestimate by 10 files
*/
*oldlenp = sizeof(filehead) + (nfiles + 10) *
sizeof(struct file);
return 0;
}
/*
* first sysctl_copyout filehead
*/
if (buflen < sizeof(filehead)) {
*oldlenp = 0;
return 0;
}
sysctl_unlock();
if (allowaddr) {
memcpy(&flist, &filehead, sizeof(flist));
} else {
memset(&flist, 0, sizeof(flist));
}
error = sysctl_copyout(l, &flist, where, sizeof(flist));
if (error) {
sysctl_relock();
return error;
}
buflen -= sizeof(flist);
where += sizeof(flist);
/*
* followed by an array of file structures
*/
mutex_enter(&sysctl_file_marker_lock);
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
struct filedesc *fd;
fdtab_t *dt;
u_int i;
if (p->p_stat == SIDL) {
/* skip embryonic processes */
continue;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
NULL, NULL);
mutex_exit(p->p_lock);
if (error != 0) {
/*
* Don't leak kauth retval if we're silently
* skipping this entry.
*/
error = 0;
continue;
}
/*
* Grab a hold on the process.
*/
if (!rw_tryenter(&p->p_reflock, RW_READER)) {
continue;
}
mutex_exit(&proc_lock);
fd = p->p_fd;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
fdfile_t *ff;
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) == NULL) {
continue;
}
mutex_enter(&fp->f_lock);
if ((fp->f_count == 0) ||
(fp->f_marker == sysctl_file_marker)) {
mutex_exit(&fp->f_lock);
continue;
}
/* Check that we have enough space. */
if (buflen < sizeof(struct file)) {
*oldlenp = where - start;
mutex_exit(&fp->f_lock);
error = ENOMEM;
break;
}
fill_file(&fbuf, fp);
mutex_exit(&fp->f_lock);
error = sysctl_copyout(l, &fbuf, where, sizeof(fbuf));
if (error) {
break;
}
buflen -= sizeof(struct file);
where += sizeof(struct file);
fp->f_marker = sysctl_file_marker;
}
mutex_exit(&fd->fd_lock);
/*
* Release reference to process.
*/
mutex_enter(&proc_lock);
rw_exit(&p->p_reflock);
if (error)
break;
}
sysctl_file_marker++;
/* Reset all markers if wrapped. */
if (sysctl_file_marker == 0) {
sysctl_file_marker_reset();
sysctl_file_marker++;
}
mutex_exit(&proc_lock);
mutex_exit(&sysctl_file_marker_lock);
*oldlenp = where - start;
sysctl_relock();
return error;
}
/*
* sysctl helper function for kern.file2
*/
static int
sysctl_kern_file2(SYSCTLFN_ARGS)
{
struct proc *p;
struct file *fp;
struct filedesc *fd;
struct kinfo_file kf;
char *dp;
u_int i, op;
size_t len, needed, elem_size, out_size;
int error, arg, elem_count;
fdfile_t *ff;
fdtab_t *dt;
if (namelen == 1 && name[0] == CTL_QUERY)
return sysctl_query(SYSCTLFN_CALL(rnode));
if (namelen != 4)
return EINVAL;
error = 0;
dp = oldp;
len = (oldp != NULL) ? *oldlenp : 0;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
out_size = MIN(sizeof(kf), elem_size);
needed = 0;
if (elem_size < 1 || elem_count < 0)
return EINVAL;
switch (op) {
case KERN_FILE_BYFILE:
case KERN_FILE_BYPID:
/*
* We're traversing the process list in both cases; the BYFILE
* case does additional work of keeping track of files already
* looked at.
*/
/* doesn't use arg so it must be zero */
if ((op == KERN_FILE_BYFILE) && (arg != 0))
return EINVAL;
if ((op == KERN_FILE_BYPID) && (arg < -1))
/* -1 means all processes */
return EINVAL;
sysctl_unlock();
if (op == KERN_FILE_BYFILE)
mutex_enter(&sysctl_file_marker_lock);
mutex_enter(&proc_lock);
PROCLIST_FOREACH(p, &allproc) {
if (p->p_stat == SIDL) {
/* skip embryonic processes */
continue;
}
if (arg > 0 && p->p_pid != arg) {
/* pick only the one we want */
/* XXX want 0 to mean "kernel files" */
continue;
}
mutex_enter(p->p_lock);
error = kauth_authorize_process(l->l_cred,
KAUTH_PROCESS_CANSEE, p,
KAUTH_ARG(KAUTH_REQ_PROCESS_CANSEE_OPENFILES),
NULL, NULL);
mutex_exit(p->p_lock);
if (error != 0) {
/*
* Don't leak kauth retval if we're silently
* skipping this entry.
*/
error = 0;
continue;
}
/*
* Grab a hold on the process.
*/
if (!rw_tryenter(&p->p_reflock, RW_READER)) {
continue;
}
mutex_exit(&proc_lock);
fd = p->p_fd;
mutex_enter(&fd->fd_lock);
dt = fd->fd_dt;
for (i = 0; i < dt->dt_nfiles; i++) {
if ((ff = dt->dt_ff[i]) == NULL) {
continue;
}
if ((fp = atomic_load_consume(&ff->ff_file)) ==
NULL) {
continue;
}
if ((op == KERN_FILE_BYFILE) &&
(fp->f_marker == sysctl_file_marker)) {
continue;
}
if (len >= elem_size && elem_count > 0) {
mutex_enter(&fp->f_lock);
fill_file2(&kf, fp, ff, i, p->p_pid);
mutex_exit(&fp->f_lock);
mutex_exit(&fd->fd_lock);
error = sysctl_copyout(l,
&kf, dp, out_size);
mutex_enter(&fd->fd_lock);
if (error)
break;
dp += elem_size;
len -= elem_size;
}
if (op == KERN_FILE_BYFILE)
fp->f_marker = sysctl_file_marker;
needed += elem_size;
if (elem_count > 0 && elem_count != INT_MAX)
elem_count--;
}
mutex_exit(&fd->fd_lock);
/*
* Release reference to process.
*/
mutex_enter(&proc_lock);
rw_exit(&p->p_reflock);
}
if (op == KERN_FILE_BYFILE) {
sysctl_file_marker++;
/* Reset all markers if wrapped. */
if (sysctl_file_marker == 0) {
sysctl_file_marker_reset();
sysctl_file_marker++;
}
}
mutex_exit(&proc_lock);
if (op == KERN_FILE_BYFILE)
mutex_exit(&sysctl_file_marker_lock);
sysctl_relock();
break;
default:
return EINVAL;
}
if (oldp == NULL)
needed += KERN_FILESLOP * elem_size;
*oldlenp = needed;
return error;
}
static void
fill_file(struct file *fp, const struct file *fpsrc)
{
const bool allowaddr = get_expose_address(curproc);
memset(fp, 0, sizeof(*fp));
fp->f_offset = fpsrc->f_offset;
COND_SET_PTR(fp->f_cred, fpsrc->f_cred, allowaddr);
COND_SET_CPTR(fp->f_ops, fpsrc->f_ops, allowaddr);
COND_SET_STRUCT(fp->f_undata, fpsrc->f_undata, allowaddr);
COND_SET_STRUCT(fp->f_list, fpsrc->f_list, allowaddr);
fp->f_flag = fpsrc->f_flag;
fp->f_marker = fpsrc->f_marker;
fp->f_type = fpsrc->f_type;
fp->f_advice = fpsrc->f_advice;
fp->f_count = fpsrc->f_count;
fp->f_msgcount = fpsrc->f_msgcount;
fp->f_unpcount = fpsrc->f_unpcount;
COND_SET_STRUCT(fp->f_unplist, fpsrc->f_unplist, allowaddr);
}
static void
fill_file2(struct kinfo_file *kp, const file_t *fp, const fdfile_t *ff,
int i, pid_t pid)
{
const bool allowaddr = get_expose_address(curproc);
memset(kp, 0, sizeof(*kp));
COND_SET_VALUE(kp->ki_fileaddr, PTRTOUINT64(fp), allowaddr);
kp->ki_flag = fp->f_flag;
kp->ki_iflags = 0;
kp->ki_ftype = fp->f_type;
kp->ki_count = fp->f_count;
kp->ki_msgcount = fp->f_msgcount;
COND_SET_VALUE(kp->ki_fucred, PTRTOUINT64(fp->f_cred), allowaddr);
kp->ki_fuid = kauth_cred_geteuid(fp->f_cred);
kp->ki_fgid = kauth_cred_getegid(fp->f_cred);
COND_SET_VALUE(kp->ki_fops, PTRTOUINT64(fp->f_ops), allowaddr);
kp->ki_foffset = fp->f_offset;
COND_SET_VALUE(kp->ki_fdata, PTRTOUINT64(fp->f_data), allowaddr);
/* vnode information to glue this file to something */
if (fp->f_type == DTYPE_VNODE) {
struct vnode *vp = fp->f_vnode;
COND_SET_VALUE(kp->ki_vun, PTRTOUINT64(vp->v_un.vu_socket),
allowaddr);
kp->ki_vsize = vp->v_size;
kp->ki_vtype = vp->v_type;
kp->ki_vtag = vp->v_tag;
COND_SET_VALUE(kp->ki_vdata, PTRTOUINT64(vp->v_data),
allowaddr);
}
/* process information when retrieved via KERN_FILE_BYPID */
if (ff != NULL) {
kp->ki_pid = pid;
kp->ki_fd = i;
kp->ki_ofileflags = ff->ff_exclose;
kp->ki_usecount = ff->ff_refcnt;
}
}
/* $NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $ */
/*-
* Copyright (c) 1997-2011, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center, by Charles M. Hannum, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.13 (Berkeley) 4/18/94
*/
/*
* The vnode cache subsystem.
*
* Life-cycle
*
* Normally, there are two points where new vnodes are created:
* VOP_CREATE(9) and VOP_LOOKUP(9). The life-cycle of a vnode
* starts in one of the following ways:
*
* - Allocation, via vcache_get(9) or vcache_new(9).
* - Reclamation of inactive vnode, via vcache_vget(9).
*
* Recycle from a free list, via getnewvnode(9) -> getcleanvnode(9)
* was another, traditional way. Currently, only the draining thread
* recycles the vnodes. This behaviour might be revisited.
*
* The life-cycle ends when the last reference is dropped, usually
* in VOP_REMOVE(9). In such case, VOP_INACTIVE(9) is called to inform
* the file system that vnode is inactive. Via this call, file system
* indicates whether vnode can be recycled (usually, it checks its own
* references, e.g. count of links, whether the file was removed).
*
* Depending on indication, vnode can be put into a free list (cache),
* or cleaned via vcache_reclaim, which calls VOP_RECLAIM(9) to
* disassociate underlying file system from the vnode, and finally
* destroyed.
*
* Vnode state
*
* Vnode is always in one of six states:
* - MARKER This is a marker vnode to help list traversal. It
* will never change its state.
* - LOADING Vnode is associating underlying file system and not
* yet ready to use.
* - LOADED Vnode has associated underlying file system and is
* ready to use.
* - BLOCKED Vnode is active but cannot get new references.
* - RECLAIMING Vnode is disassociating from the underlying file
* system.
* - RECLAIMED Vnode has disassociated from underlying file system
* and is dead.
*
* Valid state changes are:
* LOADING -> LOADED
* Vnode has been initialised in vcache_get() or
* vcache_new() and is ready to use.
* BLOCKED -> RECLAIMING
* Vnode starts disassociation from underlying file
* system in vcache_reclaim().
* RECLAIMING -> RECLAIMED
* Vnode finished disassociation from underlying file
* system in vcache_reclaim().
* LOADED -> BLOCKED
* Either vcache_rekey*() is changing the vnode key or
* vrelel() is about to call VOP_INACTIVE().
* BLOCKED -> LOADED
* The block condition is over.
* LOADING -> RECLAIMED
* Either vcache_get() or vcache_new() failed to
* associate the underlying file system or vcache_rekey*()
* drops a vnode used as placeholder.
*
* Of these states LOADING, BLOCKED and RECLAIMING are intermediate
* and it is possible to wait for state change.
*
* State is protected with v_interlock with one exception:
* to change from LOADING both v_interlock and vcache_lock must be held
* so it is possible to check "state == LOADING" without holding
* v_interlock. See vcache_get() for details.
*
* Reference counting
*
* Vnode is considered active, if reference count (vnode_t::v_usecount)
* is non-zero. It is maintained using: vref(9) and vrele(9), as well
* as vput(9), routines. Common points holding references are e.g.
* file openings, current working directory, mount points, etc.
*
* v_usecount is adjusted with atomic operations, however to change
* from a non-zero value to zero the interlock must also be held.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_vnode.c,v 1.153 2023/11/27 16:13:59 hannken Exp $");
#ifdef _KERNEL_OPT
#include "opt_pax.h"
#endif
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/atomic.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/device.h>
#include <sys/hash.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/module.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pax.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/threadpool.h>
#include <sys/vnode_impl.h>
#include <sys/wapbl.h>
#include <sys/fstrans.h>
#include <miscfs/deadfs/deadfs.h>
#include <miscfs/specfs/specdev.h>
#include <uvm/uvm.h>
#include <uvm/uvm_readahead.h>
#include <uvm/uvm_stat.h>
/* Flags to vrelel. */
#define VRELEL_ASYNC 0x0001 /* Always defer to vrele thread. */
#define LRU_VRELE 0
#define LRU_FREE 1
#define LRU_HOLD 2
#define LRU_COUNT 3
/*
* There are three lru lists: one holds vnodes waiting for async release,
* one is for vnodes which have no buffer/page references and one for those
* which do (i.e. v_holdcnt is non-zero). We put the lists into a single,
* private cache line as vnodes migrate between them while under the same
* lock (vdrain_lock).
*/
typedef struct {
vnode_impl_t *li_marker;
} lru_iter_t;
u_int numvnodes __cacheline_aligned;
static vnodelst_t lru_list[LRU_COUNT] __cacheline_aligned;
static struct threadpool *threadpool;
static struct threadpool_job vdrain_job;
static struct threadpool_job vrele_job;
static kmutex_t vdrain_lock __cacheline_aligned;
SLIST_HEAD(hashhead, vnode_impl);
static kmutex_t vcache_lock __cacheline_aligned;
static kcondvar_t vcache_cv;
static u_int vcache_hashsize;
static u_long vcache_hashmask;
static struct hashhead *vcache_hashtab;
static pool_cache_t vcache_pool;
static void lru_requeue(vnode_t *, vnodelst_t *);
static vnodelst_t * lru_which(vnode_t *);
static vnode_impl_t * lru_iter_first(int, lru_iter_t *);
static vnode_impl_t * lru_iter_next(lru_iter_t *);
static void lru_iter_release(lru_iter_t *);
static vnode_impl_t * vcache_alloc(void);
static void vcache_dealloc(vnode_impl_t *);
static void vcache_free(vnode_impl_t *);
static void vcache_init(void);
static void vcache_reinit(void);
static void vcache_reclaim(vnode_t *);
static void vrele_deferred(vnode_impl_t *);
static void vrelel(vnode_t *, int, int);
static void vnpanic(vnode_t *, const char *, ...)
__printflike(2, 3);
static bool vdrain_one(u_int);
static void vdrain_task(struct threadpool_job *);
static void vrele_task(struct threadpool_job *);
/* Routines having to do with the management of the vnode table. */
/*
* The high bit of v_usecount is a gate for vcache_tryvget(). It's set
* only when the vnode state is LOADED.
* The next bit of v_usecount is a flag for vrelel(). It's set
* from vcache_vget() and vcache_tryvget() whenever the operation succeeds.
*/
#define VUSECOUNT_MASK 0x3fffffff
#define VUSECOUNT_GATE 0x80000000
#define VUSECOUNT_VGET 0x40000000
/*
* Return the current usecount of a vnode.
*/
inline int
vrefcnt(struct vnode *vp)
{ return atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_MASK;
}
/* Vnode state operations and diagnostics. */
#if defined(DIAGNOSTIC)
#define VSTATE_VALID(state) \
((state) != VS_ACTIVE && (state) != VS_MARKER)
#define VSTATE_GET(vp) \
vstate_assert_get((vp), __func__, __LINE__)
#define VSTATE_CHANGE(vp, from, to) \
vstate_assert_change((vp), (from), (to), __func__, __LINE__)
#define VSTATE_WAIT_STABLE(vp) \
vstate_assert_wait_stable((vp), __func__, __LINE__)
void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
bool has_lock)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
int refcnt = vrefcnt(vp); if (!has_lock) { enum vnode_state vstate = atomic_load_relaxed(&vip->vi_state); if (state == VS_ACTIVE && refcnt > 0 &&
(vstate == VS_LOADED || vstate == VS_BLOCKED))
return;
if (vstate == state)
return;
mutex_enter((vp)->v_interlock);
}
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if ((state == VS_ACTIVE && refcnt > 0 &&
(vip->vi_state == VS_LOADED || vip->vi_state == VS_BLOCKED)) ||
vip->vi_state == state) {
if (!has_lock) mutex_exit((vp)->v_interlock);
return;
}
vnpanic(vp, "state is %s, usecount %d, expected %s at %s:%d",
vstate_name(vip->vi_state), refcnt,
vstate_name(state), func, line);
}
static enum vnode_state
vstate_assert_get(vnode_t *vp, const char *func, int line)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line);
return vip->vi_state;
}
static void
vstate_assert_wait_stable(vnode_t *vp, const char *func, int line)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line); while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED) cv_wait(&vp->v_cv, vp->v_interlock);
if (! VSTATE_VALID(vip->vi_state))
vnpanic(vp, "state is %s at %s:%d",
vstate_name(vip->vi_state), func, line);
}
static void
vstate_assert_change(vnode_t *vp, enum vnode_state from, enum vnode_state to,
const char *func, int line)
{ bool gated = (atomic_load_relaxed(&vp->v_usecount) & VUSECOUNT_GATE);
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
KASSERTMSG(mutex_owned(vp->v_interlock), "at %s:%d", func, line); if (from == VS_LOADING) KASSERTMSG(mutex_owned(&vcache_lock), "at %s:%d", func, line);
if (! VSTATE_VALID(from))
vnpanic(vp, "from is %s at %s:%d",
vstate_name(from), func, line);
if (! VSTATE_VALID(to))
vnpanic(vp, "to is %s at %s:%d",
vstate_name(to), func, line);
if (vip->vi_state != from)
vnpanic(vp, "from is %s, expected %s at %s:%d\n",
vstate_name(vip->vi_state), vstate_name(from), func, line);
if ((from == VS_LOADED) != gated)
vnpanic(vp, "state is %s, gate %d does not match at %s:%d\n",
vstate_name(vip->vi_state), gated, func, line);
/* Open/close the gate for vcache_tryvget(). */
if (to == VS_LOADED) {
membar_release();
atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
} else {
atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
}
atomic_store_relaxed(&vip->vi_state, to); if (from == VS_LOADING) cv_broadcast(&vcache_cv); if (to == VS_LOADED || to == VS_RECLAIMED)
cv_broadcast(&vp->v_cv);
}
#else /* defined(DIAGNOSTIC) */
#define VSTATE_GET(vp) \
(VNODE_TO_VIMPL((vp))->vi_state)
#define VSTATE_CHANGE(vp, from, to) \
vstate_change((vp), (from), (to))
#define VSTATE_WAIT_STABLE(vp) \
vstate_wait_stable((vp))
void
_vstate_assert(vnode_t *vp, enum vnode_state state, const char *func, int line,
bool has_lock)
{
}
static void
vstate_wait_stable(vnode_t *vp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
while (vip->vi_state != VS_LOADED && vip->vi_state != VS_RECLAIMED)
cv_wait(&vp->v_cv, vp->v_interlock);
}
static void
vstate_change(vnode_t *vp, enum vnode_state from, enum vnode_state to)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
/* Open/close the gate for vcache_tryvget(). */
if (to == VS_LOADED) {
membar_release();
atomic_or_uint(&vp->v_usecount, VUSECOUNT_GATE);
} else {
atomic_and_uint(&vp->v_usecount, ~VUSECOUNT_GATE);
}
atomic_store_relaxed(&vip->vi_state, to);
if (from == VS_LOADING)
cv_broadcast(&vcache_cv);
if (to == VS_LOADED || to == VS_RECLAIMED)
cv_broadcast(&vp->v_cv);
}
#endif /* defined(DIAGNOSTIC) */
void
vfs_vnode_sysinit(void)
{
int error __diagused, i;
dead_rootmount = vfs_mountalloc(&dead_vfsops, NULL);
KASSERT(dead_rootmount != NULL);
dead_rootmount->mnt_iflag |= IMNT_MPSAFE;
mutex_init(&vdrain_lock, MUTEX_DEFAULT, IPL_NONE);
for (i = 0; i < LRU_COUNT; i++) {
TAILQ_INIT(&lru_list[i]);
}
vcache_init();
error = threadpool_get(&threadpool, PRI_NONE);
KASSERTMSG((error == 0), "threadpool_get failed: %d", error);
threadpool_job_init(&vdrain_job, vdrain_task, &vdrain_lock, "vdrain");
threadpool_job_init(&vrele_job, vrele_task, &vdrain_lock, "vrele");
}
/*
* Allocate a new marker vnode.
*/
vnode_t *
vnalloc_marker(struct mount *mp)
{
vnode_impl_t *vip;
vnode_t *vp;
vip = pool_cache_get(vcache_pool, PR_WAITOK);
memset(vip, 0, sizeof(*vip));
vp = VIMPL_TO_VNODE(vip);
uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
vp->v_mount = mp;
vp->v_type = VBAD;
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
klist_init(&vip->vi_klist.vk_klist);
vp->v_klist = &vip->vi_klist;
vip->vi_state = VS_MARKER;
return vp;
}
/*
* Free a marker vnode.
*/
void
vnfree_marker(vnode_t *vp)
{
vnode_impl_t *vip;
vip = VNODE_TO_VIMPL(vp);
KASSERT(vip->vi_state == VS_MARKER);
mutex_obj_free(vp->v_interlock);
uvm_obj_destroy(&vp->v_uobj, true);
klist_fini(&vip->vi_klist.vk_klist);
pool_cache_put(vcache_pool, vip);
}
/*
* Test a vnode for being a marker vnode.
*/
bool
vnis_marker(vnode_t *vp)
{
return (VNODE_TO_VIMPL(vp)->vi_state == VS_MARKER);
}
/*
* Return the lru list this node should be on.
*/
static vnodelst_t *
lru_which(vnode_t *vp)
{
KASSERT(mutex_owned(vp->v_interlock));
if (vp->v_holdcnt > 0)
return &lru_list[LRU_HOLD];
else
return &lru_list[LRU_FREE];
}
/*
* Put vnode to end of given list.
* Both the current and the new list may be NULL, used on vnode alloc/free.
* Adjust numvnodes and signal vdrain thread if there is work.
*/
static void
lru_requeue(vnode_t *vp, vnodelst_t *listhd)
{
vnode_impl_t *vip;
int d;
/*
* If the vnode is on the correct list, and was put there recently,
* then leave it be, thus avoiding huge cache and lock contention.
*/
vip = VNODE_TO_VIMPL(vp);
if (listhd == vip->vi_lrulisthd &&
(getticks() - vip->vi_lrulisttm) < hz) {
return;
}
mutex_enter(&vdrain_lock);
d = 0;
if (vip->vi_lrulisthd != NULL) TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
else
d++;
vip->vi_lrulisthd = listhd;
vip->vi_lrulisttm = getticks();
if (vip->vi_lrulisthd != NULL)
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
else
d--; if (d != 0) {
/*
* Looks strange? This is not a bug. Don't store
* numvnodes unless there is a change - avoid false
* sharing on MP.
*/
numvnodes += d;
}
if (listhd == &lru_list[LRU_VRELE]) threadpool_schedule_job(threadpool, &vrele_job); if (d > 0 && numvnodes > desiredvnodes) threadpool_schedule_job(threadpool, &vdrain_job); if (d > 0 && numvnodes > desiredvnodes + desiredvnodes / 16) kpause("vnfull", false, MAX(1, mstohz(10)), &vdrain_lock);
mutex_exit(&vdrain_lock);
}
/*
* LRU list iterator.
* Caller holds vdrain_lock.
*/
static vnode_impl_t *
lru_iter_first(int idx, lru_iter_t *iterp)
{
vnode_impl_t *marker;
KASSERT(mutex_owned(&vdrain_lock));
mutex_exit(&vdrain_lock);
marker = VNODE_TO_VIMPL(vnalloc_marker(NULL));
mutex_enter(&vdrain_lock);
marker->vi_lrulisthd = &lru_list[idx];
iterp->li_marker = marker;
TAILQ_INSERT_HEAD(marker->vi_lrulisthd, marker, vi_lrulist);
return lru_iter_next(iterp);
}
static vnode_impl_t *
lru_iter_next(lru_iter_t *iter)
{
vnode_impl_t *vip, *marker;
vnodelst_t *listhd;
KASSERT(mutex_owned(&vdrain_lock));
marker = iter->li_marker;
listhd = marker->vi_lrulisthd;
while ((vip = TAILQ_NEXT(marker, vi_lrulist))) { TAILQ_REMOVE(listhd, marker, vi_lrulist); TAILQ_INSERT_AFTER(listhd, vip, marker, vi_lrulist); if (!vnis_marker(VIMPL_TO_VNODE(vip)))
break;
}
return vip;
}
static void
lru_iter_release(lru_iter_t *iter)
{
vnode_impl_t *marker;
KASSERT(mutex_owned(&vdrain_lock));
marker = iter->li_marker;
TAILQ_REMOVE(marker->vi_lrulisthd, marker, vi_lrulist);
mutex_exit(&vdrain_lock);
vnfree_marker(VIMPL_TO_VNODE(marker));
mutex_enter(&vdrain_lock);
}
/*
* Release deferred vrele vnodes for this mount.
* Called with file system suspended.
*/
void
vrele_flush(struct mount *mp)
{
lru_iter_t iter;
vnode_impl_t *vip;
KASSERT(fstrans_is_owner(mp));
mutex_enter(&vdrain_lock);
for (vip = lru_iter_first(LRU_VRELE, &iter); vip != NULL;
vip = lru_iter_next(&iter)) {
if (VIMPL_TO_VNODE(vip)->v_mount != mp)
continue;
vrele_deferred(vip);
}
lru_iter_release(&iter);
mutex_exit(&vdrain_lock);
}
/*
* One pass through the LRU lists to keep the number of allocated
* vnodes below target. Returns true if target met.
*/
static bool
vdrain_one(u_int target)
{
int ix, lists[] = { LRU_FREE, LRU_HOLD };
lru_iter_t iter;
vnode_impl_t *vip;
vnode_t *vp;
struct mount *mp;
KASSERT(mutex_owned(&vdrain_lock));
for (ix = 0; ix < __arraycount(lists); ix++) {
for (vip = lru_iter_first(lists[ix], &iter); vip != NULL;
vip = lru_iter_next(&iter)) {
if (numvnodes < target) {
lru_iter_release(&iter);
return true;
}
vp = VIMPL_TO_VNODE(vip);
/* Probe usecount (unlocked). */
if (vrefcnt(vp) > 0)
continue;
/* Try v_interlock -- we lock the wrong direction! */
if (!mutex_tryenter(vp->v_interlock))
continue;
/* Probe usecount and state. */
if (vrefcnt(vp) > 0 || VSTATE_GET(vp) != VS_LOADED) {
mutex_exit(vp->v_interlock);
continue;
}
mutex_exit(&vdrain_lock);
mp = vp->v_mount;
if (fstrans_start_nowait(mp) != 0) {
mutex_exit(vp->v_interlock);
mutex_enter(&vdrain_lock);
continue;
}
if (vcache_vget(vp) == 0) {
if (!vrecycle(vp)) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_EXCLUSIVE);
}
}
fstrans_done(mp);
mutex_enter(&vdrain_lock);
}
lru_iter_release(&iter);
}
return false;
}
/*
* threadpool task to keep the number of vnodes below desiredvnodes.
*/
static void
vdrain_task(struct threadpool_job *job)
{
u_int target;
target = desiredvnodes - desiredvnodes / 16;
mutex_enter(&vdrain_lock);
while (!vdrain_one(target))
kpause("vdrain", false, 1, &vdrain_lock);
threadpool_job_done(job);
mutex_exit(&vdrain_lock);
}
/*
* threadpool task to process asynchronous vrele.
*/
static void
vrele_task(struct threadpool_job *job)
{
int skipped;
lru_iter_t iter;
vnode_impl_t *vip;
struct mount *mp;
mutex_enter(&vdrain_lock);
while ((vip = lru_iter_first(LRU_VRELE, &iter)) != NULL) {
for (skipped = 0; vip != NULL; vip = lru_iter_next(&iter)) {
mp = VIMPL_TO_VNODE(vip)->v_mount;
if (fstrans_start_nowait(mp) == 0) {
vrele_deferred(vip);
fstrans_done(mp);
} else {
skipped++;
}
}
lru_iter_release(&iter);
if (skipped)
kpause("vrele", false, MAX(1, mstohz(10)), &vdrain_lock);
}
threadpool_job_done(job);
lru_iter_release(&iter);
mutex_exit(&vdrain_lock);
}
/*
* Try to drop reference on a vnode. Abort if we are releasing the
* last reference. Note: this _must_ succeed if not the last reference.
*/
static bool
vtryrele(vnode_t *vp)
{
u_int use, next;
membar_release();
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_MASK) == 1)) {
return false;
}
KASSERT((use & VUSECOUNT_MASK) > 1);
next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
if (__predict_true(next == use)) {
return true;
}
}
}
/*
* vput: unlock and release the reference.
*/
void
vput(vnode_t *vp)
{
int lktype;
/*
* Do an unlocked check of the usecount. If it looks like we're not
* about to drop the last reference, then unlock the vnode and try
* to drop the reference. If it ends up being the last reference
* after all, vrelel() can fix it all up. Most of the time this
* will all go to plan.
*/
if (vrefcnt(vp) > 1) {
VOP_UNLOCK(vp);
if (vtryrele(vp)) {
return;
}
lktype = LK_NONE;
} else {
lktype = VOP_ISLOCKED(vp);
KASSERT(lktype != LK_NONE);
}
mutex_enter(vp->v_interlock);
vrelel(vp, 0, lktype);
}
/*
* Release a vnode from the deferred list.
*/
static void
vrele_deferred(vnode_impl_t *vip)
{
vnode_t *vp;
KASSERT(mutex_owned(&vdrain_lock));
KASSERT(vip->vi_lrulisthd == &lru_list[LRU_VRELE]);
vp = VIMPL_TO_VNODE(vip);
/*
* First remove the vnode from the vrele list.
* Put it on the last lru list, the last vrele()
* will put it back onto the right list before
* its usecount reaches zero.
*/
TAILQ_REMOVE(vip->vi_lrulisthd, vip, vi_lrulist);
vip->vi_lrulisthd = &lru_list[LRU_HOLD];
vip->vi_lrulisttm = getticks();
TAILQ_INSERT_TAIL(vip->vi_lrulisthd, vip, vi_lrulist);
mutex_exit(&vdrain_lock);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_EXCLUSIVE);
mutex_enter(&vdrain_lock);
}
/*
* Vnode release. If reference count drops to zero, call inactive
* routine and either return to freelist or free to the pool.
*/
static void
vrelel(vnode_t *vp, int flags, int lktype)
{
const bool async = ((flags & VRELEL_ASYNC) != 0);
bool recycle, defer, objlock_held;
u_int use, next;
int error;
objlock_held = false;
retry:
KASSERT(mutex_owned(vp->v_interlock)); if (__predict_false(vp->v_op == dead_vnodeop_p &&
VSTATE_GET(vp) != VS_RECLAIMED)) {
vnpanic(vp, "dead but not clean");
}
/*
* If not the last reference, just unlock and drop the reference count.
*
* Otherwise make sure we pass a point in time where we hold the
* last reference with VGET flag unset.
*/
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) {
if (__predict_false((use & VUSECOUNT_MASK) > 1)) {
if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
lktype = LK_NONE;
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
if (vtryrele(vp)) {
mutex_exit(vp->v_interlock);
return;
}
next = atomic_load_relaxed(&vp->v_usecount);
continue;
}
KASSERT((use & VUSECOUNT_MASK) == 1);
next = use & ~VUSECOUNT_VGET;
if (next != use) {
next = atomic_cas_uint(&vp->v_usecount, use, next);
}
if (__predict_true(next == use)) {
break;
}
}
membar_acquire();
if (vrefcnt(vp) <= 0 || vp->v_writecount != 0) {
vnpanic(vp, "%s: bad ref count", __func__);
}
#ifdef DIAGNOSTIC
if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specnode != NULL && vp->v_specnode->sn_opencnt != 0) { vprint("vrelel: missing VOP_CLOSE()", vp);
}
#endif
/*
* If already clean there is no need to lock, defer or
* deactivate this node.
*/
if (VSTATE_GET(vp) == VS_RECLAIMED) { if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
lktype = LK_NONE;
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
goto out;
}
/*
* First try to get the vnode locked for VOP_INACTIVE().
* Defer vnode release to vrele task if caller requests
* it explicitly, is the pagedaemon or the lock failed.
*/
defer = false;
if ((curlwp == uvm.pagedaemon_lwp) || async) {
defer = true;
} else if (lktype == LK_SHARED) {
/* Excellent chance of getting, if the last ref. */
error = vn_lock(vp, LK_UPGRADE | LK_RETRY | LK_NOWAIT);
if (error != 0) {
defer = true;
} else {
lktype = LK_EXCLUSIVE;
}
} else if (lktype == LK_NONE) {
/* Excellent chance of getting, if the last ref. */
error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
if (error != 0) {
defer = true;
} else {
lktype = LK_EXCLUSIVE;
}
}
KASSERT(mutex_owned(vp->v_interlock));
if (defer) {
/*
* Defer reclaim to the vrele task; it's not safe to
* clean it here. We donate it our last reference.
*/
if (lktype != LK_NONE) { mutex_exit(vp->v_interlock);
VOP_UNLOCK(vp);
mutex_enter(vp->v_interlock);
}
lru_requeue(vp, &lru_list[LRU_VRELE]);
mutex_exit(vp->v_interlock);
return;
}
KASSERT(lktype == LK_EXCLUSIVE);
/* If the node gained another reference, retry. */
use = atomic_load_relaxed(&vp->v_usecount); if ((use & VUSECOUNT_VGET) != 0) {
goto retry;
}
KASSERT((use & VUSECOUNT_MASK) == 1); if ((vp->v_iflag & (VI_TEXT|VI_EXECMAP|VI_WRMAP)) != 0 ||
(vp->v_vflag & VV_MAPPED) != 0) {
/* Take care of space accounting. */
if (!objlock_held) {
objlock_held = true;
if (!rw_tryenter(vp->v_uobj.vmobjlock, RW_WRITER)) {
mutex_exit(vp->v_interlock);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
mutex_enter(vp->v_interlock);
goto retry;
}
}
if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP|VI_WRMAP);
vp->v_vflag &= ~VV_MAPPED;
}
if (objlock_held) {
objlock_held = false;
rw_exit(vp->v_uobj.vmobjlock);
}
/*
* Deactivate the vnode, but preserve our reference across
* the call to VOP_INACTIVE().
*
* If VOP_INACTIVE() indicates that the file has been
* deleted, then recycle the vnode.
*
* Note that VOP_INACTIVE() will not drop the vnode lock.
*/
mutex_exit(vp->v_interlock);
recycle = false;
VOP_INACTIVE(vp, &recycle);
if (!recycle) {
lktype = LK_NONE;
VOP_UNLOCK(vp);
}
mutex_enter(vp->v_interlock);
/*
* Block new references then check again to see if a
* new reference was acquired in the meantime. If
* it was, restore the vnode state and try again.
*/
if (recycle) {
VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
use = atomic_load_relaxed(&vp->v_usecount);
if ((use & VUSECOUNT_VGET) != 0) {
VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
goto retry;
}
KASSERT((use & VUSECOUNT_MASK) == 1);
}
/*
* Recycle the vnode if the file is now unused (unlinked).
*/
if (recycle) {
VSTATE_ASSERT(vp, VS_BLOCKED);
KASSERT(lktype == LK_EXCLUSIVE);
/* vcache_reclaim drops the lock. */
lktype = LK_NONE;
vcache_reclaim(vp);
}
KASSERT(vrefcnt(vp) > 0); KASSERT(lktype == LK_NONE);
out:
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_VGET) != 0 &&
(use & VUSECOUNT_MASK) == 1)) {
/* Gained and released another reference, retry. */
goto retry;
}
next = atomic_cas_uint(&vp->v_usecount, use, use - 1);
if (__predict_true(next == use)) {
if (__predict_false((use & VUSECOUNT_MASK) != 1)) {
/* Gained another reference. */
mutex_exit(vp->v_interlock);
return;
}
break;
}
}
membar_acquire();
if (VSTATE_GET(vp) == VS_RECLAIMED && vp->v_holdcnt == 0) {
/*
* It's clean so destroy it. It isn't referenced
* anywhere since it has been reclaimed.
*/
vcache_free(VNODE_TO_VIMPL(vp));
} else {
/*
* Otherwise, put it back onto the freelist. It
* can't be destroyed while still associated with
* a file system.
*/
lru_requeue(vp, lru_which(vp));
mutex_exit(vp->v_interlock);
}
}
void
vrele(vnode_t *vp)
{ if (vtryrele(vp)) {
return;
}
mutex_enter(vp->v_interlock);
vrelel(vp, 0, LK_NONE);
}
/*
* Asynchronous vnode release, vnode is released in different context.
*/
void
vrele_async(vnode_t *vp)
{
if (vtryrele(vp)) {
return;
}
mutex_enter(vp->v_interlock);
vrelel(vp, VRELEL_ASYNC, LK_NONE);
}
/*
* Vnode reference, where a reference is already held by some other
* object (for example, a file structure).
*
* NB: lockless code sequences may rely on this not blocking.
*/
void
vref(vnode_t *vp)
{ KASSERT(vrefcnt(vp) > 0);
atomic_inc_uint(&vp->v_usecount);
}
/*
* Page or buffer structure gets a reference.
* Called with v_interlock held.
*/
void
vholdl(vnode_t *vp)
{ KASSERT(mutex_owned(vp->v_interlock)); if (vp->v_holdcnt++ == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp));
}
/*
* Page or buffer structure gets a reference.
*/
void
vhold(vnode_t *vp)
{
mutex_enter(vp->v_interlock);
vholdl(vp);
mutex_exit(vp->v_interlock);
}
/*
* Page or buffer structure frees a reference.
* Called with v_interlock held.
*/
void
holdrelel(vnode_t *vp)
{ KASSERT(mutex_owned(vp->v_interlock));
if (vp->v_holdcnt <= 0) {
vnpanic(vp, "%s: holdcnt vp %p", __func__, vp);
}
vp->v_holdcnt--;
if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) lru_requeue(vp, lru_which(vp));
}
/*
* Page or buffer structure frees a reference.
*/
void
holdrele(vnode_t *vp)
{
mutex_enter(vp->v_interlock);
holdrelel(vp);
mutex_exit(vp->v_interlock);
}
/*
* Recycle an unused vnode if caller holds the last reference.
*/
bool
vrecycle(vnode_t *vp)
{
int error __diagused;
mutex_enter(vp->v_interlock);
/* If the vnode is already clean we're done. */
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) != VS_LOADED) { VSTATE_ASSERT(vp, VS_RECLAIMED);
vrelel(vp, 0, LK_NONE);
return true;
}
/* Prevent further references until the vnode is locked. */
VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
/* Make sure we hold the last reference. */
if (vrefcnt(vp) != 1) { VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
mutex_exit(vp->v_interlock);
return false;
}
mutex_exit(vp->v_interlock);
/*
* On a leaf file system this lock will always succeed as we hold
* the last reference and prevent further references.
* On layered file systems waiting for the lock would open a can of
* deadlocks as the lower vnodes may have other active references.
*/
error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY | LK_NOWAIT);
mutex_enter(vp->v_interlock);
if (error) {
VSTATE_CHANGE(vp, VS_BLOCKED, VS_LOADED);
mutex_exit(vp->v_interlock);
return false;
}
KASSERT(vrefcnt(vp) == 1);
vcache_reclaim(vp);
vrelel(vp, 0, LK_NONE);
return true;
}
/*
* Helper for vrevoke() to propagate suspension from lastmp
* to thismp. Both args may be NULL.
* Returns the currently suspended file system or NULL.
*/
static struct mount *
vrevoke_suspend_next(struct mount *lastmp, struct mount *thismp)
{
int error;
if (lastmp == thismp)
return thismp;
if (lastmp != NULL) vfs_resume(lastmp); if (thismp == NULL)
return NULL;
do {
error = vfs_suspend(thismp, 0);
} while (error == EINTR || error == ERESTART);
if (error == 0)
return thismp;
KASSERT(error == EOPNOTSUPP || error == ENOENT);
return NULL;
}
/*
* Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
void
vrevoke(vnode_t *vp)
{
struct mount *mp;
vnode_t *vq;
enum vtype type;
dev_t dev;
KASSERT(vrefcnt(vp) > 0);
mp = vrevoke_suspend_next(NULL, vp->v_mount);
mutex_enter(vp->v_interlock);
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) == VS_RECLAIMED) { mutex_exit(vp->v_interlock);
} else if (vp->v_type != VBLK && vp->v_type != VCHR) {
atomic_inc_uint(&vp->v_usecount);
mutex_exit(vp->v_interlock);
vgone(vp);
} else {
dev = vp->v_rdev;
type = vp->v_type;
mutex_exit(vp->v_interlock);
while (spec_node_lookup_by_dev(type, dev, VDEAD_NOWAIT, &vq)
== 0) {
mp = vrevoke_suspend_next(mp, vq->v_mount);
vgone(vq);
}
}
vrevoke_suspend_next(mp, NULL);
}
/*
* Eliminate all activity associated with a vnode in preparation for
* reuse. Drops a reference from the vnode.
*/
void
vgone(vnode_t *vp)
{
int lktype;
KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
lktype = LK_EXCLUSIVE;
mutex_enter(vp->v_interlock);
VSTATE_WAIT_STABLE(vp);
if (VSTATE_GET(vp) == VS_LOADED) { VSTATE_CHANGE(vp, VS_LOADED, VS_BLOCKED);
vcache_reclaim(vp);
lktype = LK_NONE;
}
VSTATE_ASSERT(vp, VS_RECLAIMED);
vrelel(vp, 0, lktype);
}
static inline uint32_t
vcache_hash(const struct vcache_key *key)
{
uint32_t hash = HASH32_BUF_INIT;
KASSERT(key->vk_key_len > 0);
hash = hash32_buf(&key->vk_mount, sizeof(struct mount *), hash);
hash = hash32_buf(key->vk_key, key->vk_key_len, hash);
return hash;
}
static int
vcache_stats(struct hashstat_sysctl *hs, bool fill)
{
vnode_impl_t *vip;
uint64_t chain;
strlcpy(hs->hash_name, "vcache", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "vnode cache hash", sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = vcache_hashmask + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
mutex_enter(&vcache_lock);
SLIST_FOREACH(vip, &vcache_hashtab[i], vi_hash) {
chain++;
}
mutex_exit(&vcache_lock);
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
preempt_point();
}
return 0;
}
static void
vcache_init(void)
{
vcache_pool = pool_cache_init(sizeof(vnode_impl_t), coherency_unit,
0, 0, "vcachepl", NULL, IPL_NONE, NULL, NULL, NULL);
KASSERT(vcache_pool != NULL);
mutex_init(&vcache_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&vcache_cv, "vcache");
vcache_hashsize = desiredvnodes;
vcache_hashtab = hashinit(desiredvnodes, HASH_SLIST, true,
&vcache_hashmask);
hashstat_register("vcache", vcache_stats);
}
static void
vcache_reinit(void)
{
int i;
uint32_t hash;
u_long oldmask, newmask;
struct hashhead *oldtab, *newtab;
vnode_impl_t *vip;
newtab = hashinit(desiredvnodes, HASH_SLIST, true, &newmask);
mutex_enter(&vcache_lock);
oldtab = vcache_hashtab;
oldmask = vcache_hashmask;
vcache_hashsize = desiredvnodes;
vcache_hashtab = newtab;
vcache_hashmask = newmask;
for (i = 0; i <= oldmask; i++) {
while ((vip = SLIST_FIRST(&oldtab[i])) != NULL) {
SLIST_REMOVE(&oldtab[i], vip, vnode_impl, vi_hash);
hash = vcache_hash(&vip->vi_key);
SLIST_INSERT_HEAD(&newtab[hash & vcache_hashmask],
vip, vi_hash);
}
}
mutex_exit(&vcache_lock);
hashdone(oldtab, HASH_SLIST, oldmask);
}
static inline vnode_impl_t *
vcache_hash_lookup(const struct vcache_key *key, uint32_t hash)
{
struct hashhead *hashp;
vnode_impl_t *vip;
KASSERT(mutex_owned(&vcache_lock));
hashp = &vcache_hashtab[hash & vcache_hashmask];
SLIST_FOREACH(vip, hashp, vi_hash) { if (key->vk_mount != vip->vi_key.vk_mount)
continue;
if (key->vk_key_len != vip->vi_key.vk_key_len)
continue;
if (memcmp(key->vk_key, vip->vi_key.vk_key, key->vk_key_len))
continue;
return vip;
}
return NULL;
}
/*
* Allocate a new, uninitialized vcache node.
*/
static vnode_impl_t *
vcache_alloc(void)
{
vnode_impl_t *vip;
vnode_t *vp;
vip = pool_cache_get(vcache_pool, PR_WAITOK);
vp = VIMPL_TO_VNODE(vip);
memset(vip, 0, sizeof(*vip));
rw_init(&vip->vi_lock);
vp->v_interlock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_NONE);
uvm_obj_init(&vp->v_uobj, &uvm_vnodeops, true, 1);
klist_init(&vip->vi_klist.vk_klist);
vp->v_klist = &vip->vi_klist;
cv_init(&vp->v_cv, "vnode");
cache_vnode_init(vp);
vp->v_usecount = 1;
vp->v_type = VNON;
vp->v_size = vp->v_writesize = VSIZENOTSET;
vip->vi_state = VS_LOADING;
lru_requeue(vp, &lru_list[LRU_FREE]);
return vip;
}
/*
* Deallocate a vcache node in state VS_LOADING.
*
* vcache_lock held on entry and released on return.
*/
static void
vcache_dealloc(vnode_impl_t *vip)
{
vnode_t *vp;
KASSERT(mutex_owned(&vcache_lock));
vp = VIMPL_TO_VNODE(vip);
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
mutex_enter(vp->v_interlock);
vp->v_op = dead_vnodeop_p;
VSTATE_CHANGE(vp, VS_LOADING, VS_RECLAIMED);
mutex_exit(&vcache_lock);
vrelel(vp, 0, LK_NONE);
}
/*
* Free an unused, unreferenced vcache node.
* v_interlock locked on entry.
*/
static void
vcache_free(vnode_impl_t *vip)
{
vnode_t *vp;
vp = VIMPL_TO_VNODE(vip);
KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) == 0); KASSERT(vp->v_holdcnt == 0); KASSERT(vp->v_writecount == 0);
lru_requeue(vp, NULL);
mutex_exit(vp->v_interlock);
vfs_insmntque(vp, NULL);
if (vp->v_type == VBLK || vp->v_type == VCHR) spec_node_destroy(vp);
mutex_obj_free(vp->v_interlock);
rw_destroy(&vip->vi_lock);
uvm_obj_destroy(&vp->v_uobj, true);
KASSERT(vp->v_klist == &vip->vi_klist);
klist_fini(&vip->vi_klist.vk_klist);
cv_destroy(&vp->v_cv);
cache_vnode_fini(vp);
pool_cache_put(vcache_pool, vip);
}
/*
* Try to get an initial reference on this cached vnode.
* Returns zero on success or EBUSY if the vnode state is not LOADED.
*
* NB: lockless code sequences may rely on this not blocking.
*/
int
vcache_tryvget(vnode_t *vp)
{
u_int use, next;
for (use = atomic_load_relaxed(&vp->v_usecount);; use = next) { if (__predict_false((use & VUSECOUNT_GATE) == 0)) {
return EBUSY;
}
next = atomic_cas_uint(&vp->v_usecount,
use, (use + 1) | VUSECOUNT_VGET);
if (__predict_true(next == use)) { membar_acquire();
return 0;
}
}
}
/*
* Try to get an initial reference on this cached vnode.
* Returns zero on success and ENOENT if the vnode has been reclaimed.
* Will wait for the vnode state to be stable.
*
* v_interlock locked on entry and unlocked on exit.
*/
int
vcache_vget(vnode_t *vp)
{
int error;
KASSERT(mutex_owned(vp->v_interlock));
/* Increment hold count to prevent vnode from disappearing. */
vp->v_holdcnt++;
VSTATE_WAIT_STABLE(vp);
vp->v_holdcnt--;
/* If this was the last reference to a reclaimed vnode free it now. */
if (__predict_false(VSTATE_GET(vp) == VS_RECLAIMED)) { if (vp->v_holdcnt == 0 && vrefcnt(vp) == 0) vcache_free(VNODE_TO_VIMPL(vp));
else
mutex_exit(vp->v_interlock);
return ENOENT;
}
VSTATE_ASSERT(vp, VS_LOADED);
error = vcache_tryvget(vp);
KASSERT(error == 0);
mutex_exit(vp->v_interlock);
return 0;
}
/*
* Get a vnode / fs node pair by key and return it referenced through vpp.
*/
int
vcache_get(struct mount *mp, const void *key, size_t key_len,
struct vnode **vpp)
{
int error;
uint32_t hash;
const void *new_key;
struct vnode *vp;
struct vcache_key vcache_key;
vnode_impl_t *vip, *new_vip;
new_key = NULL;
*vpp = NULL;
vcache_key.vk_mount = mp;
vcache_key.vk_key = key;
vcache_key.vk_key_len = key_len;
hash = vcache_hash(&vcache_key);
again:
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&vcache_key, hash);
/* If found, take a reference or retry. */
if (__predict_true(vip != NULL)) {
/*
* If the vnode is loading we cannot take the v_interlock
* here as it might change during load (see uvm_obj_setlock()).
* As changing state from VS_LOADING requires both vcache_lock
* and v_interlock it is safe to test with vcache_lock held.
*
* Wait for vnodes changing state from VS_LOADING and retry.
*/
if (__predict_false(vip->vi_state == VS_LOADING)) {
cv_wait(&vcache_cv, &vcache_lock);
mutex_exit(&vcache_lock);
goto again;
}
vp = VIMPL_TO_VNODE(vip);
mutex_enter(vp->v_interlock);
mutex_exit(&vcache_lock);
error = vcache_vget(vp);
if (error == ENOENT)
goto again;
if (error == 0)
*vpp = vp; KASSERT((error != 0) == (*vpp == NULL));
return error;
}
mutex_exit(&vcache_lock);
/* Allocate and initialize a new vcache / vnode pair. */
error = vfs_busy(mp);
if (error)
return error;
new_vip = vcache_alloc();
new_vip->vi_key = vcache_key;
vp = VIMPL_TO_VNODE(new_vip);
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&vcache_key, hash);
if (vip == NULL) {
SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
new_vip, vi_hash);
vip = new_vip;
}
/* If another thread beat us inserting this node, retry. */
if (vip != new_vip) { vcache_dealloc(new_vip);
vfs_unbusy(mp);
goto again;
}
mutex_exit(&vcache_lock);
/* Load the fs node. Exclusive as new_node is VS_LOADING. */
error = VFS_LOADVNODE(mp, vp, key, key_len, &new_key);
if (error) {
mutex_enter(&vcache_lock);
SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
new_vip, vnode_impl, vi_hash);
vcache_dealloc(new_vip);
vfs_unbusy(mp);
KASSERT(*vpp == NULL);
return error;
}
KASSERT(new_key != NULL); KASSERT(memcmp(key, new_key, key_len) == 0); KASSERT(vp->v_op != NULL);
vfs_insmntque(vp, mp);
if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE;
vfs_ref(mp);
vfs_unbusy(mp);
/* Finished loading, finalize node. */
mutex_enter(&vcache_lock);
new_vip->vi_key.vk_key = new_key;
mutex_enter(vp->v_interlock);
VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
mutex_exit(vp->v_interlock);
mutex_exit(&vcache_lock);
*vpp = vp;
return 0;
}
/*
* Create a new vnode / fs node pair and return it referenced through vpp.
*/
int
vcache_new(struct mount *mp, struct vnode *dvp, struct vattr *vap,
kauth_cred_t cred, void *extra, struct vnode **vpp)
{
int error;
uint32_t hash;
struct vnode *vp, *ovp;
vnode_impl_t *vip, *ovip;
*vpp = NULL;
/* Allocate and initialize a new vcache / vnode pair. */
error = vfs_busy(mp);
if (error)
return error;
vip = vcache_alloc();
vip->vi_key.vk_mount = mp;
vp = VIMPL_TO_VNODE(vip);
/* Create and load the fs node. */
error = VFS_NEWVNODE(mp, dvp, vp, vap, cred, extra,
&vip->vi_key.vk_key_len, &vip->vi_key.vk_key);
if (error) {
mutex_enter(&vcache_lock);
vcache_dealloc(vip);
vfs_unbusy(mp);
KASSERT(*vpp == NULL);
return error;
}
KASSERT(vp->v_op != NULL); KASSERT((vip->vi_key.vk_key_len == 0) == (mp == dead_rootmount)); if (vip->vi_key.vk_key_len > 0) { KASSERT(vip->vi_key.vk_key != NULL); hash = vcache_hash(&vip->vi_key);
/*
* Wait for previous instance to be reclaimed,
* then insert new node.
*/
mutex_enter(&vcache_lock);
while ((ovip = vcache_hash_lookup(&vip->vi_key, hash))) {
ovp = VIMPL_TO_VNODE(ovip);
mutex_enter(ovp->v_interlock);
mutex_exit(&vcache_lock);
error = vcache_vget(ovp);
KASSERT(error == ENOENT);
mutex_enter(&vcache_lock);
}
SLIST_INSERT_HEAD(&vcache_hashtab[hash & vcache_hashmask],
vip, vi_hash);
mutex_exit(&vcache_lock);
}
vfs_insmntque(vp, mp);
if ((mp->mnt_iflag & IMNT_MPSAFE) != 0) vp->v_vflag |= VV_MPSAFE;
vfs_ref(mp);
vfs_unbusy(mp);
/* Finished loading, finalize node. */
mutex_enter(&vcache_lock);
mutex_enter(vp->v_interlock);
VSTATE_CHANGE(vp, VS_LOADING, VS_LOADED);
mutex_exit(&vcache_lock);
mutex_exit(vp->v_interlock);
*vpp = vp;
return 0;
}
/*
* Prepare key change: update old cache nodes key and lock new cache node.
* Return an error if the new node already exists.
*/
int
vcache_rekey_enter(struct mount *mp, struct vnode *vp,
const void *old_key, size_t old_key_len,
const void *new_key, size_t new_key_len)
{
uint32_t old_hash, new_hash;
struct vcache_key old_vcache_key, new_vcache_key;
vnode_impl_t *vip, *new_vip;
old_vcache_key.vk_mount = mp;
old_vcache_key.vk_key = old_key;
old_vcache_key.vk_key_len = old_key_len;
old_hash = vcache_hash(&old_vcache_key);
new_vcache_key.vk_mount = mp;
new_vcache_key.vk_key = new_key;
new_vcache_key.vk_key_len = new_key_len;
new_hash = vcache_hash(&new_vcache_key);
new_vip = vcache_alloc();
new_vip->vi_key = new_vcache_key;
/* Insert locked new node used as placeholder. */
mutex_enter(&vcache_lock);
vip = vcache_hash_lookup(&new_vcache_key, new_hash);
if (vip != NULL) {
vcache_dealloc(new_vip);
return EEXIST;
}
SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
new_vip, vi_hash);
/* Replace old nodes key with the temporary copy. */
vip = vcache_hash_lookup(&old_vcache_key, old_hash);
KASSERT(vip != NULL);
KASSERT(VIMPL_TO_VNODE(vip) == vp);
KASSERT(vip->vi_key.vk_key != old_vcache_key.vk_key);
vip->vi_key = old_vcache_key;
mutex_exit(&vcache_lock);
return 0;
}
/*
* Key change complete: update old node and remove placeholder.
*/
void
vcache_rekey_exit(struct mount *mp, struct vnode *vp,
const void *old_key, size_t old_key_len,
const void *new_key, size_t new_key_len)
{
uint32_t old_hash, new_hash;
struct vcache_key old_vcache_key, new_vcache_key;
vnode_impl_t *vip, *new_vip;
struct vnode *new_vp;
old_vcache_key.vk_mount = mp;
old_vcache_key.vk_key = old_key;
old_vcache_key.vk_key_len = old_key_len;
old_hash = vcache_hash(&old_vcache_key);
new_vcache_key.vk_mount = mp;
new_vcache_key.vk_key = new_key;
new_vcache_key.vk_key_len = new_key_len;
new_hash = vcache_hash(&new_vcache_key);
mutex_enter(&vcache_lock);
/* Lookup old and new node. */
vip = vcache_hash_lookup(&old_vcache_key, old_hash);
KASSERT(vip != NULL);
KASSERT(VIMPL_TO_VNODE(vip) == vp);
new_vip = vcache_hash_lookup(&new_vcache_key, new_hash);
KASSERT(new_vip != NULL);
KASSERT(new_vip->vi_key.vk_key_len == new_key_len);
new_vp = VIMPL_TO_VNODE(new_vip);
mutex_enter(new_vp->v_interlock);
VSTATE_ASSERT(VIMPL_TO_VNODE(new_vip), VS_LOADING);
mutex_exit(new_vp->v_interlock);
/* Rekey old node and put it onto its new hashlist. */
vip->vi_key = new_vcache_key;
if (old_hash != new_hash) {
SLIST_REMOVE(&vcache_hashtab[old_hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
SLIST_INSERT_HEAD(&vcache_hashtab[new_hash & vcache_hashmask],
vip, vi_hash);
}
/* Remove new node used as placeholder. */
SLIST_REMOVE(&vcache_hashtab[new_hash & vcache_hashmask],
new_vip, vnode_impl, vi_hash);
vcache_dealloc(new_vip);
}
/*
* Disassociate the underlying file system from a vnode.
*
* Must be called with vnode locked and will return unlocked.
* Must be called with the interlock held, and will return with it held.
*/
static void
vcache_reclaim(vnode_t *vp)
{
lwp_t *l = curlwp;
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
struct mount *mp = vp->v_mount;
uint32_t hash;
uint8_t temp_buf[64], *temp_key;
size_t temp_key_len;
bool recycle;
int error;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(mutex_owned(vp->v_interlock)); KASSERT(vrefcnt(vp) != 0);
temp_key_len = vip->vi_key.vk_key_len;
/*
* Prevent the vnode from being recycled or brought into use
* while we clean it out.
*/
VSTATE_CHANGE(vp, VS_BLOCKED, VS_RECLAIMING);
/*
* Send NOTE_REVOKE now, before we call VOP_RECLAIM(),
* because VOP_RECLAIM() could cause vp->v_klist to
* become invalid. Don't check for interest in NOTE_REVOKE
* here; it's always posted because it sets EV_EOF.
*
* Once it's been posted, reset vp->v_klist to point to
* our own local storage, in case we were sharing with
* someone else.
*/
KNOTE(&vp->v_klist->vk_klist, NOTE_REVOKE);
vp->v_klist = &vip->vi_klist;
mutex_exit(vp->v_interlock);
rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
mutex_enter(vp->v_interlock);
if ((vp->v_iflag & VI_EXECMAP) != 0) { cpu_count(CPU_COUNT_EXECPAGES, -vp->v_uobj.uo_npages);
}
vp->v_iflag &= ~(VI_TEXT|VI_EXECMAP);
vp->v_iflag |= VI_DEADCHECK; /* for genfs_getpages() */
mutex_exit(vp->v_interlock);
rw_exit(vp->v_uobj.vmobjlock);
/*
* With vnode state set to reclaiming, purge name cache immediately
* to prevent new handles on vnode, and wait for existing threads
* trying to get a handle to notice VS_RECLAIMED status and abort.
*/
cache_purge(vp);
/* Replace the vnode key with a temporary copy. */
if (vip->vi_key.vk_key_len > sizeof(temp_buf)) { temp_key = kmem_alloc(temp_key_len, KM_SLEEP);
} else {
temp_key = temp_buf;
}
if (vip->vi_key.vk_key_len > 0) { mutex_enter(&vcache_lock);
memcpy(temp_key, vip->vi_key.vk_key, temp_key_len);
vip->vi_key.vk_key = temp_key;
mutex_exit(&vcache_lock);
}
fstrans_start(mp);
/*
* Clean out any cached data associated with the vnode.
*/
error = vinvalbuf(vp, V_SAVE, NOCRED, l, 0, 0);
if (error != 0) { if (wapbl_vphaswapbl(vp)) WAPBL_DISCARD(wapbl_vptomp(vp));
error = vinvalbuf(vp, 0, NOCRED, l, 0, 0);
}
KASSERTMSG((error == 0), "vinvalbuf failed: %d", error); KASSERT((vp->v_iflag & VI_ONWORKLST) == 0); if (vp->v_type == VBLK || vp->v_type == VCHR) { spec_node_revoke(vp);
}
/*
* Disassociate the underlying file system from the vnode.
* VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
* the vnode, and may destroy the vnode so that VOP_UNLOCK
* would no longer function.
*/
VOP_INACTIVE(vp, &recycle);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
if (VOP_RECLAIM(vp)) {
vnpanic(vp, "%s: cannot reclaim", __func__);
}
KASSERT(vp->v_data == NULL); KASSERT((vp->v_iflag & VI_PAGES) == 0); if (vp->v_type == VREG && vp->v_ractx != NULL) { uvm_ra_freectx(vp->v_ractx);
vp->v_ractx = NULL;
}
if (vip->vi_key.vk_key_len > 0) {
/* Remove from vnode cache. */
hash = vcache_hash(&vip->vi_key);
mutex_enter(&vcache_lock);
KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
mutex_exit(&vcache_lock);
}
if (temp_key != temp_buf) kmem_free(temp_key, temp_key_len);
/* Done with purge, notify sleepers of the grim news. */
mutex_enter(vp->v_interlock);
vp->v_op = dead_vnodeop_p;
VSTATE_CHANGE(vp, VS_RECLAIMING, VS_RECLAIMED);
vp->v_tag = VT_NON;
mutex_exit(vp->v_interlock);
/*
* Move to dead mount. Must be after changing the operations
* vector as vnode operations enter the mount before using the
* operations vector. See sys/kern/vnode_if.c.
*/
vp->v_vflag &= ~VV_ROOT;
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
#ifdef PAX_SEGVGUARD
pax_segvguard_cleanup(vp);
#endif /* PAX_SEGVGUARD */
mutex_enter(vp->v_interlock);
fstrans_done(mp);
KASSERT((vp->v_iflag & VI_ONWORKLST) == 0);
}
/*
* Disassociate the underlying file system from an open device vnode
* and make it anonymous.
*
* Vnode unlocked on entry, drops a reference to the vnode.
*/
void
vcache_make_anon(vnode_t *vp)
{
vnode_impl_t *vip = VNODE_TO_VIMPL(vp);
uint32_t hash;
bool recycle;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_mount == dead_rootmount || fstrans_is_owner(vp->v_mount));
VSTATE_ASSERT_UNLOCKED(vp, VS_ACTIVE);
/* Remove from vnode cache. */
hash = vcache_hash(&vip->vi_key);
mutex_enter(&vcache_lock);
KASSERT(vip == vcache_hash_lookup(&vip->vi_key, hash)); SLIST_REMOVE(&vcache_hashtab[hash & vcache_hashmask],
vip, vnode_impl, vi_hash);
vip->vi_key.vk_mount = dead_rootmount;
vip->vi_key.vk_key_len = 0;
vip->vi_key.vk_key = NULL;
mutex_exit(&vcache_lock);
/*
* Disassociate the underlying file system from the vnode.
* VOP_INACTIVE leaves the vnode locked; VOP_RECLAIM unlocks
* the vnode, and may destroy the vnode so that VOP_UNLOCK
* would no longer function.
*/
if (vn_lock(vp, LK_EXCLUSIVE)) {
vnpanic(vp, "%s: cannot lock", __func__);
}
VOP_INACTIVE(vp, &recycle);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
if (VOP_RECLAIM(vp)) {
vnpanic(vp, "%s: cannot reclaim", __func__);
}
/* Purge name cache. */
cache_purge(vp);
/* Done with purge, change operations vector. */
mutex_enter(vp->v_interlock);
vp->v_op = spec_vnodeop_p;
vp->v_vflag |= VV_MPSAFE;
mutex_exit(vp->v_interlock);
/*
* Move to dead mount. Must be after changing the operations
* vector as vnode operations enter the mount before using the
* operations vector. See sys/kern/vnode_if.c.
*/
vfs_ref(dead_rootmount);
vfs_insmntque(vp, dead_rootmount);
vrele(vp);
}
/*
* Update outstanding I/O count and do wakeup if requested.
*/
void
vwakeup(struct buf *bp)
{
vnode_t *vp;
if ((vp = bp->b_vp) == NULL)
return;
KASSERT(bp->b_objlock == vp->v_interlock); KASSERT(mutex_owned(bp->b_objlock));
if (--vp->v_numoutput < 0)
vnpanic(vp, "%s: neg numoutput, vp %p", __func__, vp); if (vp->v_numoutput == 0) cv_broadcast(&vp->v_cv);
}
/*
* Test a vnode for being or becoming dead. Returns one of:
* EBUSY: vnode is becoming dead, with "flags == VDEAD_NOWAIT" only.
* ENOENT: vnode is dead.
* 0: otherwise.
*
* Whenever this function returns a non-zero value all future
* calls will also return a non-zero value.
*/
int
vdead_check(struct vnode *vp, int flags)
{ KASSERT(mutex_owned(vp->v_interlock)); if (! ISSET(flags, VDEAD_NOWAIT)) VSTATE_WAIT_STABLE(vp); if (VSTATE_GET(vp) == VS_RECLAIMING) { KASSERT(ISSET(flags, VDEAD_NOWAIT));
return EBUSY;
} else if (VSTATE_GET(vp) == VS_RECLAIMED) {
return ENOENT;
}
return 0;
}
int
vfs_drainvnodes(void)
{
mutex_enter(&vdrain_lock);
if (!vdrain_one(desiredvnodes)) {
mutex_exit(&vdrain_lock);
return EBUSY;
}
mutex_exit(&vdrain_lock);
if (vcache_hashsize != desiredvnodes)
vcache_reinit();
return 0;
}
void
vnpanic(vnode_t *vp, const char *fmt, ...)
{
va_list ap;
#ifdef DIAGNOSTIC
vprint(NULL, vp);
#endif
va_start(ap, fmt);
vpanic(fmt, ap);
va_end(ap);
}
void
vshareilock(vnode_t *tvp, vnode_t *fvp)
{
kmutex_t *oldlock;
oldlock = tvp->v_interlock;
mutex_obj_hold(fvp->v_interlock);
tvp->v_interlock = fvp->v_interlock;
mutex_obj_free(oldlock);
}
void
vshareklist(vnode_t *tvp, vnode_t *fvp)
{
/*
* If two vnodes share klist state, they must also share
* an interlock.
*/
KASSERT(tvp->v_interlock == fvp->v_interlock);
/*
* We make the following assumptions:
*
* ==> Some other synchronization is happening outside of
* our view to make this safe.
*
* ==> That the "to" vnode will have the necessary references
* on the "from" vnode so that the storage for the klist
* won't be yanked out from beneath us (the vnode_impl).
*
* ==> If "from" is also sharing, we then assume that "from"
* has the necessary references, and so on.
*/
tvp->v_klist = fvp->v_klist;
}
/* $NetBSD: wapbl.h,v 1.21 2018/12/10 21:19:33 jdolecek Exp $ */
/*-
* Copyright (c) 2003,2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _SYS_WAPBL_H
#define _SYS_WAPBL_H
#include <sys/mutex.h>
#if defined(_KERNEL) || defined(_KMEMUSER)
#include <miscfs/specfs/specdev.h>
#endif
/* This header file describes the api and data structures for
* write ahead physical block logging (WAPBL) support.
*/
#if defined(_KERNEL_OPT)
#include "opt_wapbl.h"
#endif
#ifdef WAPBL_DEBUG
#ifndef WAPBL_DEBUG_PRINT
#define WAPBL_DEBUG_PRINT (WAPBL_PRINT_REPLAY | WAPBL_PRINT_OPEN)
#endif
#if 0
#define WAPBL_DEBUG_BUFBYTES
#endif
#endif
#ifdef WAPBL_DEBUG_PRINT
enum {
WAPBL_PRINT_OPEN = 0x1,
WAPBL_PRINT_FLUSH = 0x2,
WAPBL_PRINT_TRUNCATE = 0x4,
WAPBL_PRINT_TRANSACTION = 0x8,
WAPBL_PRINT_BUFFER = 0x10,
WAPBL_PRINT_BUFFER2 = 0x20,
WAPBL_PRINT_ALLOC = 0x40,
WAPBL_PRINT_INODE = 0x80,
WAPBL_PRINT_WRITE = 0x100,
WAPBL_PRINT_IO = 0x200,
WAPBL_PRINT_REPLAY = 0x400,
WAPBL_PRINT_ERROR = 0x800,
WAPBL_PRINT_DISCARD = 0x1000,
WAPBL_PRINT_BIODONE = 0x2000,
};
#define WAPBL_PRINTF(mask, a) if (wapbl_debug_print & (mask)) printf a
extern int wapbl_debug_print;
#else
#define WAPBL_PRINTF(mask, a)
#endif
/****************************************************************/
#include <sys/queue.h>
#include <sys/vnode.h>
#include <sys/buf.h>
#ifdef _KERNEL
struct wapbl_entry;
struct wapbl_replay;
struct wapbl;
struct wapbl_dealloc {
TAILQ_ENTRY(wapbl_dealloc) wd_entries;
daddr_t wd_blkno; /* address of block */
int wd_len; /* size of block */
};
typedef void (*wapbl_flush_fn_t)(struct mount *, struct wapbl_dealloc *);
/*
* This structure holds per transaction log information
*/
struct wapbl_entry {
struct wapbl *we_wapbl;
SIMPLEQ_ENTRY(wapbl_entry) we_entries;
size_t we_bufcount; /* Count of unsynced buffers */
size_t we_reclaimable_bytes; /* Number on disk bytes for this
transaction */
int we_error;
#ifdef WAPBL_DEBUG_BUFBYTES
size_t we_unsynced_bufbytes; /* Byte count of unsynced buffers */
#endif
};
/* Start using a log */
int wapbl_start(struct wapbl **, struct mount *, struct vnode *, daddr_t,
size_t, size_t, struct wapbl_replay *,
wapbl_flush_fn_t, wapbl_flush_fn_t);
/* Discard the current transaction, potentially dangerous */
void wapbl_discard(struct wapbl *);
/* stop using a log */
int wapbl_stop(struct wapbl *, int);
/*
* Begin a new transaction or increment transaction recursion
* level if called while a transaction is already in progress
* by the current process.
*/
int wapbl_begin(struct wapbl *, const char *, int);
/* End a transaction or decrement the transaction recursion level */
void wapbl_end(struct wapbl *);
/*
* Add a new buffer to the current transaction. The buffers
* data will be copied to the current transaction log and the
* buffer will be marked B_LOCKED so that it will not be
* flushed to disk by the syncer or reallocated.
*/
void wapbl_add_buf(struct wapbl *, struct buf *);
/* Remove a buffer from the current transaction. */
void wapbl_remove_buf(struct wapbl *, struct buf *);
void wapbl_resize_buf(struct wapbl *, struct buf *, long, long);
/*
* This will flush all completed transactions to disk and
* start asynchronous writes on the associated buffers
*/
int wapbl_flush(struct wapbl *, int);
/*
* Inodes that are allocated but have zero link count
* must be registered with the current transaction
* so they may be recorded in the log and cleaned up later.
* registration/unregistration of ino numbers already registered is ok.
*/
void wapbl_register_inode(struct wapbl *, ino_t, mode_t);
void wapbl_unregister_inode(struct wapbl *, ino_t, mode_t);
/*
* Metadata block deallocations must be registered so
* that revocations records can be written and to prevent
* the corresponding blocks from being reused as data
* blocks until the log is on disk.
*/
int wapbl_register_deallocation(struct wapbl *, daddr_t, int, bool,
void **);
void wapbl_unregister_deallocation(struct wapbl *, void *);
void wapbl_jlock_assert(struct wapbl *wl);
void wapbl_junlock_assert(struct wapbl *wl);
void wapbl_print(struct wapbl *wl, int full, void (*pr)(const char *, ...)
__printflike(1, 2));
#if defined(WAPBL_DEBUG) || defined(DDB)
void wapbl_dump(struct wapbl *);
#endif
void wapbl_biodone(struct buf *);
extern const struct wapbl_ops wapbl_ops;
static __inline struct mount *
wapbl_vptomp(struct vnode *vp)
{
struct mount *mp;
mp = NULL;
if (vp != NULL) { if (vp->v_type == VBLK) mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
}
return mp;
}
static __inline bool
wapbl_vphaswapbl(struct vnode *vp)
{
struct mount *mp;
if (vp == NULL)
return false;
mp = wapbl_vptomp(vp); return mp && mp->mnt_wapbl;
}
#endif /* _KERNEL */
/****************************************************************/
/* Replay support */
#ifdef WAPBL_INTERNAL
LIST_HEAD(wapbl_blk_head, wapbl_blk);
struct wapbl_replay {
struct vnode *wr_logvp;
struct vnode *wr_devvp;
daddr_t wr_logpbn;
int wr_log_dev_bshift;
int wr_fs_dev_bshift;
int64_t wr_circ_off;
int64_t wr_circ_size;
uint32_t wr_generation;
void *wr_scratch;
struct wapbl_blk_head *wr_blkhash;
u_long wr_blkhashmask;
int wr_blkhashcnt;
off_t wr_inodeshead;
off_t wr_inodestail;
int wr_inodescnt;
struct {
uint32_t wr_inumber;
uint32_t wr_imode;
} *wr_inodes;
};
#define wapbl_replay_isopen(wr) ((wr)->wr_scratch != 0)
/* Supply this to provide i/o support */
int wapbl_write(void *, size_t, struct vnode *, daddr_t);
int wapbl_read(void *, size_t, struct vnode *, daddr_t);
/****************************************************************/
#else
struct wapbl_replay;
#endif /* WAPBL_INTERNAL */
/****************************************************************/
int wapbl_replay_start(struct wapbl_replay **, struct vnode *,
daddr_t, size_t, size_t);
void wapbl_replay_stop(struct wapbl_replay *);
void wapbl_replay_free(struct wapbl_replay *);
int wapbl_replay_write(struct wapbl_replay *, struct vnode *);
int wapbl_replay_can_read(struct wapbl_replay *, daddr_t, long);
int wapbl_replay_read(struct wapbl_replay *, void *, daddr_t, long);
/****************************************************************/
#endif /* !_SYS_WAPBL_H */
/* $NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $ */
/*-
* Copyright (c) 2003, 2006, 2007, 2008, 2009, 2019, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Thomas Nordin <nordin@openbsd.org>
* Copyright (c) 2000-2001 Artur Grabowski <art@openbsd.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
* AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
* THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_timeout.c,v 1.79 2023/10/08 13:23:05 ad Exp $");
/*
* Timeouts are kept in a hierarchical timing wheel. The c_time is the
* value of c_cpu->cc_ticks when the timeout should be called. There are
* four levels with 256 buckets each. See 'Scheme 7' in "Hashed and
* Hierarchical Timing Wheels: Efficient Data Structures for Implementing
* a Timer Facility" by George Varghese and Tony Lauck.
*
* Some of the "math" in here is a bit tricky. We have to beware of
* wrapping ints.
*
* We use the fact that any element added to the queue must be added with
* a positive time. That means that any element `to' on the queue cannot
* be scheduled to timeout further in time than INT_MAX, but c->c_time can
* be positive or negative so comparing it with anything is dangerous.
* The only way we can use the c->c_time value in any predictable way is
* when we calculate how far in the future `to' will timeout - "c->c_time
* - c->c_cpu->cc_ticks". The result will always be positive for future
* timeouts and 0 or negative for due timeouts.
*/
#define _CALLOUT_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/callout.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/evcnt.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/kmem.h>
#include <sys/sdt.h>
#ifdef DDB
#include <machine/db_machdep.h>
#include <ddb/db_interface.h>
#include <ddb/db_access.h>
#include <ddb/db_cpu.h>
#include <ddb/db_sym.h>
#include <ddb/db_output.h>
#endif
#define BUCKETS 1024
#define WHEELSIZE 256
#define WHEELMASK 255
#define WHEELBITS 8
#define MASKWHEEL(wheel, time) (((time) >> ((wheel)*WHEELBITS)) & WHEELMASK)
#define BUCKET(cc, rel, abs) \
(((rel) <= (1 << (2*WHEELBITS))) \
? ((rel) <= (1 << WHEELBITS)) \
? &(cc)->cc_wheel[MASKWHEEL(0, (abs))] \
: &(cc)->cc_wheel[MASKWHEEL(1, (abs)) + WHEELSIZE] \
: ((rel) <= (1 << (3*WHEELBITS))) \
? &(cc)->cc_wheel[MASKWHEEL(2, (abs)) + 2*WHEELSIZE] \
: &(cc)->cc_wheel[MASKWHEEL(3, (abs)) + 3*WHEELSIZE])
#define MOVEBUCKET(cc, wheel, time) \
CIRCQ_APPEND(&(cc)->cc_todo, \
&(cc)->cc_wheel[MASKWHEEL((wheel), (time)) + (wheel)*WHEELSIZE])
/*
* Circular queue definitions.
*/
#define CIRCQ_INIT(list) \
do { \
(list)->cq_next_l = (list); \
(list)->cq_prev_l = (list); \
} while (/*CONSTCOND*/0)
#define CIRCQ_INSERT(elem, list) \
do { \
(elem)->cq_prev_e = (list)->cq_prev_e; \
(elem)->cq_next_l = (list); \
(list)->cq_prev_l->cq_next_l = (elem); \
(list)->cq_prev_l = (elem); \
} while (/*CONSTCOND*/0)
#define CIRCQ_APPEND(fst, snd) \
do { \
if (!CIRCQ_EMPTY(snd)) { \
(fst)->cq_prev_l->cq_next_l = (snd)->cq_next_l; \
(snd)->cq_next_l->cq_prev_l = (fst)->cq_prev_l; \
(snd)->cq_prev_l->cq_next_l = (fst); \
(fst)->cq_prev_l = (snd)->cq_prev_l; \
CIRCQ_INIT(snd); \
} \
} while (/*CONSTCOND*/0)
#define CIRCQ_REMOVE(elem) \
do { \
(elem)->cq_next_l->cq_prev_e = (elem)->cq_prev_e; \
(elem)->cq_prev_l->cq_next_e = (elem)->cq_next_e; \
} while (/*CONSTCOND*/0)
#define CIRCQ_FIRST(list) ((list)->cq_next_e)
#define CIRCQ_NEXT(elem) ((elem)->cq_next_e)
#define CIRCQ_LAST(elem,list) ((elem)->cq_next_l == (list))
#define CIRCQ_EMPTY(list) ((list)->cq_next_l == (list))
struct callout_cpu {
kmutex_t *cc_lock;
sleepq_t cc_sleepq;
u_int cc_nwait;
u_int cc_ticks;
lwp_t *cc_lwp;
callout_impl_t *cc_active;
struct evcnt cc_ev_late;
struct evcnt cc_ev_block;
struct callout_circq cc_todo; /* Worklist */
struct callout_circq cc_wheel[BUCKETS]; /* Queues of timeouts */
char cc_name1[12];
char cc_name2[12];
struct cpu_info *cc_cpu;
};
#ifdef DDB
static struct callout_cpu ccb;
#endif
#ifndef CRASH /* _KERNEL */
static void callout_softclock(void *);
static void callout_wait(callout_impl_t *, void *, kmutex_t *);
static struct callout_cpu callout_cpu0 __cacheline_aligned;
static void *callout_sih __read_mostly;
SDT_PROBE_DEFINE2(sdt, kernel, callout, init,
"struct callout *"/*ch*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, callout, destroy,
"struct callout *"/*ch*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, setfunc,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, schedule,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"int"/*ticks*/);
SDT_PROBE_DEFINE6(sdt, kernel, callout, migrate,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"struct cpu_info *"/*ocpu*/,
"struct cpu_info *"/*ncpu*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, entry,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, return,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, stop,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"bool"/*expired*/);
SDT_PROBE_DEFINE4(sdt, kernel, callout, halt,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE5(sdt, kernel, callout, halt__done,
"struct callout *"/*ch*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/,
"bool"/*expired*/);
syncobj_t callout_syncobj = {
.sobj_name = "callout",
.sobj_flag = SOBJ_SLEEPQ_SORTED,
.sobj_boostpri = PRI_KERNEL,
.sobj_unsleep = sleepq_unsleep,
.sobj_changepri = sleepq_changepri,
.sobj_lendpri = sleepq_lendpri,
.sobj_owner = syncobj_noowner,
};
static inline kmutex_t *
callout_lock(callout_impl_t *c)
{
struct callout_cpu *cc;
kmutex_t *lock;
for (;;) {
cc = c->c_cpu;
lock = cc->cc_lock;
mutex_spin_enter(lock);
if (__predict_true(cc == c->c_cpu))
return lock;
mutex_spin_exit(lock);
}
}
/*
* Check if the callout is currently running on an LWP that isn't curlwp.
*/
static inline bool
callout_running_somewhere_else(callout_impl_t *c, struct callout_cpu *cc)
{
KASSERT(c->c_cpu == cc); return cc->cc_active == c && cc->cc_lwp != curlwp;
}
/*
* callout_startup:
*
* Initialize the callout facility, called at system startup time.
* Do just enough to allow callouts to be safely registered.
*/
void
callout_startup(void)
{
struct callout_cpu *cc;
int b;
KASSERT(curcpu()->ci_data.cpu_callout == NULL);
cc = &callout_cpu0;
cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
CIRCQ_INIT(&cc->cc_todo);
for (b = 0; b < BUCKETS; b++)
CIRCQ_INIT(&cc->cc_wheel[b]);
curcpu()->ci_data.cpu_callout = cc;
}
/*
* callout_init_cpu:
*
* Per-CPU initialization.
*/
CTASSERT(sizeof(callout_impl_t) <= sizeof(callout_t));
void
callout_init_cpu(struct cpu_info *ci)
{
struct callout_cpu *cc;
int b;
if ((cc = ci->ci_data.cpu_callout) == NULL) {
cc = kmem_zalloc(sizeof(*cc), KM_SLEEP);
cc->cc_lock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
CIRCQ_INIT(&cc->cc_todo);
for (b = 0; b < BUCKETS; b++)
CIRCQ_INIT(&cc->cc_wheel[b]);
} else {
/* Boot CPU, one time only. */
callout_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
callout_softclock, NULL);
if (callout_sih == NULL)
panic("callout_init_cpu (2)");
}
sleepq_init(&cc->cc_sleepq);
snprintf(cc->cc_name1, sizeof(cc->cc_name1), "late/%u",
cpu_index(ci));
evcnt_attach_dynamic(&cc->cc_ev_late, EVCNT_TYPE_MISC,
NULL, "callout", cc->cc_name1);
snprintf(cc->cc_name2, sizeof(cc->cc_name2), "wait/%u",
cpu_index(ci));
evcnt_attach_dynamic(&cc->cc_ev_block, EVCNT_TYPE_MISC,
NULL, "callout", cc->cc_name2);
cc->cc_cpu = ci;
ci->ci_data.cpu_callout = cc;
}
/*
* callout_init:
*
* Initialize a callout structure. This must be quick, so we fill
* only the minimum number of fields.
*/
void
callout_init(callout_t *cs, u_int flags)
{
callout_impl_t *c = (callout_impl_t *)cs;
struct callout_cpu *cc;
KASSERT((flags & ~CALLOUT_FLAGMASK) == 0); SDT_PROBE2(sdt, kernel, callout, init, cs, flags);
cc = curcpu()->ci_data.cpu_callout;
c->c_func = NULL;
c->c_magic = CALLOUT_MAGIC;
if (__predict_true((flags & CALLOUT_MPSAFE) != 0 && cc != NULL)) {
c->c_flags = flags;
c->c_cpu = cc;
return;
}
c->c_flags = flags | CALLOUT_BOUND;
c->c_cpu = &callout_cpu0;
}
/*
* callout_destroy:
*
* Destroy a callout structure. The callout must be stopped.
*/
void
callout_destroy(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
SDT_PROBE1(sdt, kernel, callout, destroy, cs); KASSERTMSG(c->c_magic == CALLOUT_MAGIC,
"callout %p: c_magic (%#x) != CALLOUT_MAGIC (%#x)",
c, c->c_magic, CALLOUT_MAGIC);
/*
* It's not necessary to lock in order to see the correct value
* of c->c_flags. If the callout could potentially have been
* running, the current thread should have stopped it.
*/
KASSERTMSG((c->c_flags & CALLOUT_PENDING) == 0,
"pending callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
c, c->c_func, c->c_flags, __builtin_return_address(0));
KASSERTMSG(!callout_running_somewhere_else(c, c->c_cpu),
"running callout %p: c_func (%p) c_flags (%#x) destroyed from %p",
c, c->c_func, c->c_flags, __builtin_return_address(0));
c->c_magic = 0;
}
/*
* callout_schedule_locked:
*
* Schedule a callout to run. The function and argument must
* already be set in the callout structure. Must be called with
* callout_lock.
*/
static void
callout_schedule_locked(callout_impl_t *c, kmutex_t *lock, int to_ticks)
{
struct callout_cpu *cc, *occ;
int old_time;
SDT_PROBE5(sdt, kernel, callout, schedule,
c, c->c_func, c->c_arg, c->c_flags, to_ticks);
KASSERT(to_ticks >= 0); KASSERT(c->c_func != NULL);
/* Initialize the time here, it won't change. */
occ = c->c_cpu;
c->c_flags &= ~(CALLOUT_FIRED | CALLOUT_INVOKING);
/*
* If this timeout is already scheduled and now is moved
* earlier, reschedule it now. Otherwise leave it in place
* and let it be rescheduled later.
*/
if ((c->c_flags & CALLOUT_PENDING) != 0) {
/* Leave on existing CPU. */
old_time = c->c_time;
c->c_time = to_ticks + occ->cc_ticks;
if (c->c_time - old_time < 0) { CIRCQ_REMOVE(&c->c_list);
CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
}
mutex_spin_exit(lock);
return;
}
cc = curcpu()->ci_data.cpu_callout;
if ((c->c_flags & CALLOUT_BOUND) != 0 || cc == occ ||
!mutex_tryenter(cc->cc_lock)) {
/* Leave on existing CPU. */
c->c_time = to_ticks + occ->cc_ticks;
c->c_flags |= CALLOUT_PENDING;
CIRCQ_INSERT(&c->c_list, &occ->cc_todo);
} else {
/* Move to this CPU. */
c->c_cpu = cc;
c->c_time = to_ticks + cc->cc_ticks;
c->c_flags |= CALLOUT_PENDING;
CIRCQ_INSERT(&c->c_list, &cc->cc_todo);
mutex_spin_exit(cc->cc_lock);
SDT_PROBE6(sdt, kernel, callout, migrate,
c, c->c_func, c->c_arg, c->c_flags,
occ->cc_cpu, cc->cc_cpu);
}
mutex_spin_exit(lock);
}
/*
* callout_reset:
*
* Reset a callout structure with a new function and argument, and
* schedule it to run.
*/
void
callout_reset(callout_t *cs, int to_ticks, void (*func)(void *), void *arg)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags);
c->c_func = func;
c->c_arg = arg;
callout_schedule_locked(c, lock, to_ticks);
}
/*
* callout_schedule:
*
* Schedule a callout to run. The function and argument must
* already be set in the callout structure.
*/
void
callout_schedule(callout_t *cs, int to_ticks)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
callout_schedule_locked(c, lock, to_ticks);
}
/*
* callout_stop:
*
* Try to cancel a pending callout. It may be too late: the callout
* could be running on another CPU. If called from interrupt context,
* the callout could already be in progress at a lower priority.
*/
bool
callout_stop(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool expired;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c); if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list);
expired = ((c->c_flags & CALLOUT_FIRED) != 0);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
SDT_PROBE5(sdt, kernel, callout, stop,
c, c->c_func, c->c_arg, c->c_flags, expired);
mutex_spin_exit(lock);
return expired;
}
/*
* callout_halt:
*
* Cancel a pending callout. If in-flight, block until it completes.
* May not be called from a hard interrupt handler. If the callout
* can take locks, the caller of callout_halt() must not hold any of
* those locks, otherwise the two could deadlock. If 'interlock' is
* non-NULL and we must wait for the callout to complete, it will be
* released and re-acquired before returning.
*/
bool
callout_halt(callout_t *cs, void *interlock)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(!cpu_intr_p()); KASSERT(interlock == NULL || mutex_owned(interlock));
/* Fast path. */
lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, halt,
c, c->c_func, c->c_arg, c->c_flags);
if ((c->c_flags & CALLOUT_PENDING) != 0) CIRCQ_REMOVE(&c->c_list);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
if (__predict_false(callout_running_somewhere_else(c, c->c_cpu))) { callout_wait(c, interlock, lock);
return true;
}
SDT_PROBE5(sdt, kernel, callout, halt__done,
c, c->c_func, c->c_arg, c->c_flags, /*expired*/false);
mutex_spin_exit(lock);
return false;
}
/*
* callout_wait:
*
* Slow path for callout_halt(). Deliberately marked __noinline to
* prevent unneeded overhead in the caller.
*/
static void __noinline
callout_wait(callout_impl_t *c, void *interlock, kmutex_t *lock)
{
struct callout_cpu *cc;
struct lwp *l;
kmutex_t *relock;
int nlocks;
l = curlwp;
relock = NULL;
for (;;) {
/*
* At this point we know the callout is not pending, but it
* could be running on a CPU somewhere. That can be curcpu
* in a few cases:
*
* - curlwp is a higher priority soft interrupt
* - the callout blocked on a lock and is currently asleep
* - the callout itself has called callout_halt() (nice!)
*/
cc = c->c_cpu;
if (__predict_true(!callout_running_somewhere_else(c, cc)))
break;
/* It's running - need to wait for it to complete. */
if (interlock != NULL) {
/*
* Avoid potential scheduler lock order problems by
* dropping the interlock without the callout lock
* held; then retry.
*/
mutex_spin_exit(lock);
mutex_exit(interlock);
relock = interlock;
interlock = NULL;
} else {
/* XXX Better to do priority inheritance. */
KASSERT(l->l_wchan == NULL);
cc->cc_nwait++;
cc->cc_ev_block.ev_count++;
nlocks = sleepq_enter(&cc->cc_sleepq, l, cc->cc_lock);
sleepq_enqueue(&cc->cc_sleepq, cc, "callout",
&callout_syncobj, false);
sleepq_block(0, false, &callout_syncobj, nlocks);
}
/*
* Re-lock the callout and check the state of play again.
* It's a common design pattern for callouts to re-schedule
* themselves so put a stop to it again if needed.
*/
lock = callout_lock(c);
if ((c->c_flags & CALLOUT_PENDING) != 0)
CIRCQ_REMOVE(&c->c_list);
c->c_flags &= ~(CALLOUT_PENDING|CALLOUT_FIRED);
}
SDT_PROBE5(sdt, kernel, callout, halt__done,
c, c->c_func, c->c_arg, c->c_flags, /*expired*/true);
mutex_spin_exit(lock);
if (__predict_false(relock != NULL))
mutex_enter(relock);
}
#ifdef notyet
/*
* callout_bind:
*
* Bind a callout so that it will only execute on one CPU.
* The callout must be stopped, and must be MPSAFE.
*
* XXX Disabled for now until it is decided how to handle
* offlined CPUs. We may want weak+strong binding.
*/
void
callout_bind(callout_t *cs, struct cpu_info *ci)
{
callout_impl_t *c = (callout_impl_t *)cs;
struct callout_cpu *cc;
kmutex_t *lock;
KASSERT((c->c_flags & CALLOUT_PENDING) == 0);
KASSERT(c->c_cpu->cc_active != c);
KASSERT(c->c_magic == CALLOUT_MAGIC);
KASSERT((c->c_flags & CALLOUT_MPSAFE) != 0);
lock = callout_lock(c);
cc = ci->ci_data.cpu_callout;
c->c_flags |= CALLOUT_BOUND;
if (c->c_cpu != cc) {
/*
* Assigning c_cpu effectively unlocks the callout
* structure, as we don't hold the new CPU's lock.
* Issue memory barrier to prevent accesses being
* reordered.
*/
membar_exit();
c->c_cpu = cc;
}
mutex_spin_exit(lock);
}
#endif
void
callout_setfunc(callout_t *cs, void (*func)(void *), void *arg)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC); KASSERT(func != NULL); lock = callout_lock(c); SDT_PROBE4(sdt, kernel, callout, setfunc, cs, func, arg, c->c_flags);
c->c_func = func;
c->c_arg = arg;
mutex_spin_exit(lock);
}
bool
callout_expired(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC);
lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_FIRED) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_active(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
rv = ((c->c_flags & (CALLOUT_PENDING|CALLOUT_FIRED)) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_pending(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_PENDING) != 0);
mutex_spin_exit(lock);
return rv;
}
bool
callout_invoking(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
bool rv;
KASSERT(c->c_magic == CALLOUT_MAGIC); lock = callout_lock(c);
rv = ((c->c_flags & CALLOUT_INVOKING) != 0);
mutex_spin_exit(lock);
return rv;
}
void
callout_ack(callout_t *cs)
{
callout_impl_t *c = (callout_impl_t *)cs;
kmutex_t *lock;
KASSERT(c->c_magic == CALLOUT_MAGIC);
lock = callout_lock(c);
c->c_flags &= ~CALLOUT_INVOKING;
mutex_spin_exit(lock);
}
/*
* callout_hardclock:
*
* Called from hardclock() once every tick. We schedule a soft
* interrupt if there is work to be done.
*/
void
callout_hardclock(void)
{
struct callout_cpu *cc;
int needsoftclock, ticks;
cc = curcpu()->ci_data.cpu_callout;
mutex_spin_enter(cc->cc_lock);
ticks = ++cc->cc_ticks;
MOVEBUCKET(cc, 0, ticks);
if (MASKWHEEL(0, ticks) == 0) {
MOVEBUCKET(cc, 1, ticks);
if (MASKWHEEL(1, ticks) == 0) {
MOVEBUCKET(cc, 2, ticks);
if (MASKWHEEL(2, ticks) == 0)
MOVEBUCKET(cc, 3, ticks);
}
}
needsoftclock = !CIRCQ_EMPTY(&cc->cc_todo);
mutex_spin_exit(cc->cc_lock);
if (needsoftclock)
softint_schedule(callout_sih);
}
/*
* callout_softclock:
*
* Soft interrupt handler, scheduled above if there is work to
* be done. Callouts are made in soft interrupt context.
*/
static void
callout_softclock(void *v)
{
callout_impl_t *c;
struct callout_cpu *cc;
void (*func)(void *);
void *arg;
int mpsafe, count, ticks, delta;
u_int flags __unused;
lwp_t *l;
l = curlwp;
KASSERT(l->l_cpu == curcpu());
cc = l->l_cpu->ci_data.cpu_callout;
mutex_spin_enter(cc->cc_lock);
cc->cc_lwp = l;
while (!CIRCQ_EMPTY(&cc->cc_todo)) {
c = CIRCQ_FIRST(&cc->cc_todo);
KASSERT(c->c_magic == CALLOUT_MAGIC);
KASSERT(c->c_func != NULL);
KASSERT(c->c_cpu == cc);
KASSERT((c->c_flags & CALLOUT_PENDING) != 0);
KASSERT((c->c_flags & CALLOUT_FIRED) == 0);
CIRCQ_REMOVE(&c->c_list);
/* If due run it, otherwise insert it into the right bucket. */
ticks = cc->cc_ticks;
delta = (int)((unsigned)c->c_time - (unsigned)ticks);
if (delta > 0) {
CIRCQ_INSERT(&c->c_list, BUCKET(cc, delta, c->c_time));
continue;
}
if (delta < 0)
cc->cc_ev_late.ev_count++;
c->c_flags = (c->c_flags & ~CALLOUT_PENDING) |
(CALLOUT_FIRED | CALLOUT_INVOKING);
mpsafe = (c->c_flags & CALLOUT_MPSAFE);
func = c->c_func;
arg = c->c_arg;
cc->cc_active = c;
flags = c->c_flags;
mutex_spin_exit(cc->cc_lock);
KASSERT(func != NULL);
SDT_PROBE4(sdt, kernel, callout, entry, c, func, arg, flags);
if (__predict_false(!mpsafe)) {
KERNEL_LOCK(1, NULL);
(*func)(arg);
KERNEL_UNLOCK_ONE(NULL);
} else
(*func)(arg);
SDT_PROBE4(sdt, kernel, callout, return, c, func, arg, flags);
KASSERTMSG(l->l_blcnt == 0,
"callout %p func %p leaked %d biglocks",
c, func, l->l_blcnt);
mutex_spin_enter(cc->cc_lock);
/*
* We can't touch 'c' here because it might be
* freed already. If LWPs waiting for callout
* to complete, awaken them.
*/
cc->cc_active = NULL;
if ((count = cc->cc_nwait) != 0) {
cc->cc_nwait = 0;
/* sleepq_wake() drops the lock. */
sleepq_wake(&cc->cc_sleepq, cc, count, cc->cc_lock);
mutex_spin_enter(cc->cc_lock);
}
}
cc->cc_lwp = NULL;
mutex_spin_exit(cc->cc_lock);
}
#endif /* !CRASH */
#ifdef DDB
static void
db_show_callout_bucket(struct callout_cpu *cc, struct callout_circq *kbucket,
struct callout_circq *bucket)
{
callout_impl_t *c, ci;
db_expr_t offset;
const char *name;
static char question[] = "?";
int b;
if (CIRCQ_LAST(bucket, kbucket))
return;
for (c = CIRCQ_FIRST(bucket); /*nothing*/; c = CIRCQ_NEXT(&c->c_list)) {
db_read_bytes((db_addr_t)c, sizeof(ci), (char *)&ci);
c = &ci;
db_find_sym_and_offset((db_addr_t)(intptr_t)c->c_func, &name,
&offset);
name = name ? name : question;
b = (bucket - cc->cc_wheel);
if (b < 0)
b = -WHEELSIZE;
db_printf("%9d %2d/%-4d %16lx %s\n",
c->c_time - cc->cc_ticks, b / WHEELSIZE, b,
(u_long)c->c_arg, name);
if (CIRCQ_LAST(&c->c_list, kbucket))
break;
}
}
void
db_show_callout(db_expr_t addr, bool haddr, db_expr_t count, const char *modif)
{
struct callout_cpu *cc;
struct cpu_info *ci;
int b;
#ifndef CRASH
db_printf("hardclock_ticks now: %d\n", getticks());
#endif
db_printf(" ticks wheel arg func\n");
/*
* Don't lock the callwheel; all the other CPUs are paused
* anyhow, and we might be called in a circumstance where
* some other CPU was paused while holding the lock.
*/
for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
db_read_bytes((db_addr_t)ci +
offsetof(struct cpu_info, ci_data.cpu_callout),
sizeof(cc), (char *)&cc);
db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
db_show_callout_bucket(&ccb, &cc->cc_todo, &ccb.cc_todo);
}
for (b = 0; b < BUCKETS; b++) {
for (ci = db_cpu_first(); ci != NULL; ci = db_cpu_next(ci)) {
db_read_bytes((db_addr_t)ci +
offsetof(struct cpu_info, ci_data.cpu_callout),
sizeof(cc), (char *)&cc);
db_read_bytes((db_addr_t)cc, sizeof(ccb), (char *)&ccb);
db_show_callout_bucket(&ccb, &cc->cc_wheel[b],
&ccb.cc_wheel[b]);
}
}
}
#endif /* DDB */
/* $NetBSD: sysv_shm_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $ */
/*-
* Copyright (c) 1999 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sysv_shm_50.c,v 1.5 2019/12/15 16:48:26 tsutsui Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/proc.h>
#include <sys/shm.h>
#ifndef SYSVSHM
#define SYSVSHM
#endif
#include <sys/syscallargs.h>
#include <compat/sys/shm.h>
int
compat_50_sys___shmctl13(struct lwp *l, const struct compat_50_sys___shmctl13_args *uap, register_t *retval)
{
/* {
syscallarg(int) shmid;
syscallarg(int) cmd;
syscallarg(struct shmid_ds13 *) buf;
} */
struct shmid_ds shmbuf;
struct shmid_ds13 oshmbuf;
int cmd, error;
cmd = SCARG(uap, cmd);
if (cmd == IPC_SET) {
error = copyin(SCARG(uap, buf), &oshmbuf, sizeof(oshmbuf));
if (error)
return (error);
__shmid_ds13_to_native(&oshmbuf, &shmbuf);
}
error = shmctl1(l, SCARG(uap, shmid), cmd,
(cmd == IPC_SET || cmd == IPC_STAT) ? &shmbuf : NULL);
if (error == 0 && cmd == IPC_STAT) { __native_to_shmid_ds13(&shmbuf, &oshmbuf);
error = copyout(&oshmbuf, SCARG(uap, buf), sizeof(oshmbuf));
}
return (error);
}
/* $NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $ */
/*
*
* Coda: an Experimental Distributed File System
* Release 3.1
*
* Copyright (c) 1987-1998 Carnegie Mellon University
* All Rights Reserved
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation, and
* that credit is given to Carnegie Mellon University in all documents
* and publicity pertaining to direct or indirect use of this code or its
* derivatives.
*
* CODA IS AN EXPERIMENTAL SOFTWARE SYSTEM AND IS KNOWN TO HAVE BUGS,
* SOME OF WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON ALLOWS
* FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION. CARNEGIE MELLON
* DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES WHATSOEVER
* RESULTING DIRECTLY OR INDIRECTLY FROM THE USE OF THIS SOFTWARE OR OF
* ANY DERIVATIVE WORK.
*
* Carnegie Mellon encourages users of this software to return any
* improvements or extensions that they make, and to grant Carnegie
* Mellon the rights to redistribute these changes without encumbrance.
*
* @(#) cfs/coda_vfsops.c,v 1.1.1.1 1998/08/29 21:26:45 rvb Exp $
*/
/*
* Mach Operating System
* Copyright (c) 1989 Carnegie-Mellon University
* All rights reserved. The CMU software License Agreement specifies
* the terms and conditions for use and redistribution.
*/
/*
* This code was written for the Coda file system at Carnegie Mellon
* University. Contributers include David Steere, James Kistler, and
* M. Satyanarayanan.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: coda_vfsops.c,v 1.90 2022/03/28 12:37:46 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/conf.h>
#include <sys/namei.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/select.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <coda/coda.h>
#include <coda/cnode.h>
#include <coda/coda_vfsops.h>
#include <coda/coda_venus.h>
#include <coda/coda_subr.h>
#include <coda/coda_opstats.h>
/* for VN_RDEV */
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
MODULE(MODULE_CLASS_VFS, coda, "vcoda");
#define ENTRY if(coda_vfsop_print_entry) myprintf(("Entered %s\n",__func__))
extern struct vnode *coda_ctlvp;
extern struct coda_mntinfo coda_mnttbl[NVCODA]; /* indexed by minor device number */
/* structure to keep statistics of internally generated/satisfied calls */
struct coda_op_stats coda_vfsopstats[CODA_VFSOPS_SIZE];
#define MARK_ENTRY(op) (coda_vfsopstats[op].entries++)
#define MARK_INT_SAT(op) (coda_vfsopstats[op].sat_intrn++)
#define MARK_INT_FAIL(op) (coda_vfsopstats[op].unsat_intrn++)
#define MRAK_INT_GEN(op) (coda_vfsopstats[op].gen_intrn++)
extern const struct cdevsw vcoda_cdevsw;
extern const struct vnodeopv_desc coda_vnodeop_opv_desc;
const struct vnodeopv_desc * const coda_vnodeopv_descs[] = {
&coda_vnodeop_opv_desc,
NULL,
};
struct vfsops coda_vfsops = {
.vfs_name = MOUNT_CODA,
.vfs_min_mount_data = 256,
/* This is the pathname, unlike every other fs */
.vfs_mount = coda_mount,
.vfs_start = coda_start,
.vfs_unmount = coda_unmount,
.vfs_root = coda_root,
.vfs_quotactl = (void *)eopnotsupp,
.vfs_statvfs = coda_nb_statvfs,
.vfs_sync = coda_sync,
.vfs_vget = coda_vget,
.vfs_loadvnode = coda_loadvnode,
.vfs_fhtovp = (void *)eopnotsupp,
.vfs_vptofh = (void *)eopnotsupp,
.vfs_init = coda_init,
.vfs_done = coda_done,
.vfs_mountroot = (void *)eopnotsupp,
.vfs_snapshot = (void *)eopnotsupp,
.vfs_extattrctl = vfs_stdextattrctl,
.vfs_suspendctl = genfs_suspendctl,
.vfs_renamelock_enter = genfs_renamelock_enter,
.vfs_renamelock_exit = genfs_renamelock_exit,
.vfs_fsync = (void *)eopnotsupp,
.vfs_opv_descs = coda_vnodeopv_descs
};
static int
coda_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return vfs_attach(&coda_vfsops);
case MODULE_CMD_FINI:
return vfs_detach(&coda_vfsops);
default:
return ENOTTY;
}
}
int
coda_vfsopstats_init(void)
{
int i;
for (i=0;i<CODA_VFSOPS_SIZE;i++) {
coda_vfsopstats[i].opcode = i;
coda_vfsopstats[i].entries = 0;
coda_vfsopstats[i].sat_intrn = 0;
coda_vfsopstats[i].unsat_intrn = 0;
coda_vfsopstats[i].gen_intrn = 0;
}
return 0;
}
/*
* cfs mount vfsop
* Set up mount info record and attach it to vfs struct.
*/
/*ARGSUSED*/
int
coda_mount(struct mount *vfsp, /* Allocated and initialized by mount(2) */
const char *path, /* path covered: ignored by the fs-layer */
void *data, /* Need to define a data type for this in netbsd? */
size_t *data_len)
{
struct lwp *l = curlwp;
struct vnode *dvp;
struct cnode *cp;
dev_t dev;
struct coda_mntinfo *mi;
struct vnode *rtvp;
const struct cdevsw *cdev;
CodaFid rootfid = INVAL_FID;
CodaFid ctlfid = CTL_FID;
int error;
if (data == NULL)
return EINVAL;
if (vfsp->mnt_flag & MNT_GETARGS)
return EINVAL;
ENTRY;
coda_vfsopstats_init();
coda_vnodeopstats_init();
MARK_ENTRY(CODA_MOUNT_STATS);
if (CODA_MOUNTED(vfsp)) { MARK_INT_FAIL(CODA_MOUNT_STATS);
return(EBUSY);
}
/* Validate mount device. Similar to getmdev(). */
/*
* XXX: coda passes the mount device as the entire mount args,
* All other fs pass a structure contining a pointer.
* In order to get sys_mount() to do the copyin() we've set a
* fixed default size for the filename buffer.
*/
/* Ensure that namei() doesn't run off the filename buffer */
if (*data_len < 1 || *data_len > PATH_MAX ||
strnlen(data, *data_len) >= *data_len) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return EINVAL;
}
error = namei_simple_kernel((char *)data, NSM_FOLLOW_NOEMULROOT,
&dvp);
if (error) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return (error);
}
if (dvp->v_type != VCHR) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
vrele(dvp);
return(ENXIO);
}
dev = dvp->v_rdev;
vrele(dvp);
cdev = cdevsw_lookup(dev);
if (cdev == NULL) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
/*
* See if the device table matches our expectations.
*/
if (cdev != &vcoda_cdevsw)
{
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
if (minor(dev) >= NVCODA) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENXIO);
}
/*
* Initialize the mount record and link it to the vfs struct
*/
mi = &coda_mnttbl[minor(dev)];
if (!VC_OPEN(&mi->mi_vcomm)) {
MARK_INT_FAIL(CODA_MOUNT_STATS);
return(ENODEV);
}
/* No initialization (here) of mi_vcomm! */
vfsp->mnt_data = mi;
vfsp->mnt_stat.f_fsidx.__fsid_val[0] = 0;
vfsp->mnt_stat.f_fsidx.__fsid_val[1] = makefstype(MOUNT_CODA);
vfsp->mnt_stat.f_fsid = vfsp->mnt_stat.f_fsidx.__fsid_val[0];
vfsp->mnt_stat.f_namemax = CODA_MAXNAMLEN;
mi->mi_vfsp = vfsp;
/*
* Make a root vnode to placate the Vnode interface, but don't
* actually make the CODA_ROOT call to venus until the first call
* to coda_root in case a server is down while venus is starting.
*/
cp = make_coda_node(&rootfid, vfsp, VDIR);
rtvp = CTOV(cp);
rtvp->v_vflag |= VV_ROOT;
cp = make_coda_node(&ctlfid, vfsp, VCHR);
coda_ctlvp = CTOV(cp);
/* Add vfs and rootvp to chain of vfs hanging off mntinfo */
mi->mi_vfsp = vfsp;
mi->mi_rootvp = rtvp;
/* set filesystem block size */
vfsp->mnt_stat.f_bsize = 8192; /* XXX -JJK */
vfsp->mnt_stat.f_frsize = 8192; /* XXX -JJK */
/* error is currently guaranteed to be zero, but in case some
code changes... */
CODADEBUG(1,
myprintf(("coda_mount returned %d\n",error)););
if (error)
MARK_INT_FAIL(CODA_MOUNT_STATS);
else
MARK_INT_SAT(CODA_MOUNT_STATS);
return set_statvfs_info("/coda", UIO_SYSSPACE, "CODA", UIO_SYSSPACE,
vfsp->mnt_op->vfs_name, vfsp, l);
}
int
coda_start(struct mount *vfsp, int flags)
{
ENTRY;
vftomi(vfsp)->mi_started = 1;
return (0);
}
int
coda_unmount(struct mount *vfsp, int mntflags)
{
struct coda_mntinfo *mi = vftomi(vfsp);
int active, error = 0;
ENTRY;
MARK_ENTRY(CODA_UMOUNT_STATS);
if (!CODA_MOUNTED(vfsp)) {
MARK_INT_FAIL(CODA_UMOUNT_STATS);
return(EINVAL);
}
if (mi->mi_vfsp == vfsp) { /* We found the victim */
if (!IS_UNMOUNTING(VTOC(mi->mi_rootvp)))
return (EBUSY); /* Venus is still running */
#ifdef DEBUG
printf("coda_unmount: ROOT: vp %p, cp %p\n", mi->mi_rootvp, VTOC(mi->mi_rootvp));
#endif
mi->mi_started = 0;
vrele(mi->mi_rootvp);
vrele(coda_ctlvp);
active = coda_kill(vfsp, NOT_DOWNCALL);
mi->mi_rootvp->v_vflag &= ~VV_ROOT;
error = vflush(mi->mi_vfsp, NULLVP, FORCECLOSE);
printf("coda_unmount: active = %d, vflush active %d\n", active, error);
error = 0;
/* I'm going to take this out to allow lookups to go through. I'm
* not sure it's important anyway. -- DCS 2/2/94
*/
/* vfsp->VFS_DATA = NULL; */
/* No more vfsp's to hold onto */
mi->mi_vfsp = NULL;
mi->mi_rootvp = NULL;
if (error)
MARK_INT_FAIL(CODA_UMOUNT_STATS);
else
MARK_INT_SAT(CODA_UMOUNT_STATS);
return(error);
}
return (EINVAL);
}
/*
* find root of cfs
*/
int
coda_root(struct mount *vfsp, int lktype, struct vnode **vpp)
{
struct coda_mntinfo *mi = vftomi(vfsp);
int error;
struct lwp *l = curlwp; /* XXX - bnoble */
CodaFid VFid;
static const CodaFid invalfid = INVAL_FID;
ENTRY;
MARK_ENTRY(CODA_ROOT_STATS);
if (vfsp == mi->mi_vfsp) {
if (memcmp(&VTOC(mi->mi_rootvp)->c_fid, &invalfid, sizeof(CodaFid)))
{ /* Found valid root. */
*vpp = mi->mi_rootvp;
/* On Mach, this is vref. On NetBSD, VOP_LOCK */
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_SAT(CODA_ROOT_STATS);
return(0);
}
}
error = venus_root(vftomi(vfsp), l->l_cred, l->l_proc, &VFid);
if (!error) {
struct cnode *cp = VTOC(mi->mi_rootvp);
/*
* Save the new rootfid in the cnode, and rekey the cnode
* with the new fid key.
*/
error = vcache_rekey_enter(vfsp, mi->mi_rootvp,
&invalfid, sizeof(CodaFid), &VFid, sizeof(CodaFid));
if (error)
goto exit;
cp->c_fid = VFid;
vcache_rekey_exit(vfsp, mi->mi_rootvp,
&invalfid, sizeof(CodaFid), &cp->c_fid, sizeof(CodaFid));
*vpp = mi->mi_rootvp;
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_SAT(CODA_ROOT_STATS);
goto exit;
} else if (error == ENODEV || error == EINTR) {
/* Gross hack here! */
/*
* If Venus fails to respond to the CODA_ROOT call, coda_call returns
* ENODEV. Return the uninitialized root vnode to allow vfs
* operations such as unmount to continue. Without this hack,
* there is no way to do an unmount if Venus dies before a
* successful CODA_ROOT call is done. All vnode operations
* will fail.
*/
*vpp = mi->mi_rootvp;
vref(*vpp);
vn_lock(*vpp, lktype);
MARK_INT_FAIL(CODA_ROOT_STATS);
error = 0;
goto exit;
} else {
CODADEBUG( CODA_ROOT, myprintf(("error %d in CODA_ROOT\n", error)); );
MARK_INT_FAIL(CODA_ROOT_STATS);
goto exit;
}
exit:
return(error);
}
/*
* Get file system statistics.
*/
int
coda_nb_statvfs(struct mount *vfsp, struct statvfs *sbp)
{
struct lwp *l = curlwp;
struct coda_statfs fsstat;
int error;
ENTRY;
MARK_ENTRY(CODA_STATFS_STATS);
if (!CODA_MOUNTED(vfsp)) {
/* MARK_INT_FAIL(CODA_STATFS_STATS); */
return(EINVAL);
}
/* XXX - what to do about f_flags, others? --bnoble */
/* Below This is what AFS does
#define NB_SFS_SIZ 0x895440
*/
/* Note: Normal fs's have a bsize of 0x400 == 1024 */
error = venus_statfs(vftomi(vfsp), l->l_cred, l, &fsstat);
if (!error) {
sbp->f_bsize = 8192; /* XXX */
sbp->f_frsize = 8192; /* XXX */
sbp->f_iosize = 8192; /* XXX */
sbp->f_blocks = fsstat.f_blocks;
sbp->f_bfree = fsstat.f_bfree;
sbp->f_bavail = fsstat.f_bavail;
sbp->f_bresvd = 0;
sbp->f_files = fsstat.f_files;
sbp->f_ffree = fsstat.f_ffree;
sbp->f_favail = fsstat.f_ffree;
sbp->f_fresvd = 0;
copy_statvfs_info(sbp, vfsp);
}
MARK_INT_SAT(CODA_STATFS_STATS);
return(error);
}
/*
* Flush any pending I/O.
*/
int
coda_sync(struct mount *vfsp, int waitfor,
kauth_cred_t cred)
{
ENTRY;
MARK_ENTRY(CODA_SYNC_STATS);
MARK_INT_SAT(CODA_SYNC_STATS);
return(0);
}
int
coda_vget(struct mount *vfsp, ino_t ino, int lktype,
struct vnode **vpp)
{
ENTRY;
return (EOPNOTSUPP);
}
int
coda_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
CodaFid fid;
struct cnode *cp;
extern int (**coda_vnodeop_p)(void *);
KASSERT(key_len == sizeof(CodaFid));
memcpy(&fid, key, key_len);
cp = kmem_zalloc(sizeof(*cp), KM_SLEEP);
mutex_init(&cp->c_lock, MUTEX_DEFAULT, IPL_NONE);
cp->c_fid = fid;
cp->c_vnode = vp;
vp->v_op = coda_vnodeop_p;
vp->v_tag = VT_CODA;
vp->v_type = VNON;
vp->v_data = cp;
*new_key = &cp->c_fid;
return 0;
}
/*
* fhtovp is now what vget used to be in 4.3-derived systems. For
* some silly reason, vget is now keyed by a 32 bit ino_t, rather than
* a type-specific fid.
*/
int
coda_fhtovp(struct mount *vfsp, struct fid *fhp, struct mbuf *nam,
struct vnode **vpp, int *exflagsp,
kauth_cred_t *creadanonp, int lktype)
{
struct cfid *cfid = (struct cfid *)fhp;
struct cnode *cp = 0;
int error;
struct lwp *l = curlwp; /* XXX -mach */
CodaFid VFid;
int vtype;
ENTRY;
MARK_ENTRY(CODA_VGET_STATS);
/* Check for vget of control object. */
if (IS_CTL_FID(&cfid->cfid_fid)) {
*vpp = coda_ctlvp;
vref(coda_ctlvp);
MARK_INT_SAT(CODA_VGET_STATS);
return(0);
}
error = venus_fhtovp(vftomi(vfsp), &cfid->cfid_fid, l->l_cred, l->l_proc, &VFid, &vtype);
if (error) {
CODADEBUG(CODA_VGET, myprintf(("vget error %d\n",error));)
*vpp = (struct vnode *)0;
} else {
CODADEBUG(CODA_VGET,
myprintf(("vget: %s type %d result %d\n",
coda_f2s(&VFid), vtype, error)); )
cp = make_coda_node(&VFid, vfsp, vtype);
*vpp = CTOV(cp);
}
return(error);
}
int
coda_vptofh(struct vnode *vnp, struct fid *fidp)
{
ENTRY;
return (EOPNOTSUPP);
}
void
coda_init(void)
{
ENTRY;
}
void
coda_done(void)
{
ENTRY;
}
SYSCTL_SETUP(sysctl_vfs_coda_setup, "sysctl vfs.coda subtree setup")
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "coda",
SYSCTL_DESCR("code vfs options"),
NULL, 0, NULL, 0,
CTL_VFS, 18, CTL_EOL);
/*
* XXX the "18" above could be dynamic, thereby eliminating
* one more instance of the "number to vfs" mapping problem,
* but "18" is the order as taken from sys/mount.h
*/
/*
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "clusterread",
SYSCTL_DESCR( anyone? ),
NULL, 0, &doclusterread, 0,
CTL_VFS, 18, FFS_CLUSTERREAD, CTL_EOL);
*/
}
/*
* To allow for greater ease of use, some vnodes may be orphaned when
* Venus dies. Certain operations should still be allowed to go
* through, but without propagating orphan-ness. So this function will
* get a new vnode for the file from the current run of Venus.
*/
int
getNewVnode(struct vnode **vpp)
{
struct cfid cfid;
struct coda_mntinfo *mi = vftomi((*vpp)->v_mount);
ENTRY;
cfid.cfid_len = (short)sizeof(CodaFid);
cfid.cfid_fid = VTOC(*vpp)->c_fid; /* Structure assignment. */
/* XXX ? */
/* We're guessing that if set, the 1st element on the list is a
* valid vnode to use. If not, return ENODEV as venus is dead.
*/
if (mi->mi_vfsp == NULL)
return ENODEV;
return coda_fhtovp(mi->mi_vfsp, (struct fid*)&cfid, NULL, vpp,
NULL, NULL, LK_EXCLUSIVE);
}
/* Get the mount structure corresponding to a given device.
* Return NULL if no device is found or the device is not mounted.
*/
struct mount *devtomp(dev_t dev)
{
struct mount *mp;
struct vnode *vp;
if (spec_node_lookup_by_dev(VBLK, dev, VDEAD_NOWAIT, &vp) == 0) {
mp = spec_node_getmountedfs(vp);
vrele(vp);
} else {
mp = NULL;
}
return mp;
}
/* $NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Entropy pool (`reseedable pseudorandom number generator') based on a
* sponge duplex, following the design described and analyzed in
*
* Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
* Assche, `Sponge-Based Pseudo-Random Number Generators', in
* Stefan Mangard and François-Xavier Standaert, eds.,
* Cryptographic Hardware and Embedded Systems—CHES 2010, Springer
* LNCS 6225, pp. 33–47.
* https://link.springer.com/chapter/10.1007/978-3-642-15031-9_3
* https://keccak.team/files/SpongePRNG.pdf
*
* Guido Bertoni, Joan Daemen, Michaël Peeters, and Gilles Van
* Assche, `Duplexing the Sponge: Single-Pass Authenticated
* Encryption and Other Applications', in Ali Miri and Serge
* Vaudenay, eds., Selected Areas in Cryptography—SAC 2011,
* Springer LNCS 7118, pp. 320–337.
* https://link.springer.com/chapter/10.1007/978-3-642-28496-0_19
* https://keccak.team/files/SpongeDuplex.pdf
*
* We make the following tweaks that don't affect security:
*
* - Samples are length-delimited 7-bit variable-length encoding.
* The encoding is still injective, so the security theorems
* continue to apply.
*
* - Output is not buffered -- callers should draw 32 bytes and
* expand with a stream cipher. In effect, every output draws
* the full rate, and we just discard whatever the caller didn't
* ask for; the impact is only on performance, not security.
*
* On top of the underlying sponge state, an entropy pool maintains an
* integer i in [0, RATE-1] indicating where to write the next byte in
* the input buffer. Zeroing an entropy pool initializes it.
*/
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: entpool.c,v 1.1 2020/04/30 03:28:19 riastradh Exp $");
#endif
#include "entpool.h"
#include ENTPOOL_HEADER
#if defined(_KERNEL) || defined(_STANDALONE)
#include <sys/types.h>
#include <lib/libkern/libkern.h>
#define ASSERT KASSERT
#else
#include <sys/cdefs.h>
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#define ASSERT assert
#define CTASSERT __CTASSERT
#endif
#define secret /* must not use in variable-time operations; should zero */
#define arraycount(A) (sizeof(A)/sizeof((A)[0]))
#define MIN(X,Y) ((X) < (Y) ? (X) : (Y))
#define RATE ENTPOOL_RATE
/*
* stir(P)
*
* Internal subroutine to apply the sponge permutation to the
* state in P. Resets P->i to 0 to indicate that the input buffer
* is empty.
*/
static void
stir(struct entpool *P)
{
size_t i;
/*
* Switch to the permutation's byte order, if necessary, apply
* permutation, and then switch back. This way we can data in
* and out byte by byte, but get the same answers out of test
* vectors.
*/
for (i = 0; i < arraycount(P->s.w); i++)
P->s.w[i] = ENTPOOL_WTOH(P->s.w[i]);
ENTPOOL_PERMUTE(P->s.w);
for (i = 0; i < arraycount(P->s.w); i++)
P->s.w[i] = ENTPOOL_HTOW(P->s.w[i]);
/* Reset the input buffer. */
P->i = 0;
}
/*
* entpool_enter(P, buf, len)
*
* Enter len bytes from buf into the entropy pool P, stirring as
* needed. Corresponds to P.feed in the paper.
*/
void
entpool_enter(struct entpool *P, const void *buf, size_t len)
{
const uint8_t *p = buf;
size_t n = len, n1 = n;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* Encode the length, stirring as needed. */
while (n1) {
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
P->s.u8[P->i++] ^= (n1 >= 0x80 ? 0x80 : 0) | (n1 & 0x7f);
n1 >>= 7;
}
/* Enter the sample, stirring as needed. */
while (n --> 0) {
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
P->s.u8[P->i++] ^= *p++;
}
/* If we filled the input buffer exactly, stir once more. */
if (P->i == RATE-1)
stir(P); ASSERT(P->i < RATE-1);
}
/*
* entpool_enter_nostir(P, buf, len)
*
* Enter as many bytes as possible, up to len, from buf into the
* entropy pool P. Roughly corresponds to P.feed in the paper,
* but we stop if we would have run the permutation.
*
* Return true if the sample was consumed in its entirety, or true
* if the sample was truncated so the caller should arrange to
* call entpool_stir when it is next convenient to do so.
*
* This function is cheap -- it only xors the input into the
* state, and never calls the underlying permutation, but it may
* truncate samples.
*/
bool
entpool_enter_nostir(struct entpool *P, const void *buf, size_t len)
{
const uint8_t *p = buf;
size_t n0, n;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If the input buffer is full, fail. */
if (P->i == RATE-1)
return false;
ASSERT(P->i < RATE-1);
/*
* Truncate the sample and enter it with 1-byte length encoding
* -- don't bother with variable-length encoding, not worth the
* trouble.
*/
n = n0 = MIN(127, MIN(len, RATE-1 - P->i - 1));
P->s.u8[P->i++] ^= n;
while (n --> 0)
P->s.u8[P->i++] ^= *p++;
/* Can't guarantee anything better than 0 <= i <= RATE-1. */
ASSERT(P->i <= RATE-1);
/* Return true if all done, false if truncated and in need of stir. */
return (n0 == len);
}
/*
* entpool_stir(P)
*
* Stir the entropy pool after entpool_enter_nostir fails. If it
* has already been stirred already, this has no effect.
*/
void
entpool_stir(struct entpool *P)
{
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If the input buffer is full, stir. */
if (P->i == RATE-1)
stir(P);
ASSERT(P->i < RATE-1);
}
/*
* entpool_extract(P, buf, len)
*
* Extract len bytes from the entropy pool P into buf.
* Corresponds to iterating P.fetch/P.forget in the paper.
* (Feeding the output back in -- as P.forget does -- is the same
* as zeroing what we just read out.)
*/
void
entpool_extract(struct entpool *P, secret void *buf, size_t len)
{
uint8_t *p = buf;
size_t n = len;
/* Sanity-check P->i. */
ASSERT(P->i <= RATE-1);
/* If input buffer is not empty, stir. */
if (P->i != 0) stir(P);
ASSERT(P->i == 0);
/*
* Copy out and zero (RATE-1)-sized chunks at a time, stirring
* with a bit set to distinguish this from inputs.
*/
while (n >= RATE-1) {
memcpy(p, P->s.u8, RATE-1);
memset(P->s.u8, 0, RATE-1);
P->s.u8[RATE-1] ^= 0x80;
stir(P);
p += RATE-1;
n -= RATE-1;
}
/*
* If there's anything left, copy out a partial rate's worth
* and zero the entire rate's worth, stirring with a bit set to
* distinguish this from inputs.
*/
if (n) {
ASSERT(n < RATE-1);
memcpy(p, P->s.u8, n); /* Copy part of it. */
memset(P->s.u8, 0, RATE-1); /* Zero all of it. */
P->s.u8[RATE-1] ^= 0x80;
stir(P);
}
}
/*
* Known-answer tests
*/
#if ENTPOOL_SMALL
#define KATLEN 15
/* Gimli */
static const uint8_t known_answers[][KATLEN] = {
[0] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[1] = {
0x74,0x15,0x16,0x49,0x31,0x07,0x77,0xa1,
0x3b,0x4d,0x78,0xc6,0x5d,0xef,0x87,
},
[2] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[3] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[4] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[5] = {
0xa9,0x3c,0x3c,0xac,0x5f,0x6d,0x80,0xdc,
0x33,0x0c,0xb2,0xe3,0xdd,0x55,0x31,
},
[6] = {
0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
},
[7] = {
0xae,0xfd,0x7d,0xc4,0x3b,0xce,0x09,0x25,
0xbf,0x60,0x21,0x6e,0x3c,0x3a,0x84,
},
[8] = {
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,
},
[9] = {
0x69,0xb8,0x49,0x0d,0x39,0xfb,0x42,0x61,
0xf7,0x66,0xdf,0x04,0xb6,0xed,0x11,
},
[10] = {
0x2e,0x69,0x1a,0x2a,0x2d,0x09,0xd4,0x5e,
0x49,0xcc,0x8c,0xb2,0x0b,0xcc,0x42,
},
[11] = {
0x6f,0xfd,0xd2,0x29,0x78,0x46,0xc0,0x7d,
0xc7,0xf2,0x0a,0x2b,0x72,0xd6,0xc6,
},
[12] = {
0x86,0xf0,0xc1,0xf9,0x95,0x0f,0xc9,0x12,
0xde,0x38,0x39,0x10,0x1f,0x8c,0xc4,
},
};
#else /* !ENTPOOL_SMALL */
#define KATLEN 16
/* Keccak-p[1600, 24] */
static const uint8_t known_answers[][KATLEN] = {
[0] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[1] = {
0x57,0x49,0x6e,0x28,0x7f,0xaa,0xee,0x6c,
0xa8,0xb0,0xf5,0x0b,0x87,0xae,0xd6,0xd6,
},
[2] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[3] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[4] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[5] = {
0x95,0x23,0x77,0xe4,0x84,0xeb,0xaa,0x2e,
0x6a,0x99,0xc2,0x52,0x06,0x6d,0xdf,0xea,
},
[6] = {
0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
},
[7] = {
0x51,0x72,0x0f,0x59,0x54,0xe1,0xaf,0xa8,
0x16,0x67,0xfa,0x3f,0x8a,0x19,0x52,0x50,
},
[8] = {
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
},
[9] = {
0x3b,0x20,0xf0,0xe9,0xce,0x94,0x48,0x07,
0x97,0xb6,0x16,0xb5,0xb5,0x05,0x1a,0xce,
},
[10] = {
0x8c,0xdd,0x1b,0xaf,0x0e,0xf6,0xe9,0x1d,
0x51,0x33,0x68,0x38,0x8d,0xad,0x55,0x84,
},
[11] = {
0xf6,0xc1,0x14,0xbb,0x13,0x0a,0xaf,0xed,
0xca,0x0b,0x35,0x2c,0xf1,0x2b,0x1a,0x85,
},
[12] = {
0xf9,0x4b,0x05,0xd1,0x8b,0xcd,0xb3,0xd0,
0x77,0x27,0xfe,0x46,0xf9,0x33,0xb2,0xa2,
},
};
#endif
#define KAT_BEGIN(P, n) memset(P, 0, sizeof(*(P)))
#define KAT_ERROR() return -1
#define KAT_END(P, n) do \
{ \
uint8_t KAT_ACTUAL[KATLEN]; \
entpool_extract(P, KAT_ACTUAL, KATLEN); \
if (memcmp(KAT_ACTUAL, known_answers[n], KATLEN)) \
return -1; \
} while (0)
int
entpool_selftest(void)
{
struct entpool pool, *P = &pool;
uint8_t sample[1] = {0xff};
uint8_t scratch[RATE];
const uint8_t zero[RATE] = {0};
/* Test entpool_enter with empty buffer. */
KAT_BEGIN(P, 0);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 0);
/* Test entpool_enter with partial buffer. */
KAT_BEGIN(P, 1);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
entpool_enter(P, zero, RATE-3);
#else
entpool_enter(P, zero, RATE-4);
#endif
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 1);
/* Test entpool_enter with full buffer. */
KAT_BEGIN(P, 2);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 2);
/* Test entpool_enter with full buffer after stir. */
KAT_BEGIN(P, 3);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_stir(P);
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 3);
/* Test entpool_enter_nostir with empty buffer. */
KAT_BEGIN(P, 4);
entpool_stir(P); /* noop */
if (!entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P); /* noop */
KAT_END(P, 4);
/* Test entpool_enter_nostir with partial buffer. */
KAT_BEGIN(P, 5);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
entpool_enter(P, zero, RATE-3);
#else
entpool_enter(P, zero, RATE-4);
#endif
entpool_stir(P); /* noop */
if (entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P);
KAT_END(P, 5);
/* Test entpool_enter_nostir with full buffer. */
KAT_BEGIN(P, 6);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
if (entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P);
KAT_END(P, 6);
/* Test entpool_enter_nostir with full buffer after stir. */
KAT_BEGIN(P, 7);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
entpool_stir(P);
if (!entpool_enter_nostir(P, sample, 1))
KAT_ERROR();
entpool_stir(P); /* noop */
KAT_END(P, 7);
/* Test entpool_extract with empty input buffer. */
KAT_BEGIN(P, 8);
entpool_stir(P); /* noop */
KAT_END(P, 8);
/* Test entpool_extract with nonempty input buffer. */
KAT_BEGIN(P, 9);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 9);
/* Test entpool_extract with full input buffer. */
KAT_BEGIN(P, 10);
entpool_stir(P); /* noop */
#if ENTPOOL_SMALL
if (!entpool_enter_nostir(P, zero, RATE-2))
KAT_ERROR();
#else
CTASSERT(127 <= RATE-2);
if (!entpool_enter_nostir(P, zero, 127))
KAT_ERROR();
if (!entpool_enter_nostir(P, zero, RATE-2 - 127 - 1))
KAT_ERROR();
#endif
KAT_END(P, 10);
/* Test entpool_extract with iterated output. */
KAT_BEGIN(P, 11);
entpool_stir(P); /* noop */
entpool_extract(P, scratch, RATE-1 + 1);
entpool_stir(P); /* noop */
KAT_END(P, 11);
/* Test extract, enter, extract. */
KAT_BEGIN(P, 12);
entpool_stir(P); /* noop */
entpool_extract(P, scratch, 1);
entpool_stir(P); /* noop */
entpool_enter(P, sample, 1);
entpool_stir(P); /* noop */
KAT_END(P, 12);
return 0;
}
#if ENTPOOL_TEST
int
main(void)
{
return entpool_selftest();
}
#endif
/*
* Known-answer test generation
*
* This generates the known-answer test vectors from explicitly
* specified duplex inputs that correspond to what entpool_enter
* &c. induce, to confirm the encoding of inputs works as
* intended.
*/
#if ENTPOOL_GENKAT
#include <stdio.h>
struct event {
enum { IN, OUT, STOP } t;
uint8_t b[RATE-1];
};
/* Cases correspond to entpool_selftest above. */
static const struct event *const cases[] = {
[0] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[1] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-3, [RATE-2] = 1}},
#else
{IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7, [RATE-2] = 1}},
#endif
{IN, {0xff}},
{STOP, {0}},
},
[2] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[3] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[4] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[5] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-3, [RATE-2] = 0 /* truncated length */}},
#else
{IN, {0x80|((RATE-4)&0x7f), (RATE-4)>>7,
[RATE-2] = 0 /* truncated length */}},
#endif
{STOP, {0}},
},
[6] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{STOP, {0}},
},
[7] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{IN, {1, 0xff}},
{STOP, {0}},
},
[8] = (const struct event[]) {
{STOP, {0}},
},
[9] = (const struct event[]) {
{IN, {1, 0xff}},
{STOP, {0}},
},
[10] = (const struct event[]) {
#if ENTPOOL_SMALL
{IN, {RATE-2}},
#else
{IN, {127, [128] = RATE-2 - 127 - 1}},
#endif
{STOP, {0}},
},
[11] = (const struct event[]) {
{OUT, {0}},
{OUT, {0}},
{STOP, {0}},
},
[12] = (const struct event[]) {
{OUT, {0}},
{IN, {1, 0xff}},
{STOP, {0}},
},
};
static void
compute(uint8_t output[KATLEN], const struct event *events)
{
union {
uint8_t b[ENTPOOL_SIZE];
ENTPOOL_WORD w[ENTPOOL_SIZE/sizeof(ENTPOOL_WORD)];
} u;
unsigned i, j, k;
memset(&u.b, 0, sizeof u.b);
for (i = 0;; i++) {
if (events[i].t == STOP)
break;
for (j = 0; j < sizeof(events[i].b); j++)
u.b[j] ^= events[i].b[j];
if (events[i].t == OUT) {
memset(u.b, 0, RATE-1);
u.b[RATE-1] ^= 0x80;
}
for (k = 0; k < arraycount(u.w); k++)
u.w[k] = ENTPOOL_WTOH(u.w[k]);
ENTPOOL_PERMUTE(u.w);
for (k = 0; k < arraycount(u.w); k++)
u.w[k] = ENTPOOL_HTOW(u.w[k]);
}
for (j = 0; j < KATLEN; j++)
output[j] = u.b[j];
}
int
main(void)
{
uint8_t output[KATLEN];
unsigned i, j;
printf("static const uint8_t known_answers[][KATLEN] = {\n");
for (i = 0; i < arraycount(cases); i++) {
printf("\t[%u] = {\n", i);
compute(output, cases[i]);
for (j = 0; j < KATLEN; j++) {
if (j % 8 == 0)
printf("\t\t");
printf("0x%02hhx,", output[j]);
if (j % 8 == 7)
printf("\n");
}
if ((KATLEN % 8) != 0)
printf("\n");
printf("\t},\n");
}
printf("};\n");
fflush(stdout);
return ferror(stdout);
}
#endif
/* $NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* UFS Rename
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ufs_rename.c,v 1.14 2021/10/20 03:08:19 thorpej Exp $");
#include <sys/param.h>
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <sys/vnode.h>
#include <sys/vnode_if.h>
#include <sys/wapbl.h>
#include <miscfs/genfs/genfs.h>
#include <ufs/ufs/dir.h>
#include <ufs/ufs/inode.h>
#include <ufs/ufs/ufs_bswap.h>
#include <ufs/ufs/ufs_extern.h>
#include <ufs/ufs/ufs_wapbl.h>
#include <ufs/ufs/ufsmount.h>
/*
* Forward declarations
*/
static int ufs_sane_rename(struct vnode *, struct componentname *,
struct vnode *, struct componentname *,
kauth_cred_t, bool);
static bool ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *,
const struct ufs_lookup_results *);
static int ufs_rename_recalculate_fulr(struct vnode *,
struct ufs_lookup_results *, const struct ufs_lookup_results *,
const struct componentname *);
static int ufs_direct_namlen(const struct direct *, const struct vnode *);
static int ufs_read_dotdot(struct vnode *, kauth_cred_t, ino_t *);
static int ufs_dirbuf_dotdot_namlen(const struct dirtemplate *,
const struct vnode *);
static const struct genfs_rename_ops ufs_genfs_rename_ops;
/*
* ufs_sane_rename: The hairiest vop, with the saner API.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tcnp (to component name),
* . cred (credentials structure), and
* . posixly_correct (flag for behaviour if target & source link same file).
*
* fdvp and tdvp may be the same, and must be referenced and unlocked.
*/
static int
ufs_sane_rename(
struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct)
{
struct ufs_lookup_results fulr, tulr;
return genfs_sane_rename(&ufs_genfs_rename_ops,
fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
cred, posixly_correct);
}
/*
* ufs_rename: The hairiest vop, with the insanest API. Defer to
* genfs_insane_rename immediately.
*/
int
ufs_rename(void *v)
{
return genfs_insane_rename(v, &ufs_sane_rename);
}
/*
* ufs_gro_directory_empty_p: Return true if the directory vp is
* empty. dvp is its parent.
*
* vp and dvp must be locked and referenced.
*/
bool
ufs_gro_directory_empty_p(struct mount *mp, kauth_cred_t cred,
struct vnode *vp, struct vnode *dvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(dvp != NULL); KASSERT(vp != dvp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_mount == mp); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
return ufs_dirempty(VTOI(vp), VTOI(dvp)->i_number, cred);
}
/*
* ufs_gro_rename_check_possible: Check whether a rename is possible
* independent of credentials.
*/
int
ufs_gro_rename_check_possible(struct mount *mp,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
return genfs_ufslike_rename_check_possible(
VTOI(fdvp)->i_flags, VTOI(fvp)->i_flags,
VTOI(tdvp)->i_flags, (tvp? VTOI(tvp)->i_flags : 0),
(tvp != NULL),
IMMUTABLE, APPEND);
}
/*
* ufs_gro_rename_check_permitted: Check whether a rename is permitted
* given our credentials.
*/
int
ufs_gro_rename_check_permitted(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
return genfs_ufslike_rename_check_permitted(cred,
fdvp, VTOI(fdvp)->i_mode, VTOI(fdvp)->i_uid,
fvp, VTOI(fvp)->i_uid,
tdvp, VTOI(tdvp)->i_mode, VTOI(tdvp)->i_uid,
tvp, (tvp? VTOI(tvp)->i_uid : 0));
}
/*
* ufs_gro_remove_check_possible: Check whether a remove is possible
* independent of credentials.
*/
int
ufs_gro_remove_check_possible(struct mount *mp,
struct vnode *dvp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
return genfs_ufslike_remove_check_possible(
VTOI(dvp)->i_flags, VTOI(vp)->i_flags,
IMMUTABLE, APPEND);
}
/*
* ufs_gro_remove_check_permitted: Check whether a remove is permitted
* given our credentials.
*/
int
ufs_gro_remove_check_permitted(struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
return genfs_ufslike_remove_check_permitted(cred,
dvp, VTOI(dvp)->i_mode, VTOI(dvp)->i_uid, vp, VTOI(vp)->i_uid);
}
/*
* A virgin directory (no blushing please).
*
* XXX Copypasta from ufs_vnops.c. Kill!
*/
static const struct dirtemplate mastertemplate = {
0, 12, DT_DIR, 1, ".",
0, UFS_DIRBLKSIZ - 12, DT_DIR, 2, ".."
};
/*
* ufs_gro_rename: Actually perform the rename operation.
*/
int
ufs_gro_rename(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde, struct vnode *fvp,
struct vnode *tdvp, struct componentname *tcnp,
void *tde, struct vnode *tvp, nlink_t *tvp_nlinkp)
{
struct ufs_lookup_results *fulr = fde;
struct ufs_lookup_results *tulr = tde;
bool directory_p, reparent_p;
struct direct *newdir;
int error;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(fulr != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_mount == mp); KASSERT(fvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT((tvp == NULL) || (tvp->v_mount == mp)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* We shall need to temporarily bump the link count, so make
* sure there is room to do so.
*/
if ((nlink_t)VTOI(fvp)->i_nlink >= LINK_MAX)
return EMLINK;
directory_p = (fvp->v_type == VDIR);
KASSERT(directory_p == ((VTOI(fvp)->i_mode & IFMT) == IFDIR)); KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR))); KASSERT((tvp == NULL) || (directory_p ==
((VTOI(tvp)->i_mode & IFMT) == IFDIR)));
reparent_p = (fdvp != tdvp);
KASSERT(reparent_p == (VTOI(fdvp)->i_number != VTOI(tdvp)->i_number));
/*
* Commence hacking of the data on disk.
*/
error = UFS_WAPBL_BEGIN(mp);
if (error)
goto ihateyou;
/*
* 1) Bump link count while we're moving stuff
* around. If we crash somewhere before
* completing our work, the link count
* may be wrong, but correctable.
*/
KASSERT((nlink_t)VTOI(fvp)->i_nlink < LINK_MAX);
VTOI(fvp)->i_nlink++;
DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
VTOI(fvp)->i_flag |= IN_CHANGE;
error = UFS_UPDATE(fvp, NULL, NULL, UPDATE_DIROP);
if (error)
goto whymustithurtsomuch;
/*
* 2) If target doesn't exist, link the target
* to the source and unlink the source.
* Otherwise, rewrite the target directory
* entry to reference the source inode and
* expunge the original entry's existence.
*/
if (tvp == NULL) {
/*
* Account for ".." in new directory.
* When source and destination have the same
* parent we don't fool with the link count.
*/
if (directory_p && reparent_p) { if ((nlink_t)VTOI(tdvp)->i_nlink >= LINK_MAX) {
error = EMLINK;
goto whymustithurtsomuch;
}
KASSERT((nlink_t)VTOI(tdvp)->i_nlink < LINK_MAX);
VTOI(tdvp)->i_nlink++;
DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
error = UFS_UPDATE(tdvp, NULL, NULL, UPDATE_DIROP);
if (error) {
/*
* Link count update didn't take --
* back out the in-memory link count.
*/
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink,
VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
goto whymustithurtsomuch;
}
}
newdir = pool_cache_get(ufs_direct_cache, PR_WAITOK);
ufs_makedirentry(VTOI(fvp), tcnp, newdir);
error = ufs_direnter(tdvp, tulr, NULL, newdir, tcnp, NULL);
pool_cache_put(ufs_direct_cache, newdir);
if (error) {
if (directory_p && reparent_p) {
/*
* Directory update didn't take, but
* the link count update did -- back
* out the in-memory link count and the
* on-disk link count.
*/
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink,
VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
(void)UFS_UPDATE(tdvp, NULL, NULL,
UPDATE_WAIT | UPDATE_DIROP);
}
goto whymustithurtsomuch;
}
} else {
if (directory_p)
/* XXX WTF? Why purge here? Why not purge others? */
cache_purge(tdvp);
/*
* Make the target directory's entry for tcnp point at
* the source node.
*
* XXX ufs_dirrewrite decrements tvp's link count, but
* doesn't touch the link count of the new inode. Go
* figure.
*/
error = ufs_dirrewrite(VTOI(tdvp), tulr->ulr_offset,
VTOI(tvp), VTOI(fvp)->i_number, IFTODT(VTOI(fvp)->i_mode),
((directory_p && reparent_p) ? reparent_p : directory_p),
IN_CHANGE | IN_UPDATE);
if (error)
goto whymustithurtsomuch;
/*
* If the source and target are directories, and the
* target is in the same directory as the source,
* decrement the link count of the common parent
* directory, since we are removing the target from
* that directory.
*/
if (directory_p && !reparent_p) {
KASSERT(fdvp == tdvp);
/* XXX check, don't kassert */
KASSERT(0 < VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_nlink--;
DIP_ASSIGN(VTOI(tdvp), nlink, VTOI(tdvp)->i_nlink);
VTOI(tdvp)->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(tdvp, NULL, NULL, 0);
}
if (directory_p) {
/*
* XXX I don't understand the following comment
* from ufs_rename -- in particular, the part
* about `there may be other hard links'.
*
* Truncate inode. The only stuff left in the directory
* is "." and "..". The "." reference is inconsequential
* since we are quashing it. We have removed the "."
* reference and the reference in the parent directory,
* but there may be other hard links.
*
* XXX The ufs_dirempty call earlier does
* not guarantee anything about nlink.
*/
if (VTOI(tvp)->i_nlink != 1) ufs_dirbad(VTOI(tvp), (doff_t)0,
"hard-linked directory");
VTOI(tvp)->i_nlink = 0;
DIP_ASSIGN(VTOI(tvp), nlink, 0);
(void) UFS_TRUNCATE(tvp, (off_t)0, IO_SYNC, cred);
}
}
/*
* If the source is a directory with a new parent, the link
* count of the old parent directory must be decremented and
* ".." set to point to the new parent.
*
* XXX ufs_dirrewrite updates the link count of fdvp, but not
* the link count of fvp or the link count of tdvp. Go figure.
*/
if (directory_p && reparent_p) {
error = ufs_dirrewrite(VTOI(fvp), mastertemplate.dot_reclen,
VTOI(fdvp), VTOI(tdvp)->i_number, DT_DIR, 0, IN_CHANGE);
#if 0 /* XXX This branch was not in ufs_rename! */
if (error)
goto whymustithurtsomuch;
#endif
/* XXX WTF? Why purge here? Why not purge others? */
cache_purge(fdvp);
}
/*
* 3) Unlink the source.
*/
/*
* ufs_direnter may compact the directory in the process of
* inserting a new entry. That may invalidate fulr, which we
* need in order to remove the old entry. In that case, we
* need to recalculate what fulr should be.
*/
if (!reparent_p && (tvp == NULL) &&
ufs_rename_ulr_overlap_p(fulr, tulr)) {
error = ufs_rename_recalculate_fulr(fdvp, fulr, tulr, fcnp);
#if 0 /* XXX */
if (error) /* XXX Try to back out changes? */
goto whymustithurtsomuch;
#endif
}
/*
* XXX 0 means !isrmdir. But can't this be an rmdir?
* XXX Well, turns out that argument to ufs_dirremove is ignored...
* XXX And it turns out ufs_dirremove updates the link count of fvp.
* XXX But it doesn't update the link count of fdvp. Go figure.
* XXX fdvp's link count is updated in ufs_dirrewrite instead.
* XXX Actually, sometimes it doesn't update fvp's link count.
* XXX I hate the world.
*/
error = ufs_dirremove(fdvp, fulr, VTOI(fvp), fcnp->cn_flags, 0);
if (error)
#if 0 /* XXX */
goto whymustithurtsomuch;
#endif
goto arghmybrainhurts;
if (tvp != NULL) {
*tvp_nlinkp = VTOI(tvp)->i_nlink;
}
#if 0 /* XXX */
genfs_rename_cache_purge(fdvp, fvp, tdvp, tvp);
#endif
goto arghmybrainhurts;
whymustithurtsomuch:
KASSERT(0 < VTOI(fvp)->i_nlink);
VTOI(fvp)->i_nlink--;
DIP_ASSIGN(VTOI(fvp), nlink, VTOI(fvp)->i_nlink);
VTOI(fvp)->i_flag |= IN_CHANGE;
UFS_WAPBL_UPDATE(fvp, NULL, NULL, 0);
arghmybrainhurts:
UFS_WAPBL_END(mp);
ihateyou:
return error;
}
/*
* ufs_rename_ulr_overlap_p: True iff tulr overlaps with fulr so that
* entering a directory entry at tulr may move fulr.
*/
static bool
ufs_rename_ulr_overlap_p(const struct ufs_lookup_results *fulr,
const struct ufs_lookup_results *tulr)
{
doff_t from_prev_start, from_prev_end, to_start, to_end;
KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr);
/*
* fulr is from a DELETE lookup, so fulr->ulr_count is the size
* of the preceding entry (d_reclen).
*/
from_prev_end = fulr->ulr_offset;
KASSERT(fulr->ulr_count <= from_prev_end);
from_prev_start = (from_prev_end - fulr->ulr_count);
/*
* tulr is from a RENAME lookup, so tulr->ulr_count is the size
* of the free space for an entry that we are about to fill.
*/
to_start = tulr->ulr_offset;
KASSERT(tulr->ulr_count < (UFS_MAXDIRSIZE - to_start));
to_end = (to_start + tulr->ulr_count);
return
(((to_start <= from_prev_start) && (from_prev_start < to_end)) || ((to_start <= from_prev_end) && (from_prev_end < to_end)));
}
/*
* ufs_rename_recalculate_fulr: If we have just entered a directory into
* dvp at tulr, and we were about to remove one at fulr for an entry
* named fcnp, fulr may be invalid. So, if necessary, recalculate it.
*/
static int
ufs_rename_recalculate_fulr(struct vnode *dvp,
struct ufs_lookup_results *fulr, const struct ufs_lookup_results *tulr,
const struct componentname *fcnp)
{
struct mount *mp;
struct ufsmount *ump;
int needswap;
/* XXX int is a silly type for this; blame ufsmount::um_dirblksiz. */
int dirblksiz;
doff_t search_start, search_end;
doff_t offset; /* Offset of entry we're examining. */
struct buf *bp; /* I/O block we're examining. */
char *dirbuf; /* Pointer into directory at search_start. */
struct direct *ep; /* Pointer to the entry we're examining. */
/* XXX direct::d_reclen is 16-bit;
* ufs_lookup_results::ulr_reclen is 32-bit. Blah. */
uint32_t reclen; /* Length of the entry we're examining. */
uint32_t prev_reclen; /* Length of the preceding entry. */
int error;
KASSERT(dvp != NULL);
KASSERT(dvp->v_mount != NULL); KASSERT(VTOI(dvp) != NULL); KASSERT(fulr != NULL); KASSERT(tulr != NULL); KASSERT(fulr != tulr); KASSERT(ufs_rename_ulr_overlap_p(fulr, tulr));
mp = dvp->v_mount;
ump = VFSTOUFS(mp);
KASSERT(ump != NULL); KASSERT(ump == VTOI(dvp)->i_ump);
needswap = UFS_MPNEEDSWAP(ump);
dirblksiz = ump->um_dirblksiz;
KASSERT(0 < dirblksiz); KASSERT((dirblksiz & (dirblksiz - 1)) == 0);
/* A directory block may not span across multiple I/O blocks. */
KASSERT(dirblksiz <= mp->mnt_stat.f_iosize);
/* Find the bounds of the search. */
search_start = tulr->ulr_offset;
KASSERT(fulr->ulr_reclen < (UFS_MAXDIRSIZE - fulr->ulr_offset));
search_end = (fulr->ulr_offset + fulr->ulr_reclen);
/* Compaction must happen only within a directory block. (*) */
KASSERT(search_start <= search_end); KASSERT((search_end - (search_start &~ (dirblksiz - 1))) <= dirblksiz);
dirbuf = NULL;
bp = NULL;
error = ufs_blkatoff(dvp, (off_t)search_start, &dirbuf, &bp, false);
if (error)
return error;
KASSERT(dirbuf != NULL); KASSERT(bp != NULL);
/*
* Guarantee we sha'n't go past the end of the buffer we got.
* dirbuf is bp->b_data + (search_start & (iosize - 1)), and
* the valid range is [bp->b_data, bp->b_data + bp->b_bcount).
*/
KASSERT((search_end - search_start) <=
(bp->b_bcount - (search_start & (mp->mnt_stat.f_iosize - 1))));
prev_reclen = fulr->ulr_count;
offset = search_start;
/*
* Search from search_start to search_end for the entry matching
* fcnp, which must be there because we found it before and it
* should only at most have moved earlier.
*/
for (;;) {
KASSERT(search_start <= offset); KASSERT(offset < search_end);
/*
* Examine the directory entry at offset.
*/
ep = (struct direct *)(dirbuf + (offset - search_start));
reclen = ufs_rw16(ep->d_reclen, needswap);
if (ep->d_ino == 0)
goto next; /* Entry is unused. */
if (ufs_rw32(ep->d_ino, needswap) == UFS_WINO)
goto next; /* Entry is whiteout. */
if (fcnp->cn_namelen != ufs_direct_namlen(ep, dvp))
goto next; /* Wrong name length. */
if (memcmp(ep->d_name, fcnp->cn_nameptr, fcnp->cn_namelen))
goto next; /* Wrong name. */
/* Got it! */
break;
next:
if (! ((reclen < search_end) &&
(offset < (search_end - reclen)))) {
brelse(bp, 0);
return EIO; /* XXX Panic? What? */
}
/* We may not move past the search end. */
KASSERT(reclen < search_end);
KASSERT(offset < (search_end - reclen));
/*
* We may not move across a directory block boundary;
* see (*) above.
*/
KASSERT((offset &~ (dirblksiz - 1)) ==
((offset + reclen) &~ (dirblksiz - 1)));
prev_reclen = reclen;
offset += reclen;
}
/*
* Found the entry. Record where.
*/
fulr->ulr_offset = offset;
fulr->ulr_reclen = reclen;
/*
* Record the preceding record length, but not if we're at the
* start of a directory block.
*/
fulr->ulr_count = ((offset & (dirblksiz - 1))? prev_reclen : 0);
brelse(bp, 0);
return 0;
}
/*
* ufs_direct_namlen: Return the namlen of the directory entry ep from
* the directory vp.
*/
static int /* XXX int? uint8_t? */
ufs_direct_namlen(const struct direct *ep, const struct vnode *vp)
{
bool swap;
KASSERT(ep != NULL);
KASSERT(vp != NULL); KASSERT(VTOI(vp) != NULL); KASSERT(VTOI(vp)->i_ump != NULL);
#if (BYTE_ORDER == LITTLE_ENDIAN)
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif
return ((FSFMT(vp) && swap)? ep->d_type : ep->d_namlen);
}
/*
* ufs_gro_remove: Rename an object over another link to itself,
* effectively removing just the original link.
*/
int
ufs_gro_remove(struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
nlink_t *tvp_nlinkp)
{
struct ufs_lookup_results *ulr = de;
int error;
KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(ulr != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(cnp->cn_nameiop == DELETE); error = UFS_WAPBL_BEGIN(mp);
if (error)
goto out;
/* XXX ufs_dirremove decrements vp's link count for us. */
error = ufs_dirremove(dvp, ulr, VTOI(vp), cnp->cn_flags, 0);
UFS_WAPBL_END(mp);
*tvp_nlinkp = VTOI(vp)->i_nlink;
out:
return error;
}
/*
* ufs_gro_lookup: Look up and save the lookup results.
*/
int
ufs_gro_lookup(struct mount *mp, struct vnode *dvp,
struct componentname *cnp, void *de_ret, struct vnode **vp_ret)
{
struct ufs_lookup_results *ulr_ret = de_ret;
struct vnode *vp = NULL;
int error;
(void)mp;
KASSERT(mp != NULL); KASSERT(dvp != NULL); KASSERT(cnp != NULL); KASSERT(ulr_ret != NULL); KASSERT(vp_ret != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
/* Kludge cargo-culted from dholland's ufs_rename. */
cnp->cn_flags &=~ MODMASK;
cnp->cn_flags |= (LOCKPARENT | LOCKLEAF);
error = relookup(dvp, &vp, cnp, 0 /* dummy */);
if ((error == 0) && (vp == NULL)) {
error = ENOENT;
goto out;
} else if (error) {
return error;
}
/*
* Thanks to VFS insanity, relookup locks vp, which screws us
* in various ways.
*/
KASSERT(vp != NULL);
VOP_UNLOCK(vp);
out: *ulr_ret = VTOI(dvp)->i_crap;
*vp_ret = vp;
return error;
}
/*
* ufs_rmdired_p: Check whether the directory vp has been rmdired.
*
* vp must be locked and referenced.
*/
static bool
ufs_rmdired_p(struct vnode *vp)
{
KASSERT(vp != NULL);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_type == VDIR);
/* XXX Is this correct? */
return (VTOI(vp)->i_size == 0);
}
/*
* ufs_read_dotdot: Store in *ino_ret the inode number of the parent
* of the directory vp.
*/
static int
ufs_read_dotdot(struct vnode *vp, kauth_cred_t cred, ino_t *ino_ret)
{
struct dirtemplate dirbuf;
int error;
KASSERT(vp != NULL);
KASSERT(ino_ret != NULL);
KASSERT(vp->v_type == VDIR);
error = ufs_bufio(UIO_READ, vp, &dirbuf, sizeof dirbuf, (off_t)0,
IO_NODELOCKED, cred, NULL, NULL);
if (error)
return error;
if (ufs_dirbuf_dotdot_namlen(&dirbuf, vp) != 2 ||
dirbuf.dotdot_name[0] != '.' ||
dirbuf.dotdot_name[1] != '.')
/* XXX Panic? Print warning? */
return ENOTDIR;
*ino_ret = ufs_rw32(dirbuf.dotdot_ino,
UFS_MPNEEDSWAP(VTOI(vp)->i_ump));
return 0;
}
/*
* ufs_dirbuf_dotdot_namlen: Return the namlen of the directory buffer
* dirbuf that came from the directory vp. Swap byte order if
* necessary.
*/
static int /* XXX int? uint8_t? */
ufs_dirbuf_dotdot_namlen(const struct dirtemplate *dirbuf,
const struct vnode *vp)
{
bool swap;
KASSERT(dirbuf != NULL);
KASSERT(vp != NULL);
KASSERT(VTOI(vp) != NULL); KASSERT(VTOI(vp)->i_ump != NULL);
#if (BYTE_ORDER == LITTLE_ENDIAN)
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) == 0);
#else
swap = (UFS_MPNEEDSWAP(VTOI(vp)->i_ump) != 0);
#endif
return ((FSFMT(vp) && swap)?
dirbuf->dotdot_type : dirbuf->dotdot_namlen);
}
/*
* ufs_gro_genealogy: Analyze the genealogy of the source and target
* directories.
*/
int
ufs_gro_genealogy(struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct vnode *tdvp,
struct vnode **intermediate_node_ret)
{
struct vnode *vp, *dvp;
ino_t dotdot_ino = 0; /* XXX: gcc */
int error;
KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != tdvp); KASSERT(intermediate_node_ret != NULL); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR);
/*
* We need to provisionally lock tdvp to keep rmdir from
* deleting it -- or any ancestor -- at an inopportune moment.
*/
error = ufs_gro_lock_directory(mp, tdvp);
if (error)
return error;
vp = tdvp;
vref(vp);
for (;;) {
KASSERT(vp != NULL);
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERT(vp->v_mount == mp); KASSERT(vp->v_type == VDIR); KASSERT(!ufs_rmdired_p(vp));
/* Did we hit the root without finding fdvp? */
if (VTOI(vp)->i_number == UFS_ROOTINO) {
vput(vp);
*intermediate_node_ret = NULL;
return 0;
}
error = ufs_read_dotdot(vp, cred, &dotdot_ino);
if (error) {
vput(vp);
return error;
}
/* Did we find that fdvp is an ancestor of tdvp? */
if (VTOI(fdvp)->i_number == dotdot_ino) {
/* Unlock vp, but keep it referenced. */
VOP_UNLOCK(vp);
*intermediate_node_ret = vp;
return 0;
}
/* Neither -- keep ascending the family tree. */
error = vcache_get(mp, &dotdot_ino, sizeof(dotdot_ino), &dvp);
vput(vp);
if (error)
return error;
error = vn_lock(dvp, LK_EXCLUSIVE);
if (error) {
vrele(dvp);
return error;
}
KASSERT(dvp != NULL); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
vp = dvp;
if (vp->v_type != VDIR) {
/*
* XXX Panic? Print a warning? Can this
* happen if we lose the race I suspect to
* exist above, and the `..' inode number has
* been recycled?
*/
vput(vp);
return ENOTDIR;
}
if (ufs_rmdired_p(vp)) { vput(vp);
return ENOENT;
}
}
}
/*
* ufs_gro_lock_directory: Lock the directory vp, but fail if it has
* been rmdir'd.
*/
int
ufs_gro_lock_directory(struct mount *mp, struct vnode *vp)
{
(void)mp;
KASSERT(mp != NULL); KASSERT(vp != NULL); KASSERT(vp->v_mount == mp);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (ufs_rmdired_p(vp)) { VOP_UNLOCK(vp);
return ENOENT;
}
return 0;
}
static const struct genfs_rename_ops ufs_genfs_rename_ops = {
.gro_directory_empty_p = ufs_gro_directory_empty_p,
.gro_rename_check_possible = ufs_gro_rename_check_possible,
.gro_rename_check_permitted = ufs_gro_rename_check_permitted,
.gro_remove_check_possible = ufs_gro_remove_check_possible,
.gro_remove_check_permitted = ufs_gro_remove_check_permitted,
.gro_rename = ufs_gro_rename,
.gro_remove = ufs_gro_remove,
.gro_lookup = ufs_gro_lookup,
.gro_genealogy = ufs_gro_genealogy,
.gro_lock_directory = ufs_gro_lock_directory,
};
/* $NetBSD: nd.c,v 1.5 2022/11/19 08:00:51 yamt Exp $ */
/*
* Copyright (c) 2020 The NetBSD Foundation, Inc.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Roy Marples.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: nd.c,v 1.5 2022/11/19 08:00:51 yamt Exp $");
#include <sys/callout.h>
#include <sys/mbuf.h>
#include <sys/socketvar.h> /* for softnet_lock */
#include <net/if_llatbl.h>
#include <net/nd.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
static struct nd_domain *nd_domains[AF_MAX];
static int nd_gctimer = (60 * 60 * 24); /* 1 day: garbage collection timer */
static void nd_set_timertick(struct llentry *, time_t);
static struct nd_domain *nd_find_domain(int);
static void
nd_timer(void *arg)
{
struct llentry *ln = arg;
struct nd_domain *nd;
struct ifnet *ifp = NULL;
struct psref psref;
struct mbuf *m = NULL;
bool send_ns = false;
int16_t missed = ND_LLINFO_NOSTATE;
union l3addr taddr, *daddrp = NULL;
SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE();
LLE_WLOCK(ln);
if (!(ln->la_flags & LLE_LINKED))
goto out;
if (ln->ln_ntick > 0) {
nd_set_timer(ln, ND_TIMER_TICK);
goto out;
}
nd = nd_find_domain(ln->lle_tbl->llt_af);
ifp = ln->lle_tbl->llt_ifp;
KASSERT(ifp != NULL);
if_acquire(ifp, &psref);
memcpy(&taddr, &ln->r_l3addr, sizeof(taddr));
switch (ln->ln_state) {
case ND_LLINFO_WAITDELETE:
LLE_REMREF(ln);
nd->nd_free(ln, 0);
ln = NULL;
break;
case ND_LLINFO_INCOMPLETE:
send_ns = true;
if (ln->ln_asked++ < nd->nd_mmaxtries)
break;
if (ln->ln_hold) {
struct mbuf *m0, *mnxt;
/*
* Assuming every packet in ln_hold
* has the same IP header.
*/
m = ln->ln_hold;
for (m0 = m->m_nextpkt; m0 != NULL; m0 = mnxt) {
mnxt = m0->m_nextpkt;
m0->m_nextpkt = NULL;
m_freem(m0);
}
m->m_nextpkt = NULL;
ln->ln_hold = NULL;
}
missed = ND_LLINFO_INCOMPLETE;
ln->ln_state = ND_LLINFO_WAITDELETE;
break;
case ND_LLINFO_REACHABLE:
if (!ND_IS_LLINFO_PERMANENT(ln)) {
ln->ln_state = ND_LLINFO_STALE;
nd_set_timer(ln, ND_TIMER_GC);
}
break;
case ND_LLINFO_PURGE: /* FALLTHROUGH */
case ND_LLINFO_STALE:
if (!ND_IS_LLINFO_PERMANENT(ln)) {
LLE_REMREF(ln);
nd->nd_free(ln, 1);
ln = NULL;
}
break;
case ND_LLINFO_DELAY:
if (nd->nd_nud_enabled(ifp)) {
ln->ln_asked = 1;
ln->ln_state = ND_LLINFO_PROBE;
send_ns = true;
daddrp = &taddr;
} else {
ln->ln_state = ND_LLINFO_STALE;
nd_set_timer(ln, ND_TIMER_GC);
}
break;
case ND_LLINFO_PROBE:
send_ns = true;
if (ln->ln_asked++ < nd->nd_umaxtries) {
daddrp = &taddr;
} else {
ln->ln_state = ND_LLINFO_UNREACHABLE;
ln->ln_asked = 1;
missed = ND_LLINFO_PROBE;
/* nd_missed() consumers can use missed to know if
* they need to send ICMP UNREACHABLE or not. */
}
break;
case ND_LLINFO_UNREACHABLE:
/*
* RFC 7048 Section 3 says in the UNREACHABLE state
* packets continue to be sent to the link-layer address and
* then backoff exponentially.
* We adjust this slightly and move to the INCOMPLETE state
* after nd_mmaxtries probes and then start backing off.
*
* This results in simpler code whilst providing a more robust
* model which doubles the time to failure over what we did
* before. We don't want to be back to the old ARP model where
* no unreachability errors are returned because very
* few applications would look at unreachability hints provided
* such as ND_LLINFO_UNREACHABLE or RTM_MISS.
*/
send_ns = true;
if (ln->ln_asked++ < nd->nd_mmaxtries)
break;
missed = ND_LLINFO_UNREACHABLE;
ln->ln_state = ND_LLINFO_WAITDELETE;
ln->la_flags &= ~LLE_VALID;
break;
}
if (send_ns) {
uint8_t lladdr[255], *lladdrp;
union l3addr src, *psrc;
if (ln->ln_state == ND_LLINFO_WAITDELETE)
nd_set_timer(ln, ND_TIMER_RETRANS_BACKOFF);
else
nd_set_timer(ln, ND_TIMER_RETRANS);
if (ln->ln_state > ND_LLINFO_INCOMPLETE &&
ln->la_flags & LLE_VALID)
{
KASSERT(sizeof(lladdr) >= ifp->if_addrlen);
memcpy(lladdr, &ln->ll_addr, ifp->if_addrlen);
lladdrp = lladdr;
} else
lladdrp = NULL;
psrc = nd->nd_holdsrc(ln, &src);
LLE_FREE_LOCKED(ln);
ln = NULL;
nd->nd_output(ifp, daddrp, &taddr, lladdrp, psrc);
}
out:
if (ln != NULL)
LLE_FREE_LOCKED(ln);
SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE();
if (missed != ND_LLINFO_NOSTATE)
nd->nd_missed(ifp, &taddr, missed, m);
if (ifp != NULL)
if_release(ifp, &psref);
}
static void
nd_set_timertick(struct llentry *ln, time_t xtick)
{
CTASSERT(sizeof(time_t) > sizeof(int));
KASSERT(xtick >= 0);
/*
* We have to take care of a reference leak which occurs if
* callout_reset overwrites a pending callout schedule. Unfortunately
* we don't have a mean to know the overwrite, so we need to know it
* using callout_stop. We need to call callout_pending first to exclude
* the case that the callout has never been scheduled.
*/
if (callout_pending(&ln->la_timer)) {
bool expired;
expired = callout_stop(&ln->la_timer);
if (!expired)
LLE_REMREF(ln);
}
ln->ln_expire = time_uptime + xtick / hz;
LLE_ADDREF(ln);
if (xtick > INT_MAX) {
ln->ln_ntick = xtick - INT_MAX;
xtick = INT_MAX;
} else {
ln->ln_ntick = 0;
}
callout_reset(&ln->ln_timer_ch, xtick, nd_timer, ln);
}
void
nd_set_timer(struct llentry *ln, int type)
{
time_t xtick;
struct ifnet *ifp;
struct nd_domain *nd;
LLE_WLOCK_ASSERT(ln);
ifp = ln->lle_tbl->llt_ifp;
nd = nd_find_domain(ln->lle_tbl->llt_af);
switch (type) {
case ND_TIMER_IMMEDIATE:
xtick = 0;
break;
case ND_TIMER_TICK:
xtick = ln->ln_ntick;
break;
case ND_TIMER_RETRANS:
xtick = nd->nd_retrans(ifp) * hz / 1000;
break;
case ND_TIMER_RETRANS_BACKOFF:
{
unsigned int retrans = nd->nd_retrans(ifp);
unsigned int attempts = ln->ln_asked - nd->nd_mmaxtries;
xtick = retrans;
while (attempts-- != 0) {
xtick *= nd->nd_retransmultiple;
if (xtick > nd->nd_maxretrans || xtick < retrans) {
xtick = nd->nd_maxretrans;
break;
}
}
xtick = xtick * hz / 1000;
break;
}
case ND_TIMER_REACHABLE:
xtick = nd->nd_reachable(ifp) * hz / 1000;
break;
case ND_TIMER_EXPIRE:
if (ln->ln_expire > time_uptime)
xtick = (ln->ln_expire - time_uptime) * hz;
else
xtick = nd_gctimer * hz;
break;
case ND_TIMER_DELAY:
xtick = nd->nd_delay * hz;
break;
case ND_TIMER_GC:
xtick = nd_gctimer * hz;
break;
default:
panic("%s: invalid timer type\n", __func__);
}
nd_set_timertick(ln, xtick);
}
int
nd_resolve(struct llentry *ln, const struct rtentry *rt, struct mbuf *m,
uint8_t *lldst, size_t dstsize)
{
struct ifnet *ifp;
struct nd_domain *nd;
int error;
LLE_WLOCK_ASSERT(ln);
ifp = ln->lle_tbl->llt_ifp;
nd = nd_find_domain(ln->lle_tbl->llt_af);
/* We don't have to do link-layer address resolution on a p2p link. */
if (ifp->if_flags & IFF_POINTOPOINT &&
ln->ln_state < ND_LLINFO_REACHABLE)
{
ln->ln_state = ND_LLINFO_STALE;
nd_set_timer(ln, ND_TIMER_GC);
}
/*
* The first time we send a packet to a neighbor whose entry is
* STALE, we have to change the state to DELAY and a sets a timer to
* expire in DELAY_FIRST_PROBE_TIME seconds to ensure do
* neighbor unreachability detection on expiration.
* (RFC 2461 7.3.3)
*/
if (ln->ln_state == ND_LLINFO_STALE) { ln->ln_asked = 0;
ln->ln_state = ND_LLINFO_DELAY;
nd_set_timer(ln, ND_TIMER_DELAY);
}
/*
* If the neighbor cache entry has a state other than INCOMPLETE
* (i.e. its link-layer address is already resolved), just
* send the packet.
*/
if (ln->ln_state > ND_LLINFO_INCOMPLETE) {
KASSERT((ln->la_flags & LLE_VALID) != 0);
memcpy(lldst, &ln->ll_addr, MIN(dstsize, ifp->if_addrlen));
LLE_WUNLOCK(ln);
return 0;
}
/*
* There is a neighbor cache entry, but no ethernet address
* response yet. Append this latest packet to the end of the
* packet queue in the mbuf, unless the number of the packet
* does not exceed maxqueuelen. When it exceeds maxqueuelen,
* the oldest packet in the queue will be removed.
*/
if (ln->ln_state == ND_LLINFO_NOSTATE ||
ln->ln_state == ND_LLINFO_WAITDELETE)
ln->ln_state = ND_LLINFO_INCOMPLETE;
#ifdef MBUFTRACE
m_claimm(m, ln->lle_tbl->llt_mowner);
#endif
if (ln->ln_hold != NULL) {
struct mbuf *m_hold;
int i;
i = 0;
for (m_hold = ln->ln_hold; m_hold; m_hold = m_hold->m_nextpkt) {
i++;
if (m_hold->m_nextpkt == NULL) {
m_hold->m_nextpkt = m;
break;
}
}
while (i >= nd->nd_maxqueuelen) {
m_hold = ln->ln_hold;
ln->ln_hold = ln->ln_hold->m_nextpkt;
m_freem(m_hold);
i--;
}
} else
ln->ln_hold = m; if (ln->ln_asked >= nd->nd_mmaxtries) error = (rt != NULL && rt->rt_flags & RTF_GATEWAY) ?
EHOSTUNREACH : EHOSTDOWN;
else
error = EWOULDBLOCK;
/*
* If there has been no NS for the neighbor after entering the
* INCOMPLETE state, send the first solicitation.
*/
if (!ND_IS_LLINFO_PERMANENT(ln) && ln->ln_asked == 0) {
struct psref psref;
union l3addr dst, src, *psrc;
ln->ln_asked++;
nd_set_timer(ln, ND_TIMER_RETRANS);
memcpy(&dst, &ln->r_l3addr, sizeof(dst));
psrc = nd->nd_holdsrc(ln, &src);
if_acquire(ifp, &psref);
LLE_WUNLOCK(ln);
nd->nd_output(ifp, NULL, &dst, NULL, psrc);
if_release(ifp, &psref);
} else
LLE_WUNLOCK(ln);
return error;
}
void
nd_nud_hint(struct llentry *ln)
{
struct nd_domain *nd;
if (ln == NULL)
return;
LLE_WLOCK_ASSERT(ln);
if (ln->ln_state < ND_LLINFO_REACHABLE)
goto done;
nd = nd_find_domain(ln->lle_tbl->llt_af);
/*
* if we get upper-layer reachability confirmation many times,
* it is possible we have false information.
*/
ln->ln_byhint++;
if (ln->ln_byhint > nd->nd_maxnudhint)
goto done;
ln->ln_state = ND_LLINFO_REACHABLE;
if (!ND_IS_LLINFO_PERMANENT(ln))
nd_set_timer(ln, ND_TIMER_REACHABLE);
done:
LLE_WUNLOCK(ln);
return;
}
static struct nd_domain *
nd_find_domain(int af)
{
KASSERT(af < __arraycount(nd_domains) && nd_domains[af] != NULL);
return nd_domains[af];
}
void
nd_attach_domain(struct nd_domain *nd)
{
KASSERT(nd->nd_family < __arraycount(nd_domains));
nd_domains[nd->nd_family] = nd;
}
/* $NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $ */
/*-
* Copyright (c) 2012 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Generic rename abstraction.
*
* Rename is unbelievably hairy. Try to use this if you can --
* otherwise you are practically guaranteed to get it wrong.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: genfs_rename.c,v 1.7 2021/10/20 13:29:06 thorpej Exp $");
#include <sys/param.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/types.h>
#include <miscfs/genfs/genfs.h>
/*
* Sample copypasta for implementing VOP_RENAME via genfs_rename.
* Don't change this template without carefully considering whether
* every other file system that already uses it needs to change too.
* That way, once we have changed all the file systems to use it, we
* can easily replace mumblefs_rename by mumblefs_sane_rename and
* eliminate the insane API altogether.
*/
/* begin sample copypasta */
#if 0
static const struct genfs_rename_ops mumblefs_genfs_rename_ops;
/*
* mumblefs_sane_rename: The hairiest vop, with the saner API.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tcnp (to component name),
* . cred (credentials structure), and
* . posixly_correct (flag for behaviour if target & source link same file).
*
* fdvp and tdvp may be the same, and must be referenced and unlocked.
*/
static int
mumblefs_sane_rename(
struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct)
{
struct mumblefs_lookup_results fulr, tulr;
return genfs_sane_rename(&mumblefs_genfs_rename_ops,
fdvp, fcnp, &fulr, tdvp, tcnp, &tulr,
cred, posixly_correct);
}
/*
* mumblefs_rename: The hairiest vop, with the insanest API. Defer to
* genfs_insane_rename immediately.
*/
int
mumblefs_rename(void *v)
{
return genfs_insane_rename(v, &mumblefs_sane_rename);
}
#endif
/* end sample copypasta */
/*
* Forward declarations
*/
static int genfs_rename_enter(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode **,
struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_common(const struct genfs_rename_ops *,
struct mount *, kauth_cred_t, struct vnode *,
struct componentname *, void *, struct vnode **,
struct componentname *, void *, struct vnode **);
static int genfs_rename_enter_separate(const struct genfs_rename_ops *,
struct mount *, kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode **,
struct vnode *, struct componentname *, void *, struct vnode **);
static int genfs_rename_lock(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t, int, int, int,
struct vnode *, struct componentname *, bool, void *, struct vnode **,
struct vnode *, struct componentname *, bool, void *, struct vnode **);
static void genfs_rename_exit(const struct genfs_rename_ops *, struct mount *,
struct vnode *, struct vnode *,
struct vnode *, struct vnode *);
static int genfs_rename_remove(const struct genfs_rename_ops *, struct mount *,
kauth_cred_t,
struct vnode *, struct componentname *, void *, struct vnode *, nlink_t *);
/*
* genfs_insane_rename: Generic implementation of the insane API for
* the rename vop.
*
* Arguments:
*
* . fdvp (from directory vnode),
* . fvp (from vnode),
* . fcnp (from component name),
* . tdvp (to directory vnode),
* . tvp (to vnode, or NULL), and
* . tcnp (to component name).
*
* Any pair of vnode parameters may have the same vnode.
*
* On entry,
*
* . fdvp, fvp, tdvp, and tvp are referenced,
* . fdvp and fvp are unlocked, and
* . tdvp and tvp (if nonnull) are locked.
*
* On exit,
*
* . fdvp, fvp, tdvp, and tvp (if nonnull) are unreferenced, and
* . tdvp and tvp (if nonnull) are unlocked.
*/
int
genfs_insane_rename(void *v,
int (*sane_rename)(struct vnode *fdvp, struct componentname *fcnp,
struct vnode *tdvp, struct componentname *tcnp,
kauth_cred_t cred, bool posixly_correct))
{
struct vop_rename_args /* {
struct vnode *a_fdvp;
struct vnode *a_fvp;
struct componentname *a_fcnp;
struct vnode *a_tdvp;
struct vnode *a_tvp;
struct componentname *a_tcnp;
} */ *ap = v;
struct vnode *fdvp = ap->a_fdvp;
struct vnode *fvp = ap->a_fvp;
struct componentname *fcnp = ap->a_fcnp;
struct vnode *tdvp = ap->a_tdvp;
struct vnode *tvp = ap->a_tvp;
struct componentname *tcnp = ap->a_tcnp;
kauth_cred_t cred;
int error;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL); KASSERT(fcnp->cn_nameptr != NULL);
/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
/* KASSERT(VOP_ISLOCKED(fvp) != LK_EXCLUSIVE); */
KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR);
cred = fcnp->cn_cred;
/*
* XXX Want a better equality test. `tcnp->cn_cred == cred'
* hoses p2k because puffs transmits the creds separately and
* allocates distinct but equivalent structures for them.
*/
KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
/*
* Sanitize our world from the VFS insanity. Unlock the target
* directory and node, which are locked. Release the children,
* which are referenced, since we'll be looking them up again
* later.
*/
VOP_UNLOCK(tdvp);
if ((tvp != NULL) && (tvp != tdvp)) VOP_UNLOCK(tvp);
vrele(fvp);
if (tvp != NULL) vrele(tvp);
error = (*sane_rename)(fdvp, fcnp, tdvp, tcnp, cred, false);
/*
* All done, whether with success or failure. Release the
* directory nodes now, as the caller expects from the VFS
* protocol.
*/
vrele(fdvp);
vrele(tdvp);
return error;
}
/*
* genfs_sane_rename: Generic implementation of the saner API for the
* rename vop. Handles ancestry checks, locking, and permissions
* checks. Caller is responsible for implementing the genfs rename
* operations.
*
* fdvp and tdvp must be referenced and unlocked.
*/
int
genfs_sane_rename(const struct genfs_rename_ops *ops,
struct vnode *fdvp, struct componentname *fcnp, void *fde,
struct vnode *tdvp, struct componentname *tcnp, void *tde,
kauth_cred_t cred, bool posixly_correct)
{
struct mount *mp;
struct vnode *fvp = NULL, *tvp = NULL;
nlink_t tvp_new_nlink = 0;
int error;
KASSERT(ops != NULL); KASSERT(fdvp != NULL); KASSERT(fcnp != NULL); KASSERT(tdvp != NULL); KASSERT(tcnp != NULL);
/* KASSERT(VOP_ISLOCKED(fdvp) != LK_EXCLUSIVE); */
/* KASSERT(VOP_ISLOCKED(tdvp) != LK_EXCLUSIVE); */
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME);
/* XXX Want a better equality test. */
KASSERT(kauth_cred_uidmatch(cred, fcnp->cn_cred)); KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
mp = fdvp->v_mount;
KASSERT(mp != NULL); KASSERT(mp == tdvp->v_mount);
/* XXX How can we be sure this stays true? */
KASSERT((mp->mnt_flag & MNT_RDONLY) == 0);
/* Reject rename("x/..", ...) and rename(..., "x/..") early. */
if ((fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT)
return EINVAL; /* XXX EISDIR? */
error = genfs_rename_enter(ops, mp, cred,
fdvp, fcnp, fde, &fvp,
tdvp, tcnp, tde, &tvp);
if (error)
return error;
/*
* Check that everything is locked and looks right.
*/
KASSERT(fvp != NULL); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* If the source and destination are the same object, we need
* only at most delete the source entry. We are guaranteed at
* this point that the entries are distinct.
*/
if (fvp == tvp) { KASSERT(tvp != NULL); if (fvp->v_type == VDIR)
/* XXX This shouldn't be possible. */
error = EINVAL;
else if (posixly_correct)
/* POSIX sez to leave them alone. */
error = 0;
else if ((fdvp == tdvp) && (fcnp->cn_namelen == tcnp->cn_namelen) &&
(memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr,
fcnp->cn_namelen) == 0))
/* Renaming an entry over itself does nothing. */
error = 0;
else {
/* XXX Can't use VOP_REMOVE because of locking. */
error = genfs_rename_remove(ops, mp, cred,
fdvp, fcnp, fde, fvp, &tvp_new_nlink);
VN_KNOTE(fdvp, NOTE_WRITE); VN_KNOTE(fvp,
tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK);
}
goto out;
}
KASSERT(fvp != tvp);
KASSERT((fdvp != tdvp) ||
(fcnp->cn_namelen != tcnp->cn_namelen) ||
(memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fcnp->cn_namelen)
!= 0));
/*
* If the target exists, refuse to rename a directory over a
* non-directory or vice versa, or to clobber a non-empty
* directory.
*/
if (tvp != NULL) { if (fvp->v_type == VDIR && tvp->v_type == VDIR)
error =
(ops->gro_directory_empty_p(mp, cred, tvp, tdvp)?
0 : ENOTEMPTY);
else if (fvp->v_type == VDIR && tvp->v_type != VDIR)
error = ENOTDIR;
else if (fvp->v_type != VDIR && tvp->v_type == VDIR)
error = EISDIR;
else
error = 0;
if (error)
goto out;
KASSERT((fvp->v_type == VDIR) == (tvp->v_type == VDIR));
}
/*
* Authorize the rename.
*/
error = ops->gro_rename_check_possible(mp, fdvp, fvp, tdvp, tvp);
if (error)
goto out;
error = ops->gro_rename_check_permitted(mp, cred, fdvp, fvp, tdvp, tvp);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, fvp, fdvp,
error);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_RENAME, tvp, tdvp,
error);
if (error)
goto out;
/*
* Everything is hunky-dory. Shuffle the directory entries.
*/
error = ops->gro_rename(mp, cred,
fdvp, fcnp, fde, fvp,
tdvp, tcnp, tde, tvp,
&tvp_new_nlink);
if (error)
goto out;
/* Success! */
genfs_rename_knote(fdvp, fvp, tdvp, tvp, tvp_new_nlink);
out:
genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
return error;
}
/*
* genfs_rename_knote: Note events about the various vnodes in a
* rename. To be called by gro_rename on success. The only pair of
* vnodes that may be identical is {fdvp, tdvp}. tvp_new_nlink is
* the resulting link count of tvp.
*/
void
genfs_rename_knote(struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp, nlink_t tvp_new_nlink)
{
long fdvp_events, tdvp_events;
bool directory_p, reparent_p, replaced_p;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); directory_p = (fvp->v_type == VDIR);
reparent_p = (fdvp != tdvp);
replaced_p = (tvp != NULL);
KASSERT((tvp == NULL) || (directory_p == (tvp->v_type == VDIR)));
fdvp_events = NOTE_WRITE;
if (directory_p && reparent_p)
fdvp_events |= NOTE_LINK;
VN_KNOTE(fdvp, fdvp_events); VN_KNOTE(fvp, NOTE_RENAME); if (reparent_p) {
tdvp_events = NOTE_WRITE;
if (!replaced_p) {
tdvp_events |= NOTE_EXTEND;
if (directory_p)
tdvp_events |= NOTE_LINK;
}
VN_KNOTE(tdvp, tdvp_events);
}
if (replaced_p) VN_KNOTE(tvp, (tvp_new_nlink == 0 ? NOTE_DELETE : NOTE_LINK));
}
/*
* genfs_rename_cache_purge: Purge the name cache. To be called by
* gro_rename on success. The only pair of vnodes that may be
* identical is {fdvp, tdvp}.
*/
void
genfs_rename_cache_purge(struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{ KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR);
/*
* XXX What actually needs to be purged?
*/
cache_purge(fdvp);
if (fvp->v_type == VDIR) cache_purge(fvp); if (tdvp != fdvp) cache_purge(tdvp); if ((tvp != NULL) && (tvp->v_type == VDIR)) cache_purge(tvp);
}
/*
* genfs_rename_enter: Look up fcnp in fdvp, and store the lookup
* results in *fde_ret and the associated vnode in *fvp_ret; fail if
* not found. Look up tcnp in tdvp, and store the lookup results in
* *tde_ret and the associated vnode in *tvp_ret; store null instead if
* not found. Fail if anything has been mounted on any of the nodes
* involved.
*
* fdvp and tdvp must be referenced.
*
* On entry, nothing is locked.
*
* On success, everything is locked, and *fvp_ret, and *tvp_ret if
* nonnull, are referenced. The only pairs of vnodes that may be
* identical are {fdvp, tdvp} and {fvp, tvp}.
*
* On failure, everything remains as was.
*
* Locking everything including the source and target nodes is
* necessary to make sure that, e.g., link count updates are OK. The
* locking order is, in general, ancestor-first, matching the order you
* need to use to look up a descendant anyway.
*/
static int
genfs_rename_enter(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct vnode *tdvp, struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
int error;
KASSERT(mp != NULL);
KASSERT(fdvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tdvp != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(fvp_ret != tvp_ret);
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp);
if (fdvp == tdvp)
error = genfs_rename_enter_common(ops, mp, cred, fdvp,
fcnp, fde_ret, fvp_ret,
tcnp, tde_ret, tvp_ret);
else
error = genfs_rename_enter_separate(ops, mp, cred,
fdvp, fcnp, fde_ret, fvp_ret,
tdvp, tcnp, tde_ret, tvp_ret);
if (error)
return error;
KASSERT(*fvp_ret != NULL); KASSERT(VOP_ISLOCKED(*fvp_ret) == LK_EXCLUSIVE); KASSERT((*tvp_ret == NULL) || (VOP_ISLOCKED(*tvp_ret) == LK_EXCLUSIVE)); KASSERT(*fvp_ret != fdvp); KASSERT(*fvp_ret != tdvp); KASSERT(*tvp_ret != fdvp); KASSERT(*tvp_ret != tdvp);
return 0;
}
/*
* genfs_rename_enter_common: Lock and look up with a common
* source/target directory.
*/
static int
genfs_rename_enter_common(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred, struct vnode *dvp,
struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
struct vnode *fvp, *tvp;
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(dvp->v_type == VDIR); KASSERT(dvp->v_mount == mp);
error = ops->gro_lock_directory(mp, dvp);
if (error)
goto fail0;
/* Did we lose a race with mount? */
if (dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail1;
}
KASSERT(fcnp->cn_nameiop == DELETE);
error = ops->gro_lookup(mp, dvp, fcnp, fde_ret, &fvp);
if (error)
goto fail1;
KASSERT(fvp != NULL);
/* Refuse to rename `.'. */
if (fvp == dvp) {
error = EINVAL;
goto fail2;
}
KASSERT(fvp != dvp);
KASSERT(tcnp->cn_nameiop == RENAME);
error = ops->gro_lookup(mp, dvp, tcnp, tde_ret, &tvp);
if (error == ENOENT) {
tvp = NULL;
} else if (error) {
goto fail2;
} else {
KASSERT(tvp != NULL);
/* Refuse to rename over `.'. */
if (tvp == dvp) {
error = EISDIR; /* XXX EINVAL? */
goto fail2;
}
}
KASSERT(tvp != dvp);
/*
* We've looked up both nodes. Now lock them and check them.
*/
vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(fvp->v_mount == mp);
/* Refuse to rename a mount point. */
if ((fvp->v_type == VDIR) && (fvp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail3;
}
if ((tvp != NULL) && (tvp != fvp)) {
vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(tvp->v_mount == mp);
/* Refuse to rename over a mount point. */
if ((tvp->v_type == VDIR) && (tvp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail4;
}
}
KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
*fvp_ret = fvp;
*tvp_ret = tvp;
return 0;
fail4: if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp);
fail3: VOP_UNLOCK(fvp);
if (tvp != NULL) vrele(tvp);
fail2: vrele(fvp);
fail1: VOP_UNLOCK(dvp);
fail0: return error;
}
/*
* genfs_rename_enter_separate: Lock and look up with separate source
* and target directories.
*/
static int
genfs_rename_enter_separate(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *fdvp, struct componentname *fcnp,
void *fde_ret, struct vnode **fvp_ret,
struct vnode *tdvp, struct componentname *tcnp,
void *tde_ret, struct vnode **tvp_ret)
{
struct vnode *intermediate_node;
struct vnode *fvp, *tvp;
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(fdvp != NULL);
KASSERT(fcnp != NULL);
KASSERT(fvp_ret != NULL);
KASSERT(tdvp != NULL);
KASSERT(tcnp != NULL);
KASSERT(tvp_ret != NULL);
KASSERT(fdvp != tdvp);
KASSERT(fcnp != tcnp); KASSERT(fcnp->cn_nameiop == DELETE); KASSERT(tcnp->cn_nameiop == RENAME);
KASSERT(fvp_ret != tvp_ret);
KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == mp); KASSERT(tdvp->v_mount == mp);
error = ops->gro_genealogy(mp, cred, fdvp, tdvp, &intermediate_node);
if (error)
return error;
/*
* intermediate_node == NULL means fdvp is not an ancestor of tdvp.
*/
if (intermediate_node == NULL)
error = genfs_rename_lock(ops, mp, cred,
ENOTEMPTY, EISDIR, EINVAL,
tdvp, tcnp, true, tde_ret, &tvp,
fdvp, fcnp, false, fde_ret, &fvp);
else
error = genfs_rename_lock(ops, mp, cred,
EINVAL, EISDIR, EINVAL,
fdvp, fcnp, false, fde_ret, &fvp,
tdvp, tcnp, true, tde_ret, &tvp);
if (error)
goto out;
KASSERT(fvp != NULL);
/*
* Reject rename("foo/bar", "foo/bar/baz/quux/zot").
*/
if (fvp == intermediate_node) { genfs_rename_exit(ops, mp, fdvp, fvp, tdvp, tvp);
error = EINVAL;
goto out;
}
*fvp_ret = fvp;
*tvp_ret = tvp;
error = 0;
out: if (intermediate_node != NULL) vrele(intermediate_node);
return error;
}
/*
* genfs_rename_lock: Lookup and lock it all. The lock order is:
*
* a_dvp -> a_vp -> b_dvp -> b_vp,
*
* except if a_vp is a nondirectory in which case the lock order is:
*
* a_dvp -> b_dvp -> b_vp -> a_vp,
*
* which can't violate ancestor->descendant because a_vp has no
* descendants in this case. This edge case is necessary because some
* file systems can only lookup/lock/unlock, and we can't hold a_vp
* locked when we lookup/lock/unlock b_vp if they turn out to be the
* same, and we can't find out that they're the same until after the
* lookup.
*
* b_dvp must not be an ancestor of a_dvp, although a_dvp may be an
* ancestor of b_dvp.
*
* Fail with overlap_error if node a is directory b. Neither
* componentname may be `.' or `..'.
*
* a_dvp and b_dvp must be referenced.
*
* On entry, a_dvp and b_dvp are unlocked.
*
* On success,
* . a_dvp and b_dvp are locked,
* . *a_dirent_ret is filled with a directory entry whose node is
* locked and referenced,
* . *b_vp_ret is filled with the corresponding vnode,
* . *b_dirent_ret is filled either with null or with a directory entry
* whose node is locked and referenced,
* . *b_vp is filled either with null or with the corresponding vnode,
* and
* . the only pair of vnodes that may be identical is a_vp and b_vp.
*
* On failure, a_dvp and b_dvp are left unlocked, and *a_dirent_ret,
* *a_vp, *b_dirent_ret, and *b_vp are left alone.
*/
static int
genfs_rename_lock(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
int overlap_error, int a_dot_error, int b_dot_error,
struct vnode *a_dvp, struct componentname *a_cnp, bool a_missing_ok,
void *a_de_ret, struct vnode **a_vp_ret,
struct vnode *b_dvp, struct componentname *b_cnp, bool b_missing_ok,
void *b_de_ret, struct vnode **b_vp_ret)
{
struct vnode *a_vp, *b_vp;
int error;
KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(a_dvp != NULL); KASSERT(a_cnp != NULL); KASSERT(a_vp_ret != NULL); KASSERT(b_dvp != NULL); KASSERT(b_cnp != NULL); KASSERT(b_vp_ret != NULL); KASSERT(a_dvp != b_dvp); KASSERT(a_vp_ret != b_vp_ret); KASSERT(a_dvp->v_type == VDIR); KASSERT(b_dvp->v_type == VDIR); KASSERT(a_dvp->v_mount == mp); KASSERT(b_dvp->v_mount == mp); KASSERT(a_missing_ok != b_missing_ok);
/*
* 1. Lock a_dvp.
*/
error = ops->gro_lock_directory(mp, a_dvp);
if (error)
goto fail0;
/* Did we lose a race with mount? */
if (a_dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail1;
}
/*
* 2. Lookup a_vp. May lock/unlock a_vp.
*/
error = ops->gro_lookup(mp, a_dvp, a_cnp, a_de_ret, &a_vp);
if (error) {
if (a_missing_ok && (error == ENOENT))
a_vp = NULL;
else
goto fail1;
} else {
KASSERT(a_vp != NULL);
/* Refuse to rename (over) `.'. */
if (a_vp == a_dvp) {
error = a_dot_error;
goto fail2;
}
/* Reject rename("x", "x/y") or rename("x/y", "x"). */
if (a_vp == b_dvp) {
error = overlap_error;
goto fail2;
}
}
KASSERT(a_vp != a_dvp); KASSERT(a_vp != b_dvp);
/*
* 3. Lock a_vp, if it is a directory.
*
* We already ruled out a_vp == a_dvp (i.e., a_cnp is `.'), so
* this is not locking against self, and we already ruled out
* a_vp == b_dvp, so this won't cause subsequent locking of
* b_dvp to lock against self.
*
* If a_vp is a nondirectory, we can't hold it when we lookup
* b_vp in case (a) the file system can only lookup/lock/unlock
* and (b) b_vp turns out to be the same file as a_vp due to
* hard links -- and we can't even detect that case until after
* we've looked up b_vp. Fortunately, if a_vp is a
* nondirectory, then it is a leaf, so we can safely lock it
* last.
*/
if (a_vp != NULL && a_vp->v_type == VDIR) {
vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(a_vp->v_mount == mp);
/* Refuse to rename (over) a mount point. */
if (a_vp->v_mountedhere != NULL) {
error = EBUSY;
goto fail3;
}
}
/*
* 4. Lock b_dvp.
*/
error = ops->gro_lock_directory(mp, b_dvp);
if (error)
goto fail3;
/* Did we lose a race with mount? */
if (b_dvp->v_mountedhere != NULL) {
error = EBUSY;
goto fail4;
}
/*
* 5. Lookup b_vp. May lock/unlock b_vp.
*/
error = ops->gro_lookup(mp, b_dvp, b_cnp, b_de_ret, &b_vp);
if (error) {
if (b_missing_ok && (error == ENOENT))
b_vp = NULL;
else
goto fail4;
} else {
KASSERT(b_vp != NULL);
/* Refuse to rename (over) `.'. */
if (b_vp == b_dvp) {
error = b_dot_error;
goto fail5;
}
/*
* b_dvp must not be an ancestor of a_dvp, so if we
* find b_dvp/b_vp=a_dvp/a_vp something is wrong.
*/
if (b_vp == a_dvp) {
/*
* We have a directory hard link before us.
* XXX What error should this return? EDEADLK?
* Panic?
*/
error = EIO;
goto fail5;
}
}
KASSERT(b_vp != b_dvp); KASSERT(b_vp != a_dvp);
/*
* 6. Lock a_vp, if it is a nondirectory.
*
* In this case a_vp is a leaf, so it is either equal to or
* incommensurate with b_vp, and so we can safely lock it at
* any point now.
*/
if (a_vp != NULL && a_vp->v_type != VDIR) {
vn_lock(a_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(a_vp->v_mount == mp);
/* (not a directory so can't have anything mounted here) */
}
/*
* 7. Lock b_vp, if it is not a_vp.
*
* b_vp and a_vp may the same inode if they are hard links to
* one another.
*/
if ((b_vp != NULL) && (b_vp != a_vp)) {
vn_lock(b_vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(b_vp->v_mount == mp);
/* Refuse to rename (over) a mount point. */
if ((b_vp->v_type == VDIR) && (b_vp->v_mountedhere != NULL)) {
error = EBUSY;
goto fail6;
}
}
KASSERT(VOP_ISLOCKED(a_dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(b_dvp) == LK_EXCLUSIVE); KASSERT(a_missing_ok || (a_vp != NULL)); KASSERT(b_missing_ok || (b_vp != NULL)); KASSERT((a_vp == NULL) || (VOP_ISLOCKED(a_vp) == LK_EXCLUSIVE)); KASSERT((b_vp == NULL) || (VOP_ISLOCKED(b_vp) == LK_EXCLUSIVE));
*a_vp_ret = a_vp;
*b_vp_ret = b_vp;
return 0;
fail6: if ((b_vp != NULL) && (b_vp != a_vp))
VOP_UNLOCK(b_vp);
if (a_vp != NULL && a_vp->v_type != VDIR) VOP_UNLOCK(a_vp);fail5: if (b_vp != NULL)
vrele(b_vp);
fail4: VOP_UNLOCK(b_dvp);
fail3: if (a_vp != NULL && a_vp->v_type == VDIR)
VOP_UNLOCK(a_vp);
fail2: if (a_vp != NULL)
vrele(a_vp);
fail1: VOP_UNLOCK(a_dvp);
fail0: return error;
}
/*
* genfs_rename_exit: Unlock everything we locked for rename.
*
* fdvp and tdvp must be referenced.
*
* On entry, everything is locked, and fvp and tvp referenced.
*
* On exit, everything is unlocked, and fvp and tvp are released.
*/
static void
genfs_rename_exit(const struct genfs_rename_ops *ops,
struct mount *mp,
struct vnode *fdvp, struct vnode *fvp,
struct vnode *tdvp, struct vnode *tvp)
{
(void)ops;
KASSERT(ops != NULL); KASSERT(mp != NULL); KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != tvp); KASSERT(tdvp != fvp); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE)); if ((tvp != NULL) && (tvp != fvp)) VOP_UNLOCK(tvp); VOP_UNLOCK(fvp);
if (tvp != NULL)
vrele(tvp);
if (tdvp != fdvp) VOP_UNLOCK(tdvp);
vrele(fvp);
VOP_UNLOCK(fdvp);
}
/*
* genfs_rename_remove: Remove the entry for the non-directory vp with
* componentname cnp from the directory dvp, using the lookup results
* de. It is the responsibility of gro_remove to purge the name cache.
*
* Everything must be locked and referenced.
*/
static int
genfs_rename_remove(const struct genfs_rename_ops *ops,
struct mount *mp, kauth_cred_t cred,
struct vnode *dvp, struct componentname *cnp, void *de, struct vnode *vp,
nlink_t *tvp_nlinkp)
{
int error;
KASSERT(ops != NULL);
KASSERT(mp != NULL);
KASSERT(dvp != NULL);
KASSERT(cnp != NULL);
KASSERT(vp != NULL);
KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == mp); KASSERT(vp->v_mount == mp); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
error = ops->gro_remove_check_possible(mp, dvp, vp);
if (error)
return error;
error = ops->gro_remove_check_permitted(mp, cred, dvp, vp);
error = kauth_authorize_vnode(cred, KAUTH_VNODE_DELETE, vp, dvp,
error);
if (error)
return error;
error = ops->gro_remove(mp, cred, dvp, cnp, de, vp, tvp_nlinkp);
if (error)
return error;
return 0;
}
static int
genfs_ufslike_check_sticky(kauth_cred_t, mode_t, uid_t, struct vnode *, uid_t);
/*
* genfs_ufslike_rename_check_possible: Check whether a rename is
* possible independent of credentials, assuming UFS-like inode flag
* semantics. clobber_p is true iff the target node already exists.
*/
int
genfs_ufslike_rename_check_possible(
unsigned long fdflags, unsigned long fflags,
unsigned long tdflags, unsigned long tflags, bool clobber_p,
unsigned long immutable, unsigned long append)
{ if ((fdflags | fflags) & (immutable | append))
return EPERM;
if (tdflags & (immutable | (clobber_p? append : 0)))
return EPERM;
if (clobber_p && (tflags & (immutable | append)))
return EPERM;
return 0;
}
/*
* genfs_ufslike_rename_check_permitted: Check whether a rename is
* permitted given our credentials, assuming UFS-like permission and
* ownership semantics.
*
* The only pair of vnodes that may be identical is {fdvp, tdvp}.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_rename_check_permitted(kauth_cred_t cred,
struct vnode *fdvp, mode_t fdmode, uid_t fduid,
struct vnode *fvp, uid_t fuid,
struct vnode *tdvp, mode_t tdmode, uid_t tduid,
struct vnode *tvp, uid_t tuid)
{
int error;
KASSERT(fdvp != NULL); KASSERT(fvp != NULL); KASSERT(tdvp != NULL); KASSERT(fdvp != fvp); KASSERT(fdvp != tvp); KASSERT(tdvp != fvp); KASSERT(tdvp != tvp); KASSERT(fvp != tvp); KASSERT(fdvp->v_type == VDIR); KASSERT(tdvp->v_type == VDIR); KASSERT(fdvp->v_mount == fvp->v_mount); KASSERT(fdvp->v_mount == tdvp->v_mount); KASSERT((tvp == NULL) || (fdvp->v_mount == tvp->v_mount)); KASSERT(VOP_ISLOCKED(fdvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(fvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE); KASSERT((tvp == NULL) || (VOP_ISLOCKED(tvp) == LK_EXCLUSIVE));
/*
* We need to remove or change an entry in the source directory.
*/
error = VOP_ACCESS(fdvp, VWRITE, cred);
if (error)
return error;
/*
* If we are changing directories, then we need to write to the
* target directory to add or change an entry. Also, if fvp is
* a directory, we need to write to it to change its `..'
* entry.
*/
if (fdvp != tdvp) {
error = VOP_ACCESS(tdvp, VWRITE, cred);
if (error)
return error;
if (fvp->v_type == VDIR) {
error = VOP_ACCESS(fvp, VWRITE, cred);
if (error)
return error;
}
}
error = genfs_ufslike_check_sticky(cred, fdmode, fduid, fvp, fuid); if (error)
return error;
error = genfs_ufslike_check_sticky(cred, tdmode, tduid, tvp, tuid);
if (error)
return error;
return 0;
}
/*
* genfs_ufslike_remove_check_possible: Check whether a remove is
* possible independent of credentials, assuming UFS-like inode flag
* semantics.
*/
int
genfs_ufslike_remove_check_possible(unsigned long dflags, unsigned long flags,
unsigned long immutable, unsigned long append)
{
/*
* We want to delete the entry. If the directory is immutable,
* we can't write to it to delete the entry. If the directory
* is append-only, the only change we can make is to add
* entries, so we can't delete entries. If the node is
* immutable, we can't change the links to it, so we can't
* delete the entry. If the node is append-only...well, this
* is what UFS does.
*/
if ((dflags | flags) & (immutable | append))
return EPERM;
return 0;
}
/*
* genfs_ufslike_remove_check_permitted: Check whether a remove is
* permitted given our credentials, assuming UFS-like permission and
* ownership semantics.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_remove_check_permitted(kauth_cred_t cred,
struct vnode *dvp, mode_t dmode, uid_t duid,
struct vnode *vp, uid_t uid)
{
int error;
KASSERT(dvp != NULL); KASSERT(vp != NULL); KASSERT(dvp != vp); KASSERT(dvp->v_type == VDIR); KASSERT(vp->v_type != VDIR); KASSERT(dvp->v_mount == vp->v_mount); KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE); KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
/*
* We need to write to the directory to remove from it.
*/
error = VOP_ACCESS(dvp, VWRITE, cred);
if (error)
return error;
error = genfs_ufslike_check_sticky(cred, dmode, duid, vp, uid);
if (error)
return error;
return 0;
}
/*
* genfs_ufslike_check_sticky: Check whether a party with credentials
* cred may change an entry in a sticky directory, assuming UFS-like
* permission, ownership, and stickiness semantics: If the directory is
* sticky and the entry exists, the user must own either the directory
* or the entry's node in order to change the entry.
*
* Everything must be locked and referenced.
*/
int
genfs_ufslike_check_sticky(kauth_cred_t cred, mode_t dmode, uid_t duid,
struct vnode *vp, uid_t uid)
{
if ((dmode & S_ISTXT) && (vp != NULL)) return genfs_can_sticky(vp, cred, duid, uid);
return 0;
}
/* $NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $ */
/*
* Copyright (c) 2006, 2010, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* uvm_object.c: operate with memory objects
*
* TODO:
* 1. Support PG_RELEASED-using objects
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: uvm_object.c,v 1.25 2020/08/15 07:24:09 chs Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/rwlock.h>
#include <sys/queue.h>
#include <uvm/uvm.h>
#include <uvm/uvm_ddb.h>
#include <uvm/uvm_page_array.h>
/* Page count to fetch per single step. */
#define FETCH_PAGECOUNT 16
/*
* uvm_obj_init: initialize UVM memory object.
*/
void
uvm_obj_init(struct uvm_object *uo, const struct uvm_pagerops *ops,
bool alock, u_int refs)
{
#if 0 /* notyet */
KASSERT(ops);
#endif
if (alock) {
/* Allocate and assign a lock. */
uo->vmobjlock = rw_obj_alloc();
} else {
/* The lock will need to be set via uvm_obj_setlock(). */
uo->vmobjlock = NULL;
}
uo->pgops = ops;
LIST_INIT(&uo->uo_ubc);
uo->uo_npages = 0;
uo->uo_refs = refs;
radix_tree_init_tree(&uo->uo_pages);
}
/*
* uvm_obj_destroy: destroy UVM memory object.
*/
void
uvm_obj_destroy(struct uvm_object *uo, bool dlock)
{ KASSERT(radix_tree_empty_tree_p(&uo->uo_pages));
/* Purge any UBC entries associated with this object. */
ubc_purge(uo);
/* Destroy the lock, if requested. */
if (dlock) { rw_obj_free(uo->vmobjlock);
}
radix_tree_fini_tree(&uo->uo_pages);
}
/*
* uvm_obj_setlock: assign a vmobjlock to the UVM object.
*
* => Caller is responsible to ensure that UVM objects is not use.
* => Only dynamic lock may be previously set. We drop the reference then.
*/
void
uvm_obj_setlock(struct uvm_object *uo, krwlock_t *lockptr)
{
krwlock_t *olockptr = uo->vmobjlock;
if (olockptr) {
/* Drop the reference on the old lock. */
rw_obj_free(olockptr);
}
if (lockptr == NULL) {
/* If new lock is not passed - allocate default one. */
lockptr = rw_obj_alloc();
}
uo->vmobjlock = lockptr;
}
/*
* uvm_obj_wirepages: wire the pages of entire UVM object.
*
* => NOTE: this function should only be used for types of objects
* where PG_RELEASED flag is never set (aobj objects)
* => caller must pass page-aligned start and end values
*/
int
uvm_obj_wirepages(struct uvm_object *uobj, off_t start, off_t end,
struct pglist *list)
{
int i, npages, error;
struct vm_page *pgs[FETCH_PAGECOUNT], *pg = NULL;
off_t offset = start, left;
left = (end - start) >> PAGE_SHIFT;
rw_enter(uobj->vmobjlock, RW_WRITER);
while (left) {
npages = MIN(FETCH_PAGECOUNT, left);
/* Get the pages */
memset(pgs, 0, sizeof(pgs));
error = (*uobj->pgops->pgo_get)(uobj, offset, pgs, &npages, 0,
VM_PROT_READ | VM_PROT_WRITE, UVM_ADV_SEQUENTIAL,
PGO_SYNCIO);
if (error)
goto error;
rw_enter(uobj->vmobjlock, RW_WRITER);
for (i = 0; i < npages; i++) { KASSERT(pgs[i] != NULL); KASSERT(!(pgs[i]->flags & PG_RELEASED));
/*
* Loan break
*/
if (pgs[i]->loan_count) { while (pgs[i]->loan_count) {
pg = uvm_loanbreak(pgs[i]);
if (!pg) { rw_exit(uobj->vmobjlock);
uvm_wait("uobjwirepg");
rw_enter(uobj->vmobjlock, RW_WRITER);
continue;
}
}
pgs[i] = pg;
}
if (pgs[i]->flags & PG_AOBJ) { uvm_pagemarkdirty(pgs[i],
UVM_PAGE_STATUS_DIRTY);
uao_dropswap(uobj, i);
}
}
/* Wire the pages */
for (i = 0; i < npages; i++) {
uvm_pagelock(pgs[i]);
uvm_pagewire(pgs[i]);
uvm_pageunlock(pgs[i]);
if (list != NULL) TAILQ_INSERT_TAIL(list, pgs[i], pageq.queue);
}
/* Unbusy the pages */
uvm_page_unbusy(pgs, npages);
left -= npages;
offset += npages << PAGE_SHIFT;
}
rw_exit(uobj->vmobjlock);
return 0;
error:
/* Unwire the pages which has been wired */
uvm_obj_unwirepages(uobj, start, offset);
return error;
}
/*
* uvm_obj_unwirepages: unwire the pages of entire UVM object.
*
* => NOTE: this function should only be used for types of objects
* where PG_RELEASED flag is never set
* => caller must pass page-aligned start and end values
*/
void
uvm_obj_unwirepages(struct uvm_object *uobj, off_t start, off_t end)
{
struct vm_page *pg;
off_t offset;
rw_enter(uobj->vmobjlock, RW_WRITER);
for (offset = start; offset < end; offset += PAGE_SIZE) {
pg = uvm_pagelookup(uobj, offset);
KASSERT(pg != NULL);
KASSERT(!(pg->flags & PG_RELEASED));
uvm_pagelock(pg);
uvm_pageunwire(pg);
uvm_pageunlock(pg);
}
rw_exit(uobj->vmobjlock);
}
static inline bool
uvm_obj_notag_p(struct uvm_object *uobj, int tag)
{
KASSERT(rw_lock_held(uobj->vmobjlock));
return radix_tree_empty_tagged_tree_p(&uobj->uo_pages, tag);
}
bool
uvm_obj_clean_p(struct uvm_object *uobj)
{ return uvm_obj_notag_p(uobj, UVM_PAGE_DIRTY_TAG);
}
bool
uvm_obj_nowriteback_p(struct uvm_object *uobj)
{ return uvm_obj_notag_p(uobj, UVM_PAGE_WRITEBACK_TAG);
}
static inline bool
uvm_obj_page_tag_p(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_lock_held(uobj->vmobjlock));
return radix_tree_get_tag(&uobj->uo_pages, pgidx, tag) != 0;
}
static inline void
uvm_obj_page_set_tag(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock));
radix_tree_set_tag(&uobj->uo_pages, pgidx, tag);
}
static inline void
uvm_obj_page_clear_tag(struct vm_page *pg, int tag)
{
struct uvm_object *uobj = pg->uobject;
uint64_t pgidx = pg->offset >> PAGE_SHIFT;
KASSERT(uobj != NULL); KASSERT(rw_write_held(uobj->vmobjlock));
radix_tree_clear_tag(&uobj->uo_pages, pgidx, tag);
}
bool
uvm_obj_page_dirty_p(struct vm_page *pg)
{ return uvm_obj_page_tag_p(pg, UVM_PAGE_DIRTY_TAG);
}
void
uvm_obj_page_set_dirty(struct vm_page *pg)
{ uvm_obj_page_set_tag(pg, UVM_PAGE_DIRTY_TAG);
}
void
uvm_obj_page_clear_dirty(struct vm_page *pg)
{ uvm_obj_page_clear_tag(pg, UVM_PAGE_DIRTY_TAG);
}
bool
uvm_obj_page_writeback_p(struct vm_page *pg)
{ return uvm_obj_page_tag_p(pg, UVM_PAGE_WRITEBACK_TAG);
}
void
uvm_obj_page_set_writeback(struct vm_page *pg)
{ uvm_obj_page_set_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}
void
uvm_obj_page_clear_writeback(struct vm_page *pg)
{ uvm_obj_page_clear_tag(pg, UVM_PAGE_WRITEBACK_TAG);
}
#if defined(DDB) || defined(DEBUGPRINT)
/*
* uvm_object_printit: actually prints the object
*/
void
uvm_object_printit(struct uvm_object *uobj, bool full,
void (*pr)(const char *, ...))
{
struct uvm_page_array a;
struct vm_page *pg;
int cnt = 0;
voff_t off;
(*pr)("OBJECT %p: locked=%d, pgops=%p, npages=%d, ",
uobj, rw_write_held(uobj->vmobjlock), uobj->pgops, uobj->uo_npages);
if (UVM_OBJ_IS_KERN_OBJECT(uobj))
(*pr)("refs=<SYSTEM>\n");
else
(*pr)("refs=%d\n", uobj->uo_refs);
if (!full) {
return;
}
(*pr)(" PAGES <pg,offset>:\n ");
uvm_page_array_init(&a, uobj, 0);
off = 0;
while ((pg = uvm_page_array_fill_and_peek(&a, off, 0)) != NULL) {
cnt++;
(*pr)("<%p,0x%llx> ", pg, (long long)pg->offset);
if ((cnt % 3) == 0) {
(*pr)("\n ");
}
off = pg->offset + PAGE_SIZE;
uvm_page_array_advance(&a);
}
if ((cnt % 3) != 0) {
(*pr)("\n");
}
uvm_page_array_fini(&a);
}
#endif /* DDB || DEBUGPRINT */
/* $NetBSD: ufs_bswap.h,v 1.23 2018/04/19 21:50:10 christos Exp $ */
/*
* Copyright (c) 1998 Manuel Bouyer.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*/
#ifndef _UFS_UFS_BSWAP_H_
#define _UFS_UFS_BSWAP_H_
#if defined(_KERNEL_OPT)
#include "opt_ffs.h"
#endif
#include <sys/bswap.h>
/* Macros to access UFS flags */
#ifdef FFS_EI
#define UFS_MPNEEDSWAP(ump) ((ump)->um_flags & UFS_NEEDSWAP)
#define UFS_FSNEEDSWAP(fs) ((fs)->fs_flags & FS_SWAPPED)
#define UFS_IPNEEDSWAP(ip) UFS_MPNEEDSWAP((ip)->i_ump)
#else
#define UFS_MPNEEDSWAP(ump) ((void)(ump), 0)
#define UFS_FSNEEDSWAP(fs) ((void)(fs), 0)
#define UFS_IPNEEDSWAP(ip) ((void)(ip), 0)
#endif
#if (!defined(_KERNEL) && !defined(NO_FFS_EI)) || defined(FFS_EI)
/* inlines for access to swapped data */
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
return ((ns) ? bswap16(a) : (a));
}
static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
return ((ns) ? bswap32(a) : (a));
}
static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
return ((ns) ? bswap64(a) : (a));
}
#else
static __inline u_int16_t
ufs_rw16(uint16_t a, int ns)
{
return a;
}
static __inline u_int32_t
ufs_rw32(uint32_t a, int ns)
{
return a;
}
static __inline u_int64_t
ufs_rw64(uint64_t a, int ns)
{
return a;
}
#endif
#define ufs_add16(a, b, ns) \
(a) = ufs_rw16(ufs_rw16((a), (ns)) + (b), (ns))
#define ufs_add32(a, b, ns) \
(a) = ufs_rw32(ufs_rw32((a), (ns)) + (b), (ns))
#define ufs_add64(a, b, ns) \
(a) = ufs_rw64(ufs_rw64((a), (ns)) + (b), (ns))
#endif /* !_UFS_UFS_BSWAP_H_ */
/* $NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $ */
/*-
* Copyright (c) 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Taylor R. Campbell.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Entropy subsystem
*
* * Each CPU maintains a per-CPU entropy pool so that gathering
* entropy requires no interprocessor synchronization, except
* early at boot when we may be scrambling to gather entropy as
* soon as possible.
*
* - entropy_enter gathers entropy and never drops it on the
* floor, at the cost of sometimes having to do cryptography.
*
* - entropy_enter_intr gathers entropy or drops it on the
* floor, with low latency. Work to stir the pool or kick the
* housekeeping thread is scheduled in soft interrupts.
*
* * entropy_enter immediately enters into the global pool if it
* can transition to full entropy in one swell foop. Otherwise,
* it defers to a housekeeping thread that consolidates entropy,
* but only when the CPUs collectively have full entropy, in
* order to mitigate iterative-guessing attacks.
*
* * The entropy housekeeping thread continues to consolidate
* entropy even after we think we have full entropy, in case we
* are wrong, but is limited to one discretionary consolidation
* per minute, and only when new entropy is actually coming in,
* to limit performance impact.
*
* * The entropy epoch is the number that changes when we
* transition from partial entropy to full entropy, so that
* users can easily determine when to reseed. This also
* facilitates an operator explicitly causing everything to
* reseed by sysctl -w kern.entropy.consolidate=1.
*
* * Entropy depletion is available for testing (or if you're into
* that sort of thing), with sysctl -w kern.entropy.depletion=1;
* the logic to support it is small, to minimize chance of bugs.
*
* * While cold, a single global entropy pool is available for
* entering and extracting, serialized through splhigh/splx.
* The per-CPU entropy pool data structures are initialized in
* entropy_init and entropy_init_late (separated mainly for
* hysterical raisins at this point), but are not used until the
* system is warm, at which point access to the global entropy
* pool is limited to thread and softint context and serialized
* by E->lock.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_entropy.c,v 1.66 2023/10/04 20:28:06 ad Exp $");
#include <sys/param.h>
#include <sys/types.h>
#include <sys/atomic.h>
#include <sys/compat_stub.h>
#include <sys/condvar.h>
#include <sys/cpu.h>
#include <sys/entropy.h>
#include <sys/errno.h>
#include <sys/evcnt.h>
#include <sys/event.h>
#include <sys/file.h>
#include <sys/intr.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/lwp.h>
#include <sys/module_hook.h>
#include <sys/mutex.h>
#include <sys/percpu.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/reboot.h>
#include <sys/rnd.h> /* legacy kernel API */
#include <sys/rndio.h> /* userland ioctl interface */
#include <sys/rndsource.h> /* kernel rndsource driver API */
#include <sys/select.h>
#include <sys/selinfo.h>
#include <sys/sha1.h> /* for boot seed checksum */
#include <sys/stdint.h>
#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/xcall.h>
#include <lib/libkern/entpool.h>
#include <machine/limits.h>
#ifdef __HAVE_CPU_COUNTER
#include <machine/cpu_counter.h>
#endif
#define MINENTROPYBYTES ENTROPY_CAPACITY
#define MINENTROPYBITS (MINENTROPYBYTES*NBBY)
#define MINSAMPLES (2*MINENTROPYBITS)
/*
* struct entropy_cpu
*
* Per-CPU entropy state. The pool is allocated separately
* because percpu(9) sometimes moves per-CPU objects around
* without zeroing them, which would lead to unwanted copies of
* sensitive secrets. The evcnt is allocated separately because
* evcnt(9) assumes it stays put in memory.
*/
struct entropy_cpu {
struct entropy_cpu_evcnt {
struct evcnt softint;
struct evcnt intrdrop;
struct evcnt intrtrunc;
} *ec_evcnt;
struct entpool *ec_pool;
unsigned ec_bitspending;
unsigned ec_samplespending;
bool ec_locked;
};
/*
* struct entropy_cpu_lock
*
* State for locking the per-CPU entropy state.
*/
struct entropy_cpu_lock {
int ecl_s;
long ecl_pctr;
};
/*
* struct rndsource_cpu
*
* Per-CPU rndsource state.
*/
struct rndsource_cpu {
unsigned rc_entropybits;
unsigned rc_timesamples;
unsigned rc_datasamples;
rnd_delta_t rc_timedelta;
};
/*
* entropy_global (a.k.a. E for short in this file)
*
* Global entropy state. Writes protected by the global lock.
* Some fields, marked (A), can be read outside the lock, and are
* maintained with atomic_load/store_relaxed.
*/
struct {
kmutex_t lock; /* covers all global state */
struct entpool pool; /* global pool for extraction */
unsigned bitsneeded; /* (A) needed globally */
unsigned bitspending; /* pending in per-CPU pools */
unsigned samplesneeded; /* (A) needed globally */
unsigned samplespending; /* pending in per-CPU pools */
unsigned timestamp; /* (A) time of last consolidation */
unsigned epoch; /* (A) changes when needed -> 0 */
kcondvar_t cv; /* notifies state changes */
struct selinfo selq; /* notifies needed -> 0 */
struct lwp *sourcelock; /* lock on list of sources */
kcondvar_t sourcelock_cv; /* notifies sourcelock release */
LIST_HEAD(,krndsource) sources; /* list of entropy sources */
bool consolidate; /* kick thread to consolidate */
bool seed_rndsource; /* true if seed source is attached */
bool seeded; /* true if seed file already loaded */
} entropy_global __cacheline_aligned = {
/* Fields that must be initialized when the kernel is loaded. */
.bitsneeded = MINENTROPYBITS,
.samplesneeded = MINSAMPLES,
.epoch = (unsigned)-1, /* -1 means entropy never consolidated */
.sources = LIST_HEAD_INITIALIZER(entropy_global.sources),
};
#define E (&entropy_global) /* declutter */
/* Read-mostly globals */
static struct percpu *entropy_percpu __read_mostly; /* struct entropy_cpu */
static void *entropy_sih __read_mostly; /* softint handler */
static struct lwp *entropy_lwp __read_mostly; /* housekeeping thread */
static struct krndsource seed_rndsource __read_mostly;
/*
* Event counters
*
* Must be careful with adding these because they can serve as
* side channels.
*/
static struct evcnt entropy_discretionary_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "discretionary");
EVCNT_ATTACH_STATIC(entropy_discretionary_evcnt);
static struct evcnt entropy_immediate_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "immediate");
EVCNT_ATTACH_STATIC(entropy_immediate_evcnt);
static struct evcnt entropy_partial_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "partial");
EVCNT_ATTACH_STATIC(entropy_partial_evcnt);
static struct evcnt entropy_consolidate_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "consolidate");
EVCNT_ATTACH_STATIC(entropy_consolidate_evcnt);
static struct evcnt entropy_extract_fail_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "extract fail");
EVCNT_ATTACH_STATIC(entropy_extract_fail_evcnt);
static struct evcnt entropy_request_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "request");
EVCNT_ATTACH_STATIC(entropy_request_evcnt);
static struct evcnt entropy_deplete_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "deplete");
EVCNT_ATTACH_STATIC(entropy_deplete_evcnt);
static struct evcnt entropy_notify_evcnt =
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, NULL, "entropy", "notify");
EVCNT_ATTACH_STATIC(entropy_notify_evcnt);
/* Sysctl knobs */
static bool entropy_collection = 1;
static bool entropy_depletion = 0; /* Silly! */
static const struct sysctlnode *entropy_sysctlroot;
static struct sysctllog *entropy_sysctllog;
/* Forward declarations */
static void entropy_init_cpu(void *, void *, struct cpu_info *);
static void entropy_fini_cpu(void *, void *, struct cpu_info *);
static void entropy_account_cpu(struct entropy_cpu *);
static void entropy_enter(const void *, size_t, unsigned, bool);
static bool entropy_enter_intr(const void *, size_t, unsigned, bool);
static void entropy_softintr(void *);
static void entropy_thread(void *);
static bool entropy_pending(void);
static void entropy_pending_cpu(void *, void *, struct cpu_info *);
static void entropy_do_consolidate(void);
static void entropy_consolidate_xc(void *, void *);
static void entropy_notify(void);
static int sysctl_entropy_consolidate(SYSCTLFN_ARGS);
static int sysctl_entropy_gather(SYSCTLFN_ARGS);
static void filt_entropy_read_detach(struct knote *);
static int filt_entropy_read_event(struct knote *, long);
static int entropy_request(size_t, int);
static void rnd_add_data_internal(struct krndsource *, const void *,
uint32_t, uint32_t, bool);
static void rnd_add_data_1(struct krndsource *, const void *, uint32_t,
uint32_t, bool, uint32_t, bool);
static unsigned rndsource_entropybits(struct krndsource *);
static void rndsource_entropybits_cpu(void *, void *, struct cpu_info *);
static void rndsource_to_user(struct krndsource *, rndsource_t *);
static void rndsource_to_user_est(struct krndsource *, rndsource_est_t *);
static void rndsource_to_user_est_cpu(void *, void *, struct cpu_info *);
/*
* entropy_timer()
*
* Cycle counter, time counter, or anything that changes a wee bit
* unpredictably.
*/
static inline uint32_t
entropy_timer(void)
{
struct bintime bt;
uint32_t v;
/* If we have a CPU cycle counter, use the low 32 bits. */
#ifdef __HAVE_CPU_COUNTER
if (__predict_true(cpu_hascounter()))
return cpu_counter32();
#endif /* __HAVE_CPU_COUNTER */
/* If we're cold, tough. Can't binuptime while cold. */
if (__predict_false(cold))
return 0;
/* Fold the 128 bits of binuptime into 32 bits. */
binuptime(&bt);
v = bt.frac;
v ^= bt.frac >> 32;
v ^= bt.sec;
v ^= bt.sec >> 32;
return v;
}
static void
attach_seed_rndsource(void)
{
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* First called no later than entropy_init, while we are still
* single-threaded, so no need for RUN_ONCE.
*/
if (E->seed_rndsource)
return;
rnd_attach_source(&seed_rndsource, "seed", RND_TYPE_UNKNOWN,
RND_FLAG_COLLECT_VALUE);
E->seed_rndsource = true;
}
/*
* entropy_init()
*
* Initialize the entropy subsystem. Panic on failure.
*
* Requires percpu(9) and sysctl(9) to be initialized. Must run
* while cold.
*/
static void
entropy_init(void)
{
uint32_t extra[2];
struct krndsource *rs;
unsigned i = 0;
KASSERT(cold);
/* Grab some cycle counts early at boot. */
extra[i++] = entropy_timer();
/* Run the entropy pool cryptography self-test. */
if (entpool_selftest() == -1)
panic("entropy pool crypto self-test failed");
/* Create the sysctl directory. */
sysctl_createv(&entropy_sysctllog, 0, NULL, &entropy_sysctlroot,
CTLFLAG_PERMANENT, CTLTYPE_NODE, "entropy",
SYSCTL_DESCR("Entropy (random number sources) options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
/* Create the sysctl knobs. */
/* XXX These shouldn't be writable at securelevel>0. */
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "collection",
SYSCTL_DESCR("Automatically collect entropy from hardware"),
NULL, 0, &entropy_collection, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_BOOL, "depletion",
SYSCTL_DESCR("`Deplete' entropy pool when observed"),
NULL, 0, &entropy_depletion, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "consolidate",
SYSCTL_DESCR("Trigger entropy consolidation now"),
sysctl_entropy_consolidate, 0, NULL, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT, "gather",
SYSCTL_DESCR("Trigger entropy gathering from sources now"),
sysctl_entropy_gather, 0, NULL, 0, CTL_CREATE, CTL_EOL);
/* XXX These should maybe not be readable at securelevel>0. */
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"needed",
SYSCTL_DESCR("Systemwide entropy deficit (bits of entropy)"),
NULL, 0, &E->bitsneeded, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"pending",
SYSCTL_DESCR("Number of bits of entropy pending on CPUs"),
NULL, 0, &E->bitspending, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"samplesneeded",
SYSCTL_DESCR("Systemwide entropy deficit (samples)"),
NULL, 0, &E->samplesneeded, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"samplespending",
SYSCTL_DESCR("Number of samples pending on CPUs"),
NULL, 0, &E->samplespending, 0, CTL_CREATE, CTL_EOL);
sysctl_createv(&entropy_sysctllog, 0, &entropy_sysctlroot, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY|CTLFLAG_PRIVATE, CTLTYPE_INT,
"epoch", SYSCTL_DESCR("Entropy epoch"),
NULL, 0, &E->epoch, 0, CTL_CREATE, CTL_EOL);
/* Initialize the global state for multithreaded operation. */
mutex_init(&E->lock, MUTEX_DEFAULT, IPL_SOFTSERIAL);
cv_init(&E->cv, "entropy");
selinit(&E->selq);
cv_init(&E->sourcelock_cv, "entsrclock");
/* Make sure the seed source is attached. */
attach_seed_rndsource();
/* Note if the bootloader didn't provide a seed. */
if (!E->seeded)
aprint_debug("entropy: no seed from bootloader\n");
/* Allocate the per-CPU records for all early entropy sources. */
LIST_FOREACH(rs, &E->sources, list)
rs->state = percpu_alloc(sizeof(struct rndsource_cpu));
/* Allocate and initialize the per-CPU state. */
entropy_percpu = percpu_create(sizeof(struct entropy_cpu),
entropy_init_cpu, entropy_fini_cpu, NULL);
/* Enter the boot cycle count to get started. */
extra[i++] = entropy_timer();
KASSERT(i == __arraycount(extra));
entropy_enter(extra, sizeof extra, /*nbits*/0, /*count*/false);
explicit_memset(extra, 0, sizeof extra);
}
/*
* entropy_init_late()
*
* Late initialization. Panic on failure.
*
* Requires CPUs to have been detected and LWPs to have started.
* Must run while cold.
*/
static void
entropy_init_late(void)
{
int error;
KASSERT(cold);
/*
* Establish the softint at the highest softint priority level.
* Must happen after CPU detection.
*/
entropy_sih = softint_establish(SOFTINT_SERIAL|SOFTINT_MPSAFE,
&entropy_softintr, NULL);
if (entropy_sih == NULL)
panic("unable to establish entropy softint");
/*
* Create the entropy housekeeping thread. Must happen after
* lwpinit.
*/
error = kthread_create(PRI_NONE, KTHREAD_MPSAFE|KTHREAD_TS, NULL,
entropy_thread, NULL, &entropy_lwp, "entbutler");
if (error)
panic("unable to create entropy housekeeping thread: %d",
error);
}
/*
* entropy_init_cpu(ptr, cookie, ci)
*
* percpu(9) constructor for per-CPU entropy pool.
*/
static void
entropy_init_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
const char *cpuname;
ec->ec_evcnt = kmem_alloc(sizeof(*ec->ec_evcnt), KM_SLEEP);
ec->ec_pool = kmem_zalloc(sizeof(*ec->ec_pool), KM_SLEEP);
ec->ec_bitspending = 0;
ec->ec_samplespending = 0;
ec->ec_locked = false;
/* XXX ci_cpuname may not be initialized early enough. */
cpuname = ci->ci_cpuname[0] == '\0' ? "cpu0" : ci->ci_cpuname;
evcnt_attach_dynamic(&ec->ec_evcnt->softint, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy softint");
evcnt_attach_dynamic(&ec->ec_evcnt->intrdrop, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy intrdrop");
evcnt_attach_dynamic(&ec->ec_evcnt->intrtrunc, EVCNT_TYPE_MISC, NULL,
cpuname, "entropy intrtrunc");
}
/*
* entropy_fini_cpu(ptr, cookie, ci)
*
* percpu(9) destructor for per-CPU entropy pool.
*/
static void
entropy_fini_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
/*
* Zero any lingering data. Disclosure of the per-CPU pool
* shouldn't retroactively affect the security of any keys
* generated, because entpool(9) erases whatever we have just
* drawn out of any pool, but better safe than sorry.
*/
explicit_memset(ec->ec_pool, 0, sizeof(*ec->ec_pool));
evcnt_detach(&ec->ec_evcnt->intrtrunc);
evcnt_detach(&ec->ec_evcnt->intrdrop);
evcnt_detach(&ec->ec_evcnt->softint);
kmem_free(ec->ec_pool, sizeof(*ec->ec_pool));
kmem_free(ec->ec_evcnt, sizeof(*ec->ec_evcnt));
}
/*
* ec = entropy_cpu_get(&lock)
* entropy_cpu_put(&lock, ec)
*
* Lock and unlock the per-CPU entropy state. This only prevents
* access on the same CPU -- by hard interrupts, by soft
* interrupts, or by other threads.
*
* Blocks soft interrupts and preemption altogether; doesn't block
* hard interrupts, but causes samples in hard interrupts to be
* dropped.
*/
static struct entropy_cpu *
entropy_cpu_get(struct entropy_cpu_lock *lock)
{
struct entropy_cpu *ec;
ec = percpu_getref(entropy_percpu);
lock->ecl_s = splsoftserial();
KASSERT(!ec->ec_locked);
ec->ec_locked = true;
lock->ecl_pctr = lwp_pctr();
__insn_barrier();
return ec;
}
static void
entropy_cpu_put(struct entropy_cpu_lock *lock, struct entropy_cpu *ec)
{ KASSERT(ec == percpu_getptr_remote(entropy_percpu, curcpu())); KASSERT(ec->ec_locked);
__insn_barrier();
KASSERT(lock->ecl_pctr == lwp_pctr());
ec->ec_locked = false;
splx(lock->ecl_s);
percpu_putref(entropy_percpu);
}
/*
* entropy_seed(seed)
*
* Seed the entropy pool with seed. Meant to be called as early
* as possible by the bootloader; may be called before or after
* entropy_init. Must be called before system reaches userland.
* Must be called in thread or soft interrupt context, not in hard
* interrupt context. Must be called at most once.
*
* Overwrites the seed in place. Caller may then free the memory.
*/
static void
entropy_seed(rndsave_t *seed)
{
SHA1_CTX ctx;
uint8_t digest[SHA1_DIGEST_LENGTH];
bool seeded;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* Verify the checksum. If the checksum fails, take the data
* but ignore the entropy estimate -- the file may have been
* incompletely written with garbage, which is harmless to add
* but may not be as unpredictable as alleged.
*/
SHA1Init(&ctx);
SHA1Update(&ctx, (const void *)&seed->entropy, sizeof(seed->entropy));
SHA1Update(&ctx, seed->data, sizeof(seed->data));
SHA1Final(digest, &ctx);
CTASSERT(sizeof(seed->digest) == sizeof(digest));
if (!consttime_memequal(digest, seed->digest, sizeof(digest))) {
printf("entropy: invalid seed checksum\n");
seed->entropy = 0;
}
explicit_memset(&ctx, 0, sizeof ctx);
explicit_memset(digest, 0, sizeof digest);
/*
* If the entropy is insensibly large, try byte-swapping.
* Otherwise assume the file is corrupted and act as though it
* has zero entropy.
*/
if (howmany(seed->entropy, NBBY) > sizeof(seed->data)) {
seed->entropy = bswap32(seed->entropy);
if (howmany(seed->entropy, NBBY) > sizeof(seed->data))
seed->entropy = 0;
}
/* Make sure the seed source is attached. */
attach_seed_rndsource();
/* Test and set E->seeded. */
seeded = E->seeded;
E->seeded = (seed->entropy > 0);
/*
* If we've been seeded, may be re-entering the same seed
* (e.g., bootloader vs module init, or something). No harm in
* entering it twice, but it contributes no additional entropy.
*/
if (seeded) {
printf("entropy: double-seeded by bootloader\n");
seed->entropy = 0;
} else {
printf("entropy: entering seed from bootloader"
" with %u bits of entropy\n", (unsigned)seed->entropy);
}
/* Enter it into the pool and promptly zero it. */
rnd_add_data(&seed_rndsource, seed->data, sizeof(seed->data),
seed->entropy);
explicit_memset(seed, 0, sizeof(*seed));
}
/*
* entropy_bootrequest()
*
* Request entropy from all sources at boot, once config is
* complete and interrupts are running but we are still cold.
*/
void
entropy_bootrequest(void)
{
int error;
KASSERT(!cpu_intr_p());
KASSERT(!cpu_softintr_p());
KASSERT(cold);
/*
* Request enough to satisfy the maximum entropy shortage.
* This is harmless overkill if the bootloader provided a seed.
*/
error = entropy_request(MINENTROPYBYTES, ENTROPY_WAIT);
KASSERTMSG(error == 0, "error=%d", error);
}
/*
* entropy_epoch()
*
* Returns the current entropy epoch. If this changes, you should
* reseed. If -1, means system entropy has not yet reached full
* entropy or been explicitly consolidated; never reverts back to
* -1. Never zero, so you can always use zero as an uninitialized
* sentinel value meaning `reseed ASAP'.
*
* Usage model:
*
* struct foo {
* struct crypto_prng prng;
* unsigned epoch;
* } *foo;
*
* unsigned epoch = entropy_epoch();
* if (__predict_false(epoch != foo->epoch)) {
* uint8_t seed[32];
* if (entropy_extract(seed, sizeof seed, 0) != 0)
* warn("no entropy");
* crypto_prng_reseed(&foo->prng, seed, sizeof seed);
* foo->epoch = epoch;
* }
*/
unsigned
entropy_epoch(void)
{
/*
* Unsigned int, so no need for seqlock for an atomic read, but
* make sure we read it afresh each time.
*/
return atomic_load_relaxed(&E->epoch);
}
/*
* entropy_ready()
*
* True if the entropy pool has full entropy.
*/
bool
entropy_ready(void)
{
return atomic_load_relaxed(&E->bitsneeded) == 0;
}
/*
* entropy_account_cpu(ec)
*
* Consider whether to consolidate entropy into the global pool
* after we just added some into the current CPU's pending pool.
*
* - If this CPU can provide enough entropy now, do so.
*
* - If this and whatever else is available on other CPUs can
* provide enough entropy, kick the consolidation thread.
*
* - Otherwise, do as little as possible, except maybe consolidate
* entropy at most once a minute.
*
* Caller must be bound to a CPU and therefore have exclusive
* access to ec. Will acquire and release the global lock.
*/
static void
entropy_account_cpu(struct entropy_cpu *ec)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec0;
unsigned bitsdiff, samplesdiff;
KASSERT(!cpu_intr_p()); KASSERT(!cold); KASSERT(curlwp->l_pflag & LP_BOUND);
/*
* If there's no entropy needed, and entropy has been
* consolidated in the last minute, do nothing.
*/
if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0) && __predict_true(!atomic_load_relaxed(&entropy_depletion)) &&
__predict_true((time_uptime - E->timestamp) <= 60))
return;
/*
* Consider consolidation, under the global lock and with the
* per-CPU state locked.
*/
mutex_enter(&E->lock);
ec0 = entropy_cpu_get(&lock); KASSERT(ec0 == ec); if (ec->ec_bitspending == 0 && ec->ec_samplespending == 0) {
/* Raced with consolidation xcall. Nothing to do. */
} else if (E->bitsneeded != 0 && E->bitsneeded <= ec->ec_bitspending) {
/*
* If we have not yet attained full entropy but we can
* now, do so. This way we disseminate entropy
* promptly when it becomes available early at boot;
* otherwise we leave it to the entropy consolidation
* thread, which is rate-limited to mitigate side
* channels and abuse.
*/
uint8_t buf[ENTPOOL_CAPACITY];
/* Transfer from the local pool to the global pool. */
entpool_extract(ec->ec_pool, buf, sizeof buf);
entpool_enter(&E->pool, buf, sizeof buf);
atomic_store_relaxed(&ec->ec_bitspending, 0); atomic_store_relaxed(&ec->ec_samplespending, 0);
atomic_store_relaxed(&E->bitsneeded, 0);
atomic_store_relaxed(&E->samplesneeded, 0);
/* Notify waiters that we now have full entropy. */
entropy_notify();
entropy_immediate_evcnt.ev_count++;
} else {
/* Determine how much we can add to the global pool. */
KASSERTMSG(E->bitspending <= MINENTROPYBITS,
"E->bitspending=%u", E->bitspending);
bitsdiff = MIN(ec->ec_bitspending,
MINENTROPYBITS - E->bitspending);
KASSERTMSG(E->samplespending <= MINSAMPLES,
"E->samplespending=%u", E->samplespending);
samplesdiff = MIN(ec->ec_samplespending,
MINSAMPLES - E->samplespending);
/*
* This should make a difference unless we are already
* saturated.
*/
KASSERTMSG((bitsdiff || samplesdiff ||
E->bitspending == MINENTROPYBITS ||
E->samplespending == MINSAMPLES),
"bitsdiff=%u E->bitspending=%u ec->ec_bitspending=%u"
"samplesdiff=%u E->samplespending=%u"
" ec->ec_samplespending=%u"
" minentropybits=%u minsamples=%u",
bitsdiff, E->bitspending, ec->ec_bitspending,
samplesdiff, E->samplespending, ec->ec_samplespending,
(unsigned)MINENTROPYBITS, (unsigned)MINSAMPLES);
/* Add to the global, subtract from the local. */
E->bitspending += bitsdiff;
KASSERTMSG(E->bitspending <= MINENTROPYBITS,
"E->bitspending=%u", E->bitspending);
atomic_store_relaxed(&ec->ec_bitspending,
ec->ec_bitspending - bitsdiff);
E->samplespending += samplesdiff;
KASSERTMSG(E->samplespending <= MINSAMPLES,
"E->samplespending=%u", E->samplespending);
atomic_store_relaxed(&ec->ec_samplespending,
ec->ec_samplespending - samplesdiff);
/* One or the other must have gone up from zero. */
KASSERT(E->bitspending || E->samplespending); if (E->bitsneeded <= E->bitspending ||
E->samplesneeded <= E->samplespending) {
/*
* Enough bits or at least samples between all
* the per-CPU pools. Leave a note for the
* housekeeping thread to consolidate entropy
* next time it wakes up -- and wake it up if
* this is the first time, to speed things up.
*
* If we don't need any entropy, this doesn't
* mean much, but it is the only time we ever
* gather additional entropy in case the
* accounting has been overly optimistic. This
* happens at most once a minute, so there's
* negligible performance cost.
*/
E->consolidate = true;
if (E->epoch == (unsigned)-1) cv_broadcast(&E->cv); if (E->bitsneeded == 0) entropy_discretionary_evcnt.ev_count++;
} else {
/* Can't get full entropy. Keep gathering. */
entropy_partial_evcnt.ev_count++;
}
}
entropy_cpu_put(&lock, ec);
mutex_exit(&E->lock);
}
/*
* entropy_enter_early(buf, len, nbits)
*
* Do entropy bookkeeping globally, before we have established
* per-CPU pools. Enter directly into the global pool in the hope
* that we enter enough before the first entropy_extract to thwart
* iterative-guessing attacks; entropy_extract will warn if not.
*/
static void
entropy_enter_early(const void *buf, size_t len, unsigned nbits)
{
bool notify = false;
int s;
KASSERT(cold);
/*
* We're early at boot before multithreading and multi-CPU
* operation, and we don't have softints yet to defer
* processing from interrupt context, so we have to enter the
* samples directly into the global pool. But interrupts may
* be enabled, and we enter this path from interrupt context,
* so block interrupts until we're done.
*/
s = splhigh();
/* Enter it into the pool. */
entpool_enter(&E->pool, buf, len);
/*
* Decide whether to notify reseed -- we will do so if either:
* (a) we transition from partial entropy to full entropy, or
* (b) we get a batch of full entropy all at once.
* We don't count timing samples because we assume, while cold,
* there's not likely to be much jitter yet.
*/
notify |= (E->bitsneeded && E->bitsneeded <= nbits);
notify |= (nbits >= MINENTROPYBITS);
/*
* Subtract from the needed count and notify if appropriate.
* We don't count samples here because entropy_timer might
* still be returning zero at this point if there's no CPU
* cycle counter.
*/
E->bitsneeded -= MIN(E->bitsneeded, nbits);
if (notify) {
entropy_notify();
entropy_immediate_evcnt.ev_count++;
}
splx(s);
}
/*
* entropy_enter(buf, len, nbits, count)
*
* Enter len bytes of data from buf into the system's entropy
* pool, stirring as necessary when the internal buffer fills up.
* nbits is a lower bound on the number of bits of entropy in the
* process that led to this sample.
*/
static void
entropy_enter(const void *buf, size_t len, unsigned nbits, bool count)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
unsigned bitspending, samplespending;
int bound;
KASSERTMSG(!cpu_intr_p(),
"use entropy_enter_intr from interrupt context");
KASSERTMSG(howmany(nbits, NBBY) <= len,
"impossible entropy rate: %u bits in %zu-byte string", nbits, len);
/*
* If we're still cold, just use entropy_enter_early to put
* samples directly into the global pool.
*/
if (__predict_false(cold)) {
entropy_enter_early(buf, len, nbits);
return;
}
/*
* Bind ourselves to the current CPU so we don't switch CPUs
* between entering data into the current CPU's pool (and
* updating the pending count) and transferring it to the
* global pool in entropy_account_cpu.
*/
bound = curlwp_bind();
/*
* With the per-CPU state locked, enter into the per-CPU pool
* and count up what we can add.
*
* We don't count samples while cold because entropy_timer
* might still be returning zero if there's no CPU cycle
* counter.
*/
ec = entropy_cpu_get(&lock);
entpool_enter(ec->ec_pool, buf, len);
bitspending = ec->ec_bitspending;
bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
atomic_store_relaxed(&ec->ec_bitspending, bitspending);
samplespending = ec->ec_samplespending;
if (__predict_true(count)) {
samplespending += MIN(MINSAMPLES - samplespending, 1);
atomic_store_relaxed(&ec->ec_samplespending, samplespending);
}
entropy_cpu_put(&lock, ec);
/* Consolidate globally if appropriate based on what we added. */
if (bitspending > 0 || samplespending >= MINSAMPLES) entropy_account_cpu(ec); curlwp_bindx(bound);
}
/*
* entropy_enter_intr(buf, len, nbits, count)
*
* Enter up to len bytes of data from buf into the system's
* entropy pool without stirring. nbits is a lower bound on the
* number of bits of entropy in the process that led to this
* sample. If the sample could be entered completely, assume
* nbits of entropy pending; otherwise assume none, since we don't
* know whether some parts of the sample are constant, for
* instance. Schedule a softint to stir the entropy pool if
* needed. Return true if used fully, false if truncated at all.
*
* Using this in thread or softint context with no spin locks held
* will work, but you might as well use entropy_enter in that
* case.
*/
static bool
entropy_enter_intr(const void *buf, size_t len, unsigned nbits, bool count)
{
struct entropy_cpu *ec;
bool fullyused = false;
uint32_t bitspending, samplespending;
int s;
KASSERTMSG(howmany(nbits, NBBY) <= len,
"impossible entropy rate: %u bits in %zu-byte string", nbits, len);
/*
* If we're still cold, just use entropy_enter_early to put
* samples directly into the global pool.
*/
if (__predict_false(cold)) {
entropy_enter_early(buf, len, nbits);
return true;
}
/*
* In case we were called in thread or interrupt context with
* interrupts unblocked, block soft interrupts up to
* IPL_SOFTSERIAL. This way logic that is safe in interrupt
* context or under a spin lock is also safe in less
* restrictive contexts.
*/
s = splsoftserial();
/*
* Acquire the per-CPU state. If someone is in the middle of
* using it, drop the sample. Otherwise, take the lock so that
* higher-priority interrupts will drop their samples.
*/
ec = percpu_getref(entropy_percpu);
if (ec->ec_locked) {
ec->ec_evcnt->intrdrop.ev_count++;
goto out0;
}
ec->ec_locked = true;
__insn_barrier();
/*
* Enter as much as we can into the per-CPU pool. If it was
* truncated, schedule a softint to stir the pool and stop.
*/
if (!entpool_enter_nostir(ec->ec_pool, buf, len)) {
if (__predict_true(!cold)) softint_schedule(entropy_sih);
ec->ec_evcnt->intrtrunc.ev_count++;
goto out1;
}
fullyused = true;
/*
* Count up what we can contribute.
*
* We don't count samples while cold because entropy_timer
* might still be returning zero if there's no CPU cycle
* counter.
*/
bitspending = ec->ec_bitspending;
bitspending += MIN(MINENTROPYBITS - bitspending, nbits);
atomic_store_relaxed(&ec->ec_bitspending, bitspending); if (__predict_true(count)) {
samplespending = ec->ec_samplespending;
samplespending += MIN(MINSAMPLES - samplespending, 1);
atomic_store_relaxed(&ec->ec_samplespending, samplespending);
}
/* Schedule a softint if we added anything and it matters. */
if (__predict_false(atomic_load_relaxed(&E->bitsneeded) || atomic_load_relaxed(&entropy_depletion)) && (nbits != 0 || count) &&
__predict_true(!cold))
softint_schedule(entropy_sih);
out1: /* Release the per-CPU state. */
KASSERT(ec->ec_locked);
__insn_barrier();
ec->ec_locked = false;
out0: percpu_putref(entropy_percpu);
splx(s);
return fullyused;
}
/*
* entropy_softintr(cookie)
*
* Soft interrupt handler for entering entropy. Takes care of
* stirring the local CPU's entropy pool if it filled up during
* hard interrupts, and promptly crediting entropy from the local
* CPU's entropy pool to the global entropy pool if needed.
*/
static void
entropy_softintr(void *cookie)
{
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
unsigned bitspending, samplespending;
/*
* With the per-CPU state locked, stir the pool if necessary
* and determine if there's any pending entropy on this CPU to
* account globally.
*/
ec = entropy_cpu_get(&lock);
ec->ec_evcnt->softint.ev_count++;
entpool_stir(ec->ec_pool);
bitspending = ec->ec_bitspending;
samplespending = ec->ec_samplespending;
entropy_cpu_put(&lock, ec);
/* Consolidate globally if appropriate based on what we added. */
if (bitspending > 0 || samplespending >= MINSAMPLES)
entropy_account_cpu(ec);
}
/*
* entropy_thread(cookie)
*
* Handle any asynchronous entropy housekeeping.
*/
static void
entropy_thread(void *cookie)
{
bool consolidate;
#ifndef _RUMPKERNEL /* XXX rump starts threads before cold */
KASSERT(!cold);
#endif
for (;;) {
/*
* Wait until there's full entropy somewhere among the
* CPUs, as confirmed at most once per minute, or
* someone wants to consolidate.
*/
if (entropy_pending()) {
consolidate = true;
} else {
mutex_enter(&E->lock);
if (!E->consolidate)
cv_timedwait(&E->cv, &E->lock, 60*hz);
consolidate = E->consolidate;
E->consolidate = false;
mutex_exit(&E->lock);
}
if (consolidate) {
/* Do it. */
entropy_do_consolidate();
/* Mitigate abuse. */
kpause("entropy", false, hz, NULL);
}
}
}
struct entropy_pending_count {
uint32_t bitspending;
uint32_t samplespending;
};
/*
* entropy_pending()
*
* True if enough bits or samples are pending on other CPUs to
* warrant consolidation.
*/
static bool
entropy_pending(void)
{
struct entropy_pending_count count = { 0, 0 }, *C = &count;
percpu_foreach(entropy_percpu, &entropy_pending_cpu, C);
return C->bitspending >= MINENTROPYBITS ||
C->samplespending >= MINSAMPLES;
}
static void
entropy_pending_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct entropy_cpu *ec = ptr;
struct entropy_pending_count *C = cookie;
uint32_t cpu_bitspending;
uint32_t cpu_samplespending;
cpu_bitspending = atomic_load_relaxed(&ec->ec_bitspending);
cpu_samplespending = atomic_load_relaxed(&ec->ec_samplespending);
C->bitspending += MIN(MINENTROPYBITS - C->bitspending,
cpu_bitspending);
C->samplespending += MIN(MINSAMPLES - C->samplespending,
cpu_samplespending);
}
/*
* entropy_do_consolidate()
*
* Issue a cross-call to gather entropy on all CPUs and advance
* the entropy epoch.
*/
static void
entropy_do_consolidate(void)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
struct entpool pool;
uint8_t buf[ENTPOOL_CAPACITY];
unsigned bitsdiff, samplesdiff;
uint64_t ticket;
KASSERT(!cold);
ASSERT_SLEEPABLE();
/* Gather entropy on all CPUs into a temporary pool. */
memset(&pool, 0, sizeof pool);
ticket = xc_broadcast(0, &entropy_consolidate_xc, &pool, NULL);
xc_wait(ticket);
/* Acquire the lock to notify waiters. */
mutex_enter(&E->lock);
/* Count another consolidation. */
entropy_consolidate_evcnt.ev_count++;
/* Note when we last consolidated, i.e. now. */
E->timestamp = time_uptime;
/* Mix what we gathered into the global pool. */
entpool_extract(&pool, buf, sizeof buf);
entpool_enter(&E->pool, buf, sizeof buf);
explicit_memset(&pool, 0, sizeof pool);
/* Count the entropy that was gathered. */
bitsdiff = MIN(E->bitsneeded, E->bitspending);
atomic_store_relaxed(&E->bitsneeded, E->bitsneeded - bitsdiff);
E->bitspending -= bitsdiff;
if (__predict_false(E->bitsneeded > 0) && bitsdiff != 0) {
if ((boothowto & AB_DEBUG) != 0 &&
ratecheck(&lasttime, &interval)) {
printf("WARNING:"
" consolidating less than full entropy\n");
}
}
samplesdiff = MIN(E->samplesneeded, E->samplespending);
atomic_store_relaxed(&E->samplesneeded,
E->samplesneeded - samplesdiff);
E->samplespending -= samplesdiff;
/* Advance the epoch and notify waiters. */
entropy_notify();
/* Release the lock. */
mutex_exit(&E->lock);
}
/*
* entropy_consolidate_xc(vpool, arg2)
*
* Extract output from the local CPU's input pool and enter it
* into a temporary pool passed as vpool.
*/
static void
entropy_consolidate_xc(void *vpool, void *arg2 __unused)
{
struct entpool *pool = vpool;
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
uint8_t buf[ENTPOOL_CAPACITY];
uint32_t extra[7];
unsigned i = 0;
/* Grab CPU number and cycle counter to mix extra into the pool. */
extra[i++] = cpu_number();
extra[i++] = entropy_timer();
/*
* With the per-CPU state locked, extract from the per-CPU pool
* and count it as no longer pending.
*/
ec = entropy_cpu_get(&lock);
extra[i++] = entropy_timer();
entpool_extract(ec->ec_pool, buf, sizeof buf);
atomic_store_relaxed(&ec->ec_bitspending, 0);
atomic_store_relaxed(&ec->ec_samplespending, 0);
extra[i++] = entropy_timer();
entropy_cpu_put(&lock, ec);
extra[i++] = entropy_timer();
/*
* Copy over statistics, and enter the per-CPU extract and the
* extra timing into the temporary pool, under the global lock.
*/
mutex_enter(&E->lock);
extra[i++] = entropy_timer();
entpool_enter(pool, buf, sizeof buf);
explicit_memset(buf, 0, sizeof buf);
extra[i++] = entropy_timer();
KASSERT(i == __arraycount(extra));
entpool_enter(pool, extra, sizeof extra);
explicit_memset(extra, 0, sizeof extra);
mutex_exit(&E->lock);
}
/*
* entropy_notify()
*
* Caller just contributed entropy to the global pool. Advance
* the entropy epoch and notify waiters.
*
* Caller must hold the global entropy lock.
*/
static void
entropy_notify(void)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
static bool ready = false, besteffort = false;
unsigned epoch;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
/*
* If this is the first time, print a message to the console
* that we're ready so operators can compare it to the timing
* of other events.
*
* If we didn't get full entropy from reliable sources, report
* instead that we are running on fumes with best effort. (If
* we ever do get full entropy after that, print the ready
* message once.)
*/
if (__predict_false(!ready)) {
if (E->bitsneeded == 0) {
printf("entropy: ready\n");
ready = true;
} else if (E->samplesneeded == 0 && !besteffort) {
printf("entropy: best effort\n");
besteffort = true;
}
}
/* Set the epoch; roll over from UINTMAX-1 to 1. */
if (__predict_true(!atomic_load_relaxed(&entropy_depletion)) ||
ratecheck(&lasttime, &interval)) {
epoch = E->epoch + 1;
if (epoch == 0 || epoch == (unsigned)-1)
epoch = 1;
atomic_store_relaxed(&E->epoch, epoch);
}
KASSERT(E->epoch != (unsigned)-1);
/* Notify waiters. */
if (__predict_true(!cold)) {
cv_broadcast(&E->cv);
selnotify(&E->selq, POLLIN|POLLRDNORM, NOTE_SUBMIT);
}
/* Count another notification. */
entropy_notify_evcnt.ev_count++;
}
/*
* entropy_consolidate()
*
* Trigger entropy consolidation and wait for it to complete.
*
* This should be used sparingly, not periodically -- requiring
* conscious intervention by the operator or a clear policy
* decision. Otherwise, the kernel will automatically consolidate
* when enough entropy has been gathered into per-CPU pools to
* transition to full entropy.
*/
void
entropy_consolidate(void)
{
uint64_t ticket;
int error;
KASSERT(!cold);
ASSERT_SLEEPABLE();
mutex_enter(&E->lock);
ticket = entropy_consolidate_evcnt.ev_count;
E->consolidate = true;
cv_broadcast(&E->cv);
while (ticket == entropy_consolidate_evcnt.ev_count) {
error = cv_wait_sig(&E->cv, &E->lock);
if (error)
break;
}
mutex_exit(&E->lock);
}
/*
* sysctl -w kern.entropy.consolidate=1
*
* Trigger entropy consolidation and wait for it to complete.
* Writable only by superuser. This, writing to /dev/random, and
* ioctl(RNDADDDATA) are the only ways for the system to
* consolidate entropy if the operator knows something the kernel
* doesn't about how unpredictable the pending entropy pools are.
*/
static int
sysctl_entropy_consolidate(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg)
entropy_consolidate();
return error;
}
/*
* sysctl -w kern.entropy.gather=1
*
* Trigger gathering entropy from all on-demand sources, and wait
* for synchronous sources (but not asynchronous sources) to
* complete. Writable only by superuser.
*/
static int
sysctl_entropy_gather(SYSCTLFN_ARGS)
{
struct sysctlnode node = *rnode;
int arg = 0;
int error;
node.sysctl_data = &arg;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
if (arg) {
mutex_enter(&E->lock);
error = entropy_request(ENTROPY_CAPACITY,
ENTROPY_WAIT|ENTROPY_SIG);
mutex_exit(&E->lock);
}
return 0;
}
/*
* entropy_extract(buf, len, flags)
*
* Extract len bytes from the global entropy pool into buf.
*
* Caller MUST NOT expose these bytes directly -- must use them
* ONLY to seed a cryptographic pseudorandom number generator
* (`CPRNG'), a.k.a. deterministic random bit generator (`DRBG'),
* and then erase them. entropy_extract does not, on its own,
* provide backtracking resistance -- it must be combined with a
* PRNG/DRBG that does.
*
* This may be used very early at boot, before even entropy_init
* has been called.
*
* You generally shouldn't use this directly -- use cprng(9)
* instead.
*
* Flags may have:
*
* ENTROPY_WAIT Wait for entropy if not available yet.
* ENTROPY_SIG Allow interruption by a signal during wait.
* ENTROPY_HARDFAIL Either fill the buffer with full entropy,
* or fail without filling it at all.
*
* Return zero on success, or error on failure:
*
* EWOULDBLOCK No entropy and ENTROPY_WAIT not set.
* EINTR/ERESTART No entropy, ENTROPY_SIG set, and interrupted.
*
* If ENTROPY_WAIT is set, allowed only in thread context. If
* ENTROPY_WAIT is not set, allowed also in softint context -- may
* sleep on an adaptive lock up to IPL_SOFTSERIAL. Forbidden in
* hard interrupt context.
*/
int
entropy_extract(void *buf, size_t len, int flags)
{
static const struct timeval interval = {.tv_sec = 60, .tv_usec = 0};
static struct timeval lasttime; /* serialized by E->lock */
bool printed = false;
int s = -1/*XXXGCC*/, error;
if (ISSET(flags, ENTROPY_WAIT)) {
ASSERT_SLEEPABLE();
KASSERT(!cold);
}
/* Refuse to operate in interrupt context. */
KASSERT(!cpu_intr_p());
/*
* If we're cold, we are only contending with interrupts on the
* current CPU, so block them. Otherwise, we are _not_
* contending with interrupts on the current CPU, but we are
* contending with other threads, to exclude them with a mutex.
*/
if (__predict_false(cold))
s = splhigh();
else
mutex_enter(&E->lock);
/* Wait until there is enough entropy in the system. */
error = 0;
if (E->bitsneeded > 0 && E->samplesneeded == 0) {
/*
* We don't have full entropy from reliable sources,
* but we gathered a plausible number of samples from
* other sources such as timers. Try asking for more
* from any sources we can, but don't worry if it
* fails -- best effort.
*/
(void)entropy_request(ENTROPY_CAPACITY, flags); } else while (E->bitsneeded > 0 && E->samplesneeded > 0) {
/* Ask for more, synchronously if possible. */
error = entropy_request(len, flags);
if (error)
break;
/* If we got enough, we're done. */
if (E->bitsneeded == 0 || E->samplesneeded == 0) {
KASSERT(error == 0);
break;
}
/* If not waiting, stop here. */
if (!ISSET(flags, ENTROPY_WAIT)) {
error = EWOULDBLOCK;
break;
}
/* Wait for some entropy to come in and try again. */
KASSERT(!cold); if (!printed) {
printf("entropy: pid %d (%s) waiting for entropy(7)\n",
curproc->p_pid, curproc->p_comm);
printed = true;
}
if (ISSET(flags, ENTROPY_SIG)) {
error = cv_timedwait_sig(&E->cv, &E->lock, hz);
if (error && error != EWOULDBLOCK)
break;
} else {
cv_timedwait(&E->cv, &E->lock, hz);
}
}
/*
* Count failure -- but fill the buffer nevertheless, unless
* the caller specified ENTROPY_HARDFAIL.
*/
if (error) { if (ISSET(flags, ENTROPY_HARDFAIL))
goto out;
entropy_extract_fail_evcnt.ev_count++;
}
/*
* Report a warning if we haven't yet reached full entropy.
* This is the only case where we consider entropy to be
* `depleted' without kern.entropy.depletion enabled -- when we
* only have partial entropy, an adversary may be able to
* narrow the state of the pool down to a small number of
* possibilities; the output then enables them to confirm a
* guess, reducing its entropy from the adversary's perspective
* to zero.
*
* This should only happen if the operator has chosen to
* consolidate, either through sysctl kern.entropy.consolidate
* or by writing less than full entropy to /dev/random as root
* (which /dev/random promises will immediately affect
* subsequent output, for better or worse).
*/
if (E->bitsneeded > 0 && E->samplesneeded > 0) { if (__predict_false(E->epoch == (unsigned)-1) &&
ratecheck(&lasttime, &interval)) {
printf("WARNING:"
" system needs entropy for security;"
" see entropy(7)\n");
}
atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
}
/* Extract data from the pool, and `deplete' if we're doing that. */
entpool_extract(&E->pool, buf, len);
if (__predict_false(atomic_load_relaxed(&entropy_depletion)) &&
error == 0) {
unsigned cost = MIN(len, ENTROPY_CAPACITY)*NBBY;
unsigned bitsneeded = E->bitsneeded;
unsigned samplesneeded = E->samplesneeded;
bitsneeded += MIN(MINENTROPYBITS - bitsneeded, cost);
samplesneeded += MIN(MINSAMPLES - samplesneeded, cost);
atomic_store_relaxed(&E->bitsneeded, bitsneeded);
atomic_store_relaxed(&E->samplesneeded, samplesneeded);
entropy_deplete_evcnt.ev_count++;
}
out: /* Release the global lock and return the error. */
if (__predict_false(cold))
splx(s);
else
mutex_exit(&E->lock);
return error;
}
/*
* entropy_poll(events)
*
* Return the subset of events ready, and if it is not all of
* events, record curlwp as waiting for entropy.
*/
int
entropy_poll(int events)
{
int revents = 0;
KASSERT(!cold);
/* Always ready for writing. */
revents |= events & (POLLOUT|POLLWRNORM);
/* Narrow it down to reads. */
events &= POLLIN|POLLRDNORM;
if (events == 0)
return revents;
/*
* If we have reached full entropy and we're not depleting
* entropy, we are forever ready.
*/
if (__predict_true(atomic_load_relaxed(&E->bitsneeded) == 0 || atomic_load_relaxed(&E->samplesneeded) == 0) &&
__predict_true(!atomic_load_relaxed(&entropy_depletion)))
return revents | events;
/*
* Otherwise, check whether we need entropy under the lock. If
* we don't, we're ready; if we do, add ourselves to the queue.
*/
mutex_enter(&E->lock);
if (E->bitsneeded == 0 || E->samplesneeded == 0)
revents |= events;
else
selrecord(curlwp, &E->selq);
mutex_exit(&E->lock);
return revents;
}
/*
* filt_entropy_read_detach(kn)
*
* struct filterops::f_detach callback for entropy read events:
* remove kn from the list of waiters.
*/
static void
filt_entropy_read_detach(struct knote *kn)
{
KASSERT(!cold);
mutex_enter(&E->lock);
selremove_knote(&E->selq, kn);
mutex_exit(&E->lock);
}
/*
* filt_entropy_read_event(kn, hint)
*
* struct filterops::f_event callback for entropy read events:
* poll for entropy. Caller must hold the global entropy lock if
* hint is NOTE_SUBMIT, and must not if hint is not NOTE_SUBMIT.
*/
static int
filt_entropy_read_event(struct knote *kn, long hint)
{
int ret;
KASSERT(!cold);
/* Acquire the lock, if caller is outside entropy subsystem. */
if (hint == NOTE_SUBMIT)
KASSERT(mutex_owned(&E->lock));
else
mutex_enter(&E->lock);
/*
* If we still need entropy, can't read anything; if not, can
* read arbitrarily much.
*/
if (E->bitsneeded != 0 && E->samplesneeded != 0) {
ret = 0;
} else {
if (atomic_load_relaxed(&entropy_depletion))
kn->kn_data = ENTROPY_CAPACITY; /* bytes */
else
kn->kn_data = MIN(INT64_MAX, SSIZE_MAX);
ret = 1;
}
/* Release the lock, if caller is outside entropy subsystem. */
if (hint == NOTE_SUBMIT)
KASSERT(mutex_owned(&E->lock));
else
mutex_exit(&E->lock);
return ret;
}
/* XXX Makes sense only for /dev/u?random. */
static const struct filterops entropy_read_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_entropy_read_detach,
.f_event = filt_entropy_read_event,
};
/*
* entropy_kqfilter(kn)
*
* Register kn to receive entropy event notifications. May be
* EVFILT_READ or EVFILT_WRITE; anything else yields EINVAL.
*/
int
entropy_kqfilter(struct knote *kn)
{
KASSERT(!cold);
switch (kn->kn_filter) {
case EVFILT_READ:
/* Enter into the global select queue. */
mutex_enter(&E->lock);
kn->kn_fop = &entropy_read_filtops;
selrecord_knote(&E->selq, kn);
mutex_exit(&E->lock);
return 0;
case EVFILT_WRITE:
/* Can always dump entropy into the system. */
kn->kn_fop = &seltrue_filtops;
return 0;
default:
return EINVAL;
}
}
/*
* rndsource_setcb(rs, get, getarg)
*
* Set the request callback for the entropy source rs, if it can
* provide entropy on demand. Must precede rnd_attach_source.
*/
void
rndsource_setcb(struct krndsource *rs, void (*get)(size_t, void *),
void *getarg)
{
rs->get = get;
rs->getarg = getarg;
}
/*
* rnd_attach_source(rs, name, type, flags)
*
* Attach the entropy source rs. Must be done after
* rndsource_setcb, if any, and before any calls to rnd_add_data.
*/
void
rnd_attach_source(struct krndsource *rs, const char *name, uint32_t type,
uint32_t flags)
{
uint32_t extra[4];
unsigned i = 0;
KASSERTMSG(name[0] != '\0', "rndsource must have nonempty name");
/* Grab cycle counter to mix extra into the pool. */
extra[i++] = entropy_timer();
/*
* Apply some standard flags:
*
* - We do not bother with network devices by default, for
* hysterical raisins (perhaps: because it is often the case
* that an adversary can influence network packet timings).
*/
switch (type) {
case RND_TYPE_NET:
flags |= RND_FLAG_NO_COLLECT;
break;
}
/* Sanity-check the callback if RND_FLAG_HASCB is set. */
KASSERT(!ISSET(flags, RND_FLAG_HASCB) || rs->get != NULL);
/* Initialize the random source. */
memset(rs->name, 0, sizeof(rs->name)); /* paranoia */
strlcpy(rs->name, name, sizeof(rs->name));
memset(&rs->time_delta, 0, sizeof(rs->time_delta));
memset(&rs->value_delta, 0, sizeof(rs->value_delta));
rs->total = 0;
rs->type = type;
rs->flags = flags;
if (entropy_percpu != NULL)
rs->state = percpu_alloc(sizeof(struct rndsource_cpu));
extra[i++] = entropy_timer();
/* Wire it into the global list of random sources. */
if (__predict_true(!cold))
mutex_enter(&E->lock);
LIST_INSERT_HEAD(&E->sources, rs, list);
if (__predict_true(!cold))
mutex_exit(&E->lock);
extra[i++] = entropy_timer();
/* Request that it provide entropy ASAP, if we can. */
if (ISSET(flags, RND_FLAG_HASCB))
(*rs->get)(ENTROPY_CAPACITY, rs->getarg);
extra[i++] = entropy_timer();
/* Mix the extra into the pool. */
KASSERT(i == __arraycount(extra));
entropy_enter(extra, sizeof extra, 0, /*count*/__predict_true(!cold));
explicit_memset(extra, 0, sizeof extra);
}
/*
* rnd_detach_source(rs)
*
* Detach the entropy source rs. May sleep waiting for users to
* drain. Further use is not allowed.
*/
void
rnd_detach_source(struct krndsource *rs)
{
/*
* If we're cold (shouldn't happen, but hey), just remove it
* from the list -- there's nothing allocated.
*/
if (__predict_false(cold) && entropy_percpu == NULL) {
LIST_REMOVE(rs, list);
return;
}
/* We may have to wait for entropy_request. */
ASSERT_SLEEPABLE();
/* Wait until the source list is not in use, and remove it. */
mutex_enter(&E->lock);
while (E->sourcelock)
cv_wait(&E->sourcelock_cv, &E->lock);
LIST_REMOVE(rs, list);
mutex_exit(&E->lock);
/* Free the per-CPU data. */
percpu_free(rs->state, sizeof(struct rndsource_cpu));
}
/*
* rnd_lock_sources(flags)
*
* Lock the list of entropy sources. Caller must hold the global
* entropy lock. If successful, no rndsource will go away until
* rnd_unlock_sources even while the caller releases the global
* entropy lock.
*
* May be called very early at boot, before entropy_init.
*
* If flags & ENTROPY_WAIT, wait for concurrent access to finish.
* If flags & ENTROPY_SIG, allow interruption by signal.
*/
static int __attribute__((warn_unused_result))
rnd_lock_sources(int flags)
{
int error;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
while (E->sourcelock) {
KASSERT(!cold);
if (!ISSET(flags, ENTROPY_WAIT))
return EWOULDBLOCK;
if (ISSET(flags, ENTROPY_SIG)) {
error = cv_wait_sig(&E->sourcelock_cv, &E->lock);
if (error)
return error;
} else {
cv_wait(&E->sourcelock_cv, &E->lock);
}
}
E->sourcelock = curlwp;
return 0;
}
/*
* rnd_unlock_sources()
*
* Unlock the list of sources after rnd_lock_sources. Caller must
* hold the global entropy lock.
*
* May be called very early at boot, before entropy_init.
*/
static void
rnd_unlock_sources(void)
{
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
KASSERTMSG(E->sourcelock == curlwp, "lwp %p releasing lock held by %p",
curlwp, E->sourcelock);
E->sourcelock = NULL;
if (__predict_true(!cold))
cv_signal(&E->sourcelock_cv);
}
/*
* rnd_sources_locked()
*
* True if we hold the list of rndsources locked, for diagnostic
* assertions.
*
* May be called very early at boot, before entropy_init.
*/
static bool __diagused
rnd_sources_locked(void)
{
return E->sourcelock == curlwp;
}
/*
* entropy_request(nbytes, flags)
*
* Request nbytes bytes of entropy from all sources in the system.
* OK if we overdo it. Caller must hold the global entropy lock;
* will release and re-acquire it.
*
* May be called very early at boot, before entropy_init.
*
* If flags & ENTROPY_WAIT, wait for concurrent access to finish.
* If flags & ENTROPY_SIG, allow interruption by signal.
*/
static int
entropy_request(size_t nbytes, int flags)
{
struct krndsource *rs;
int error;
KASSERT(__predict_false(cold) || mutex_owned(&E->lock));
KASSERT(!cpu_intr_p());
if ((flags & ENTROPY_WAIT) != 0 && __predict_false(!cold))
ASSERT_SLEEPABLE();
/*
* Lock the list of entropy sources to block rnd_detach_source
* until we're done, and to serialize calls to the entropy
* callbacks as guaranteed to drivers.
*/
error = rnd_lock_sources(flags);
if (error)
return error;
entropy_request_evcnt.ev_count++;
/* Clamp to the maximum reasonable request. */
nbytes = MIN(nbytes, ENTROPY_CAPACITY);
/* Walk the list of sources. */
LIST_FOREACH(rs, &E->sources, list) {
/* Skip sources without callbacks. */
if (!ISSET(rs->flags, RND_FLAG_HASCB))
continue;
/*
* Skip sources that are disabled altogether -- we
* would just ignore their samples anyway.
*/
if (ISSET(rs->flags, RND_FLAG_NO_COLLECT))
continue;
/* Drop the lock while we call the callback. */
if (__predict_true(!cold))
mutex_exit(&E->lock);
(*rs->get)(nbytes, rs->getarg);
if (__predict_true(!cold))
mutex_enter(&E->lock);
}
/* Request done; unlock the list of entropy sources. */
rnd_unlock_sources();
return 0;
}
static inline uint32_t
rnd_delta_estimate(rnd_delta_t *d, uint32_t v, int32_t delta)
{
int32_t delta2, delta3;
/*
* Calculate the second and third order differentials
*/
delta2 = d->dx - delta;
if (delta2 < 0)
delta2 = -delta2; /* XXX arithmetic overflow */
delta3 = d->d2x - delta2;
if (delta3 < 0)
delta3 = -delta3; /* XXX arithmetic overflow */
d->x = v;
d->dx = delta;
d->d2x = delta2;
/*
* If any delta is 0, we got no entropy. If all are non-zero, we
* might have something.
*/
if (delta == 0 || delta2 == 0 || delta3 == 0)
return 0;
return 1;
}
static inline uint32_t
rnd_dt_estimate(struct krndsource *rs, uint32_t t)
{
int32_t delta;
uint32_t ret;
rnd_delta_t *d;
struct rndsource_cpu *rc;
rc = percpu_getref(rs->state);
d = &rc->rc_timedelta;
if (t < d->x) {
delta = UINT32_MAX - d->x + t;
} else {
delta = d->x - t;
}
if (delta < 0) {
delta = -delta; /* XXX arithmetic overflow */
}
ret = rnd_delta_estimate(d, t, delta);
KASSERT(d->x == t);
KASSERT(d->dx == delta);
percpu_putref(rs->state);
return ret;
}
/*
* rnd_add_uint32(rs, value)
*
* Enter 32 bits of data from an entropy source into the pool.
*
* May be called from any context or with spin locks held, but may
* drop data.
*
* This is meant for cheaply taking samples from devices that
* aren't designed to be hardware random number generators.
*/
void
rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
void
_rnd_add_uint32(struct krndsource *rs, uint32_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
void
_rnd_add_uint64(struct krndsource *rs, uint64_t value)
{
bool intr_p = true;
rnd_add_data_internal(rs, &value, sizeof value, 0, intr_p);
}
/*
* rnd_add_data(rs, buf, len, entropybits)
*
* Enter data from an entropy source into the pool, with a
* driver's estimate of how much entropy the physical source of
* the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's
* estimate and treat it as zero.
*
* rs MAY but SHOULD NOT be NULL. If rs is NULL, MUST NOT be
* called from interrupt context or with spin locks held.
*
* If rs is non-NULL, MAY but SHOULD NOT be called from interrupt
* context, in which case act like rnd_add_data_intr -- if the
* sample buffer is full, schedule a softint and drop any
* additional data on the floor. (This may change later once we
* fix drivers that still call this from interrupt context to use
* rnd_add_data_intr instead.) MUST NOT be called with spin locks
* held if not in hard interrupt context -- i.e., MUST NOT be
* called in thread context or softint context with spin locks
* held.
*/
void
rnd_add_data(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
bool intr_p = cpu_intr_p(); /* XXX make this unconditionally false */
/*
* Weird legacy exception that we should rip out and replace by
* creating new rndsources to attribute entropy to the callers:
* If there's no rndsource, just enter the data and time now.
*/
if (rs == NULL) {
uint32_t extra;
KASSERT(!intr_p); KASSERTMSG(howmany(entropybits, NBBY) <= len,
"%s: impossible entropy rate:"
" %"PRIu32" bits in %"PRIu32"-byte string",
rs ? rs->name : "(anonymous)", entropybits, len);
entropy_enter(buf, len, entropybits, /*count*/false);
extra = entropy_timer();
entropy_enter(&extra, sizeof extra, 0, /*count*/false);
explicit_memset(&extra, 0, sizeof extra);
return;
}
rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}
/*
* rnd_add_data_intr(rs, buf, len, entropybits)
*
* Try to enter data from an entropy source into the pool, with a
* driver's estimate of how much entropy the physical source of
* the data has. If RND_FLAG_NO_ESTIMATE, we ignore the driver's
* estimate and treat it as zero. If the sample buffer is full,
* schedule a softint and drop any additional data on the floor.
*/
void
rnd_add_data_intr(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
bool intr_p = true;
rnd_add_data_internal(rs, buf, len, entropybits, intr_p);
}
/*
* rnd_add_data_internal(rs, buf, len, entropybits, intr_p)
*
* Internal subroutine to decide whether or not to enter data or
* timing for a particular rndsource, and if so, to enter it.
*
* intr_p is true for callers from interrupt context or spin locks
* held, and false for callers from thread or soft interrupt
* context and no spin locks held.
*/
static void
rnd_add_data_internal(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits, bool intr_p)
{
uint32_t flags;
KASSERTMSG(howmany(entropybits, NBBY) <= len,
"%s: impossible entropy rate:"
" %"PRIu32" bits in %"PRIu32"-byte string",
rs ? rs->name : "(anonymous)", entropybits, len);
/*
* Hold up the reset xcall before it zeroes the entropy counts
* on this CPU or globally. Otherwise, we might leave some
* nonzero entropy attributed to an untrusted source in the
* event of a race with a change to flags.
*/
kpreempt_disable();
/* Load a snapshot of the flags. Ioctl may change them under us. */
flags = atomic_load_relaxed(&rs->flags);
/*
* Skip if:
* - we're not collecting entropy, or
* - the operator doesn't want to collect entropy from this, or
* - neither data nor timings are being collected from this.
*/
if (!atomic_load_relaxed(&entropy_collection) ||
ISSET(flags, RND_FLAG_NO_COLLECT) ||
!ISSET(flags, RND_FLAG_COLLECT_VALUE|RND_FLAG_COLLECT_TIME))
goto out;
/* If asked, ignore the estimate. */
if (ISSET(flags, RND_FLAG_NO_ESTIMATE))
entropybits = 0;
/* If we are collecting data, enter them. */
if (ISSET(flags, RND_FLAG_COLLECT_VALUE)) {
rnd_add_data_1(rs, buf, len, entropybits, /*count*/false,
RND_FLAG_COLLECT_VALUE, intr_p);
}
/* If we are collecting timings, enter one. */
if (ISSET(flags, RND_FLAG_COLLECT_TIME)) {
uint32_t extra;
bool count;
/* Sample a timer. */
extra = entropy_timer();
/* If asked, do entropy estimation on the time. */
if ((flags & (RND_FLAG_ESTIMATE_TIME|RND_FLAG_NO_ESTIMATE)) ==
RND_FLAG_ESTIMATE_TIME && __predict_true(!cold)) count = rnd_dt_estimate(rs, extra);
else
count = false;
rnd_add_data_1(rs, &extra, sizeof extra, 0, count,
RND_FLAG_COLLECT_TIME, intr_p);
}
out: /* Allow concurrent changes to flags to finish. */
kpreempt_enable();
}
static unsigned
add_sat(unsigned a, unsigned b)
{
unsigned c = a + b;
return (c < a ? UINT_MAX : c);
}
/*
* rnd_add_data_1(rs, buf, len, entropybits, count, flag)
*
* Internal subroutine to call either entropy_enter_intr, if we're
* in interrupt context, or entropy_enter if not, and to count the
* entropy in an rndsource.
*/
static void
rnd_add_data_1(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits, bool count, uint32_t flag, bool intr_p)
{
bool fullyused;
/*
* For the interrupt-like path, use entropy_enter_intr and take
* note of whether it consumed the full sample; otherwise, use
* entropy_enter, which always consumes the full sample.
*/
if (intr_p) {
fullyused = entropy_enter_intr(buf, len, entropybits, count);
} else {
entropy_enter(buf, len, entropybits, count);
fullyused = true;
}
/*
* If we used the full sample, note how many bits were
* contributed from this source.
*/
if (fullyused) {
if (__predict_false(cold)) {
const int s = splhigh();
rs->total = add_sat(rs->total, entropybits);
switch (flag) {
case RND_FLAG_COLLECT_TIME:
rs->time_delta.insamples =
add_sat(rs->time_delta.insamples, 1);
break;
case RND_FLAG_COLLECT_VALUE:
rs->value_delta.insamples =
add_sat(rs->value_delta.insamples, 1);
break;
}
splx(s);
} else {
struct rndsource_cpu *rc = percpu_getref(rs->state);
atomic_store_relaxed(&rc->rc_entropybits,
add_sat(rc->rc_entropybits, entropybits));
switch (flag) {
case RND_FLAG_COLLECT_TIME:
atomic_store_relaxed(&rc->rc_timesamples,
add_sat(rc->rc_timesamples, 1));
break;
case RND_FLAG_COLLECT_VALUE:
atomic_store_relaxed(&rc->rc_datasamples,
add_sat(rc->rc_datasamples, 1));
break;
}
percpu_putref(rs->state);
}
}
}
/*
* rnd_add_data_sync(rs, buf, len, entropybits)
*
* Same as rnd_add_data. Originally used in rndsource callbacks,
* to break an unnecessary cycle; no longer really needed.
*/
void
rnd_add_data_sync(struct krndsource *rs, const void *buf, uint32_t len,
uint32_t entropybits)
{
rnd_add_data(rs, buf, len, entropybits);
}
/*
* rndsource_entropybits(rs)
*
* Return approximately the number of bits of entropy that have
* been contributed via rs so far. Approximate if other CPUs may
* be calling rnd_add_data concurrently.
*/
static unsigned
rndsource_entropybits(struct krndsource *rs)
{
unsigned nbits = rs->total;
KASSERT(!cold);
KASSERT(rnd_sources_locked());
percpu_foreach(rs->state, rndsource_entropybits_cpu, &nbits);
return nbits;
}
static void
rndsource_entropybits_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct rndsource_cpu *rc = ptr;
unsigned *nbitsp = cookie;
unsigned cpu_nbits;
cpu_nbits = atomic_load_relaxed(&rc->rc_entropybits);
*nbitsp += MIN(UINT_MAX - *nbitsp, cpu_nbits);
}
/*
* rndsource_to_user(rs, urs)
*
* Copy a description of rs out to urs for userland.
*/
static void
rndsource_to_user(struct krndsource *rs, rndsource_t *urs)
{
KASSERT(!cold);
KASSERT(rnd_sources_locked());
/* Avoid kernel memory disclosure. */
memset(urs, 0, sizeof(*urs));
CTASSERT(sizeof(urs->name) == sizeof(rs->name));
strlcpy(urs->name, rs->name, sizeof(urs->name));
urs->total = rndsource_entropybits(rs);
urs->type = rs->type;
urs->flags = atomic_load_relaxed(&rs->flags);
}
/*
* rndsource_to_user_est(rs, urse)
*
* Copy a description of rs and estimation statistics out to urse
* for userland.
*/
static void
rndsource_to_user_est(struct krndsource *rs, rndsource_est_t *urse)
{
KASSERT(!cold);
KASSERT(rnd_sources_locked());
/* Avoid kernel memory disclosure. */
memset(urse, 0, sizeof(*urse));
/* Copy out the rndsource description. */
rndsource_to_user(rs, &urse->rt);
/* Gather the statistics. */
urse->dt_samples = rs->time_delta.insamples;
urse->dt_total = 0;
urse->dv_samples = rs->value_delta.insamples;
urse->dv_total = urse->rt.total;
percpu_foreach(rs->state, rndsource_to_user_est_cpu, urse);
}
static void
rndsource_to_user_est_cpu(void *ptr, void *cookie, struct cpu_info *ci)
{
struct rndsource_cpu *rc = ptr;
rndsource_est_t *urse = cookie;
urse->dt_samples = add_sat(urse->dt_samples,
atomic_load_relaxed(&rc->rc_timesamples));
urse->dv_samples = add_sat(urse->dv_samples,
atomic_load_relaxed(&rc->rc_datasamples));
}
/*
* entropy_reset_xc(arg1, arg2)
*
* Reset the current CPU's pending entropy to zero.
*/
static void
entropy_reset_xc(void *arg1 __unused, void *arg2 __unused)
{
uint32_t extra = entropy_timer();
struct entropy_cpu_lock lock;
struct entropy_cpu *ec;
/*
* With the per-CPU state locked, zero the pending count and
* enter a cycle count for fun.
*/
ec = entropy_cpu_get(&lock);
ec->ec_bitspending = 0;
ec->ec_samplespending = 0;
entpool_enter(ec->ec_pool, &extra, sizeof extra);
entropy_cpu_put(&lock, ec);
}
/*
* entropy_ioctl(cmd, data)
*
* Handle various /dev/random ioctl queries.
*/
int
entropy_ioctl(unsigned long cmd, void *data)
{
struct krndsource *rs;
bool privileged;
int error;
KASSERT(!cold);
/* Verify user's authorization to perform the ioctl. */
switch (cmd) {
case RNDGETENTCNT:
case RNDGETPOOLSTAT:
case RNDGETSRCNUM:
case RNDGETSRCNAME:
case RNDGETESTNUM:
case RNDGETESTNAME:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_GETPRIV, NULL, NULL, NULL, NULL);
break;
case RNDCTL:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_SETPRIV, NULL, NULL, NULL, NULL);
break;
case RNDADDDATA:
error = kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_ADDDATA, NULL, NULL, NULL, NULL);
/* Ascertain whether the user's inputs should be counted. */
if (kauth_authorize_device(kauth_cred_get(),
KAUTH_DEVICE_RND_ADDDATA_ESTIMATE,
NULL, NULL, NULL, NULL) == 0)
privileged = true;
break;
default: {
/*
* XXX Hack to avoid changing module ABI so this can be
* pulled up. Later, we can just remove the argument.
*/
static const struct fileops fops = {
.fo_ioctl = rnd_system_ioctl,
};
struct file f = {
.f_ops = &fops,
};
MODULE_HOOK_CALL(rnd_ioctl_50_hook, (&f, cmd, data),
enosys(), error);
#if defined(_LP64)
if (error == ENOSYS)
MODULE_HOOK_CALL(rnd_ioctl32_50_hook, (&f, cmd, data),
enosys(), error);
#endif
if (error == ENOSYS)
error = ENOTTY;
break;
}
}
/* If anything went wrong with authorization, stop here. */
if (error)
return error;
/* Dispatch on the command. */
switch (cmd) {
case RNDGETENTCNT: { /* Get current entropy count in bits. */
uint32_t *countp = data;
mutex_enter(&E->lock);
*countp = MINENTROPYBITS - E->bitsneeded;
mutex_exit(&E->lock);
break;
}
case RNDGETPOOLSTAT: { /* Get entropy pool statistics. */
rndpoolstat_t *pstat = data;
mutex_enter(&E->lock);
/* parameters */
pstat->poolsize = ENTPOOL_SIZE/sizeof(uint32_t); /* words */
pstat->threshold = MINENTROPYBITS/NBBY; /* bytes */
pstat->maxentropy = ENTROPY_CAPACITY*NBBY; /* bits */
/* state */
pstat->added = 0; /* XXX total entropy_enter count */
pstat->curentropy = MINENTROPYBITS - E->bitsneeded; /* bits */
pstat->removed = 0; /* XXX total entropy_extract count */
pstat->discarded = 0; /* XXX bits of entropy beyond capacity */
/*
* This used to be bits of data fabricated in some
* sense; we'll take it to mean number of samples,
* excluding the bits of entropy from HWRNG or seed.
*/
pstat->generated = MINSAMPLES - E->samplesneeded;
pstat->generated -= MIN(pstat->generated, pstat->curentropy);
mutex_exit(&E->lock);
break;
}
case RNDGETSRCNUM: { /* Get entropy sources by number. */
rndstat_t *stat = data;
uint32_t start = 0, i = 0;
/* Skip if none requested; fail if too many requested. */
if (stat->count == 0)
break;
if (stat->count > RND_MAXSTATCOUNT)
return EINVAL;
/*
* Under the lock, find the first one, copy out as many
* as requested, and report how many we copied out.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (start++ == stat->start)
break;
}
while (i < stat->count && rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user(rs, &stat->source[i++]);
mutex_enter(&E->lock);
rs = LIST_NEXT(rs, list);
}
KASSERT(i <= stat->count);
stat->count = i;
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETESTNUM: { /* Get sources and estimates by number. */
rndstat_est_t *estat = data;
uint32_t start = 0, i = 0;
/* Skip if none requested; fail if too many requested. */
if (estat->count == 0)
break;
if (estat->count > RND_MAXSTATCOUNT)
return EINVAL;
/*
* Under the lock, find the first one, copy out as many
* as requested, and report how many we copied out.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (start++ == estat->start)
break;
}
while (i < estat->count && rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user_est(rs, &estat->source[i++]);
mutex_enter(&E->lock);
rs = LIST_NEXT(rs, list);
}
KASSERT(i <= estat->count);
estat->count = i;
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETSRCNAME: { /* Get entropy sources by name. */
rndstat_name_t *nstat = data;
const size_t n = sizeof(rs->name);
CTASSERT(sizeof(rs->name) == sizeof(nstat->name));
/*
* Under the lock, search by name. If found, copy it
* out; if not found, fail with ENOENT.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (strncmp(rs->name, nstat->name, n) == 0)
break;
}
if (rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user(rs, &nstat->source);
mutex_enter(&E->lock);
} else {
error = ENOENT;
}
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDGETESTNAME: { /* Get sources and estimates by name. */
rndstat_est_name_t *enstat = data;
const size_t n = sizeof(rs->name);
CTASSERT(sizeof(rs->name) == sizeof(enstat->name));
/*
* Under the lock, search by name. If found, copy it
* out; if not found, fail with ENOENT.
*/
mutex_enter(&E->lock);
error = rnd_lock_sources(ENTROPY_WAIT|ENTROPY_SIG);
if (error) {
mutex_exit(&E->lock);
return error;
}
LIST_FOREACH(rs, &E->sources, list) {
if (strncmp(rs->name, enstat->name, n) == 0)
break;
}
if (rs != NULL) {
mutex_exit(&E->lock);
rndsource_to_user_est(rs, &enstat->source);
mutex_enter(&E->lock);
} else {
error = ENOENT;
}
rnd_unlock_sources();
mutex_exit(&E->lock);
break;
}
case RNDCTL: { /* Modify entropy source flags. */
rndctl_t *rndctl = data;
const size_t n = sizeof(rs->name);
uint32_t resetflags = RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;
uint32_t flags;
bool reset = false, request = false;
CTASSERT(sizeof(rs->name) == sizeof(rndctl->name));
/* Whitelist the flags that user can change. */
rndctl->mask &= RND_FLAG_NO_ESTIMATE|RND_FLAG_NO_COLLECT;
/*
* For each matching rndsource, either by type if
* specified or by name if not, set the masked flags.
*/
mutex_enter(&E->lock);
LIST_FOREACH(rs, &E->sources, list) {
if (rndctl->type != 0xff) {
if (rs->type != rndctl->type)
continue;
} else if (rndctl->name[0] != '\0') { if (strncmp(rs->name, rndctl->name, n) != 0)
continue;
}
flags = rs->flags & ~rndctl->mask;
flags |= rndctl->flags & rndctl->mask;
if ((rs->flags & resetflags) == 0 &&
(flags & resetflags) != 0)
reset = true;
if ((rs->flags ^ flags) & resetflags)
request = true;
atomic_store_relaxed(&rs->flags, flags);
}
mutex_exit(&E->lock);
/*
* If we disabled estimation or collection, nix all the
* pending entropy and set needed to the maximum.
*/
if (reset) { xc_broadcast(0, &entropy_reset_xc, NULL, NULL);
mutex_enter(&E->lock);
E->bitspending = 0;
E->samplespending = 0;
atomic_store_relaxed(&E->bitsneeded, MINENTROPYBITS);
atomic_store_relaxed(&E->samplesneeded, MINSAMPLES);
E->consolidate = false;
mutex_exit(&E->lock);
}
/*
* If we changed any of the estimation or collection
* flags, request new samples from everyone -- either
* to make up for what we just lost, or to get new
* samples from what we just added.
*
* Failing on signal, while waiting for another process
* to finish requesting entropy, is OK here even though
* we have committed side effects, because this ioctl
* command is idempotent, so repeating it is safe.
*/
if (request) { mutex_enter(&E->lock);
error = entropy_request(ENTROPY_CAPACITY,
ENTROPY_WAIT|ENTROPY_SIG);
mutex_exit(&E->lock);
}
break;
}
case RNDADDDATA: { /* Enter seed into entropy pool. */
rnddata_t *rdata = data;
unsigned entropybits = 0;
if (!atomic_load_relaxed(&entropy_collection))
break; /* thanks but no thanks */
if (rdata->len > MIN(sizeof(rdata->data), UINT32_MAX/NBBY))
return EINVAL;
/*
* This ioctl serves as the userland alternative a
* bootloader-provided seed -- typically furnished by
* /etc/rc.d/random_seed. We accept the user's entropy
* claim only if
*
* (a) the user is privileged, and
* (b) we have not entered a bootloader seed.
*
* under the assumption that the user may use this to
* load a seed from disk that we have already loaded
* from the bootloader, so we don't double-count it.
*/
if (privileged && rdata->entropy && rdata->len) {
mutex_enter(&E->lock);
if (!E->seeded) { entropybits = MIN(rdata->entropy,
MIN(rdata->len, ENTROPY_CAPACITY)*NBBY);
E->seeded = true;
}
mutex_exit(&E->lock);
}
/* Enter the data and consolidate entropy. */
rnd_add_data(&seed_rndsource, rdata->data, rdata->len,
entropybits);
entropy_consolidate();
break;
}
default:
error = ENOTTY;
}
/* Return any error that may have come up. */
return error;
}
/* Legacy entry points */
void
rnd_seed(void *seed, size_t len)
{
if (len != sizeof(rndsave_t)) {
printf("entropy: invalid seed length: %zu,"
" expected sizeof(rndsave_t) = %zu\n",
len, sizeof(rndsave_t));
return;
}
entropy_seed(seed);
}
void
rnd_init(void)
{
entropy_init();
}
void
rnd_init_softint(void)
{
entropy_init_late();
entropy_bootrequest();
}
int
rnd_system_ioctl(struct file *fp, unsigned long cmd, void *data)
{
return entropy_ioctl(cmd, data);
}
/* $NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $ */
/*-
* Copyright (c) 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Emmanuel Dreyfus.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: clockctl.c,v 1.39 2022/03/28 12:33:20 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ntp.h"
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/device.h>
#include <sys/time.h>
#include <sys/conf.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/compat_stub.h>
#include <sys/clockctl.h>
#include <compat/sys/clockctl.h>
#include <compat/sys/time_types.h>
kmutex_t clockctl_mtx;
int clockctl_refcnt;
#include "ioconf.h"
dev_type_ioctl(clockctlioctl);
const struct cdevsw clockctl_cdevsw = {
.d_open = clockctlopen,
.d_close = clockctlclose,
.d_read = noread,
.d_write = nowrite,
.d_ioctl = clockctlioctl,
.d_stop = nostop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER,
};
static kauth_listener_t clockctl_listener;
static int
clockctl_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
enum kauth_system_req req;
bool device_context;
result = KAUTH_RESULT_DEFER;
req = (enum kauth_system_req)(uintptr_t)arg0;
if ((action != KAUTH_SYSTEM_TIME) ||
(req != KAUTH_REQ_SYSTEM_TIME_SYSTEM))
return result;
device_context = arg3 != NULL;
/* Device is controlled by permissions, so allow. */
if (device_context)
result = KAUTH_RESULT_ALLOW;
return result;
}
/*ARGSUSED*/
void
clockctlattach(int num)
{
/*
* Don't initialize the listener here - it will get handled as part
* of module initialization.
*/
#if 0
clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
clockctl_listener_cb, NULL);
#endif
}
/*
* Maintain a refcount for each open/close, so we know when it is
* safe to call devsw_detach()
*/
int
clockctlopen(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_enter(&clockctl_mtx);
clockctl_refcnt++;
mutex_exit(&clockctl_mtx);
return 0;
}
int
clockctlclose(dev_t dev, int flag, int mode, struct lwp *l)
{
mutex_enter(&clockctl_mtx);
clockctl_refcnt--;
mutex_exit(&clockctl_mtx);
return 0;
}
MODULE(MODULE_CLASS_DRIVER, clockctl, NULL);
int
clockctl_modcmd(modcmd_t cmd, void *data)
{
int error;
#ifdef _MODULE
int bmajor, cmajor;
#endif
error = 0;
switch (cmd) {
case MODULE_CMD_INIT:
mutex_init(&clockctl_mtx, MUTEX_DEFAULT, IPL_NONE);
clockctl_listener = kauth_listen_scope(KAUTH_SCOPE_SYSTEM,
clockctl_listener_cb, NULL);
#ifdef _MODULE
bmajor = cmajor = -1;
error = devsw_attach("clockctl", NULL, &bmajor,
&clockctl_cdevsw, &cmajor);
if (error != 0)
kauth_unlisten_scope(clockctl_listener);
#endif
break;
case MODULE_CMD_FINI:
mutex_enter(&clockctl_mtx);
if (clockctl_refcnt != 0) {
mutex_exit(&clockctl_mtx);
return EBUSY;
}
#ifdef _MODULE
devsw_detach(NULL, &clockctl_cdevsw);
#endif
mutex_exit(&clockctl_mtx);
kauth_unlisten_scope(clockctl_listener);
mutex_destroy(&clockctl_mtx);
break;
default:
error = ENOTTY;
break;
}
return error;
}
int
clockctlioctl(
dev_t dev,
u_long cmd,
void *data,
int flags,
struct lwp *l)
{
int error = 0;
switch (cmd) {
case CLOCKCTL_SETTIMEOFDAY: {
struct clockctl_settimeofday *args = data;
error = settimeofday1(args->tv, true, args->tzp, l, false);
break;
}
case CLOCKCTL_ADJTIME: {
struct timeval atv, oldatv;
struct clockctl_adjtime *args = data;
if (args->delta) {
error = copyin(args->delta, &atv, sizeof(atv));
if (error)
return (error);
}
adjtime1(args->delta ? &atv : NULL,
args->olddelta ? &oldatv : NULL, l->l_proc);
if (args->olddelta) error = copyout(&oldatv, args->olddelta,
sizeof(oldatv));
break;
}
case CLOCKCTL_CLOCK_SETTIME: {
struct clockctl_clock_settime *args = data;
struct timespec ts;
error = copyin(args->tp, &ts, sizeof ts);
if (error)
return (error);
error = clock_settime1(l->l_proc, args->clock_id, &ts, false);
break;
}
case CLOCKCTL_NTP_ADJTIME: {
struct clockctl_ntp_adjtime *args = data;
struct timex ntv;
if (vec_ntp_timestatus == NULL) {
error = ENOTTY;
break;
}
error = copyin(args->tp, &ntv, sizeof(ntv));
if (error)
return (error);
(*vec_ntp_adjtime1)(&ntv);
error = copyout(&ntv, args->tp, sizeof(ntv));
if (error == 0) args->retval = (*vec_ntp_timestatus)();
break;
}
default:
MODULE_HOOK_CALL(clockctl_ioctl_50_hook,
(dev, cmd, data, flags, l), enosys(), error);
if (error == ENOSYS)
error = ENOTTY;
}
return (error);
}
/* $NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $ */
/*
* Copyright (c) 2005-2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Julio M. Merino Vidal, developed as part of Google's Summer of Code
* 2005 program, and by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Efficient memory file system: interfaces for inode and directory entry
* construction, destruction and manipulation.
*
* Reference counting
*
* The link count of inode (tmpfs_node_t::tn_links) is used as a
* reference counter. However, it has slightly different semantics.
*
* For directories - link count represents directory entries, which
* refer to the directories. In other words, it represents the count
* of sub-directories. It also takes into account the virtual '.'
* entry (which has no real entry in the list). For files - link count
* represents the hard links. Since only empty directories can be
* removed - link count aligns the reference counting requirements
* enough. Note: to check whether directory is not empty, the inode
* size (tmpfs_node_t::tn_size) can be used.
*
* The inode itself, as an object, gathers its first reference when
* directory entry is attached via tmpfs_dir_attach(9). For instance,
* after regular tmpfs_create(), a file would have a link count of 1,
* while directory after tmpfs_mkdir() would have 2 (due to '.').
*
* Reclamation
*
* It should be noted that tmpfs inodes rely on a combination of vnode
* reference counting and link counting. That is, an inode can only be
* destroyed if its associated vnode is inactive. The destruction is
* done on vnode reclamation i.e. tmpfs_reclaim(). It should be noted
* that tmpfs_node_t::tn_links being 0 is a destruction criterion.
*
* If an inode has references within the file system (tn_links > 0) and
* its inactive vnode gets reclaimed/recycled - then the association is
* broken in tmpfs_reclaim(). In such case, an inode will always pass
* tmpfs_lookup() and thus vcache_get() to associate a new vnode.
*
* Lock order
*
* vnode_t::v_vlock ->
* vnode_t::v_interlock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_subr.c,v 1.117 2023/04/29 08:15:13 riastradh Exp $");
#include <sys/param.h>
#include <sys/cprng.h>
#include <sys/dirent.h>
#include <sys/event.h>
#include <sys/kmem.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/time.h>
#include <sys/stat.h>
#include <sys/systm.h>
#include <sys/vnode.h>
#include <sys/kauth.h>
#include <sys/atomic.h>
#include <uvm/uvm_aobj.h>
#include <uvm/uvm_extern.h>
#include <uvm/uvm_object.h>
#include <miscfs/specfs/specdev.h>
#include <miscfs/genfs/genfs.h>
#include <fs/tmpfs/tmpfs.h>
#include <fs/tmpfs/tmpfs_fifoops.h>
#include <fs/tmpfs/tmpfs_specops.h>
#include <fs/tmpfs/tmpfs_vnops.h>
static void tmpfs_dir_putseq(tmpfs_node_t *, tmpfs_dirent_t *);
/*
* Initialize vnode with tmpfs node.
*/
static void
tmpfs_init_vnode(struct vnode *vp, tmpfs_node_t *node)
{
krwlock_t *slock;
KASSERT(node->tn_vnode == NULL);
/* Share the interlock with the node. */
if (node->tn_type == VREG) { slock = node->tn_spec.tn_reg.tn_aobj->vmobjlock;
rw_obj_hold(slock);
uvm_obj_setlock(&vp->v_uobj, slock);
}
vp->v_tag = VT_TMPFS;
vp->v_type = node->tn_type;
/* Type-specific initialization. */
switch (vp->v_type) {
case VBLK:
case VCHR:
vp->v_op = tmpfs_specop_p;
spec_node_init(vp, node->tn_spec.tn_dev.tn_rdev);
break;
case VFIFO:
vp->v_op = tmpfs_fifoop_p;
break;
case VDIR:
if (node->tn_spec.tn_dir.tn_parent == node) vp->v_vflag |= VV_ROOT;
/* FALLTHROUGH */
case VLNK:
case VREG:
case VSOCK:
vp->v_op = tmpfs_vnodeop_p;
break;
default:
panic("bad node type %d", vp->v_type);
break;
}
vp->v_data = node;
node->tn_vnode = vp;
uvm_vnp_setsize(vp, node->tn_size);
KASSERT(node->tn_mode != VNOVAL);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
}
/*
* tmpfs_loadvnode: initialise a vnode for a specified inode.
*/
int
tmpfs_loadvnode(struct mount *mp, struct vnode *vp,
const void *key, size_t key_len, const void **new_key)
{
tmpfs_node_t *node;
KASSERT(key_len == sizeof(node));
memcpy(&node, key, key_len);
if (node->tn_links == 0)
return ENOENT;
tmpfs_init_vnode(vp, node);
*new_key = &vp->v_data;
return 0;
}
/*
* tmpfs_newvnode: allocate a new inode of a specified type and
* attach the vonode.
*/
int
tmpfs_newvnode(struct mount *mp, struct vnode *dvp, struct vnode *vp,
struct vattr *vap, kauth_cred_t cred, void *extra,
size_t *key_len, const void **new_key)
{ tmpfs_mount_t *tmp = VFS_TO_TMPFS(mp);
tmpfs_node_t *node, *dnode;
if (dvp != NULL) { KASSERT(VOP_ISLOCKED(dvp)); dnode = VP_TO_TMPFS_DIR(dvp); if (dnode->tn_links == 0)
return ENOENT;
if (vap->va_type == VDIR) {
/* Check for maximum links limit. */
if (dnode->tn_links == LINK_MAX)
return EMLINK;
KASSERT(dnode->tn_links < LINK_MAX);
}
} else
dnode = NULL;
node = tmpfs_node_get(tmp);
if (node == NULL)
return ENOSPC;
/* Initially, no references and no associations. */
node->tn_links = 0;
node->tn_vnode = NULL;
node->tn_holdcount = 0;
node->tn_dirent_hint = NULL;
/*
* XXX Where the pool is backed by a map larger than (4GB *
* sizeof(*node)), this may produce duplicate inode numbers
* for applications that do not understand 64-bit ino_t.
*/
node->tn_id = (ino_t)((uintptr_t)node / sizeof(*node));
/*
* Make sure the generation number is not zero.
* tmpfs_inactive() uses generation zero to mark dead nodes.
*/
do {
node->tn_gen = TMPFS_NODE_GEN_MASK & cprng_fast32();
} while (node->tn_gen == 0);
/* Generic initialization. */
KASSERT((int)vap->va_type != VNOVAL);
node->tn_type = vap->va_type;
node->tn_size = 0;
node->tn_flags = 0;
node->tn_lockf = NULL;
node->tn_tflags = 0;
vfs_timestamp(&node->tn_atime);
node->tn_birthtime = node->tn_atime;
node->tn_ctime = node->tn_atime;
node->tn_mtime = node->tn_atime;
mutex_init(&node->tn_timelock, MUTEX_DEFAULT, IPL_NONE);
if (dvp == NULL) {
KASSERT(vap->va_uid != VNOVAL && vap->va_gid != VNOVAL);
node->tn_uid = vap->va_uid;
node->tn_gid = vap->va_gid;
vp->v_vflag |= VV_ROOT;
} else {
KASSERT(dnode != NULL);
node->tn_uid = kauth_cred_geteuid(cred);
node->tn_gid = dnode->tn_gid;
}
KASSERT(vap->va_mode != VNOVAL);
node->tn_mode = vap->va_mode;
/* Type-specific initialization. */
switch (node->tn_type) {
case VBLK:
case VCHR:
/* Character/block special device. */
KASSERT(vap->va_rdev != VNOVAL);
node->tn_spec.tn_dev.tn_rdev = vap->va_rdev;
break;
case VDIR:
/* Directory. */
TAILQ_INIT(&node->tn_spec.tn_dir.tn_dir);
node->tn_spec.tn_dir.tn_parent = NULL;
node->tn_spec.tn_dir.tn_seq_arena = NULL;
node->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
node->tn_spec.tn_dir.tn_readdir_lastp = NULL;
/* Extra link count for the virtual '.' entry. */
node->tn_links++;
break;
case VFIFO:
case VSOCK:
break;
case VLNK:
node->tn_size = 0;
node->tn_spec.tn_lnk.tn_link = NULL;
break;
case VREG:
/* Regular file. Create an underlying UVM object. */
node->tn_spec.tn_reg.tn_aobj =
uao_create(INT64_MAX - PAGE_SIZE, 0);
node->tn_spec.tn_reg.tn_aobj_pages = 0;
break;
default:
panic("bad node type %d", vp->v_type);
break;
}
tmpfs_init_vnode(vp, node);
mutex_enter(&tmp->tm_lock);
LIST_INSERT_HEAD(&tmp->tm_nodes, node, tn_entries);
mutex_exit(&tmp->tm_lock);
*key_len = sizeof(vp->v_data);
*new_key = &vp->v_data;
return 0;
}
/*
* tmpfs_free_node: remove the inode from a list in the mount point and
* destroy the inode structures.
*/
void
tmpfs_free_node(tmpfs_mount_t *tmp, tmpfs_node_t *node)
{
size_t objsz;
uint32_t hold;
mutex_enter(&tmp->tm_lock);
hold = atomic_or_32_nv(&node->tn_holdcount, TMPFS_NODE_RECLAIMED);
/* Defer destruction to last thread holding this node. */
if (hold != TMPFS_NODE_RECLAIMED) {
mutex_exit(&tmp->tm_lock);
return;
}
LIST_REMOVE(node, tn_entries);
mutex_exit(&tmp->tm_lock);
switch (node->tn_type) {
case VLNK:
if (node->tn_size > 0) { tmpfs_strname_free(tmp, node->tn_spec.tn_lnk.tn_link,
node->tn_size);
}
break;
case VREG:
/*
* Calculate the size of inode data, decrease the used-memory
* counter, and destroy the unerlying UVM object (if any).
*/
objsz = PAGE_SIZE * node->tn_spec.tn_reg.tn_aobj_pages;
if (objsz != 0) { tmpfs_mem_decr(tmp, objsz);
}
if (node->tn_spec.tn_reg.tn_aobj != NULL) { uao_detach(node->tn_spec.tn_reg.tn_aobj);
}
break;
case VDIR:
KASSERT(node->tn_size == 0); KASSERT(node->tn_spec.tn_dir.tn_seq_arena == NULL); KASSERT(TAILQ_EMPTY(&node->tn_spec.tn_dir.tn_dir)); KASSERT(node->tn_spec.tn_dir.tn_parent == NULL ||
node == tmp->tm_root);
break;
default:
break;
}
KASSERT(node->tn_vnode == NULL); KASSERT(node->tn_links == 0);
mutex_destroy(&node->tn_timelock);
tmpfs_node_put(tmp, node);
}
/*
* tmpfs_construct_node: allocate a new file of specified type and adds it
* into the parent directory.
*
* => Credentials of the caller are used.
*/
int
tmpfs_construct_node(vnode_t *dvp, vnode_t **vpp, struct vattr *vap,
struct componentname *cnp, char *target)
{ tmpfs_mount_t *tmp = VFS_TO_TMPFS(dvp->v_mount); tmpfs_node_t *dnode = VP_TO_TMPFS_DIR(dvp), *node;
tmpfs_dirent_t *de, *wde;
char *slink = NULL;
int ssize = 0;
int error;
/* Allocate symlink target. */
if (target != NULL) { KASSERT(vap->va_type == VLNK);
ssize = strlen(target);
KASSERT(ssize < MAXPATHLEN); if (ssize > 0) {
slink = tmpfs_strname_alloc(tmp, ssize);
if (slink == NULL)
return ENOSPC;
memcpy(slink, target, ssize);
}
}
/* Allocate a directory entry that points to the new file. */
error = tmpfs_alloc_dirent(tmp, cnp->cn_nameptr, cnp->cn_namelen, &de);
if (error) {
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize);
return error;
}
/* Allocate a vnode that represents the new file. */
error = vcache_new(dvp->v_mount, dvp, vap, cnp->cn_cred, NULL, vpp);
if (error) {
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de);
return error;
}
error = vn_lock(*vpp, LK_EXCLUSIVE);
if (error) {
vrele(*vpp);
*vpp = NULL;
if (slink != NULL) tmpfs_strname_free(tmp, slink, ssize); tmpfs_free_dirent(tmp, de);
return error;
}
node = VP_TO_TMPFS_NODE(*vpp); if (slink != NULL) { node->tn_spec.tn_lnk.tn_link = slink;
node->tn_size = ssize;
}
/* Remove whiteout before adding the new entry. */
if (cnp->cn_flags & ISWHITEOUT) {
wde = tmpfs_dir_lookup(dnode, cnp);
KASSERT(wde != NULL && wde->td_node == TMPFS_NODE_WHITEOUT);
tmpfs_dir_detach(dnode, wde);
tmpfs_free_dirent(tmp, wde);
}
/* Associate inode and attach the entry into the directory. */
tmpfs_dir_attach(dnode, de, node);
/* Make node opaque if requested. */
if (cnp->cn_flags & ISWHITEOUT) node->tn_flags |= UF_OPAQUE;
/* Update the parent's timestamps. */
tmpfs_update(dvp, TMPFS_UPDATE_MTIME | TMPFS_UPDATE_CTIME);
VOP_UNLOCK(*vpp);
cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen, cnp->cn_flags);
return 0;
}
/*
* tmpfs_alloc_dirent: allocates a new directory entry for the inode.
* The directory entry contains a path name component.
*/
int
tmpfs_alloc_dirent(tmpfs_mount_t *tmp, const char *name, uint16_t len,
tmpfs_dirent_t **de)
{
tmpfs_dirent_t *nde;
nde = tmpfs_dirent_get(tmp);
if (nde == NULL)
return ENOSPC;
nde->td_name = tmpfs_strname_alloc(tmp, len);
if (nde->td_name == NULL) {
tmpfs_dirent_put(tmp, nde);
return ENOSPC;
}
nde->td_namelen = len;
memcpy(nde->td_name, name, len);
nde->td_seq = TMPFS_DIRSEQ_NONE;
nde->td_node = NULL; /* for asserts */
*de = nde;
return 0;
}
/*
* tmpfs_free_dirent: free a directory entry.
*/
void
tmpfs_free_dirent(tmpfs_mount_t *tmp, tmpfs_dirent_t *de)
{ KASSERT(de->td_node == NULL); KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
tmpfs_strname_free(tmp, de->td_name, de->td_namelen);
tmpfs_dirent_put(tmp, de);
}
/*
* tmpfs_dir_attach: associate directory entry with a specified inode,
* and attach the entry into the directory, specified by vnode.
*
* => Increases link count on the associated node.
* => Increases link count on directory node if our node is VDIR.
* => It is caller's responsibility to check for the LINK_MAX limit.
* => Triggers kqueue events here.
*/
void
tmpfs_dir_attach(tmpfs_node_t *dnode, tmpfs_dirent_t *de, tmpfs_node_t *node)
{
vnode_t *dvp = dnode->tn_vnode;
int events = NOTE_WRITE;
KASSERT(dvp != NULL); KASSERT(VOP_ISLOCKED(dvp));
/* Get a new sequence number. */
KASSERT(de->td_seq == TMPFS_DIRSEQ_NONE);
de->td_seq = tmpfs_dir_getseq(dnode, de);
/* Associate directory entry and the inode. */
de->td_node = node;
if (node != TMPFS_NODE_WHITEOUT) {
KASSERT(node->tn_links < LINK_MAX);
node->tn_links++;
/* Save the hint (might overwrite). */
node->tn_dirent_hint = de;
} else if ((dnode->tn_gen & TMPFS_WHITEOUT_BIT) == 0) {
/* Flag that there are whiteout entries. */
atomic_or_32(&dnode->tn_gen, TMPFS_WHITEOUT_BIT);
}
/* Insert the entry to the directory (parent of inode). */
TAILQ_INSERT_TAIL(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries); KASSERT(dnode->tn_size <= __type_max(off_t) - sizeof(tmpfs_dirent_t));
dnode->tn_size += sizeof(tmpfs_dirent_t);
uvm_vnp_setsize(dvp, dnode->tn_size);
if (node != TMPFS_NODE_WHITEOUT && node->tn_type == VDIR) {
/* Set parent. */
KASSERT(node->tn_spec.tn_dir.tn_parent == NULL);
node->tn_spec.tn_dir.tn_parent = dnode;
/* Increase the link count of parent. */
KASSERT(dnode->tn_links < LINK_MAX);
dnode->tn_links++;
events |= NOTE_LINK;
TMPFS_VALIDATE_DIR(node);
}
}
/*
* tmpfs_dir_detach: disassociate directory entry and its inode,
* and detach the entry from the directory, specified by vnode.
*
* => Decreases link count on the associated node.
* => Decreases the link count on directory node, if our node is VDIR.
* => Triggers kqueue events here.
*
* => Note: dvp and vp may be NULL only if called by tmpfs_unmount().
*/
void
tmpfs_dir_detach(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
tmpfs_node_t *node = de->td_node;
vnode_t *dvp = dnode->tn_vnode;
KASSERT(dvp == NULL || VOP_ISLOCKED(dvp)); if (__predict_true(node != TMPFS_NODE_WHITEOUT)) {
/* Deassociate the inode and entry. */
node->tn_dirent_hint = NULL;
KASSERT(node->tn_links > 0);
node->tn_links--;
/* If directory - decrease the link count of parent. */
if (node->tn_type == VDIR) { KASSERT(node->tn_spec.tn_dir.tn_parent == dnode);
node->tn_spec.tn_dir.tn_parent = NULL;
KASSERT(dnode->tn_links > 0);
dnode->tn_links--;
}
}
de->td_node = NULL;
/* Remove the entry from the directory. */
if (dnode->tn_spec.tn_dir.tn_readdir_lastp == de) { dnode->tn_spec.tn_dir.tn_readdir_lastp = NULL;
}
TAILQ_REMOVE(&dnode->tn_spec.tn_dir.tn_dir, de, td_entries); KASSERT(dnode->tn_size >= sizeof(tmpfs_dirent_t));
dnode->tn_size -= sizeof(tmpfs_dirent_t);
tmpfs_dir_putseq(dnode, de); if (dvp) { uvm_vnp_setsize(dvp, dnode->tn_size);
}
}
/*
* tmpfs_dir_lookup: find a directory entry in the specified inode.
*
* Note that the . and .. components are not allowed as they do not
* physically exist within directories.
*/
tmpfs_dirent_t *
tmpfs_dir_lookup(tmpfs_node_t *node, struct componentname *cnp)
{
const char *name = cnp->cn_nameptr;
const uint16_t nlen = cnp->cn_namelen;
tmpfs_dirent_t *de;
KASSERT(VOP_ISLOCKED(node->tn_vnode)); KASSERT(nlen != 1 || !(name[0] == '.')); KASSERT(nlen != 2 || !(name[0] == '.' && name[1] == '.')); TMPFS_VALIDATE_DIR(node); TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { if (de->td_namelen != nlen)
continue;
if (memcmp(de->td_name, name, nlen) != 0)
continue;
break;
}
return de;
}
/*
* tmpfs_dir_cached: get a cached directory entry if it is valid. Used to
* avoid unnecessary tmpfs_dir_lookup().
*
* => The vnode must be locked.
*/
tmpfs_dirent_t *
tmpfs_dir_cached(tmpfs_node_t *node)
{
tmpfs_dirent_t *de = node->tn_dirent_hint;
KASSERT(VOP_ISLOCKED(node->tn_vnode)); if (de == NULL) {
return NULL;
}
KASSERT(de->td_node == node);
/*
* Directories always have a valid hint. For files, check if there
* are any hard links. If there are - hint might be invalid.
*/
return (node->tn_type != VDIR && node->tn_links > 1) ? NULL : de;
}
/*
* tmpfs_dir_getseq: get a per-directory sequence number for the entry.
*
* => Shall not be larger than 2^31 for linux32 compatibility.
*/
uint32_t
tmpfs_dir_getseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
uint32_t seq = de->td_seq;
vmem_t *seq_arena;
vmem_addr_t off;
int error __diagused;
TMPFS_VALIDATE_DIR(dnode);
if (__predict_true(seq != TMPFS_DIRSEQ_NONE)) {
/* Already set. */
KASSERT(seq >= TMPFS_DIRSEQ_START);
return seq;
}
/*
* The "." and ".." and the end-of-directory have reserved numbers.
* The other sequence numbers are allocated as following:
*
* - The first half of the 2^31 is assigned incrementally.
*
* - If that range is exceeded, then the second half of 2^31
* is used, but managed by vmem(9).
*/
seq = dnode->tn_spec.tn_dir.tn_next_seq;
KASSERT(seq >= TMPFS_DIRSEQ_START); if (__predict_true(seq < TMPFS_DIRSEQ_END)) {
/* First half: just increment and return. */
dnode->tn_spec.tn_dir.tn_next_seq++;
return seq;
}
/*
* First half exceeded, use the second half. May need to create
* vmem(9) arena for the directory first.
*/
if ((seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena) == NULL) {
seq_arena = vmem_create("tmpfscoo", 0,
TMPFS_DIRSEQ_END - 1, 1, NULL, NULL, NULL, 0,
VM_SLEEP, IPL_NONE);
dnode->tn_spec.tn_dir.tn_seq_arena = seq_arena;
KASSERT(seq_arena != NULL);
}
error = vmem_alloc(seq_arena, 1, VM_SLEEP | VM_BESTFIT, &off);
KASSERT(error == 0); KASSERT(off < TMPFS_DIRSEQ_END);
seq = off | TMPFS_DIRSEQ_END;
return seq;
}
static void
tmpfs_dir_putseq(tmpfs_node_t *dnode, tmpfs_dirent_t *de)
{
vmem_t *seq_arena = dnode->tn_spec.tn_dir.tn_seq_arena;
uint32_t seq = de->td_seq;
TMPFS_VALIDATE_DIR(dnode);
if (seq == TMPFS_DIRSEQ_NONE || seq < TMPFS_DIRSEQ_END) {
/* First half (or no sequence number set yet). */
KASSERT(de->td_seq >= TMPFS_DIRSEQ_START);
} else {
/* Second half. */
KASSERT(seq_arena != NULL);
KASSERT(seq >= TMPFS_DIRSEQ_END);
seq &= ~TMPFS_DIRSEQ_END;
vmem_free(seq_arena, seq, 1);
}
de->td_seq = TMPFS_DIRSEQ_NONE;
/* Empty? We can reset. */
if (seq_arena && dnode->tn_size == 0) { dnode->tn_spec.tn_dir.tn_seq_arena = NULL;
dnode->tn_spec.tn_dir.tn_next_seq = TMPFS_DIRSEQ_START;
vmem_destroy(seq_arena);
}
}
/*
* tmpfs_dir_lookupbyseq: lookup a directory entry by the sequence number.
*/
tmpfs_dirent_t *
tmpfs_dir_lookupbyseq(tmpfs_node_t *node, off_t seq)
{
tmpfs_dirent_t *de = node->tn_spec.tn_dir.tn_readdir_lastp;
TMPFS_VALIDATE_DIR(node);
/*
* First, check the cache. If does not match - perform a lookup.
*/
if (de && de->td_seq == seq) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE);
return de;
}
TAILQ_FOREACH(de, &node->tn_spec.tn_dir.tn_dir, td_entries) { KASSERT(de->td_seq >= TMPFS_DIRSEQ_START); KASSERT(de->td_seq != TMPFS_DIRSEQ_NONE); if (de->td_seq == seq)
return de;
}
return NULL;
}
/*
* tmpfs_dir_getdotents: helper function for tmpfs_readdir() to get the
* dot meta entries, that is, "." or "..". Copy it to the UIO space.
*/
static int
tmpfs_dir_getdotents(tmpfs_node_t *node, struct dirent *dp, struct uio *uio)
{
tmpfs_dirent_t *de;
off_t next = 0;
int error;
switch (uio->uio_offset) {
case TMPFS_DIRSEQ_DOT:
dp->d_fileno = node->tn_id;
strlcpy(dp->d_name, ".", sizeof(dp->d_name));
next = TMPFS_DIRSEQ_DOTDOT;
break;
case TMPFS_DIRSEQ_DOTDOT:
dp->d_fileno = node->tn_spec.tn_dir.tn_parent->tn_id;
strlcpy(dp->d_name, "..", sizeof(dp->d_name));
de = TAILQ_FIRST(&node->tn_spec.tn_dir.tn_dir);
next = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
break;
default:
KASSERT(false);
}
dp->d_type = DT_DIR;
dp->d_namlen = strlen(dp->d_name);
dp->d_reclen = _DIRENT_SIZE(dp);
if (dp->d_reclen > uio->uio_resid) {
return EJUSTRETURN;
}
if ((error = uiomove(dp, dp->d_reclen, uio)) != 0) {
return error;
}
uio->uio_offset = next;
return error;
}
/*
* tmpfs_dir_getdents: helper function for tmpfs_readdir.
*
* => Returns as much directory entries as can fit in the uio space.
* => The read starts at uio->uio_offset.
*/
int
tmpfs_dir_getdents(tmpfs_node_t *node, struct uio *uio, off_t *cntp)
{
tmpfs_dirent_t *de;
struct dirent dent;
int error = 0;
KASSERT(VOP_ISLOCKED(node->tn_vnode)); TMPFS_VALIDATE_DIR(node);
/*
* First check for the "." and ".." cases.
* Note: tmpfs_dir_getdotents() will "seek" for us.
*/
memset(&dent, 0, sizeof(dent));
if (uio->uio_offset == TMPFS_DIRSEQ_DOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
goto done;
}
(*cntp)++;
}
if (uio->uio_offset == TMPFS_DIRSEQ_DOTDOT) { if ((error = tmpfs_dir_getdotents(node, &dent, uio)) != 0) {
goto done;
}
(*cntp)++;
}
/* Done if we reached the end. */
if (uio->uio_offset == TMPFS_DIRSEQ_EOF) {
goto done;
}
/* Locate the directory entry given by the given sequence number. */
de = tmpfs_dir_lookupbyseq(node, uio->uio_offset);
if (de == NULL) {
error = EINVAL;
goto done;
}
/*
* Read as many entries as possible; i.e., until we reach the end
* of the directory or we exhaust UIO space.
*/
do {
if (de->td_node == TMPFS_NODE_WHITEOUT) {
dent.d_fileno = 1;
dent.d_type = DT_WHT;
} else {
dent.d_fileno = de->td_node->tn_id;
dent.d_type = vtype2dt(de->td_node->tn_type);
}
dent.d_namlen = de->td_namelen;
KASSERT(de->td_namelen < sizeof(dent.d_name));
memcpy(dent.d_name, de->td_name, de->td_namelen);
dent.d_name[de->td_namelen] = '\0';
dent.d_reclen = _DIRENT_SIZE(&dent);
if (dent.d_reclen > uio->uio_resid) {
/* Exhausted UIO space. */
error = EJUSTRETURN;
break;
}
/* Copy out the directory entry and continue. */
error = uiomove(&dent, dent.d_reclen, uio);
if (error) {
break;
}
(*cntp)++;
de = TAILQ_NEXT(de, td_entries);
} while (uio->uio_resid > 0 && de);
/* Cache the last entry or clear and mark EOF. */
uio->uio_offset = de ? tmpfs_dir_getseq(node, de) : TMPFS_DIRSEQ_EOF;
node->tn_spec.tn_dir.tn_readdir_lastp = de;
done:
tmpfs_update(node->tn_vnode, TMPFS_UPDATE_ATIME);
if (error == EJUSTRETURN) {
/* Exhausted UIO space - just return. */
error = 0;
}
KASSERT(error >= 0);
return error;
}
/*
* tmpfs_reg_resize: resize the underlying UVM object associated with the
* specified regular file.
*/
int
tmpfs_reg_resize(struct vnode *vp, off_t newsize)
{ tmpfs_mount_t *tmp = VFS_TO_TMPFS(vp->v_mount); tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
struct uvm_object *uobj = node->tn_spec.tn_reg.tn_aobj;
size_t newpages, oldpages;
off_t oldsize;
KASSERT(vp->v_type == VREG); KASSERT(newsize >= 0); if (newsize > __type_max(off_t) - PAGE_SIZE + 1)
return EFBIG;
oldsize = node->tn_size;
oldpages = round_page(oldsize) >> PAGE_SHIFT;
newpages = round_page(newsize) >> PAGE_SHIFT;
KASSERT(oldpages == node->tn_spec.tn_reg.tn_aobj_pages); if (newsize == oldsize) {
return 0;
}
if (newpages > oldpages) {
/* Increase the used-memory counter if getting extra pages. */
if (!tmpfs_mem_incr(tmp, (newpages - oldpages) << PAGE_SHIFT)) {
return ENOSPC;
}
} else if (newsize < oldsize) {
size_t zerolen;
zerolen = MIN(round_page(newsize), node->tn_size) - newsize;
ubc_zerorange(uobj, newsize, zerolen, UBC_VNODE_FLAGS(vp));
}
node->tn_spec.tn_reg.tn_aobj_pages = newpages;
node->tn_size = newsize;
uvm_vnp_setsize(vp, newsize);
/*
* Free "backing store".
*/
if (newpages < oldpages) { rw_enter(uobj->vmobjlock, RW_WRITER);
uao_dropswap_range(uobj, newpages, oldpages);
rw_exit(uobj->vmobjlock);
/* Decrease the used-memory counter. */
tmpfs_mem_decr(tmp, (oldpages - newpages) << PAGE_SHIFT);
}
return 0;
}
/*
* tmpfs_chflags: change flags of the given vnode.
*/
int
tmpfs_chflags(vnode_t *vp, int flags, kauth_cred_t cred, lwp_t *l)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
kauth_action_t action = KAUTH_VNODE_WRITE_FLAGS;
int error;
bool changing_sysflags = false;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/*
* If the new flags have non-user flags that are different than
* those on the node, we need special permission to change them.
*/
if ((flags & SF_SETTABLE) != (node->tn_flags & SF_SETTABLE)) {
action |= KAUTH_VNODE_WRITE_SYSFLAGS;
changing_sysflags = true;
}
/*
* Indicate that this node's flags have system attributes in them if
* that's the case.
*/
if (node->tn_flags & (SF_IMMUTABLE | SF_APPEND)) {
action |= KAUTH_VNODE_HAS_SYSFLAGS;
}
error = kauth_authorize_vnode(cred, action, vp, NULL,
genfs_can_chflags(vp, cred, node->tn_uid, changing_sysflags));
if (error)
return error;
/*
* Set the flags. If we're not setting non-user flags, be careful not
* to overwrite them.
*
* XXX: Can't we always assign here? if the system flags are different,
* the code above should catch attempts to change them without
* proper permissions, and if we're here it means it's okay to
* change them...
*/
if (!changing_sysflags) {
/* Clear all user-settable flags and re-set them. */
node->tn_flags &= SF_SETTABLE;
node->tn_flags |= (flags & UF_SETTABLE);
} else {
node->tn_flags = flags;
}
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
return 0;
}
/*
* tmpfs_chmod: change access mode on the given vnode.
*/
int
tmpfs_chmod(vnode_t *vp, mode_t mode, kauth_cred_t cred, lwp_t *l)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_SECURITY, vp,
NULL, genfs_can_chmod(vp, cred, node->tn_uid, node->tn_gid, mode));
if (error) {
return error;
}
node->tn_mode = (mode & ALLPERMS);
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
return 0;
}
/*
* tmpfs_chown: change ownership of the given vnode.
*
* => At least one of uid or gid must be different than VNOVAL.
* => Attribute is unchanged for VNOVAL case.
*/
int
tmpfs_chown(vnode_t *vp, uid_t uid, gid_t gid, kauth_cred_t cred, lwp_t *l)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Assign default values if they are unknown. */
KASSERT(uid != VNOVAL || gid != VNOVAL); if (uid == VNOVAL) {
uid = node->tn_uid;
}
if (gid == VNOVAL) { gid = node->tn_gid;
}
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_CHANGE_OWNERSHIP, vp,
NULL, genfs_can_chown(vp, cred, node->tn_uid, node->tn_gid, uid,
gid));
if (error) {
return error;
}
node->tn_uid = uid;
node->tn_gid = gid;
tmpfs_update(vp, TMPFS_UPDATE_CTIME);
cache_enter_id(vp, node->tn_mode, node->tn_uid, node->tn_gid, true);
return 0;
}
/*
* tmpfs_chsize: change size of the given vnode.
*/
int
tmpfs_chsize(vnode_t *vp, u_quad_t size, kauth_cred_t cred, lwp_t *l)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
const off_t length = size;
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Decide whether this is a valid operation based on the file type. */
switch (vp->v_type) {
case VDIR:
return EISDIR;
case VREG:
if (vp->v_mount->mnt_flag & MNT_RDONLY) {
return EROFS;
}
break;
case VBLK:
case VCHR:
case VFIFO:
/*
* Allow modifications of special files even if in the file
* system is mounted read-only (we are not modifying the
* files themselves, but the objects they represent).
*/
return 0;
default:
return EOPNOTSUPP;
}
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND)) {
return EPERM;
}
if (length < 0) {
return EINVAL;
}
/* Note: tmpfs_reg_resize() will raise NOTE_EXTEND and NOTE_ATTRIB. */
if (node->tn_size != length &&
(error = tmpfs_reg_resize(vp, length)) != 0) {
return error;
}
tmpfs_update(vp, TMPFS_UPDATE_CTIME | TMPFS_UPDATE_MTIME);
return 0;
}
/*
* tmpfs_chtimes: change access and modification times for vnode.
*/
int
tmpfs_chtimes(vnode_t *vp, const struct timespec *atime,
const struct timespec *mtime, const struct timespec *btime,
int vaflags, kauth_cred_t cred, lwp_t *l)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
int error;
KASSERT(VOP_ISLOCKED(vp));
/* Disallow this operation if the file system is mounted read-only. */
if (vp->v_mount->mnt_flag & MNT_RDONLY)
return EROFS;
/* Immutable or append-only files cannot be modified, either. */
if (node->tn_flags & (IMMUTABLE | APPEND))
return EPERM;
error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp, NULL,
genfs_can_chtimes(vp, cred, node->tn_uid, vaflags));
if (error)
return error;
mutex_enter(&node->tn_timelock);
if (atime->tv_sec != VNOVAL) { atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_ATIME);
node->tn_atime = *atime;
}
if (mtime->tv_sec != VNOVAL) { atomic_and_uint(&node->tn_tflags, ~TMPFS_UPDATE_MTIME);
node->tn_mtime = *mtime;
}
if (btime->tv_sec != VNOVAL) { node->tn_birthtime = *btime;
}
mutex_exit(&node->tn_timelock);
return 0;
}
/*
* tmpfs_update_locked: update the timestamps as indicated by the flags.
*/
void
tmpfs_update_locked(vnode_t *vp, unsigned tflags)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
struct timespec nowtm;
KASSERT(mutex_owned(&node->tn_timelock)); if ((tflags |= atomic_swap_uint(&node->tn_tflags, 0)) == 0) {
return;
}
vfs_timestamp(&nowtm);
if (tflags & TMPFS_UPDATE_ATIME) { node->tn_atime = nowtm;
}
if (tflags & TMPFS_UPDATE_MTIME) { node->tn_mtime = nowtm;
}
if (tflags & TMPFS_UPDATE_CTIME) { node->tn_ctime = nowtm;
}
}
/*
* tmpfs_update: update the timestamps as indicated by the flags.
*/
void
tmpfs_update(vnode_t *vp, unsigned tflags)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp); if ((tflags | atomic_load_relaxed(&node->tn_tflags)) == 0) {
return;
}
mutex_enter(&node->tn_timelock);
tmpfs_update_locked(vp, tflags);
mutex_exit(&node->tn_timelock);
}
/*
* tmpfs_update_lazily: schedule a deferred timestamp update.
*/
void
tmpfs_update_lazily(vnode_t *vp, unsigned tflags)
{ tmpfs_node_t *node = VP_TO_TMPFS_NODE(vp);
unsigned cur;
cur = atomic_load_relaxed(&node->tn_tflags); if ((cur & tflags) != tflags) { atomic_or_uint(&node->tn_tflags, tflags);
return;
}
}
/* $NetBSD: if_arp.h,v 1.43 2021/02/19 14:51:59 christos Exp $ */
/*
* Copyright (c) 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if_arp.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NET_IF_ARP_H_
#define _NET_IF_ARP_H_
/*
* Address Resolution Protocol.
*
* See RFC 826 for protocol description. ARP packets are variable
* in size; the arphdr structure defines the fixed-length portion.
* Protocol type values are the same as those for 10 Mb/s Ethernet.
* It is followed by the variable-sized fields ar_sha, arp_spa,
* arp_tha and arp_tpa in that order, according to the lengths
* specified. Field names used correspond to RFC 826.
*/
struct arphdr {
uint16_t ar_hrd; /* format of hardware address */
#define ARPHRD_ETHER 1 /* ethernet hardware format */
#define ARPHRD_IEEE802 6 /* IEEE 802 hardware format */
#define ARPHRD_ARCNET 7 /* ethernet hardware format */
#define ARPHRD_FRELAY 15 /* frame relay hardware format */
#define ARPHRD_STRIP 23 /* Ricochet Starmode Radio hardware format */
#define ARPHRD_IEEE1394 24 /* IEEE 1394 (FireWire) hardware format */
uint16_t ar_pro; /* format of protocol address */
uint8_t ar_hln; /* length of hardware address */
uint8_t ar_pln; /* length of protocol address */
uint16_t ar_op; /* one of: */
#define ARPOP_REQUEST 1 /* request to resolve address */
#define ARPOP_REPLY 2 /* response to previous request */
#define ARPOP_REVREQUEST 3 /* request protocol address given hardware */
#define ARPOP_REVREPLY 4 /* response giving protocol address */
#define ARPOP_INVREQUEST 8 /* request to identify peer */
#define ARPOP_INVREPLY 9 /* response identifying peer */
/*
* The remaining fields are variable in size,
* according to the sizes above.
*/
#ifdef COMMENT_ONLY
uint8_t ar_sha[]; /* sender hardware address */
uint8_t ar_spa[]; /* sender protocol address */
uint8_t ar_tha[]; /* target hardware address (!IEEE1394) */
uint8_t ar_tpa[]; /* target protocol address */
#endif
};
static __inline uint8_t *
ar_data(struct arphdr *ap)
{
return (uint8_t *)(void *)(ap + 1);
}
static __inline uint8_t *
ar_sha(struct arphdr *ap)
{
return ar_data(ap) + 0;
}
static __inline uint8_t *
ar_spa(struct arphdr *ap)
{
return ar_data(ap) + ap->ar_hln;
}
static __inline uint8_t *
ar_tha(struct arphdr *ap)
{
if (ntohs(ap->ar_hrd) == ARPHRD_IEEE1394) {
return NULL;
} else {
return ar_data(ap) + ap->ar_hln + ap->ar_pln;
}
}
static __inline uint8_t *
ar_tpa(struct arphdr *ap)
{
if (ntohs(ap->ar_hrd) == ARPHRD_IEEE1394) {
return ar_data(ap) + ap->ar_hln + ap->ar_pln;
} else {
return ar_data(ap) + ap->ar_hln + ap->ar_pln + ap->ar_hln;
}
}
/*
* ARP ioctl request
*/
struct arpreq {
struct sockaddr arp_pa; /* protocol address */
struct sockaddr arp_ha; /* hardware address */
int arp_flags; /* flags */
};
/* arp_flags and at_flags field values */
#define ATF_INUSE 0x01 /* entry in use */
#define ATF_COM 0x02 /* completed entry (enaddr valid) */
#define ATF_PERM 0x04 /* permanent entry */
#define ATF_PUBL 0x08 /* publish entry (respond for other host) */
#define ATF_USETRAILERS 0x10 /* has requested trailers */
/*
* Kernel statistics about arp
*/
#define ARP_STAT_SNDTOTAL 0 /* total packets sent */
#define ARP_STAT_SNDREPLY 1 /* replies sent */
#define ARP_STAT_SENDREQUEST 2 /* requests sent */
#define ARP_STAT_RCVTOTAL 3 /* total packets received */
#define ARP_STAT_RCVREQUEST 4 /* valid requests received */
#define ARP_STAT_RCVREPLY 5 /* replies received */
#define ARP_STAT_RCVMCAST 6 /* multicast/broadcast received */
#define ARP_STAT_RCVBADPROTO 7 /* unknown protocol type received */
#define ARP_STAT_RCVBADLEN 8 /* bad (short) length received */
#define ARP_STAT_RCVZEROTPA 9 /* received w/ null target ip */
#define ARP_STAT_RCVZEROSPA 10 /* received w/ null source ip */
#define ARP_STAT_RCVNOINT 11 /* couldn't map to interface */
#define ARP_STAT_RCVLOCALSHA 12 /* received from local hw address */
#define ARP_STAT_RCVBCASTSHA 13 /* received w/ broadcast src */
#define ARP_STAT_RCVLOCALSPA 14 /* received for a local ip [dup!] */
#define ARP_STAT_RCVOVERPERM 15 /* attempts to overwrite static info */
#define ARP_STAT_RCVOVERINT 16 /* attempts to overwrite wrong if */
#define ARP_STAT_RCVOVER 17 /* entries overwritten! */
#define ARP_STAT_RCVLENCHG 18 /* changes in hw address len */
#define ARP_STAT_DFRTOTAL 19 /* deferred pending ARP resolution */
#define ARP_STAT_DFRSENT 20 /* deferred, then sent */
#define ARP_STAT_DFRDROPPED 21 /* deferred, then dropped */
#define ARP_STAT_ALLOCFAIL 22 /* failures to allocate llinfo */
#define ARP_NSTATS 23
void arp_stat_add(int, uint64_t);
#endif /* !_NET_IF_ARP_H_ */
/* $NetBSD: if.h,v 1.305 2023/10/09 11:55:34 riastradh Exp $ */
/*-
* Copyright (c) 1999, 2000, 2001 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by William Studenmund and Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)if.h 8.3 (Berkeley) 2/9/95
*/
#ifndef _NET_IF_H_
#define _NET_IF_H_
#if !defined(_KERNEL) && !defined(_STANDALONE)
#include <stdbool.h>
#endif
#include <sys/featuretest.h>
/*
* Length of interface external name, including terminating '\0'.
* Note: this is the same size as a generic device's external name.
*/
#define IF_NAMESIZE 16
/*
* Length of interface description, including terminating '\0'.
*/
#define IFDESCRSIZE 64
#if defined(_NETBSD_SOURCE)
#include <sys/socket.h>
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/hook.h>
#include <net/dlt.h>
#include <net/pfil.h>
#ifdef _KERNEL
#include <net/pktqueue.h>
#include <sys/pslist.h>
#include <sys/pserialize.h>
#include <sys/psref.h>
#include <sys/module_hook.h>
#endif
/*
* Always include ALTQ glue here -- we use the ALTQ interface queue
* structure even when ALTQ is not configured into the kernel so that
* the size of struct ifnet does not changed based on the option. The
* ALTQ queue structure is API-compatible with the legacy ifqueue.
*/
#include <altq/if_altq.h>
/*
* Structures defining a network interface, providing a packet
* transport mechanism (ala level 0 of the PUP protocols).
*
* Each interface accepts output datagrams of a specified maximum
* length, and provides higher level routines with input datagrams
* received from its medium.
*
* Output occurs when the routine if_output is called, with four parameters:
* (*ifp->if_output)(ifp, m, dst, rt)
* Here m is the mbuf chain to be sent and dst is the destination address.
* The output routine encapsulates the supplied datagram if necessary,
* and then transmits it on its medium.
*
* On input, each interface unwraps the data received by it, and either
* places it on the input queue of a internetwork datagram routine
* and posts the associated software interrupt, or passes the datagram to a raw
* packet input routine.
*
* Routines exist for locating interfaces by their addresses
* or for locating a interface on a certain network, as well as more general
* routing and gateway routines maintaining information used to locate
* interfaces. These routines live in the files if.c and route.c
*/
#include <sys/time.h>
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#include "opt_gateway.h"
#endif
struct mbuf;
struct proc;
struct rtentry;
struct socket;
struct ether_header;
struct ifaddr;
struct ifnet;
struct rt_addrinfo;
#define IFNAMSIZ IF_NAMESIZE
/*
* Structure describing a `cloning' interface.
*/
struct if_clone {
LIST_ENTRY(if_clone) ifc_list; /* on list of cloners */
const char *ifc_name; /* name of device, e.g. `gif' */
size_t ifc_namelen; /* length of name */
int (*ifc_create)(struct if_clone *, int);
int (*ifc_destroy)(struct ifnet *);
};
#define IF_CLONE_INITIALIZER(name, create, destroy) \
{ { NULL, NULL }, name, sizeof(name) - 1, create, destroy }
/*
* Structure used to query names of interface cloners.
*/
struct if_clonereq {
int ifcr_total; /* total cloners (out) */
int ifcr_count; /* room for this many in user buffer */
char *ifcr_buffer; /* buffer for cloner names */
};
/*
* Structure defining statistics and other data kept regarding a network
* interface.
*
* Only used for exporting data from the interface.
*/
struct if_data {
/* generic interface information */
u_char ifi_type; /* ethernet, tokenring, etc. */
u_char ifi_addrlen; /* media address length */
u_char ifi_hdrlen; /* media header length */
int ifi_link_state; /* current link state */
uint64_t ifi_mtu; /* maximum transmission unit */
uint64_t ifi_metric; /* routing metric (external only) */
uint64_t ifi_baudrate; /* linespeed */
/* volatile statistics */
uint64_t ifi_ipackets; /* packets received on interface */
uint64_t ifi_ierrors; /* input errors on interface */
uint64_t ifi_opackets; /* packets sent on interface */
uint64_t ifi_oerrors; /* output errors on interface */
uint64_t ifi_collisions; /* collisions on csma interfaces */
uint64_t ifi_ibytes; /* total number of octets received */
uint64_t ifi_obytes; /* total number of octets sent */
uint64_t ifi_imcasts; /* packets received via multicast */
uint64_t ifi_omcasts; /* packets sent via multicast */
uint64_t ifi_iqdrops; /* dropped on input, this interface */
uint64_t ifi_noproto; /* destined for unsupported protocol */
struct timespec ifi_lastchange;/* last operational state change */
};
/*
* Values for if_link_state.
*/
#define LINK_STATE_UNKNOWN 0 /* link invalid/unknown */
#define LINK_STATE_DOWN 1 /* link is down */
#define LINK_STATE_UP 2 /* link is up */
/*
* Status bit descriptions for the various interface types.
*/
struct if_status_description {
unsigned char ifs_type;
unsigned char ifs_state;
const char *ifs_string;
};
#define LINK_STATE_DESC_MATCH(_ifs, _t, _s) \
(((_ifs)->ifs_type == (_t) || (_ifs)->ifs_type == 0) && \
(_ifs)->ifs_state == (_s))
#define LINK_STATE_DESCRIPTIONS { \
{ IFT_ETHER, LINK_STATE_DOWN, "no carrier" }, \
{ IFT_IEEE80211, LINK_STATE_DOWN, "no network" }, \
{ IFT_PPP, LINK_STATE_DOWN, "no carrier" }, \
{ IFT_CARP, LINK_STATE_DOWN, "backup" }, \
{ IFT_CARP, LINK_STATE_UP, "master" }, \
{ 0, LINK_STATE_UP, "active" }, \
{ 0, LINK_STATE_UNKNOWN, "unknown" }, \
{ 0, LINK_STATE_DOWN, "down" }, \
{ 0, 0, NULL } \
}
/*
* Structure defining a queue for a network interface.
*/
struct ifqueue {
struct mbuf *ifq_head;
struct mbuf *ifq_tail;
int ifq_len;
int ifq_maxlen;
uint64_t ifq_drops;
kmutex_t *ifq_lock;
};
#ifdef _KERNEL
#include <sys/percpu.h>
#include <sys/callout.h>
#include <sys/rwlock.h>
#include <sys/workqueue.h>
#endif /* _KERNEL */
/*
* Structure defining a queue for a network interface.
*
* (Would like to call this struct ``if'', but C isn't PL/1.)
*/
TAILQ_HEAD(ifnet_head, ifnet); /* the actual queue head */
struct bridge_softc;
struct bridge_iflist;
struct callout;
struct krwlock;
struct if_percpuq;
struct if_deferred_start;
struct in6_multi;
typedef unsigned short if_index_t;
/*
* Interface. Field markings and the corresponding locks:
*
* i: IFNET_LOCK (a.k.a., if_ioctl_lock)
* q: ifq_lock (struct ifaltq)
* a: if_afdata_lock
* 6: in6_multilock (global lock)
* :: unlocked, stable
* ?: unknown, maybe unsafe
*
* Lock order: IFNET_LOCK => in6_multilock => if_afdata_lock => ifq_lock
* Note that currently if_afdata_lock and ifq_lock aren't held
* at the same time, but define the order anyway.
*
* Lock order of IFNET_LOCK with other locks:
* softnet_lock => solock => IFNET_LOCK => ND6_LOCK, in_multilock
*/
typedef struct ifnet {
void *if_softc; /* :: lower-level data for this if */
/* DEPRECATED. Keep it to avoid breaking kvm(3) users */
TAILQ_ENTRY(ifnet)
if_list; /* i: all struct ifnets are chained */
TAILQ_HEAD(, ifaddr)
if_addrlist; /* i: linked list of addresses per if */
char if_xname[IFNAMSIZ];
/* :: external name (name + unit) */
int if_pcount; /* i: number of promiscuous listeners */
struct bpf_if *if_bpf; /* :: packet filter structure */
if_index_t if_index; /* :: numeric abbreviation for this if */
short if_timer; /* ?: time 'til if_slowtimo called */
unsigned short if_flags; /* i: up/down, broadcast, etc. */
short if_extflags; /* :: if_output MP-safe, etc. */
u_char if_type; /* :: ethernet, tokenring, etc. */
u_char if_addrlen; /* :: media address length */
u_char if_hdrlen; /* :: media header length */
/* XXX audit :? fields here. */
int if_link_state; /* :? current link state */
uint64_t if_mtu; /* :? maximum transmission unit */
uint64_t if_metric; /* :? routing metric (external only) */
uint64_t if_baudrate; /* :? linespeed */
struct timespec if_lastchange; /* :? last operational state change */
#ifdef _KERNEL
percpu_t *if_stats; /* :: statistics */
#else
void *if_stats; /* opaque to user-space */
#endif /* _KERNEL */
/*
* Procedure handles. If you add more of these, don't forget the
* corresponding NULL stub in if.c.
*/
int (*if_output) /* :: output routine (enqueue) */
(struct ifnet *, struct mbuf *, const struct sockaddr *,
const struct rtentry *);
void (*_if_input) /* :: input routine (from h/w driver) */
(struct ifnet *, struct mbuf *);
void (*if_start) /* :: initiate output routine */
(struct ifnet *);
int (*if_transmit) /* :: output routine, must be MP-safe */
(struct ifnet *, struct mbuf *);
int (*if_ioctl) /* :: ioctl routine */
(struct ifnet *, u_long, void *);
int (*if_init) /* :: init routine */
(struct ifnet *);
void (*if_stop) /* :: stop routine */
(struct ifnet *, int);
void (*if_slowtimo) /* :: timer routine */
(struct ifnet *);
#define if_watchdog if_slowtimo
void (*if_drain) /* :: routine to release resources */
(struct ifnet *);
void (*if_bpf_mtap) /* :: bpf routine */
(struct bpf_if *, struct mbuf *, u_int);
struct ifaltq if_snd; /* q: output queue (includes altq) */
struct ifaddr *if_dl; /* i: identity of this interface. */
const struct sockaddr_dl
*if_sadl; /* i: pointer to sockaddr_dl of if_dl */
/*
* May be NULL. If not NULL, it is the address assigned
* to the interface by the manufacturer, so it very likely
* to be unique. It MUST NOT be deleted. It is highly
* suitable for deriving the EUI64 for the interface.
*/
struct ifaddr *if_hwdl; /* i: h/w identity */
const uint8_t *if_broadcastaddr;
/* :: linklevel broadcast bytestring */
struct bridge_softc
*if_bridge; /* i: bridge glue */
struct bridge_iflist
*if_bridgeif; /* i: shortcut to interface list entry */
int if_dlt; /* :: data link type (<net/dlt.h>) */
pfil_head_t * if_pfil; /* :: filtering point */
uint64_t if_capabilities;
/* i: interface capabilities */
uint64_t if_capenable; /* i: capabilities enabled */
union {
void * carp_s; /* carp structure (used by !carp ifs) */
struct ifnet *carp_d;/* ptr to carpdev (used by carp ifs) */
} if_carp_ptr; /* ?: */
#define if_carp if_carp_ptr.carp_s
#define if_carpdev if_carp_ptr.carp_d
/*
* These are pre-computed based on an interfaces enabled
* capabilities, for speed elsewhere.
*/
int if_csum_flags_tx;
/* i: M_CSUM_* flags for Tx */
int if_csum_flags_rx;
/* i: M_CSUM_* flags for Rx */
void *if_afdata[AF_MAX];
/* a: */
struct mowner *if_mowner; /* ?: who owns mbufs for this interface */
void *if_lagg; /* :: lagg or agr structure */
void *if_npf_private;/* ?: associated NPF context */
/*
* pf specific data, used only when #if NPF > 0.
*/
void *if_pf_kif; /* ?: pf interface abstraction */
void *if_pf_groups; /* ?: pf interface groups */
/*
* During an ifnet's lifetime, it has only one if_index, but
* an if_index is not sufficient to identify an ifnet
* because during the lifetime of the system, many ifnets may occupy a
* given if_index. Let us tell different ifnets at the same
* if_index apart by their if_index_gen, a unique number that each ifnet
* is assigned when it if_attach()s. Now, the kernel can use the
* pair (if_index, if_index_gen) as a weak reference to an ifnet.
*/
uint64_t if_index_gen; /* :: generation number for the ifnet
* at if_index: if two ifnets' index
* and generation number are both the
* same, they are the same ifnet.
*/
struct sysctllog
*if_sysctl_log; /* :: */
int (*if_initaddr) /* :: */
(struct ifnet *, struct ifaddr *, bool);
int (*if_setflags) /* :: */
(struct ifnet *, const u_short);
kmutex_t *if_ioctl_lock; /* :: */
char *if_description; /* i: interface description */
#ifdef _KERNEL /* XXX kvm(3) */
struct if_slowtimo_data *if_slowtimo_data; /* :: */
struct krwlock *if_afdata_lock;/* :: */
struct if_percpuq
*if_percpuq; /* :: we should remove it in the future */
struct work if_link_work; /* q: linkage on link state work queue */
uint16_t if_link_queue; /* q: masked link state change queue */
/* q: is link state work scheduled? */
bool if_link_scheduled;
struct pslist_entry
if_pslist_entry;/* i: */
struct psref_target
if_psref; /* :: */
struct pslist_head
if_addr_pslist; /* i: */
struct if_deferred_start
*if_deferred_start;
/* :: */
/* XXX should be protocol independent */
LIST_HEAD(, in6_multi)
if_multiaddrs; /* 6: */
khook_list_t *if_linkstate_hooks; /* :: */
#endif
} ifnet_t;
#include <net/if_stats.h>
#define if_name(ifp) ((ifp)->if_xname)
#define IFF_UP 0x0001 /* interface is up */
#define IFF_BROADCAST 0x0002 /* broadcast address valid */
#define IFF_DEBUG 0x0004 /* turn on debugging */
#define IFF_LOOPBACK 0x0008 /* is a loopback net */
#define IFF_POINTOPOINT 0x0010 /* interface is point-to-point link */
#if 0
/* 0x0020 was IFF_NOTRAILERS */
#else
/*
* sys/compat/svr4 is remvoed on 19 Dec 2018.
* And then, IFF_NOTRAILERS itself is removed by if.h:r1.268 on 5 Feb 2019.
*/
#define IFF_UNNUMBERED 0x0020 /* explicit unnumbered */
#endif
#define IFF_RUNNING 0x0040 /* resources allocated */
#define IFF_NOARP 0x0080 /* no address resolution protocol */
#define IFF_PROMISC 0x0100 /* receive all packets */
#define IFF_ALLMULTI 0x0200 /* OBSOLETE -- DO NOT USE */
/*
* IFF_ALLMULTI obsoleted on 2019-05-15 -- existing non-MP-safe drivers
* can use it for themselves under IFNET_LOCK, but they should be
* converted to use ETHER_F_ALLMULTI under ETHER_LOCK instead. For
* compatibility with existing drivers, if_ethersubr and if_arcsubr
* will set IFF_ALLMULTI according to other flags, but you should not
* rely on this.
*/
#define IFF_OACTIVE 0x0400 /* transmission in progress */
#define IFF_SIMPLEX 0x0800 /* can't hear own transmissions */
#define IFF_LINK0 0x1000 /* per link layer defined bit */
#define IFF_LINK1 0x2000 /* per link layer defined bit */
#define IFF_LINK2 0x4000 /* per link layer defined bit */
#define IFF_MULTICAST 0x8000 /* supports multicast */
#define IFEF_MPSAFE __BIT(0) /* handlers can run in parallel (see below) */
/*
* The guidelines for converting an interface to IFEF_MPSAFE are as follows
*
* Enabling IFEF_MPSAFE on an interface suppresses taking KERNEL_LOCK when
* calling the following handlers:
* - if_start
* - Note that if_transmit is always called without KERNEL_LOCK
* - if_output
* - if_ioctl
* - if_init
* - if_stop
*
* This means that an interface with IFEF_MPSAFE must make the above handlers
* MP-safe or take KERNEL_LOCK by itself inside handlers that aren't MP-safe
* yet.
*
* There are some additional restrictions to access member variables of struct
* ifnet:
* - if_flags
* - Must be updated with holding IFNET_LOCK
* - You cannot use the flag in Tx/Rx paths anymore because there is no
* synchronization on the flag except for IFNET_LOCK
* - Note that IFNET_LOCK can't be taken in softint because it's known
* that it causes a deadlock
* - Some synchronization mechanisms such as pserialize_perform are called
* with IFNET_LOCK and also require context switches on every CPUs
* that mean softints finish so trying to take IFNET_LOCK in softint
* might block on IFNET_LOCK and prevent such synchronization mechanisms
* from being completed
* - Currently the deadlock occurs only if NET_MPSAFE is enabled, however,
* we should deal with the restriction because NET_MPSAFE will be enabled
* by default in the future
* - if_watchdog and if_timer
* - The watchdog framework works only for non-IFEF_MPSAFE interfaces
* that rely on KERNEL_LOCK
* - Interfaces with IFEF_MPSAFE have to provide its own watchdog mechanism
* if needed
* - Keep if_watchdog NULL when calling if_attach
*/
#ifdef _KERNEL
static __inline bool
if_is_mpsafe(struct ifnet *ifp)
{
return ((ifp->if_extflags & IFEF_MPSAFE) != 0);
}
static __inline int
if_output_lock(struct ifnet *cifp, struct ifnet *ifp, struct mbuf *m,
const struct sockaddr *dst, const struct rtentry *rt)
{
if (if_is_mpsafe(cifp)) {
return (*cifp->if_output)(ifp, m, dst, rt);
} else {
int ret;
KERNEL_LOCK(1, NULL);
ret = (*cifp->if_output)(ifp, m, dst, rt);
KERNEL_UNLOCK_ONE(NULL);
return ret;
}
}
static __inline void
if_start_lock(struct ifnet *ifp)
{
if (if_is_mpsafe(ifp)) {
(*ifp->if_start)(ifp);
} else {
KERNEL_LOCK(1, NULL);
(*ifp->if_start)(ifp);
KERNEL_UNLOCK_ONE(NULL);
}
}
#define KERNEL_LOCK_IF_IFP_MPSAFE(ifp) \
do { if (if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_IF_IFP_MPSAFE(ifp) \
do { if (if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)
#define KERNEL_LOCK_UNLESS_IFP_MPSAFE(ifp) \
do { if (!if_is_mpsafe(ifp)) { KERNEL_LOCK(1, NULL); } } while (0)
#define KERNEL_UNLOCK_UNLESS_IFP_MPSAFE(ifp) \
do { if (!if_is_mpsafe(ifp)) { KERNEL_UNLOCK_ONE(NULL); } } while (0)
#ifdef _KERNEL_OPT
#include "opt_net_mpsafe.h"
#endif
/* XXX explore a better place to define */
#ifdef NET_MPSAFE
#define KERNEL_LOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_LOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() do { } while (0)
#define SOFTNET_LOCK_IF_NET_MPSAFE() \
do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE() \
do { mutex_exit(softnet_lock); } while (0)
#else /* NET_MPSAFE */
#define KERNEL_LOCK_UNLESS_NET_MPSAFE() \
do { KERNEL_LOCK(1, NULL); } while (0)
#define KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \
do { KERNEL_UNLOCK_ONE(NULL); } while (0)
#define SOFTNET_LOCK_UNLESS_NET_MPSAFE() \
do { mutex_enter(softnet_lock); } while (0)
#define SOFTNET_UNLOCK_UNLESS_NET_MPSAFE() \
do { mutex_exit(softnet_lock); } while (0)
#define SOFTNET_LOCK_IF_NET_MPSAFE() do { } while (0)
#define SOFTNET_UNLOCK_IF_NET_MPSAFE() do { } while (0)
#endif /* NET_MPSAFE */
#define SOFTNET_KERNEL_LOCK_UNLESS_NET_MPSAFE() \
do { \
SOFTNET_LOCK_UNLESS_NET_MPSAFE(); \
KERNEL_LOCK_UNLESS_NET_MPSAFE(); \
} while (0)
#define SOFTNET_KERNEL_UNLOCK_UNLESS_NET_MPSAFE() \
do { \
KERNEL_UNLOCK_UNLESS_NET_MPSAFE(); \
SOFTNET_UNLOCK_UNLESS_NET_MPSAFE(); \
} while (0)
#endif /* _KERNEL */
#define IFFBITS \
"\020\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5POINTOPOINT\6UNNUMBERED" \
"\7RUNNING\10NOARP\11PROMISC\12ALLMULTI\13OACTIVE\14SIMPLEX" \
"\15LINK0\16LINK1\17LINK2\20MULTICAST"
/* flags set internally only: */
#define IFF_CANTCHANGE \
(IFF_BROADCAST|IFF_POINTOPOINT|IFF_RUNNING|IFF_OACTIVE|\
IFF_SIMPLEX|IFF_MULTICAST|IFF_ALLMULTI|IFF_PROMISC)
/*
* Some convenience macros used for setting ifi_baudrate.
*/
#define IF_Kbps(x) ((x) * 1000ULL) /* kilobits/sec. */
#define IF_Mbps(x) (IF_Kbps((x) * 1000ULL)) /* megabits/sec. */
#define IF_Gbps(x) (IF_Mbps((x) * 1000ULL)) /* gigabits/sec. */
/* Capabilities that interfaces can advertise. */
/* 0x01 .. 0x40 were previously used */
#define IFCAP_TSOv4 0x00080 /* can do TCPv4 segmentation offload */
#define IFCAP_CSUM_IPv4_Rx 0x00100 /* can do IPv4 header checksums (Rx) */
#define IFCAP_CSUM_IPv4_Tx 0x00200 /* can do IPv4 header checksums (Tx) */
#define IFCAP_CSUM_TCPv4_Rx 0x00400 /* can do IPv4/TCP checksums (Rx) */
#define IFCAP_CSUM_TCPv4_Tx 0x00800 /* can do IPv4/TCP checksums (Tx) */
#define IFCAP_CSUM_UDPv4_Rx 0x01000 /* can do IPv4/UDP checksums (Rx) */
#define IFCAP_CSUM_UDPv4_Tx 0x02000 /* can do IPv4/UDP checksums (Tx) */
#define IFCAP_CSUM_TCPv6_Rx 0x04000 /* can do IPv6/TCP checksums (Rx) */
#define IFCAP_CSUM_TCPv6_Tx 0x08000 /* can do IPv6/TCP checksums (Tx) */
#define IFCAP_CSUM_UDPv6_Rx 0x10000 /* can do IPv6/UDP checksums (Rx) */
#define IFCAP_CSUM_UDPv6_Tx 0x20000 /* can do IPv6/UDP checksums (Tx) */
#define IFCAP_TSOv6 0x40000 /* can do TCPv6 segmentation offload */
#define IFCAP_LRO 0x80000 /* can do Large Receive Offload */
#define IFCAP_MASK 0xfff80 /* currently valid capabilities */
#define IFCAPBITS \
"\020" \
"\10TSO4" \
"\11IP4CSUM_Rx" \
"\12IP4CSUM_Tx" \
"\13TCP4CSUM_Rx" \
"\14TCP4CSUM_Tx" \
"\15UDP4CSUM_Rx" \
"\16UDP4CSUM_Tx" \
"\17TCP6CSUM_Rx" \
"\20TCP6CSUM_Tx" \
"\21UDP6CSUM_Rx" \
"\22UDP6CSUM_Tx" \
"\23TSO6" \
"\24LRO" \
#define IF_AFDATA_LOCK_INIT(ifp) \
do {(ifp)->if_afdata_lock = rw_obj_alloc();} while (0)
#define IF_AFDATA_LOCK_DESTROY(ifp) rw_obj_free((ifp)->if_afdata_lock)
#define IF_AFDATA_WLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_WRITER)
#define IF_AFDATA_RLOCK(ifp) rw_enter((ifp)->if_afdata_lock, RW_READER)
#define IF_AFDATA_WUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock)
#define IF_AFDATA_RUNLOCK(ifp) rw_exit((ifp)->if_afdata_lock)
#define IF_AFDATA_LOCK(ifp) IF_AFDATA_WLOCK(ifp)
#define IF_AFDATA_UNLOCK(ifp) IF_AFDATA_WUNLOCK(ifp)
#define IF_AFDATA_TRYLOCK(ifp) rw_tryenter((ifp)->if_afdata_lock, RW_WRITER)
#define IF_AFDATA_LOCK_ASSERT(ifp) \
KASSERT(rw_lock_held((ifp)->if_afdata_lock))
#define IF_AFDATA_RLOCK_ASSERT(ifp) \
KASSERT(rw_read_held((ifp)->if_afdata_lock))
#define IF_AFDATA_WLOCK_ASSERT(ifp) \
KASSERT(rw_write_held((ifp)->if_afdata_lock))
/*
* Output queues (ifp->if_snd) and internetwork datagram level (pup level 1)
* input routines have queues of messages stored on ifqueue structures
* (defined above). Entries are added to and deleted from these structures
* by these macros, which should be called with ipl raised to splnet().
*/
#define IF_QFULL(ifq) ((ifq)->ifq_len >= (ifq)->ifq_maxlen)
#define IF_DROP(ifq) ((ifq)->ifq_drops++)
#define IF_ENQUEUE(ifq, m) do { \
(m)->m_nextpkt = 0; \
if ((ifq)->ifq_tail == 0) \
(ifq)->ifq_head = m; \
else \
(ifq)->ifq_tail->m_nextpkt = m; \
(ifq)->ifq_tail = m; \
(ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define IF_PREPEND(ifq, m) do { \
(m)->m_nextpkt = (ifq)->ifq_head; \
if ((ifq)->ifq_tail == 0) \
(ifq)->ifq_tail = (m); \
(ifq)->ifq_head = (m); \
(ifq)->ifq_len++; \
} while (/*CONSTCOND*/0)
#define IF_DEQUEUE(ifq, m) do { \
(m) = (ifq)->ifq_head; \
if (m) { \
if (((ifq)->ifq_head = (m)->m_nextpkt) == 0) \
(ifq)->ifq_tail = 0; \
(m)->m_nextpkt = 0; \
(ifq)->ifq_len--; \
} \
} while (/*CONSTCOND*/0)
#define IF_POLL(ifq, m) ((m) = (ifq)->ifq_head)
#define IF_PURGE(ifq) \
do { \
struct mbuf *__m0; \
\
for (;;) { \
IF_DEQUEUE((ifq), __m0); \
if (__m0 == NULL) \
break; \
else \
m_freem(__m0); \
} \
} while (/*CONSTCOND*/ 0)
#define IF_IS_EMPTY(ifq) ((ifq)->ifq_len == 0)
#ifndef IFQ_MAXLEN
#define IFQ_MAXLEN 256
#endif
#define IFNET_SLOWHZ 1 /* granularity is 1 second */
/*
* Structure defining statistics and other data kept regarding an address
* on a network interface.
*/
struct ifaddr_data {
int64_t ifad_inbytes;
int64_t ifad_outbytes;
};
/*
* The ifaddr structure contains information about one address
* of an interface. They are maintained by the different address families,
* are allocated and attached when an address is set, and are linked
* together so all addresses for an interface can be located.
*/
struct ifaddr {
struct sockaddr *ifa_addr; /* address of interface */
struct sockaddr *ifa_dstaddr; /* other end of p-to-p link */
#define ifa_broadaddr ifa_dstaddr /* broadcast address interface */
struct sockaddr *ifa_netmask; /* used to determine subnet */
struct ifnet *ifa_ifp; /* back-pointer to interface */
TAILQ_ENTRY(ifaddr) ifa_list; /* list of addresses for interface */
struct ifaddr_data ifa_data; /* statistics on the address */
void (*ifa_rtrequest) /* check or clean routes (+ or -)'d */
(int, struct rtentry *, const struct rt_addrinfo *);
u_int ifa_flags; /* mostly rt_flags for cloning */
int ifa_refcnt; /* count of references */
int ifa_metric; /* cost of going out this interface */
struct ifaddr *(*ifa_getifa)(struct ifaddr *,
const struct sockaddr *);
uint32_t *ifa_seqno;
int16_t ifa_preference; /* preference level for this address */
#ifdef _KERNEL
struct pslist_entry ifa_pslist_entry;
struct psref_target ifa_psref;
#endif
};
#define IFA_ROUTE RTF_UP /* (0x01) route installed */
#define IFA_DESTROYING 0x2
/*
* Message format for use in obtaining information about interfaces from
* sysctl and the routing socket. We need to force 64-bit alignment if we
* aren't using compatibility definitions.
*/
#if !defined(_KERNEL) || !defined(COMPAT_RTSOCK)
#define __align64 __aligned(sizeof(uint64_t))
#else
#define __align64
#endif
struct if_msghdr {
u_short ifm_msglen __align64;
/* to skip over non-understood messages */
u_char ifm_version; /* future binary compatibility */
u_char ifm_type; /* message type */
int ifm_addrs; /* like rtm_addrs */
int ifm_flags; /* value of if_flags */
u_short ifm_index; /* index for associated ifp */
struct if_data ifm_data __align64;
/* statistics and other data about if */
};
/*
* Message format for use in obtaining information about interface addresses
* from sysctl and the routing socket.
*/
struct ifa_msghdr {
u_short ifam_msglen __align64;
/* to skip over non-understood messages */
u_char ifam_version; /* future binary compatibility */
u_char ifam_type; /* message type */
u_short ifam_index; /* index for associated ifp */
int ifam_flags; /* value of ifa_flags */
int ifam_addrs; /* like rtm_addrs */
pid_t ifam_pid; /* identify sender */
int ifam_addrflags; /* family specific address flags */
int ifam_metric; /* value of ifa_metric */
};
/*
* Message format announcing the arrival or departure of a network interface.
*/
struct if_announcemsghdr {
u_short ifan_msglen __align64;
/* to skip over non-understood messages */
u_char ifan_version; /* future binary compatibility */
u_char ifan_type; /* message type */
u_short ifan_index; /* index for associated ifp */
char ifan_name[IFNAMSIZ]; /* if name, e.g. "en0" */
u_short ifan_what; /* what type of announcement */
};
#define IFAN_ARRIVAL 0 /* interface arrival */
#define IFAN_DEPARTURE 1 /* interface departure */
#undef __align64
/*
* Interface request structure used for socket
* ioctl's. All interface ioctl's must have parameter
* definitions which begin with ifr_name. The
* remainder may be interface specific.
*/
struct ifreq {
char ifr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
union {
struct sockaddr ifru_addr;
struct sockaddr ifru_dstaddr;
struct sockaddr ifru_broadaddr;
struct sockaddr_storage ifru_space;
short ifru_flags;
int ifru_addrflags;
int ifru_metric;
int ifru_mtu;
int ifru_dlt;
u_int ifru_value;
void * ifru_data;
struct {
uint32_t b_buflen;
void *b_buf;
} ifru_b;
} ifr_ifru;
#define ifr_addr ifr_ifru.ifru_addr /* address */
#define ifr_dstaddr ifr_ifru.ifru_dstaddr /* other end of p-to-p link */
#define ifr_broadaddr ifr_ifru.ifru_broadaddr /* broadcast address */
#define ifr_space ifr_ifru.ifru_space /* sockaddr_storage */
#define ifr_flags ifr_ifru.ifru_flags /* flags */
#define ifr_addrflags ifr_ifru.ifru_addrflags /* addr flags */
#define ifr_metric ifr_ifru.ifru_metric /* metric */
#define ifr_mtu ifr_ifru.ifru_mtu /* mtu */
#define ifr_dlt ifr_ifru.ifru_dlt /* data link type (DLT_*) */
#define ifr_value ifr_ifru.ifru_value /* generic value */
#define ifr_media ifr_ifru.ifru_metric /* media options (overload) */
#define ifr_data ifr_ifru.ifru_data /* for use by interface
* XXX deprecated
*/
#define ifr_buf ifr_ifru.ifru_b.b_buf /* new interface ioctls */
#define ifr_buflen ifr_ifru.ifru_b.b_buflen
#define ifr_index ifr_ifru.ifru_value /* interface index, BSD */
#define ifr_ifindex ifr_index /* interface index, linux */
};
#ifdef _KERNEL
#define ifreq_setdstaddr ifreq_setaddr
#define ifreq_setbroadaddr ifreq_setaddr
#define ifreq_getdstaddr ifreq_getaddr
#define ifreq_getbroadaddr ifreq_getaddr
static __inline const struct sockaddr *
/*ARGSUSED*/
ifreq_getaddr(u_long cmd, const struct ifreq *ifr)
{
return &ifr->ifr_addr;
}
#endif /* _KERNEL */
struct ifcapreq {
char ifcr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
uint64_t ifcr_capabilities; /* supported capabiliites */
uint64_t ifcr_capenable; /* capabilities enabled */
};
struct ifaliasreq {
char ifra_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct sockaddr ifra_addr;
struct sockaddr ifra_dstaddr;
#define ifra_broadaddr ifra_dstaddr
struct sockaddr ifra_mask;
};
struct ifdatareq {
char ifdr_name[IFNAMSIZ]; /* if name, e.g. "en0" */
struct if_data ifdr_data;
};
struct ifmediareq {
char ifm_name[IFNAMSIZ]; /* if name, e.g. "en0" */
int ifm_current; /* IFMWD: current media options */
int ifm_mask; /* IFMWD: don't care mask */
int ifm_status; /* media status */
int ifm_active; /* IFMWD: active options */
int ifm_count; /* # entries in ifm_ulist
array */
int *ifm_ulist; /* array of ifmedia word */
};
struct ifdrv {
char ifd_name[IFNAMSIZ]; /* if name, e.g. "en0" */
unsigned long ifd_cmd;
size_t ifd_len;
void *ifd_data;
};
#define IFLINKSTR_QUERYLEN 0x01
#define IFLINKSTR_UNSET 0x02
/*
* Structure used in SIOCGIFCONF request.
* Used to retrieve interface configuration
* for machine (useful for programs which
* must know all networks accessible).
*/
struct ifconf {
int ifc_len; /* size of associated buffer */
union {
void * ifcu_buf;
struct ifreq *ifcu_req;
} ifc_ifcu;
#define ifc_buf ifc_ifcu.ifcu_buf /* buffer address */
#define ifc_req ifc_ifcu.ifcu_req /* array of structures returned */
};
/*
* Structure for SIOC[AGD]LIFADDR
*/
struct if_laddrreq {
char iflr_name[IFNAMSIZ];
unsigned int flags;
#define IFLR_PREFIX 0x8000 /* in: prefix given out: kernel fills id */
#define IFLR_ACTIVE 0x4000 /* in/out: link-layer address activation */
#define IFLR_FACTORY 0x2000 /* in/out: factory link-layer address */
unsigned int prefixlen; /* in/out */
struct sockaddr_storage addr; /* in/out */
struct sockaddr_storage dstaddr; /* out */
};
/*
* Structure for SIOC[SG]IFADDRPREF
*/
struct if_addrprefreq {
char ifap_name[IFNAMSIZ];
int16_t ifap_preference; /* in/out */
struct sockaddr_storage ifap_addr; /* in/out */
};
#include <net/if_arp.h>
#endif /* _NETBSD_SOURCE */
#ifdef _KERNEL
#ifdef ALTQ
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_ENQUEUE((ifq), (m), (err)); \
else { \
if (IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
IF_ENQUEUE((ifq), (m)); \
(err) = 0; \
} \
} \
if ((err)) \
(ifq)->ifq_drops++; \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue((ifq), ALTDQ_REMOVE); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_DEQUEUE((ifq), (m)); \
else \
IF_DEQUEUE((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_POLL(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (TBR_IS_ENABLED(ifq)) \
(m) = tbr_dequeue((ifq), ALTDQ_POLL); \
else if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_POLL((ifq), (m)); \
else \
IF_POLL((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_PURGE(ifq) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) \
ALTQ_PURGE(ifq); \
else \
IF_PURGE(ifq); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_SET_READY(ifq) \
do { \
(ifq)->altq_flags |= ALTQF_READY; \
} while (/*CONSTCOND*/ 0)
#define IFQ_CLASSIFY(ifq, m, af) \
do { \
KASSERT(((m)->m_flags & M_PKTHDR) != 0); \
mutex_enter((ifq)->ifq_lock); \
if (ALTQ_IS_ENABLED(ifq)) { \
if (ALTQ_NEEDS_CLASSIFY(ifq)) \
(m)->m_pkthdr.pattr_class = (*(ifq)->altq_classify) \
((ifq)->altq_clfier, (m), (af)); \
(m)->m_pkthdr.pattr_af = (af); \
(m)->m_pkthdr.pattr_hdr = mtod((m), void *); \
} \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#else /* ! ALTQ */
#define IFQ_ENQUEUE(ifq, m, err) \
do { \
mutex_enter((ifq)->ifq_lock); \
if (IF_QFULL(ifq)) { \
m_freem(m); \
(err) = ENOBUFS; \
} else { \
IF_ENQUEUE((ifq), (m)); \
(err) = 0; \
} \
if (err) \
(ifq)->ifq_drops++; \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_DEQUEUE(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_DEQUEUE((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_POLL(ifq, m) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_POLL((ifq), (m)); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_PURGE(ifq) \
do { \
mutex_enter((ifq)->ifq_lock); \
IF_PURGE(ifq); \
mutex_exit((ifq)->ifq_lock); \
} while (/*CONSTCOND*/ 0)
#define IFQ_SET_READY(ifq) /* nothing */
#define IFQ_CLASSIFY(ifq, m, af) /* nothing */
#endif /* ALTQ */
#define IFQ_LOCK_INIT(ifq) (ifq)->ifq_lock = \
mutex_obj_alloc(MUTEX_DEFAULT, IPL_NET)
#define IFQ_LOCK_DESTROY(ifq) mutex_obj_free((ifq)->ifq_lock)
#define IFQ_LOCK(ifq) mutex_enter((ifq)->ifq_lock)
#define IFQ_UNLOCK(ifq) mutex_exit((ifq)->ifq_lock)
#define IFQ_IS_EMPTY(ifq) IF_IS_EMPTY(ifq)
#define IFQ_INC_LEN(ifq) ((ifq)->ifq_len++)
#define IFQ_DEC_LEN(ifq) (--(ifq)->ifq_len)
#define IFQ_INC_DROPS(ifq) ((ifq)->ifq_drops++)
#define IFQ_SET_MAXLEN(ifq, len) ((ifq)->ifq_maxlen = (len))
#include <sys/mallocvar.h>
MALLOC_DECLARE(M_IFADDR);
MALLOC_DECLARE(M_IFMADDR);
int ifreq_setaddr(u_long, struct ifreq *, const struct sockaddr *);
struct ifnet *if_alloc(u_char);
void if_free(struct ifnet *);
void if_initname(struct ifnet *, const char *, int);
struct ifaddr *if_dl_create(const struct ifnet *, const struct sockaddr_dl **);
void if_activate_sadl(struct ifnet *, struct ifaddr *,
const struct sockaddr_dl *);
void if_set_sadl(struct ifnet *, const void *, u_char, bool);
void if_alloc_sadl(struct ifnet *);
void if_free_sadl(struct ifnet *, int);
void if_initialize(struct ifnet *);
void if_register(struct ifnet *);
void if_attach(struct ifnet *); /* Deprecated. Use if_initialize and if_register */
void if_attachdomain(void);
void if_deactivate(struct ifnet *);
bool if_is_deactivated(const struct ifnet *);
void if_export_if_data(struct ifnet *, struct if_data *, bool);
void if_purgeaddrs(struct ifnet *, int, void (*)(struct ifaddr *));
void if_detach(struct ifnet *);
void if_down(struct ifnet *);
void if_down_locked(struct ifnet *);
void if_link_state_change(struct ifnet *, int);
void if_domain_link_state_change(struct ifnet *, int);
void if_up(struct ifnet *);
void ifinit(void);
void ifinit1(void);
void ifinit_post(void);
int ifaddrpref_ioctl(struct socket *, u_long, void *, struct ifnet *);
extern int (*ifioctl)(struct socket *, u_long, void *, struct lwp *);
int ifioctl_common(struct ifnet *, u_long, void *);
int ifpromisc(struct ifnet *, int);
int ifpromisc_locked(struct ifnet *, int);
int if_addr_init(ifnet_t *, struct ifaddr *, bool);
int if_do_dad(struct ifnet *);
int if_mcast_op(ifnet_t *, const unsigned long, const struct sockaddr *);
int if_flags_set(struct ifnet *, const u_short);
int if_clone_list(int, char *, int *);
int if_ioctl(struct ifnet *, u_long, void *);
int if_init(struct ifnet *);
void if_stop(struct ifnet *, int);
struct ifnet *ifunit(const char *);
struct ifnet *if_get(const char *, struct psref *);
ifnet_t *if_byindex(u_int);
ifnet_t *_if_byindex(u_int);
ifnet_t *if_get_byindex(u_int, struct psref *);
ifnet_t *if_get_bylla(const void *, unsigned char, struct psref *);
void if_put(const struct ifnet *, struct psref *);
void if_acquire(struct ifnet *, struct psref *);
#define if_release if_put
int if_tunnel_check_nesting(struct ifnet *, struct mbuf *, int);
percpu_t *if_tunnel_alloc_ro_percpu(void);
void if_tunnel_free_ro_percpu(percpu_t *);
void if_tunnel_ro_percpu_rtcache_free(percpu_t *);
struct tunnel_ro {
struct route *tr_ro;
kmutex_t *tr_lock;
};
static inline void
if_tunnel_get_ro(percpu_t *ro_percpu, struct route **ro, kmutex_t **lock)
{
struct tunnel_ro *tro;
tro = percpu_getref(ro_percpu);
*ro = tro->tr_ro;
*lock = tro->tr_lock;
mutex_enter(*lock);
}
static inline void
if_tunnel_put_ro(percpu_t *ro_percpu, kmutex_t *lock)
{
mutex_exit(lock);
percpu_putref(ro_percpu);
}
static __inline if_index_t
if_get_index(const struct ifnet *ifp)
{
return ifp != NULL ? ifp->if_index : 0;
}
bool if_held(struct ifnet *);
void if_input(struct ifnet *, struct mbuf *);
struct if_percpuq *
if_percpuq_create(struct ifnet *);
void if_percpuq_destroy(struct if_percpuq *);
void
if_percpuq_enqueue(struct if_percpuq *, struct mbuf *);
void if_deferred_start_init(struct ifnet *, void (*)(struct ifnet *));
void if_schedule_deferred_start(struct ifnet *);
void ifa_insert(struct ifnet *, struct ifaddr *);
void ifa_remove(struct ifnet *, struct ifaddr *);
void ifa_psref_init(struct ifaddr *);
void ifa_acquire(struct ifaddr *, struct psref *);
void ifa_release(struct ifaddr *, struct psref *);
bool ifa_held(struct ifaddr *);
bool ifa_is_destroying(struct ifaddr *);
void ifaref(struct ifaddr *);
void ifafree(struct ifaddr *);
struct ifaddr *ifa_ifwithaddr(const struct sockaddr *);
struct ifaddr *ifa_ifwithaddr_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifa_ifwithaf(int);
struct ifaddr *ifa_ifwithdstaddr(const struct sockaddr *);
struct ifaddr *ifa_ifwithdstaddr_psref(const struct sockaddr *,
struct psref *);
struct ifaddr *ifa_ifwithnet(const struct sockaddr *);
struct ifaddr *ifa_ifwithnet_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifa_ifwithladdr(const struct sockaddr *);
struct ifaddr *ifa_ifwithladdr_psref(const struct sockaddr *, struct psref *);
struct ifaddr *ifaof_ifpforaddr(const struct sockaddr *, struct ifnet *);
struct ifaddr *ifaof_ifpforaddr_psref(const struct sockaddr *, struct ifnet *,
struct psref *);
void link_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
void p2p_rtrequest(int, struct rtentry *, const struct rt_addrinfo *);
void if_clone_attach(struct if_clone *);
void if_clone_detach(struct if_clone *);
int if_transmit_lock(struct ifnet *, struct mbuf *);
int ifq_enqueue(struct ifnet *, struct mbuf *);
int ifq_enqueue2(struct ifnet *, struct ifqueue *, struct mbuf *);
int loioctl(struct ifnet *, u_long, void *);
void loopattach(int);
void loopinit(void);
int looutput(struct ifnet *,
struct mbuf *, const struct sockaddr *, const struct rtentry *);
void * if_linkstate_change_establish(struct ifnet *,
void (*)(void *), void *);
void if_linkstate_change_disestablish(struct ifnet *,
void *, kmutex_t *);
/*
* These are exported because they're an easy way to tell if
* an interface is going away without having to burn a flag.
*/
int if_nulloutput(struct ifnet *, struct mbuf *,
const struct sockaddr *, const struct rtentry *);
void if_nullinput(struct ifnet *, struct mbuf *);
void if_nullstart(struct ifnet *);
int if_nulltransmit(struct ifnet *, struct mbuf *);
int if_nullioctl(struct ifnet *, u_long, void *);
int if_nullinit(struct ifnet *);
void if_nullstop(struct ifnet *, int);
void if_nullslowtimo(struct ifnet *);
#define if_nullwatchdog if_nullslowtimo
void if_nulldrain(struct ifnet *);
#else
struct if_nameindex {
unsigned int if_index; /* 1, 2, ... */
char *if_name; /* null terminated name: "le0", ... */
};
#include <sys/cdefs.h>
__BEGIN_DECLS
unsigned int if_nametoindex(const char *);
char * if_indextoname(unsigned int, char *);
struct if_nameindex * if_nameindex(void);
void if_freenameindex(struct if_nameindex *);
__END_DECLS
#endif /* _KERNEL */ /* XXX really ALTQ? */
#ifdef _KERNEL
#define IFADDR_FIRST(__ifp) TAILQ_FIRST(&(__ifp)->if_addrlist)
#define IFADDR_NEXT(__ifa) TAILQ_NEXT((__ifa), ifa_list)
#define IFADDR_FOREACH(__ifa, __ifp) TAILQ_FOREACH(__ifa, \
&(__ifp)->if_addrlist, ifa_list)
#define IFADDR_FOREACH_SAFE(__ifa, __ifp, __nifa) \
TAILQ_FOREACH_SAFE(__ifa, \
&(__ifp)->if_addrlist, ifa_list, __nifa)
#define IFADDR_EMPTY(__ifp) TAILQ_EMPTY(&(__ifp)->if_addrlist)
#define IFADDR_ENTRY_INIT(__ifa) \
PSLIST_ENTRY_INIT((__ifa), ifa_pslist_entry)
#define IFADDR_ENTRY_DESTROY(__ifa) \
PSLIST_ENTRY_DESTROY((__ifa), ifa_pslist_entry)
#define IFADDR_READER_EMPTY(__ifp) \
(PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry) == NULL)
#define IFADDR_READER_FIRST(__ifp) \
PSLIST_READER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry)
#define IFADDR_READER_NEXT(__ifa) \
PSLIST_READER_NEXT((__ifa), struct ifaddr, ifa_pslist_entry)
#define IFADDR_READER_FOREACH(__ifa, __ifp) \
PSLIST_READER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_HEAD(__ifp, __ifa) \
PSLIST_WRITER_INSERT_HEAD(&(__ifp)->if_addr_pslist, (__ifa), \
ifa_pslist_entry)
#define IFADDR_WRITER_REMOVE(__ifa) \
PSLIST_WRITER_REMOVE((__ifa), ifa_pslist_entry)
#define IFADDR_WRITER_FOREACH(__ifa, __ifp) \
PSLIST_WRITER_FOREACH((__ifa), &(__ifp)->if_addr_pslist, struct ifaddr,\
ifa_pslist_entry)
#define IFADDR_WRITER_NEXT(__ifp) \
PSLIST_WRITER_NEXT((__ifp), struct ifaddr, ifa_pslist_entry)
#define IFADDR_WRITER_INSERT_AFTER(__ifp, __new) \
PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), ifa_pslist_entry)
#define IFADDR_WRITER_EMPTY(__ifp) \
(PSLIST_WRITER_FIRST(&(__ifp)->if_addr_pslist, struct ifaddr, \
ifa_pslist_entry) == NULL)
#define IFADDR_WRITER_INSERT_TAIL(__ifp, __new) \
do { \
if (IFADDR_WRITER_EMPTY(__ifp)) { \
IFADDR_WRITER_INSERT_HEAD((__ifp), (__new)); \
} else { \
struct ifaddr *__ifa; \
IFADDR_WRITER_FOREACH(__ifa, (__ifp)) { \
if (IFADDR_WRITER_NEXT(__ifa) == NULL) {\
IFADDR_WRITER_INSERT_AFTER(__ifa,\
(__new)); \
break; \
} \
} \
} \
} while (0)
#define IFNET_GLOBAL_LOCK() mutex_enter(&ifnet_mtx)
#define IFNET_GLOBAL_UNLOCK() mutex_exit(&ifnet_mtx)
#define IFNET_GLOBAL_LOCKED() mutex_owned(&ifnet_mtx)
#define IFNET_READER_EMPTY() \
(PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_READER_FIRST() \
PSLIST_READER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry)
#define IFNET_READER_NEXT(__ifp) \
PSLIST_READER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_READER_FOREACH(__ifp) \
PSLIST_READER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
if_pslist_entry)
#define IFNET_WRITER_INSERT_HEAD(__ifp) \
PSLIST_WRITER_INSERT_HEAD(&ifnet_pslist, (__ifp), if_pslist_entry)
#define IFNET_WRITER_REMOVE(__ifp) \
PSLIST_WRITER_REMOVE((__ifp), if_pslist_entry)
#define IFNET_WRITER_FOREACH(__ifp) \
PSLIST_WRITER_FOREACH((__ifp), &ifnet_pslist, struct ifnet, \
if_pslist_entry)
#define IFNET_WRITER_NEXT(__ifp) \
PSLIST_WRITER_NEXT((__ifp), struct ifnet, if_pslist_entry)
#define IFNET_WRITER_INSERT_AFTER(__ifp, __new) \
PSLIST_WRITER_INSERT_AFTER((__ifp), (__new), if_pslist_entry)
#define IFNET_WRITER_EMPTY() \
(PSLIST_WRITER_FIRST(&ifnet_pslist, struct ifnet, if_pslist_entry) == NULL)
#define IFNET_WRITER_INSERT_TAIL(__new) \
do { \
if (IFNET_WRITER_EMPTY()) { \
IFNET_WRITER_INSERT_HEAD(__new); \
} else { \
struct ifnet *__ifp; \
IFNET_WRITER_FOREACH(__ifp) { \
if (IFNET_WRITER_NEXT(__ifp) == NULL) { \
IFNET_WRITER_INSERT_AFTER(__ifp,\
(__new)); \
break; \
} \
} \
} \
} while (0)
#define IFNET_LOCK(ifp) mutex_enter((ifp)->if_ioctl_lock)
#define IFNET_UNLOCK(ifp) mutex_exit((ifp)->if_ioctl_lock)
#define IFNET_LOCKED(ifp) mutex_owned((ifp)->if_ioctl_lock)
#define IFNET_ASSERT_UNLOCKED(ifp) \
KDASSERT(mutex_ownable((ifp)->if_ioctl_lock))
extern struct pslist_head ifnet_pslist;
extern kmutex_t ifnet_mtx;
extern struct ifnet *lo0ifp;
/*
* ifq sysctl support
*/
int sysctl_ifq(int *name, u_int namelen, void *oldp,
size_t *oldlenp, void *newp, size_t newlen,
struct ifqueue *ifq);
/* symbolic names for terminal (per-protocol) CTL_IFQ_ nodes */
#define IFQCTL_LEN 1
#define IFQCTL_MAXLEN 2
#define IFQCTL_PEAK 3
#define IFQCTL_DROPS 4
/*
* Hook for if_vlan - needed by if_agr
*/
MODULE_HOOK(if_vlan_vlan_input_hook,
struct mbuf *, (struct ifnet *, struct mbuf *));
#endif /* _KERNEL */
#endif /* !_NET_IF_H_ */
/* $NetBSD: kern_resource_43.c,v 1.23 2021/09/07 11:43:02 riastradh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_resource_43.c,v 1.23 2021/09/07 11:43:02 riastradh Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/resourcevar.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_mod.h>
static struct syscall_package kern_resource_43_syscalls[] = {
{ SYS_compat_43_ogetrlimit, 0, (sy_call_t *)compat_43_sys_getrlimit },
{ SYS_compat_43_osetrlimit, 0, (sy_call_t *)compat_43_sys_setrlimit },
{ 0, 0, NULL }
};
/* ARGSUSED */
int
compat_43_sys_getrlimit(struct lwp *l, const struct compat_43_sys_getrlimit_args *uap, register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(struct orlimit *) rlp;
} */
struct proc *p = l->l_proc;
int which = SCARG(uap, which);
struct orlimit olim;
if ((u_int)which >= RLIM_NLIMITS)
return (EINVAL);
memset(&olim, 0, sizeof(olim));
olim.rlim_cur = p->p_rlimit[which].rlim_cur;
if (olim.rlim_cur == -1)
olim.rlim_cur = 0x7fffffff;
olim.rlim_max = p->p_rlimit[which].rlim_max;
if (olim.rlim_max == -1)
olim.rlim_max = 0x7fffffff;
return copyout(&olim, SCARG(uap, rlp), sizeof(olim));
}
/* ARGSUSED */
int
compat_43_sys_setrlimit(struct lwp *l, const struct compat_43_sys_setrlimit_args *uap, register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(const struct orlimit *) rlp;
} */
int which = SCARG(uap, which);
struct orlimit olim;
struct rlimit lim;
int error;
error = copyin(SCARG(uap, rlp), &olim, sizeof(struct orlimit));
if (error)
return (error);
lim.rlim_cur = olim.rlim_cur;
lim.rlim_max = olim.rlim_max;
return (dosetrlimit(l, l->l_proc, which, &lim));
}
int
kern_resource_43_init(void)
{
return syscall_establish(NULL, kern_resource_43_syscalls);
}
int
kern_resource_43_fini(void)
{
return syscall_disestablish(NULL, kern_resource_43_syscalls);
}
/* $NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)spec_vnops.c 8.15 (Berkeley) 7/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: spec_vnops.c,v 1.218 2023/04/22 15:32:49 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/conf.h>
#include <sys/buf.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode_impl.h>
#include <sys/stat.h>
#include <sys/errno.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/file.h>
#include <sys/disklabel.h>
#include <sys/disk.h>
#include <sys/lockf.h>
#include <sys/tty.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/module.h>
#include <sys/atomic.h>
#include <miscfs/genfs/genfs.h>
#include <miscfs/specfs/specdev.h>
#ifdef DDB
#include <ddb/ddb.h>
#endif
/*
* Lock order:
*
* vnode lock
* -> device_lock
* -> struct vnode::v_interlock
*/
/* symbolic sleep message strings for devices */
const char devopn[] = "devopn";
const char devio[] = "devio";
const char devwait[] = "devwait";
const char devin[] = "devin";
const char devout[] = "devout";
const char devioc[] = "devioc";
const char devcls[] = "devcls";
#define SPECHSZ 64
#if ((SPECHSZ&(SPECHSZ-1)) == 0)
#define SPECHASH(rdev) (((rdev>>5)+(rdev))&(SPECHSZ-1))
#else
#define SPECHASH(rdev) (((unsigned)((rdev>>5)+(rdev)))%SPECHSZ)
#endif
static vnode_t *specfs_hash[SPECHSZ];
extern struct mount *dead_rootmount;
/*
* This vnode operations vector is used for special device nodes
* created from whole cloth by the kernel. For the ops vector for
* vnodes built from special devices found in a filesystem, see (e.g)
* ffs_specop_entries[] in ffs_vnops.c or the equivalent for other
* filesystems.
*/
int (**spec_vnodeop_p)(void *);
const struct vnodeopv_entry_desc spec_vnodeop_entries[] = {
{ &vop_default_desc, vn_default_error },
{ &vop_parsepath_desc, genfs_parsepath }, /* parsepath */
{ &vop_lookup_desc, spec_lookup }, /* lookup */
{ &vop_create_desc, genfs_badop }, /* create */
{ &vop_mknod_desc, genfs_badop }, /* mknod */
{ &vop_open_desc, spec_open }, /* open */
{ &vop_close_desc, spec_close }, /* close */
{ &vop_access_desc, genfs_ebadf }, /* access */
{ &vop_accessx_desc, genfs_ebadf }, /* accessx */
{ &vop_getattr_desc, genfs_ebadf }, /* getattr */
{ &vop_setattr_desc, genfs_ebadf }, /* setattr */
{ &vop_read_desc, spec_read }, /* read */
{ &vop_write_desc, spec_write }, /* write */
{ &vop_fallocate_desc, genfs_eopnotsupp }, /* fallocate */
{ &vop_fdiscard_desc, spec_fdiscard }, /* fdiscard */
{ &vop_fcntl_desc, genfs_fcntl }, /* fcntl */
{ &vop_ioctl_desc, spec_ioctl }, /* ioctl */
{ &vop_poll_desc, spec_poll }, /* poll */
{ &vop_kqfilter_desc, spec_kqfilter }, /* kqfilter */
{ &vop_revoke_desc, genfs_revoke }, /* revoke */
{ &vop_mmap_desc, spec_mmap }, /* mmap */
{ &vop_fsync_desc, spec_fsync }, /* fsync */
{ &vop_seek_desc, spec_seek }, /* seek */
{ &vop_remove_desc, genfs_badop }, /* remove */
{ &vop_link_desc, genfs_badop }, /* link */
{ &vop_rename_desc, genfs_badop }, /* rename */
{ &vop_mkdir_desc, genfs_badop }, /* mkdir */
{ &vop_rmdir_desc, genfs_badop }, /* rmdir */
{ &vop_symlink_desc, genfs_badop }, /* symlink */
{ &vop_readdir_desc, genfs_badop }, /* readdir */
{ &vop_readlink_desc, genfs_badop }, /* readlink */
{ &vop_abortop_desc, genfs_badop }, /* abortop */
{ &vop_inactive_desc, spec_inactive }, /* inactive */
{ &vop_reclaim_desc, spec_reclaim }, /* reclaim */
{ &vop_lock_desc, genfs_lock }, /* lock */
{ &vop_unlock_desc, genfs_unlock }, /* unlock */
{ &vop_bmap_desc, spec_bmap }, /* bmap */
{ &vop_strategy_desc, spec_strategy }, /* strategy */
{ &vop_print_desc, spec_print }, /* print */
{ &vop_islocked_desc, genfs_islocked }, /* islocked */
{ &vop_pathconf_desc, spec_pathconf }, /* pathconf */
{ &vop_advlock_desc, spec_advlock }, /* advlock */
{ &vop_bwrite_desc, vn_bwrite }, /* bwrite */
{ &vop_getpages_desc, genfs_getpages }, /* getpages */
{ &vop_putpages_desc, genfs_putpages }, /* putpages */
{ NULL, NULL }
};
const struct vnodeopv_desc spec_vnodeop_opv_desc =
{ &spec_vnodeop_p, spec_vnodeop_entries };
static kauth_listener_t rawio_listener;
static struct kcondvar specfs_iocv;
/*
* Returns true if vnode is /dev/mem or /dev/kmem.
*/
bool
iskmemvp(struct vnode *vp)
{ return ((vp->v_type == VCHR) && iskmemdev(vp->v_rdev));
}
/*
* Returns true if dev is /dev/mem or /dev/kmem.
*/
int
iskmemdev(dev_t dev)
{
/* mem_no is emitted by config(8) to generated devsw.c */
extern const int mem_no;
/* minor 14 is /dev/io on i386 with COMPAT_10 */
return (major(dev) == mem_no && (minor(dev) < 2 || minor(dev) == 14));
}
static int
rawio_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
int result;
result = KAUTH_RESULT_DEFER;
if ((action != KAUTH_DEVICE_RAWIO_SPEC) &&
(action != KAUTH_DEVICE_RAWIO_PASSTHRU))
return result;
/* Access is mandated by permissions. */
result = KAUTH_RESULT_ALLOW;
return result;
}
void
spec_init(void)
{
rawio_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
rawio_listener_cb, NULL);
cv_init(&specfs_iocv, "specio");
}
/*
* spec_io_enter(vp, &sn, &dev)
*
* Enter an operation that may not hold vp's vnode lock or an
* fstrans on vp's mount. Until spec_io_exit, the vnode will not
* be revoked.
*
* On success, set sn to the specnode pointer and dev to the dev_t
* number and return zero. Caller must later call spec_io_exit
* when done.
*
* On failure, return ENXIO -- the device has been revoked and no
* longer exists.
*/
static int
spec_io_enter(struct vnode *vp, struct specnode **snp, dev_t *devp)
{
dev_t dev;
struct specnode *sn;
unsigned iocnt;
int error = 0;
mutex_enter(vp->v_interlock);
/*
* Extract all the info we need from the vnode, unless the
* vnode has already been reclaimed. This can happen if the
* underlying device has been removed and all the device nodes
* for it have been revoked. The caller may not hold a vnode
* lock or fstrans to prevent this from happening before it has
* had an opportunity to notice the vnode is dead.
*/
if (vdead_check(vp, VDEAD_NOWAIT) != 0 || (sn = vp->v_specnode) == NULL ||
(dev = vp->v_rdev) == NODEV) {
error = ENXIO;
goto out;
}
/*
* Notify spec_close that we are doing an I/O operation which
* may not be not bracketed by fstrans(9) and thus is not
* blocked by vfs suspension.
*
* We could hold this reference with psref(9) instead, but we
* already have to take the interlock for vdead_check, so
* there's not much more cost here to another atomic operation.
*/
do {
iocnt = atomic_load_relaxed(&sn->sn_dev->sd_iocnt); if (__predict_false(iocnt == UINT_MAX)) {
/*
* The I/O count is limited by the number of
* LWPs (which will never overflow this) --
* unless one driver uses another driver via
* specfs, which is rather unusual, but which
* could happen via pud(4) userspace drivers.
* We could use a 64-bit count, but can't use
* atomics for that on all platforms.
* (Probably better to switch to psref or
* localcount instead.)
*/
error = EBUSY;
goto out;
}
} while (atomic_cas_uint(&sn->sn_dev->sd_iocnt, iocnt, iocnt + 1)
!= iocnt);
/* Success! */
*snp = sn;
*devp = dev;
error = 0;
out: mutex_exit(vp->v_interlock);
return error;
}
/*
* spec_io_exit(vp, sn)
*
* Exit an operation entered with a successful spec_io_enter --
* allow concurrent spec_node_revoke to proceed. The argument sn
* must match the struct specnode pointer returned by spec_io_exit
* for vp.
*/
static void
spec_io_exit(struct vnode *vp, struct specnode *sn)
{
struct specdev *sd = sn->sn_dev;
unsigned iocnt;
KASSERT(vp->v_specnode == sn);
/*
* We are done. Notify spec_close if appropriate. The
* transition of 1 -> 0 must happen under device_lock so
* spec_close doesn't miss a wakeup.
*/
do {
iocnt = atomic_load_relaxed(&sd->sd_iocnt); KASSERT(iocnt > 0);
if (iocnt == 1) {
mutex_enter(&device_lock);
if (atomic_dec_uint_nv(&sd->sd_iocnt) == 0) cv_broadcast(&specfs_iocv);
mutex_exit(&device_lock);
break;
}
} while (atomic_cas_uint(&sd->sd_iocnt, iocnt, iocnt - 1) != iocnt);
}
/*
* spec_io_drain(sd)
*
* Wait for all existing spec_io_enter/exit sections to complete.
* Caller must ensure spec_io_enter will fail at this point.
*/
static void
spec_io_drain(struct specdev *sd)
{
/*
* I/O at the same time as closing is unlikely -- it often
* indicates an application bug.
*/
if (__predict_true(atomic_load_relaxed(&sd->sd_iocnt) == 0))
return;
mutex_enter(&device_lock);
while (atomic_load_relaxed(&sd->sd_iocnt) > 0) cv_wait(&specfs_iocv, &device_lock); mutex_exit(&device_lock);
}
/*
* Initialize a vnode that represents a device.
*/
void
spec_node_init(vnode_t *vp, dev_t rdev)
{
specnode_t *sn;
specdev_t *sd;
vnode_t *vp2;
vnode_t **vpp;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode == NULL);
/*
* Search the hash table for this device. If known, add a
* reference to the device structure. If not known, create
* a new entry to represent the device. In all cases add
* the vnode to the hash table.
*/
sn = kmem_alloc(sizeof(*sn), KM_SLEEP);
sd = kmem_alloc(sizeof(*sd), KM_SLEEP);
mutex_enter(&device_lock);
vpp = &specfs_hash[SPECHASH(rdev)];
for (vp2 = *vpp; vp2 != NULL; vp2 = vp2->v_specnext) { KASSERT(vp2->v_specnode != NULL); if (rdev == vp2->v_rdev && vp->v_type == vp2->v_type) {
break;
}
}
if (vp2 == NULL) {
/* No existing record, create a new one. */
sd->sd_mountpoint = NULL;
sd->sd_lockf = NULL;
sd->sd_refcnt = 1;
sd->sd_opencnt = 0;
sd->sd_bdevvp = NULL;
sd->sd_iocnt = 0;
sd->sd_opened = false;
sd->sd_closing = false;
sn->sn_dev = sd;
sd = NULL;
} else {
/* Use the existing record. */
sn->sn_dev = vp2->v_specnode->sn_dev;
sn->sn_dev->sd_refcnt++;
}
/* Insert vnode into the hash chain. */
sn->sn_opencnt = 0;
sn->sn_rdev = rdev;
sn->sn_gone = false;
vp->v_specnode = sn;
vp->v_specnext = *vpp;
*vpp = vp;
mutex_exit(&device_lock);
/* Free the record we allocated if unused. */
if (sd != NULL) { kmem_free(sd, sizeof(*sd));
}
}
/*
* Lookup a vnode by device number and return it referenced.
*/
int
spec_node_lookup_by_dev(enum vtype type, dev_t dev, int flags, vnode_t **vpp)
{
int error;
vnode_t *vp;
top: mutex_enter(&device_lock);
for (vp = specfs_hash[SPECHASH(dev)]; vp; vp = vp->v_specnext) { if (type == vp->v_type && dev == vp->v_rdev) {
mutex_enter(vp->v_interlock);
/* If clean or being cleaned, then ignore it. */
if (vdead_check(vp, VDEAD_NOWAIT) == 0)
break;
if ((flags & VDEAD_NOWAIT) == 0) {
mutex_exit(&device_lock);
/*
* It may be being revoked as we speak,
* and the caller wants to wait until
* all revocation has completed. Let
* vcache_vget wait for it to finish
* dying; as a side effect, vcache_vget
* releases vp->v_interlock. Note that
* vcache_vget cannot succeed at this
* point because vdead_check already
* failed.
*/
error = vcache_vget(vp);
KASSERT(error);
goto top;
}
mutex_exit(vp->v_interlock);
}
}
KASSERT(vp == NULL || mutex_owned(vp->v_interlock));
if (vp == NULL) {
mutex_exit(&device_lock);
return ENOENT;
}
/*
* If it is an opened block device return the opened vnode.
*/
if (type == VBLK && vp->v_specnode->sn_dev->sd_bdevvp != NULL) { mutex_exit(vp->v_interlock);
vp = vp->v_specnode->sn_dev->sd_bdevvp;
mutex_enter(vp->v_interlock);
}
mutex_exit(&device_lock);
error = vcache_vget(vp);
if (error)
return error;
*vpp = vp;
return 0;
}
/*
* Lookup a vnode by file system mounted on and return it referenced.
*/
int
spec_node_lookup_by_mount(struct mount *mp, vnode_t **vpp)
{
int i, error;
vnode_t *vp, *vq;
mutex_enter(&device_lock);
for (i = 0, vq = NULL; i < SPECHSZ && vq == NULL; i++) {
for (vp = specfs_hash[i]; vp; vp = vp->v_specnext) {
if (vp->v_type != VBLK)
continue;
vq = vp->v_specnode->sn_dev->sd_bdevvp;
if (vq != NULL &&
vq->v_specnode->sn_dev->sd_mountpoint == mp)
break;
vq = NULL;
}
}
if (vq == NULL) {
mutex_exit(&device_lock);
return ENOENT;
}
mutex_enter(vq->v_interlock);
mutex_exit(&device_lock);
error = vcache_vget(vq);
if (error)
return error;
*vpp = vq;
return 0;
}
/*
* Get the file system mounted on this block device.
*
* XXX Caller should hold the vnode lock -- shared or exclusive -- so
* that this can't changed, and the vnode can't be revoked while we
* examine it. But not all callers do, and they're scattered through a
* lot of file systems, so we can't assert this yet.
*/
struct mount *
spec_node_getmountedfs(vnode_t *devvp)
{
struct mount *mp;
KASSERT(devvp->v_type == VBLK);
mp = devvp->v_specnode->sn_dev->sd_mountpoint;
return mp;
}
/*
* Set the file system mounted on this block device.
*
* XXX Caller should hold the vnode lock exclusively so this can't be
* changed or assumed by spec_node_getmountedfs while we change it, and
* the vnode can't be revoked while we handle it. But not all callers
* do, and they're scattered through a lot of file systems, so we can't
* assert this yet. Instead, for now, we'll take an I/O reference so
* at least the ioctl doesn't race with revoke/detach.
*
* If you do change this to assert an exclusive vnode lock, you must
* also do vdead_check before trying bdev_ioctl, because the vnode may
* have been revoked by the time the caller locked it, and this is
* _not_ a vop -- calls to spec_node_setmountedfs don't go through
* v_op, so revoking the vnode doesn't prevent further calls.
*
* XXX Caller should additionally have the vnode open, at least if mp
* is nonnull, but I'm not sure all callers do that -- need to audit.
* Currently udf closes the vnode before clearing the mount.
*/
void
spec_node_setmountedfs(vnode_t *devvp, struct mount *mp)
{
struct dkwedge_info dkw;
struct specnode *sn;
dev_t dev;
int error;
KASSERT(devvp->v_type == VBLK);
error = spec_io_enter(devvp, &sn, &dev);
if (error)
return;
KASSERT(sn->sn_dev->sd_mountpoint == NULL || mp == NULL);
sn->sn_dev->sd_mountpoint = mp;
if (mp == NULL)
goto out;
error = bdev_ioctl(dev, DIOCGWEDGEINFO, &dkw, FREAD, curlwp);
if (error)
goto out;
strlcpy(mp->mnt_stat.f_mntfromlabel, dkw.dkw_wname,
sizeof(mp->mnt_stat.f_mntfromlabel));
out: spec_io_exit(devvp, sn);
}
/*
* A vnode representing a special device is going away. Close
* the device if the vnode holds it open.
*/
void
spec_node_revoke(vnode_t *vp)
{
specnode_t *sn;
specdev_t *sd;
struct vnode **vpp;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
sn = vp->v_specnode;
sd = sn->sn_dev;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_gone == false);
mutex_enter(&device_lock);
KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sn->sn_gone = true;
if (sn->sn_opencnt != 0) {
sd->sd_opencnt -= (sn->sn_opencnt - 1);
sn->sn_opencnt = 1;
mutex_exit(&device_lock);
VOP_CLOSE(vp, FNONBLOCK, NOCRED);
mutex_enter(&device_lock);
KASSERT(sn->sn_opencnt == 0);
}
/*
* We may have revoked the vnode in this thread while another
* thread was in the middle of spec_close, in the window when
* spec_close releases the vnode lock to call .d_close for the
* last close. In that case, wait for the concurrent
* spec_close to complete.
*/
while (sd->sd_closing)
cv_wait(&specfs_iocv, &device_lock);
/*
* Remove from the hash so lookups stop returning this
* specnode. We will dissociate it from the specdev -- and
* possibly free the specdev -- in spec_node_destroy.
*/
KASSERT(sn->sn_gone); KASSERT(sn->sn_opencnt == 0);
for (vpp = &specfs_hash[SPECHASH(vp->v_rdev)];;
vpp = &(*vpp)->v_specnext) { if (*vpp == vp) {
*vpp = vp->v_specnext;
vp->v_specnext = NULL;
break;
}
}
mutex_exit(&device_lock);
}
/*
* A vnode representing a special device is being recycled.
* Destroy the specfs component.
*/
void
spec_node_destroy(vnode_t *vp)
{
specnode_t *sn;
specdev_t *sd;
int refcnt;
sn = vp->v_specnode;
sd = sn->sn_dev;
KASSERT(vp->v_type == VBLK || vp->v_type == VCHR); KASSERT(vp->v_specnode != NULL); KASSERT(sn->sn_opencnt == 0);
mutex_enter(&device_lock);
sn = vp->v_specnode;
vp->v_specnode = NULL;
refcnt = sd->sd_refcnt--;
KASSERT(refcnt > 0);
mutex_exit(&device_lock);
/* If the device is no longer in use, destroy our record. */
if (refcnt == 1) { KASSERT(sd->sd_iocnt == 0); KASSERT(sd->sd_opencnt == 0); KASSERT(sd->sd_bdevvp == NULL);
kmem_free(sd, sizeof(*sd));
}
kmem_free(sn, sizeof(*sn));
}
/*
* Trivial lookup routine that always fails.
*/
int
spec_lookup(void *v)
{
struct vop_lookup_v2_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
struct componentname *a_cnp;
} */ *ap = v;
*ap->a_vpp = NULL;
return ENOTDIR;
}
typedef int (*spec_ioctl_t)(dev_t, u_long, void *, int, struct lwp *);
/*
* Open a special file.
*/
/* ARGSUSED */
int
spec_open(void *v)
{
struct vop_open_args /* {
struct vnode *a_vp;
int a_mode;
kauth_cred_t a_cred;
} */ *ap = v;
struct lwp *l = curlwp;
struct vnode *vp = ap->a_vp;
dev_t dev, dev1;
int error;
enum kauth_device_req req;
specnode_t *sn, *sn1;
specdev_t *sd;
spec_ioctl_t ioctl;
u_int gen = 0;
const char *name = NULL;
bool needclose = false;
struct partinfo pi;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE); KASSERTMSG(vp->v_type == VBLK || vp->v_type == VCHR, "type=%d",
vp->v_type);
dev = vp->v_rdev;
sn = vp->v_specnode;
sd = sn->sn_dev;
/*
* Don't allow open if fs is mounted -nodev.
*/
if (vp->v_mount && (vp->v_mount->mnt_flag & MNT_NODEV))
return ENXIO;
switch (ap->a_mode & (FREAD | FWRITE)) {
case FREAD | FWRITE:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_RW;
break;
case FWRITE:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_WRITE;
break;
default:
req = KAUTH_REQ_DEVICE_RAWIO_SPEC_READ;
break;
}
error = kauth_authorize_device_spec(ap->a_cred, req, vp);
if (error)
return error;
/*
* Acquire an open reference -- as long as we hold onto it, and
* the vnode isn't revoked, it can't be closed, and the vnode
* can't be revoked until we release the vnode lock.
*/
mutex_enter(&device_lock);
KASSERT(!sn->sn_gone);
switch (vp->v_type) {
case VCHR:
/*
* Character devices can accept opens from multiple
* vnodes. But first, wait for any close to finish.
* Wait under the vnode lock so we don't have to worry
* about the vnode being revoked while we wait.
*/
while (sd->sd_closing) {
error = cv_wait_sig(&specfs_iocv, &device_lock);
if (error)
break;
}
if (error)
break;
sd->sd_opencnt++;
sn->sn_opencnt++;
KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
break;
case VBLK:
/*
* For block devices, permit only one open. The buffer
* cache cannot remain self-consistent with multiple
* vnodes holding a block device open.
*
* Treat zero opencnt with non-NULL mountpoint as open.
* This may happen after forced detach of a mounted device.
*
* Also treat sd_closing, meaning there is a concurrent
* close in progress, as still open.
*/
if (sd->sd_opencnt != 0 || sd->sd_mountpoint != NULL ||
sd->sd_closing) {
error = EBUSY;
break;
}
KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
sn->sn_opencnt);
sn->sn_opencnt = 1;
sd->sd_opencnt = 1;
sd->sd_bdevvp = vp;
break;
default:
panic("invalid specfs vnode type: %d", vp->v_type);
}
mutex_exit(&device_lock);
if (error)
return error;
/*
* Set VV_ISTTY if this is a tty cdev.
*
* XXX This does the wrong thing if the module has to be
* autoloaded. We should maybe set this after autoloading
* modules and calling .d_open successfully, except (a) we need
* the vnode lock to touch it, and (b) once we acquire the
* vnode lock again, the vnode may have been revoked, and
* deadfs's dead_read needs VV_ISTTY to be already set in order
* to return the right answer. So this needs some additional
* synchronization to be made to work correctly with tty driver
* module autoload. For now, let's just hope it doesn't cause
* too much trouble for a tty from an autoloaded driver module
* to fail with EIO instead of returning EOF.
*/
if (vp->v_type == VCHR) { if (cdev_type(dev) == D_TTY) vp->v_vflag |= VV_ISTTY;
}
/*
* Because opening the device may block indefinitely, e.g. when
* opening a tty, and loading a module may cross into many
* other subsystems, we must not hold the vnode lock while
* calling .d_open, so release it now and reacquire it when
* done.
*
* Take an I/O reference so that any concurrent spec_close via
* spec_node_revoke will wait for us to finish calling .d_open.
* The vnode can't be dead at this point because we have it
* locked. Note that if revoked, the driver must interrupt
* .d_open before spec_close starts waiting for I/O to drain so
* this doesn't deadlock.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn1, &dev1);
if (error) {
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
}
KASSERT(sn1 == sn); KASSERT(dev1 == dev);
/*
* Open the device. If .d_open returns ENXIO (device not
* configured), the driver may not be loaded, so try
* autoloading a module and then try .d_open again if anything
* got loaded.
*/
switch (vp->v_type) {
case VCHR:
do {
const struct cdevsw *cdev;
gen = module_gen;
error = cdev_open(dev, ap->a_mode, S_IFCHR, l);
if (error != ENXIO)
break;
/* Check if we already have a valid driver */
mutex_enter(&device_lock);
cdev = cdevsw_lookup(dev);
mutex_exit(&device_lock);
if (cdev != NULL)
break;
/* Get device name from devsw_conv array */
if ((name = cdevsw_getname(major(dev))) == NULL)
break;
/* Try to autoload device module */
(void)module_autoload(name, MODULE_CLASS_DRIVER);
} while (gen != module_gen);
break;
case VBLK:
do {
const struct bdevsw *bdev;
gen = module_gen;
error = bdev_open(dev, ap->a_mode, S_IFBLK, l);
if (error != ENXIO)
break;
/* Check if we already have a valid driver */
mutex_enter(&device_lock);
bdev = bdevsw_lookup(dev);
mutex_exit(&device_lock);
if (bdev != NULL)
break;
/* Get device name from devsw_conv array */
if ((name = bdevsw_getname(major(dev))) == NULL)
break;
/* Try to autoload device module */
(void)module_autoload(name, MODULE_CLASS_DRIVER);
} while (gen != module_gen);
break;
default:
__unreachable();
}
/*
* Release the I/O reference now that we have called .d_open,
* and reacquire the vnode lock. At this point, the device may
* have been revoked, so we must tread carefully. However, sn
* and sd remain valid pointers until we drop our reference.
*/
spec_io_exit(vp, sn);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
KASSERT(vp->v_specnode == sn);
/*
* If it has been revoked since we released the vnode lock and
* reacquired it, then spec_node_revoke has closed it, and we
* must fail with EBADF.
*
* Otherwise, if opening it failed, back out and release the
* open reference. If it was ever successfully opened and we
* got the last reference this way, it's now our job to close
* it. This might happen in the following scenario:
*
* Thread 1 Thread 2
* VOP_OPEN
* ...
* .d_open -> 0 (success)
* acquire vnode lock
* do stuff VOP_OPEN
* release vnode lock ...
* .d_open -> EBUSY
* VOP_CLOSE
* acquire vnode lock
* --sd_opencnt != 0
* => no .d_close
* release vnode lock
* acquire vnode lock
* --sd_opencnt == 0
*
* We can't resolve this by making spec_close wait for .d_open
* to complete before examining sd_opencnt, because .d_open can
* hang indefinitely, e.g. for a tty.
*/
mutex_enter(&device_lock);
if (sn->sn_gone) {
if (error == 0)
error = EBADF;
} else if (error == 0) {
/*
* Device has not been revoked, so our opencnt can't
* have gone away at this point -- transition to
* sn_gone=true happens before transition to
* sn_opencnt=0 in spec_node_revoke.
*/
KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
KASSERT(!sd->sd_closing);
sd->sd_opened = true;
} else if (sd->sd_opencnt == 1 && sd->sd_opened) {
/*
* We're the last reference to a _previous_ open even
* though this one failed, so we have to close it.
* Don't decrement the reference count here --
* spec_close will do that.
*/
KASSERT(sn->sn_opencnt == 1);
needclose = true;
} else {
KASSERT(sd->sd_opencnt); KASSERT(sn->sn_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sd->sd_opencnt--;
sn->sn_opencnt--;
if (vp->v_type == VBLK) sd->sd_bdevvp = NULL;
}
mutex_exit(&device_lock);
/*
* If this open failed, but the device was previously opened,
* and another thread concurrently closed the vnode while we
* were in the middle of reopening it, the other thread will
* see sd_opencnt > 0 and thus decide not to call .d_close --
* it is now our responsibility to do so.
*
* XXX The flags passed to VOP_CLOSE here are wrong, but
* drivers can't rely on FREAD|FWRITE anyway -- e.g., consider
* a device opened by thread 0 with O_READ, then opened by
* thread 1 with O_WRITE, then closed by thread 0, and finally
* closed by thread 1; the last .d_close call will have FWRITE
* but not FREAD. We should just eliminate the FREAD/FWRITE
* parameter to .d_close altogether.
*/
if (needclose) {
KASSERT(error);
VOP_CLOSE(vp, FNONBLOCK, NOCRED);
}
/* If anything went wrong, we're done. */
if (error)
return error;
/*
* For disk devices, automagically set the vnode size to the
* partition size, if we can. This applies to block devices
* and character devices alike -- every block device must have
* a corresponding character device. And if the module is
* loaded it will remain loaded until we're done here (it is
* forbidden to devsw_detach until closed). So it is safe to
* query cdev_type unconditionally here.
*/
if (cdev_type(dev) == D_DISK) {
ioctl = vp->v_type == VCHR ? cdev_ioctl : bdev_ioctl;
if ((*ioctl)(dev, DIOCGPARTINFO, &pi, FREAD, curlwp) == 0)
uvm_vnp_setsize(vp,
(voff_t)pi.pi_secsize * pi.pi_size);
}
/* Success! */
return 0;
}
/*
* Vnode op for read
*/
/* ARGSUSED */
int
spec_read(void *v)
{
struct vop_read_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *l = curlwp;
struct specnode *sn;
dev_t dev;
struct buf *bp;
daddr_t bn;
int bsize, bscale;
struct partinfo pi;
int n, on;
int error = 0;
int i, nra;
daddr_t lastbn, *rablks;
int *rasizes;
int nrablks, ratogo;
KASSERT(uio->uio_rw == UIO_READ); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
uio->uio_vmspace == curproc->p_vmspace),
"vmspace belongs to neither kernel nor curproc");
if (uio->uio_resid == 0)
return 0;
switch (vp->v_type) {
case VCHR:
/*
* Release the lock while we sleep -- possibly
* indefinitely, if this is, e.g., a tty -- in
* cdev_read, so we don't hold up everything else that
* might want access to the vnode.
*
* But before we issue the read, take an I/O reference
* to the specnode so close will know when we're done
* reading. Note that the moment we release the lock,
* the vnode's identity may change; hence spec_io_enter
* may fail, and the caller may have a dead vnode on
* their hands, if the file system on which vp lived
* has been unmounted.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
error = cdev_read(dev, uio, ap->a_ioflag);
spec_io_exit(vp, sn);
out: /* XXX What if the caller held an exclusive lock? */
vn_lock(vp, LK_SHARED | LK_RETRY);
return error;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_offset < 0)
return EINVAL;
if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
else
bsize = BLKDEV_IOSIZE;
bscale = bsize >> DEV_BSHIFT;
nra = uimax(16 * MAXPHYS / bsize - 1, 511);
rablks = kmem_alloc(nra * sizeof(*rablks), KM_SLEEP);
rasizes = kmem_alloc(nra * sizeof(*rasizes), KM_SLEEP);
lastbn = ((uio->uio_offset + uio->uio_resid - 1) >> DEV_BSHIFT)
&~ (bscale - 1);
nrablks = ratogo = 0;
do {
bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
on = uio->uio_offset % bsize;
n = uimin((unsigned)(bsize - on), uio->uio_resid);
if (ratogo == 0) {
nrablks = uimin((lastbn - bn) / bscale, nra);
ratogo = nrablks;
for (i = 0; i < nrablks; ++i) {
rablks[i] = bn + (i+1) * bscale;
rasizes[i] = bsize;
}
error = breadn(vp, bn, bsize,
rablks, rasizes, nrablks,
0, &bp);
} else {
if (ratogo > 0)
--ratogo;
error = bread(vp, bn, bsize, 0, &bp);
}
if (error)
break;
n = uimin(n, bsize - bp->b_resid);
error = uiomove((char *)bp->b_data + on, n, uio);
brelse(bp, 0);
} while (error == 0 && uio->uio_resid > 0 && n != 0);
kmem_free(rablks, nra * sizeof(*rablks));
kmem_free(rasizes, nra * sizeof(*rasizes));
return error;
default:
panic("spec_read type");
}
/* NOTREACHED */
}
/*
* Vnode op for write
*/
/* ARGSUSED */
int
spec_write(void *v)
{
struct vop_write_args /* {
struct vnode *a_vp;
struct uio *a_uio;
int a_ioflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct uio *uio = ap->a_uio;
struct lwp *l = curlwp;
struct specnode *sn;
dev_t dev;
struct buf *bp;
daddr_t bn;
int bsize, bscale;
struct partinfo pi;
int n, on;
int error = 0;
KASSERT(uio->uio_rw == UIO_WRITE); KASSERTMSG((VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ||
uio->uio_vmspace == curproc->p_vmspace),
"vmspace belongs to neither kernel nor curproc");
switch (vp->v_type) {
case VCHR:
/*
* Release the lock while we sleep -- possibly
* indefinitely, if this is, e.g., a tty -- in
* cdev_write, so we don't hold up everything else that
* might want access to the vnode.
*
* But before we issue the write, take an I/O reference
* to the specnode so close will know when we're done
* writing. Note that the moment we release the lock,
* the vnode's identity may change; hence spec_io_enter
* may fail, and the caller may have a dead vnode on
* their hands, if the file system on which vp lived
* has been unmounted.
*/
VOP_UNLOCK(vp);
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
error = cdev_write(dev, uio, ap->a_ioflag);
spec_io_exit(vp, sn);
out: vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp); if (uio->uio_resid == 0)
return 0;
if (uio->uio_offset < 0)
return EINVAL;
if (bdev_ioctl(vp->v_rdev, DIOCGPARTINFO, &pi, FREAD, l) == 0) bsize = imin(imax(pi.pi_bsize, DEV_BSIZE), MAXBSIZE);
else
bsize = BLKDEV_IOSIZE;
bscale = bsize >> DEV_BSHIFT;
do {
bn = (uio->uio_offset >> DEV_BSHIFT) &~ (bscale - 1);
on = uio->uio_offset % bsize;
n = uimin((unsigned)(bsize - on), uio->uio_resid);
if (n == bsize)
bp = getblk(vp, bn, bsize, 0, 0);
else
error = bread(vp, bn, bsize, B_MODIFY, &bp);
if (error) {
return error;
}
n = uimin(n, bsize - bp->b_resid);
error = uiomove((char *)bp->b_data + on, n, uio);
if (error)
brelse(bp, 0);
else {
if (n + on == bsize)
bawrite(bp);
else
bdwrite(bp);
error = bp->b_error;
}
} while (error == 0 && uio->uio_resid > 0 && n != 0);
return error;
default:
panic("spec_write type");
}
/* NOTREACHED */
}
/*
* fdiscard, which on disk devices becomes TRIM.
*/
int
spec_fdiscard(void *v)
{
struct vop_fdiscard_args /* {
struct vnode *a_vp;
off_t a_pos;
off_t a_len;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
dev_t dev;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
dev = vp->v_rdev;
switch (vp->v_type) {
case VCHR:
#if 0 /* This is not stored for character devices. */
KASSERT(vp == vp->v_specnode->sn_dev->sd_cdevvp);
#endif
return cdev_discard(dev, ap->a_pos, ap->a_len);
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
return bdev_discard(dev, ap->a_pos, ap->a_len);
default:
panic("spec_fdiscard: not a device\n");
}
}
/*
* Device ioctl operation.
*/
/* ARGSUSED */
int
spec_ioctl(void *v)
{
struct vop_ioctl_args /* {
struct vnode *a_vp;
u_long a_command;
void *a_data;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
switch (vp->v_type) {
case VCHR:
error = cdev_ioctl(dev, ap->a_command, ap->a_data,
ap->a_fflag, curlwp);
break;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
error = bdev_ioctl(dev, ap->a_command, ap->a_data,
ap->a_fflag, curlwp);
break;
default:
panic("spec_ioctl");
/* NOTREACHED */
}
spec_io_exit(vp, sn);
return error;
}
/* ARGSUSED */
int
spec_poll(void *v)
{
struct vop_poll_args /* {
struct vnode *a_vp;
int a_events;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int revents;
if (spec_io_enter(vp, &sn, &dev) != 0)
return POLLERR;
switch (vp->v_type) {
case VCHR:
revents = cdev_poll(dev, ap->a_events, curlwp);
break;
default:
revents = genfs_poll(v);
break;
}
spec_io_exit(vp, sn);
return revents;
}
/* ARGSUSED */
int
spec_kqfilter(void *v)
{
struct vop_kqfilter_args /* {
struct vnode *a_vp;
struct proc *a_kn;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
switch (vp->v_type) {
case VCHR:
error = cdev_kqfilter(dev, ap->a_kn);
break;
default:
/*
* Block devices don't support kqfilter, and refuse it
* for any other files (like those vflush()ed) too.
*/
error = EOPNOTSUPP;
break;
}
spec_io_exit(vp, sn);
return error;
}
/*
* Allow mapping of only D_DISK. This is called only for VBLK.
*/
int
spec_mmap(void *v)
{
struct vop_mmap_args /* {
struct vnode *a_vp;
vm_prot_t a_prot;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct specnode *sn;
dev_t dev;
int error;
KASSERT(vp->v_type == VBLK);
error = spec_io_enter(vp, &sn, &dev);
if (error)
return error;
error = bdev_type(dev) == D_DISK ? 0 : EINVAL;
spec_io_exit(vp, sn);
return 0;
}
/*
* Synch buffers associated with a block device
*/
/* ARGSUSED */
int
spec_fsync(void *v)
{
struct vop_fsync_args /* {
struct vnode *a_vp;
kauth_cred_t a_cred;
int a_flags;
off_t offlo;
off_t offhi;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct mount *mp;
int error;
if (vp->v_type == VBLK) { if ((mp = spec_node_getmountedfs(vp)) != NULL) {
error = VFS_FSYNC(mp, vp, ap->a_flags);
if (error != EOPNOTSUPP)
return error;
}
return vflushbuf(vp, ap->a_flags);
}
return 0;
}
/*
* Just call the device strategy routine
*/
int
spec_strategy(void *v)
{
struct vop_strategy_args /* {
struct vnode *a_vp;
struct buf *a_bp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct buf *bp = ap->a_bp;
struct specnode *sn = NULL;
dev_t dev;
int error;
error = spec_io_enter(vp, &sn, &dev);
if (error)
goto out;
bp->b_dev = dev;
if (!(bp->b_flags & B_READ)) {
#ifdef DIAGNOSTIC
if (bp->b_vp && bp->b_vp->v_type == VBLK) {
struct mount *mp = spec_node_getmountedfs(bp->b_vp);
if (mp && (mp->mnt_flag & MNT_RDONLY)) {
printf("%s blk %"PRId64" written while ro!\n",
mp->mnt_stat.f_mntonname, bp->b_blkno);
#ifdef DDB
db_stacktrace();
#endif
}
}
#endif /* DIAGNOSTIC */
error = fscow_run(bp, false);
if (error)
goto out;
}
bdev_strategy(bp);
error = 0;
out: if (sn) spec_io_exit(vp, sn); if (error) { bp->b_error = error;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
return error;
}
int
spec_inactive(void *v)
{
struct vop_inactive_v2_args /* {
struct vnode *a_vp;
struct bool *a_recycle;
} */ *ap = v;
KASSERT(ap->a_vp->v_mount == dead_rootmount);
*ap->a_recycle = true;
return 0;
}
int
spec_reclaim(void *v)
{
struct vop_reclaim_v2_args /* {
struct vnode *a_vp;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
KASSERT(vp->v_specnode->sn_opencnt == 0);
VOP_UNLOCK(vp);
KASSERT(vp->v_mount == dead_rootmount);
return 0;
}
/*
* This is a noop, simply returning what one has been given.
*/
int
spec_bmap(void *v)
{
struct vop_bmap_args /* {
struct vnode *a_vp;
daddr_t a_bn;
struct vnode **a_vpp;
daddr_t *a_bnp;
int *a_runp;
} */ *ap = v;
if (ap->a_vpp != NULL)
*ap->a_vpp = ap->a_vp;
if (ap->a_bnp != NULL)
*ap->a_bnp = ap->a_bn;
if (ap->a_runp != NULL)
*ap->a_runp = (MAXBSIZE >> DEV_BSHIFT) - 1;
return 0;
}
/*
* Device close routine
*/
/* ARGSUSED */
int
spec_close(void *v)
{
struct vop_close_args /* {
struct vnode *a_vp;
int a_fflag;
kauth_cred_t a_cred;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
struct session *sess;
dev_t dev;
int flags = ap->a_fflag;
int mode, error, count;
specnode_t *sn;
specdev_t *sd;
KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
mutex_enter(vp->v_interlock);
sn = vp->v_specnode;
dev = vp->v_rdev;
sd = sn->sn_dev;
/*
* If we're going away soon, make this non-blocking.
* Also ensures that we won't wedge in vn_lock below.
*/
if (vdead_check(vp, VDEAD_NOWAIT) != 0)
flags |= FNONBLOCK;
mutex_exit(vp->v_interlock);
switch (vp->v_type) {
case VCHR:
/*
* Hack: a tty device that is a controlling terminal
* has a reference from the session structure. We
* cannot easily tell that a character device is a
* controlling terminal, unless it is the closing
* process' controlling terminal. In that case, if the
* open count is 1 release the reference from the
* session. Also, remove the link from the tty back to
* the session and pgrp.
*
* XXX V. fishy.
*/
mutex_enter(&proc_lock);
sess = curlwp->l_proc->p_session;
if (sn->sn_opencnt == 1 && vp == sess->s_ttyvp) {
mutex_spin_enter(&tty_lock);
sess->s_ttyvp = NULL;
if (sess->s_ttyp->t_session != NULL) {
sess->s_ttyp->t_pgrp = NULL;
sess->s_ttyp->t_session = NULL;
mutex_spin_exit(&tty_lock);
/* Releases proc_lock. */
proc_sessrele(sess);
} else {
mutex_spin_exit(&tty_lock);
if (sess->s_ttyp->t_pgrp != NULL)
panic("spec_close: spurious pgrp ref"); mutex_exit(&proc_lock);
}
vrele(vp);
} else
mutex_exit(&proc_lock);
/*
* If the vnode is locked, then we are in the midst
* of forcably closing the device, otherwise we only
* close on last reference.
*/
mode = S_IFCHR;
break;
case VBLK:
KASSERT(vp == vp->v_specnode->sn_dev->sd_bdevvp);
/*
* On last close of a block device (that isn't mounted)
* we must invalidate any in core blocks, so that
* we can, for instance, change floppy disks.
*/
error = vinvalbuf(vp, V_SAVE, ap->a_cred, curlwp, 0, 0);
if (error)
return error;
/*
* We do not want to really close the device if it
* is still in use unless we are trying to close it
* forcibly. Since every use (buffer, vnode, swap, cmap)
* holds a reference to the vnode, and because we mark
* any other vnodes that alias this device, when the
* sum of the reference counts on all the aliased
* vnodes descends to one, we are on last close.
*/
mode = S_IFBLK;
break;
default:
panic("spec_close: not special");
}
/*
* Decrement the open reference count of this node and the
* device. For block devices, the open reference count must be
* 1 at this point. If the device's open reference count goes
* to zero, we're the last one out so get the lights.
*
* We may find --sd->sd_opencnt gives zero, and yet
* sd->sd_opened is false. This happens if the vnode is
* revoked at the same time as it is being opened, which can
* happen when opening a tty blocks indefinitely. In that
* case, we still must call close -- it is the job of close to
* interrupt the open. Either way, the device will be no
* longer opened, so we have to clear sd->sd_opened; subsequent
* opens will have responsibility for issuing close.
*
* This has the side effect that the sequence of opens might
* happen out of order -- we might end up doing open, open,
* close, close, instead of open, close, open, close. This is
* unavoidable with the current devsw API, where open is
* allowed to block and close must be able to run concurrently
* to interrupt it. It is the driver's responsibility to
* ensure that close is idempotent so that this works. Drivers
* requiring per-open state and exact 1:1 correspondence
* between open and close can use fd_clone.
*/
mutex_enter(&device_lock);
KASSERT(sn->sn_opencnt); KASSERT(sd->sd_opencnt); KASSERTMSG(sn->sn_opencnt <= sd->sd_opencnt,
"sn_opencnt=%u > sd_opencnt=%u",
sn->sn_opencnt, sd->sd_opencnt);
sn->sn_opencnt--;
count = --sd->sd_opencnt;
if (vp->v_type == VBLK) { KASSERTMSG(count == 0, "block device with %u opens",
count + 1);
sd->sd_bdevvp = NULL;
}
if (count == 0) {
KASSERTMSG(sn->sn_opencnt == 0, "sn_opencnt=%u",
sn->sn_opencnt);
KASSERT(!sd->sd_closing);
sd->sd_opened = false;
sd->sd_closing = true;
}
mutex_exit(&device_lock);
if (count != 0)
return 0;
/*
* If we're able to block, release the vnode lock & reacquire. We
* might end up sleeping for someone else who wants our queues. They
* won't get them if we hold the vnode locked.
*/
if (!(flags & FNONBLOCK)) VOP_UNLOCK(vp);
/*
* If we can cancel all outstanding I/O, then wait for it to
* drain before we call .d_close. Drivers that split up
* .d_cancel and .d_close this way need not have any internal
* mechanism for waiting in .d_close for I/O to drain.
*/
if (vp->v_type == VBLK)
error = bdev_cancel(dev, flags, mode, curlwp);
else
error = cdev_cancel(dev, flags, mode, curlwp); if (error == 0) spec_io_drain(sd);
else
KASSERTMSG(error == ENODEV, "cancel dev=0x%lx failed with %d",
(unsigned long)dev, error);
if (vp->v_type == VBLK)
error = bdev_close(dev, flags, mode, curlwp);
else
error = cdev_close(dev, flags, mode, curlwp);
/*
* Wait for all other devsw operations to drain. After this
* point, no bdev/cdev_* can be active for this specdev.
*/
spec_io_drain(sd);
/*
* Wake any spec_open calls waiting for close to finish -- do
* this before reacquiring the vnode lock, because spec_open
* holds the vnode lock while waiting, so doing this after
* reacquiring the lock would deadlock.
*/
mutex_enter(&device_lock);
KASSERT(!sd->sd_opened); KASSERT(sd->sd_closing);
sd->sd_closing = false;
cv_broadcast(&specfs_iocv);
mutex_exit(&device_lock);
if (!(flags & FNONBLOCK)) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
return error;
}
/*
* Print out the contents of a special device vnode.
*/
int
spec_print(void *v)
{
struct vop_print_args /* {
struct vnode *a_vp;
} */ *ap = v;
printf("dev %llu, %llu\n", (unsigned long long)major(ap->a_vp->v_rdev),
(unsigned long long)minor(ap->a_vp->v_rdev));
return 0;
}
/*
* Return POSIX pathconf information applicable to special devices.
*/
int
spec_pathconf(void *v)
{
struct vop_pathconf_args /* {
struct vnode *a_vp;
int a_name;
register_t *a_retval;
} */ *ap = v;
switch (ap->a_name) {
case _PC_LINK_MAX:
*ap->a_retval = LINK_MAX;
return 0;
case _PC_MAX_CANON:
*ap->a_retval = MAX_CANON;
return 0;
case _PC_MAX_INPUT:
*ap->a_retval = MAX_INPUT;
return 0;
case _PC_PIPE_BUF:
*ap->a_retval = PIPE_BUF;
return 0;
case _PC_CHOWN_RESTRICTED:
*ap->a_retval = 1;
return 0;
case _PC_VDISABLE:
*ap->a_retval = _POSIX_VDISABLE;
return 0;
case _PC_SYNC_IO:
*ap->a_retval = 1;
return 0;
default:
return genfs_pathconf(ap);
}
/* NOTREACHED */
}
/*
* Advisory record locking support.
*/
int
spec_advlock(void *v)
{
struct vop_advlock_args /* {
struct vnode *a_vp;
void *a_id;
int a_op;
struct flock *a_fl;
int a_flags;
} */ *ap = v;
struct vnode *vp = ap->a_vp;
return lf_advlock(ap, &vp->v_speclockf, (off_t)0);
}
/* $NetBSD: dbregs.c,v 1.15 2020/01/31 08:55:38 maxv Exp $ */
/*
* Copyright (c) 2016 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/param.h>
#include <sys/types.h>
#include <sys/lwp.h>
#include <sys/pool.h>
#include <x86/cpufunc.h>
#include <x86/dbregs.h>
#include <uvm/uvm_prot.h>
#include <uvm/uvm_pmap.h>
#include <machine/pmap.h>
struct pool x86_dbregspl;
static struct dbreg initdbstate;
#define X86_BREAKPOINT_CONDITION_DETECTED ( \
X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED | \
X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED )
#define X86_GLOBAL_BREAKPOINT ( \
X86_DR7_GLOBAL_DR0_BREAKPOINT | \
X86_DR7_GLOBAL_DR1_BREAKPOINT | \
X86_DR7_GLOBAL_DR2_BREAKPOINT | \
X86_DR7_GLOBAL_DR3_BREAKPOINT )
void
x86_dbregs_init(void)
{
/* DR0-DR3 should always be 0 */
initdbstate.dr[0] = rdr0();
initdbstate.dr[1] = rdr1();
initdbstate.dr[2] = rdr2();
initdbstate.dr[3] = rdr3();
/* DR4-DR5 are reserved - skip */
/* DR6 and DR7 contain predefined nonzero bits */
initdbstate.dr[6] = rdr6();
initdbstate.dr[7] = rdr7();
/* DR8-DR15 are reserved - skip */
/*
* Explicitly reset some bits just in case they could be
* set by brave software/hardware before the kernel boot.
*/
initdbstate.dr[6] &= ~X86_BREAKPOINT_CONDITION_DETECTED;
initdbstate.dr[7] &= ~X86_DR7_GENERAL_DETECT_ENABLE;
pool_init(&x86_dbregspl, sizeof(struct dbreg), 16, 0, 0, "dbregs",
NULL, IPL_NONE);
}
static void
x86_dbregs_reset(void)
{
/*
* It's sufficient to just disable Debug Control Register (DR7).
* It will deactivate hardware watchpoints.
*/
ldr7(0);
/*
* However at some point we need to clear Debug Status Registers
* (DR6). The CPU will never do it automatically.
*
* Clear BREAKPOINT_CONDITION_DETECTED bits and ignore the rest.
*/
ldr6(rdr6() & ~X86_BREAKPOINT_CONDITION_DETECTED);
}
void
x86_dbregs_clear(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
struct dbreg *dbregs;
KASSERT(l == curlwp);
if (__predict_true(pcb->pcb_dbregs == NULL)) {
KASSERT((pcb->pcb_flags & PCB_DBREGS) == 0);
return;
}
dbregs = pcb->pcb_dbregs;
kpreempt_disable();
pcb->pcb_dbregs = NULL;
pcb->pcb_flags &= ~PCB_DBREGS;
x86_dbregs_reset();
kpreempt_enable();
pool_put(&x86_dbregspl, dbregs);
}
void
x86_dbregs_abandon(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
kpreempt_disable();
pcb->pcb_flags &= ~PCB_DBREGS;
x86_dbregs_reset();
kpreempt_enable();
}
void
x86_dbregs_read(struct lwp *l, struct dbreg *regs)
{
struct pcb *pcb = lwp_getpcb(l);
if (pcb->pcb_dbregs == NULL) {
pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
memcpy(pcb->pcb_dbregs, &initdbstate, sizeof(initdbstate));
pcb->pcb_flags |= PCB_DBREGS;
}
memcpy(regs, pcb->pcb_dbregs, sizeof(*regs));
}
void
x86_dbregs_save(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
if (!(pcb->pcb_flags & PCB_DBREGS)) {
return;
}
KASSERT(pcb->pcb_dbregs != NULL);
pcb->pcb_dbregs->dr[0] = rdr0();
pcb->pcb_dbregs->dr[1] = rdr1();
pcb->pcb_dbregs->dr[2] = rdr2();
pcb->pcb_dbregs->dr[3] = rdr3();
pcb->pcb_dbregs->dr[6] = rdr6();
pcb->pcb_dbregs->dr[7] = rdr7();
}
void
x86_dbregs_restore(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l); if (!(pcb->pcb_flags & PCB_DBREGS)) {
return;
}
KASSERT(pcb->pcb_dbregs != NULL);
ldr0(pcb->pcb_dbregs->dr[0]);
ldr1(pcb->pcb_dbregs->dr[1]);
ldr2(pcb->pcb_dbregs->dr[2]);
ldr3(pcb->pcb_dbregs->dr[3]);
ldr6(pcb->pcb_dbregs->dr[6]);
ldr7(pcb->pcb_dbregs->dr[7]);
}
void
x86_dbregs_store_dr6(struct lwp *l)
{
struct pcb *pcb = lwp_getpcb(l);
KASSERT(l == curlwp);
KASSERT(pcb->pcb_dbregs != NULL);
pcb->pcb_dbregs->dr[6] = rdr6();
}
int
x86_dbregs_user_trap(void)
{
register_t dr7, dr6;
register_t bp;
dr7 = rdr7();
if ((dr7 & X86_GLOBAL_BREAKPOINT) == 0) {
/*
* All Global Breakpoint bits are zero, thus the trap couldn't
* have been caused by the hardware debug registers.
*/
return 0;
}
dr6 = rdr6();
bp = dr6 & X86_BREAKPOINT_CONDITION_DETECTED;
if (!bp) {
/*
* None of the breakpoint bits are set, meaning this
* trap was not caused by any of the debug registers.
*/
return 0;
}
/*
* At least one of the breakpoints was hit, check to see
* which ones and if any of them are user space addresses.
*/
if (bp & X86_DR6_DR0_BREAKPOINT_CONDITION_DETECTED)
if (rdr0() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR1_BREAKPOINT_CONDITION_DETECTED)
if (rdr1() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR2_BREAKPOINT_CONDITION_DETECTED)
if (rdr2() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
if (bp & X86_DR6_DR3_BREAKPOINT_CONDITION_DETECTED)
if (rdr3() < (vaddr_t)VM_MAXUSER_ADDRESS)
return 1;
return 0;
}
int
x86_dbregs_validate(const struct dbreg *regs)
{
size_t i;
/* Check that DR0-DR3 contain user-space address */
for (i = 0; i < X86_DBREGS; i++) {
if (regs->dr[i] >= (vaddr_t)VM_MAXUSER_ADDRESS)
return EINVAL;
}
#ifndef i386
if (regs->dr[6] & X86_DR6_MBZ) {
return EINVAL;
}
if (regs->dr[7] & X86_DR7_MBZ) {
return EINVAL;
}
#endif
if (regs->dr[7] & X86_DR7_GENERAL_DETECT_ENABLE) {
return EINVAL;
}
/*
* Skip checks for reserved registers (DR4-DR5, DR8-DR15).
*/
return 0;
}
void
x86_dbregs_write(struct lwp *l, const struct dbreg *regs)
{
struct pcb *pcb = lwp_getpcb(l);
if (pcb->pcb_dbregs == NULL) {
pcb->pcb_dbregs = pool_get(&x86_dbregspl, PR_WAITOK);
}
memcpy(pcb->pcb_dbregs, regs, sizeof(*regs));
pcb->pcb_flags |= PCB_DBREGS;
}
/*
* Called with preemption disabled.
*/
void
x86_dbregs_switch(struct lwp *oldlwp, struct lwp *newlwp)
{
struct pcb *oldpcb, *newpcb;
bool olddb, newdb;
oldpcb = lwp_getpcb(oldlwp);
newpcb = lwp_getpcb(newlwp);
olddb = (oldpcb->pcb_flags & PCB_DBREGS) != 0;
newdb = (newpcb->pcb_flags & PCB_DBREGS) != 0;
if (__predict_true(!olddb && !newdb)) {
/* fast path */
return;
}
if (olddb) {
x86_dbregs_save(oldlwp);
}
if (newdb) { x86_dbregs_restore(newlwp);
} else if (olddb) {
x86_dbregs_reset();
}
}
/* $NetBSD: prop_stack.c,v 1.3 2019/05/08 02:25:50 thorpej Exp $ */
/*-
* Copyright (c) 2007 Joerg Sonnenberger <joerg@NetBSD.org>.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include "prop_object_impl.h"
#include "prop_stack.h"
void
_prop_stack_init(prop_stack_t stack)
{
stack->used_intern_elems = 0;
SLIST_INIT(&stack->extern_elems);
}
bool
_prop_stack_push(prop_stack_t stack, prop_object_t obj, void *data1,
void *data2, void *data3)
{
struct _prop_stack_extern_elem *eelem;
struct _prop_stack_intern_elem *ielem;
if (stack->used_intern_elems == PROP_STACK_INTERN_ELEMS) {
eelem = _PROP_MALLOC(sizeof(*eelem), M_TEMP);
if (eelem == NULL)
return false;
eelem->object = obj;
eelem->object_data[0] = data1;
eelem->object_data[1] = data2;
eelem->object_data[2] = data3;
SLIST_INSERT_HEAD(&stack->extern_elems, eelem, stack_link);
return true;
}
_PROP_ASSERT(stack->used_intern_elems < PROP_STACK_INTERN_ELEMS); _PROP_ASSERT(SLIST_EMPTY(&stack->extern_elems));
ielem = &stack->intern_elems[stack->used_intern_elems];
ielem->object = obj;
ielem->object_data[0] = data1;
ielem->object_data[1] = data2;
ielem->object_data[2] = data3;
++stack->used_intern_elems;
return true;
}
bool
_prop_stack_pop(prop_stack_t stack, prop_object_t *obj, void **data1,
void **data2, void **data3)
{
struct _prop_stack_extern_elem *eelem;
struct _prop_stack_intern_elem *ielem;
if (stack->used_intern_elems == 0)
return false;
if ((eelem = SLIST_FIRST(&stack->extern_elems)) != NULL) {
_PROP_ASSERT(stack->used_intern_elems == PROP_STACK_INTERN_ELEMS);
SLIST_REMOVE_HEAD(&stack->extern_elems, stack_link);
if (obj) *obj = eelem->object; if (data1) *data1 = eelem->object_data[0]; if (data2) *data2 = eelem->object_data[1]; if (data3) *data3 = eelem->object_data[2];
_PROP_FREE(eelem, M_TEMP);
return true;
}
--stack->used_intern_elems;
ielem = &stack->intern_elems[stack->used_intern_elems];
if (obj)
*obj = ielem->object;
if (data1) *data1 = ielem->object_data[0]; if (data2) *data2 = ielem->object_data[1]; if (data3) *data3 = ielem->object_data[2];
return true;
}
/* $NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $ */
/*-
* Copyright (c) 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_syscalls.c 8.42 (Berkeley) 7/31/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_syscalls_40.c,v 1.5 2019/01/27 02:08:39 pgoyette Exp $");
#if defined(_KERNEL_OPT)
#include "opt_compat_netbsd.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mount.h>
#include <sys/syscall.h>
#include <sys/syscallvar.h>
#include <sys/syscallargs.h>
#include <compat/common/compat_mod.h>
static const struct syscall_package vfs_syscalls_40_syscalls[] = {
{ SYS_compat_40_mount, 0, (sy_call_t *)compat_40_sys_mount },
{ 0, 0, NULL },
};
int
compat_40_sys_mount(struct lwp *l, const struct compat_40_sys_mount_args *uap, register_t *retval)
{
/* {
syscallarg(const char *) type;
syscallarg(const char *) path;
syscallarg(int) flags;
syscallarg(void *) data;
} */
register_t dummy;
return do_sys_mount(l, SCARG(uap, type), UIO_USERSPACE, SCARG(uap, path),
SCARG(uap, flags), SCARG(uap, data), UIO_USERSPACE, 0, &dummy);
}
int
vfs_syscalls_40_init(void)
{
return syscall_establish(NULL, vfs_syscalls_40_syscalls);
}
int
vfs_syscalls_40_fini(void)
{
return syscall_disestablish(NULL, vfs_syscalls_40_syscalls);
}
/* $NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $ */
/*-
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_uidinfo.c,v 1.13 2021/12/28 13:28:24 riastradh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kmem.h>
#include <sys/proc.h>
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/cpu.h>
static SLIST_HEAD(uihashhead, uidinfo) *uihashtbl;
static u_long uihash;
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
static int
sysctl_kern_uidinfo_cnt(SYSCTLFN_ARGS)
{
static const struct {
const char *name;
u_int value;
} nv[] = {
#define _MEM(n) { # n, offsetof(struct uidinfo, ui_ ## n) }
_MEM(proccnt),
_MEM(lwpcnt),
_MEM(lockcnt),
_MEM(semcnt),
_MEM(sbsize),
#undef _MEM
};
for (size_t i = 0; i < __arraycount(nv); i++)
if (strcmp(nv[i].name, rnode->sysctl_name) == 0) {
uint64_t cnt;
struct sysctlnode node = *rnode;
struct uidinfo *uip;
node.sysctl_data = &cnt;
uip = uid_find(kauth_cred_geteuid(l->l_cred));
*(uint64_t *)node.sysctl_data =
*(u_long *)((char *)uip + nv[i].value);
return sysctl_lookup(SYSCTLFN_CALL(&node));
}
return EINVAL;
}
static struct sysctllog *kern_uidinfo_sysctllog;
static void
sysctl_kern_uidinfo_setup(void)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(&kern_uidinfo_sysctllog, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "uidinfo",
SYSCTL_DESCR("Resource usage per uid"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "proccnt",
SYSCTL_DESCR("Number of processes for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "lwpcnt",
SYSCTL_DESCR("Number of lwps for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "lockcnt",
SYSCTL_DESCR("Number of locks for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "semcnt",
SYSCTL_DESCR("Number of semaphores used for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(&kern_uidinfo_sysctllog, 0, &rnode, &cnode,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "sbsize",
SYSCTL_DESCR("Socket buffers used for the current user"),
sysctl_kern_uidinfo_cnt, 0, NULL, 0,
CTL_CREATE, CTL_EOL);
}
static int
uid_stats(struct hashstat_sysctl *hs, bool fill)
{
struct uidinfo *uip;
uint64_t chain;
strlcpy(hs->hash_name, "uihash", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "user info (uid->used proc) hash",
sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = uihash + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
SLIST_FOREACH(uip, &uihashtbl[i], ui_hash) {
membar_datadep_consumer();
chain++;
}
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
}
return 0;
}
void
uid_init(void)
{
/*
* In case of MP system, SLIST_FOREACH would force a cache line
* write-back for every modified 'uidinfo', thus we try to keep the
* lists short.
*/
const u_int uihash_sz = (maxcpus > 1 ? 1024 : 64);
uihashtbl = hashinit(uihash_sz, HASH_SLIST, true, &uihash);
/*
* Ensure that uid 0 is always in the user hash table, as
* sbreserve() expects it available from interrupt context.
*/
(void)uid_find(0);
sysctl_kern_uidinfo_setup();
hashstat_register("uihash", uid_stats);
}
struct uidinfo *
uid_find(uid_t uid)
{
struct uidinfo *uip, *uip_first, *newuip;
struct uihashhead *uipp;
uipp = UIHASH(uid);
newuip = NULL;
/*
* To make insertion atomic, abstraction of SLIST will be violated.
*/
uip_first = uipp->slh_first;
again:
SLIST_FOREACH(uip, uipp, ui_hash) {
membar_datadep_consumer();
if (uip->ui_uid != uid)
continue;
if (newuip != NULL) kmem_free(newuip, sizeof(*newuip));
return uip;
}
if (newuip == NULL) newuip = kmem_zalloc(sizeof(*newuip), KM_SLEEP);
newuip->ui_uid = uid;
/*
* If atomic insert is unsuccessful, another thread might be
* allocated this 'uid', thus full re-check is needed.
*/
newuip->ui_hash.sle_next = uip_first;
membar_producer();
uip = atomic_cas_ptr(&uipp->slh_first, uip_first, newuip);
if (uip != uip_first) {
uip_first = uip;
goto again;
}
return newuip;
}
/*
* Change the count associated with number of processes
* a given user is using.
*/
int
chgproccnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long proccnt;
uip = uid_find(uid);
proccnt = atomic_add_long_nv(&uip->ui_proccnt, diff);
KASSERTMSG(proccnt >= 0, "uid=%d diff=%d proccnt=%ld",
uid, diff, proccnt);
return proccnt;
}
/*
* Change the count associated with number of lwps
* a given user is using.
*/
int
chglwpcnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long lwpcnt;
uip = uid_find(uid);
lwpcnt = atomic_add_long_nv(&uip->ui_lwpcnt, diff);
KASSERTMSG(lwpcnt >= 0, "uid=%d diff=%d lwpcnt=%ld",
uid, diff, lwpcnt);
return lwpcnt;
}
/*
* Change the count associated with number of semaphores
* a given user is using.
*/
int
chgsemcnt(uid_t uid, int diff)
{
struct uidinfo *uip;
long semcnt;
uip = uid_find(uid);
semcnt = atomic_add_long_nv(&uip->ui_semcnt, diff);
KASSERTMSG(semcnt >= 0, "uid=%d diff=%d semcnt=%ld",
uid, diff, semcnt);
return semcnt;
}
int
chgsbsize(struct uidinfo *uip, u_long *hiwat, u_long to, rlim_t xmax)
{
rlim_t nsb;
const long diff = to - *hiwat;
nsb = (rlim_t)atomic_add_long_nv((long *)&uip->ui_sbsize, diff);
if (diff > 0 && nsb > xmax) {
atomic_add_long((long *)&uip->ui_sbsize, -diff);
return 0;
}
*hiwat = to;
return 1;
}
/* $NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $ */
/*-
* Copyright (c) 2002, 2006, 2007, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Turnstiles are described in detail in:
*
* Solaris Internals: Core Kernel Architecture, Jim Mauro and
* Richard McDougall.
*
* Turnstiles are kept in a hash table. There are likely to be many more
* synchronisation objects than there are threads. Since a thread can block
* on only one lock at a time, we only need one turnstile per thread, and
* so they are allocated at thread creation time.
*
* When a thread decides it needs to block on a lock, it looks up the
* active turnstile for that lock. If no active turnstile exists, then
* the process lends its turnstile to the lock. If there is already an
* active turnstile for the lock, the thread places its turnstile on a
* list of free turnstiles, and references the active one instead.
*
* The act of looking up the turnstile acquires an interlock on the sleep
* queue. If a thread decides it doesn't need to block after all, then this
* interlock must be released by explicitly aborting the turnstile
* operation.
*
* When a thread is awakened, it needs to get its turnstile back. If there
* are still other threads waiting in the active turnstile, the thread
* grabs a free turnstile off the free list. Otherwise, it can take back
* the active turnstile from the lock (thus deactivating the turnstile).
*
* Turnstiles are where we do priority inheritence.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_turnstile.c,v 1.55 2023/10/15 10:30:20 riastradh Exp $");
#include <sys/param.h>
#include <sys/lockdebug.h>
#include <sys/lwp.h>
#include <sys/proc.h>
#include <sys/sleepq.h>
#include <sys/sleeptab.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* Shift of 6 aligns to typical cache line size of 64 bytes; there's no
* point having two turnstile locks to back two lock objects that share one
* cache line.
*/
#define TS_HASH_SIZE 128
#define TS_HASH_MASK (TS_HASH_SIZE - 1)
#define TS_HASH(obj) (((uintptr_t)(obj) >> 6) & TS_HASH_MASK)
static tschain_t turnstile_chains[TS_HASH_SIZE] __cacheline_aligned;
static union {
kmutex_t lock;
uint8_t pad[COHERENCY_UNIT];
} turnstile_locks[TS_HASH_SIZE] __cacheline_aligned;
/*
* turnstile_init:
*
* Initialize the turnstile mechanism.
*/
void
turnstile_init(void)
{
int i;
for (i = 0; i < TS_HASH_SIZE; i++) {
LIST_INIT(&turnstile_chains[i]);
mutex_init(&turnstile_locks[i].lock, MUTEX_DEFAULT, IPL_SCHED);
}
turnstile_ctor(&turnstile0);
}
/*
* turnstile_ctor:
*
* Constructor for turnstiles.
*/
void
turnstile_ctor(turnstile_t *ts)
{
memset(ts, 0, sizeof(*ts));
sleepq_init(&ts->ts_sleepq[TS_READER_Q]);
sleepq_init(&ts->ts_sleepq[TS_WRITER_Q]);
}
/*
* turnstile_remove:
*
* Remove an LWP from a turnstile sleep queue and wake it.
*/
static inline void
turnstile_remove(turnstile_t *ts, lwp_t *l, int q)
{
turnstile_t *nts;
KASSERT(l->l_ts == ts);
/*
* This process is no longer using the active turnstile.
* Find an inactive one on the free list to give to it.
*/
if ((nts = ts->ts_free) != NULL) {
KASSERT(TS_ALL_WAITERS(ts) > 1);
l->l_ts = nts;
ts->ts_free = nts->ts_free;
nts->ts_free = NULL;
} else {
/*
* If the free list is empty, this is the last
* waiter.
*/
KASSERT(TS_ALL_WAITERS(ts) == 1); LIST_REMOVE(ts, ts_chain);
}
ts->ts_waiters[q]--;
sleepq_remove(&ts->ts_sleepq[q], l, true);
}
/*
* turnstile_lookup:
*
* Look up the turnstile for the specified lock. This acquires and
* holds the turnstile chain lock (sleep queue interlock).
*/
turnstile_t *
turnstile_lookup(wchan_t obj)
{
turnstile_t *ts;
tschain_t *tc;
u_int hash;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
mutex_spin_enter(&turnstile_locks[hash].lock);
LIST_FOREACH(ts, tc, ts_chain) if (ts->ts_obj == obj)
return (ts);
/*
* No turnstile yet for this lock. No problem, turnstile_block()
* handles this by fetching the turnstile from the blocking thread.
*/
return (NULL);
}
/*
* turnstile_exit:
*
* Abort a turnstile operation.
*/
void
turnstile_exit(wchan_t obj)
{
mutex_spin_exit(&turnstile_locks[TS_HASH(obj)].lock);
}
/*
* turnstile_lendpri:
*
* Lend our priority to lwps on the blocking chain.
*
* If the current owner of the lock (l->l_wchan, set by sleepq_enqueue)
* has a priority lower than ours (lwp_eprio(l)), lend our priority to
* him to avoid priority inversions.
*/
static void
turnstile_lendpri(lwp_t *cur)
{
lwp_t * l = cur;
pri_t prio;
/*
* NOTE: if you get a panic in this code block, it is likely that
* a lock has been destroyed or corrupted while still in use. Try
* compiling a kernel with LOCKDEBUG to pinpoint the problem.
*/
LOCKDEBUG_BARRIER(l->l_mutex, 1);
KASSERT(l == curlwp);
prio = lwp_eprio(l);
for (;;) {
lwp_t *owner;
turnstile_t *ts;
bool dolock;
if (l->l_wchan == NULL)
break;
/*
* Ask syncobj the owner of the lock.
*/
owner = (*l->l_syncobj->sobj_owner)(l->l_wchan);
if (owner == NULL)
break;
/*
* The owner may have changed as we have dropped the tc lock.
*/
if (cur == owner) {
/*
* We own the lock: stop here, sleepq_block()
* should wake up immediately.
*/
break;
}
/*
* Acquire owner->l_mutex if we don't have it yet.
* Because we already have another LWP lock (l->l_mutex) held,
* we need to play a try lock dance to avoid deadlock.
*/
dolock = l->l_mutex != atomic_load_relaxed(&owner->l_mutex); if (l == owner || (dolock && !lwp_trylock(owner))) {
/*
* The owner was changed behind us or trylock failed.
* Restart from curlwp.
*
* Note that there may be a livelock here:
* the owner may try grabbing cur's lock (which is the
* tc lock) while we're trying to grab the owner's lock.
*/
lwp_unlock(l);
l = cur;
lwp_lock(l);
prio = lwp_eprio(l);
continue;
}
/*
* If the owner's priority is already higher than ours,
* there's nothing to do anymore.
*/
if (prio <= lwp_eprio(owner)) {
if (dolock)
lwp_unlock(owner);
break;
}
/*
* Lend our priority to the 'owner' LWP.
*
* Update lenders info for turnstile_unlendpri.
*/
ts = l->l_ts;
KASSERT(ts->ts_inheritor == owner || ts->ts_inheritor == NULL);
if (ts->ts_inheritor == NULL) {
ts->ts_inheritor = owner;
ts->ts_eprio = prio;
SLIST_INSERT_HEAD(&owner->l_pi_lenders, ts, ts_pichain);
lwp_lendpri(owner, prio);
} else if (prio > ts->ts_eprio) { ts->ts_eprio = prio;
lwp_lendpri(owner, prio);
}
if (dolock) lwp_unlock(l);
LOCKDEBUG_BARRIER(owner->l_mutex, 1);
l = owner;
}
LOCKDEBUG_BARRIER(l->l_mutex, 1);
if (cur->l_mutex != atomic_load_relaxed(&l->l_mutex)) { lwp_unlock(l);
lwp_lock(cur);
}
LOCKDEBUG_BARRIER(cur->l_mutex, 1);
}
/*
* turnstile_unlendpri: undo turnstile_lendpri
*/
static void
turnstile_unlendpri(turnstile_t *ts)
{
lwp_t * const l = curlwp;
turnstile_t *iter;
turnstile_t *next;
turnstile_t *prev = NULL;
pri_t prio;
bool dolock;
KASSERT(ts->ts_inheritor != NULL);
ts->ts_inheritor = NULL;
dolock = (atomic_load_relaxed(&l->l_mutex) ==
l->l_cpu->ci_schedstate.spc_lwplock);
if (dolock) { lwp_lock(l);
}
/*
* the following loop does two things.
*
* - remove ts from the list.
*
* - from the rest of the list, find the highest priority.
*/
prio = -1;
KASSERT(!SLIST_EMPTY(&l->l_pi_lenders)); for (iter = SLIST_FIRST(&l->l_pi_lenders);
iter != NULL; iter = next) {
KASSERT(lwp_eprio(l) >= ts->ts_eprio);
next = SLIST_NEXT(iter, ts_pichain);
if (iter == ts) {
if (prev == NULL) {
SLIST_REMOVE_HEAD(&l->l_pi_lenders,
ts_pichain);
} else {
SLIST_REMOVE_AFTER(prev, ts_pichain);
}
} else if (prio < iter->ts_eprio) {
prio = iter->ts_eprio;
}
prev = iter;
}
lwp_lendpri(l, prio);
if (dolock) { lwp_unlock(l);
}
}
/*
* turnstile_block:
*
* Enter an object into the turnstile chain and prepare the current
* LWP for sleep.
*/
void
turnstile_block(turnstile_t *ts, int q, wchan_t obj, syncobj_t *sobj)
{
lwp_t * const l = curlwp; /* cached curlwp */
turnstile_t *ots;
tschain_t *tc;
kmutex_t *lock;
sleepq_t *sq;
u_int hash;
int nlocks;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
lock = &turnstile_locks[hash].lock;
KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(mutex_owned(lock)); KASSERT(l != NULL); KASSERT(l->l_ts != NULL);
if (ts == NULL) {
/*
* We are the first thread to wait for this object;
* lend our turnstile to it.
*/
ts = l->l_ts;
KASSERT(TS_ALL_WAITERS(ts) == 0); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q])); KASSERT(LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
ts->ts_obj = obj;
ts->ts_inheritor = NULL;
LIST_INSERT_HEAD(tc, ts, ts_chain);
} else {
/*
* Object already has a turnstile. Put our turnstile
* onto the free list, and reference the existing
* turnstile instead.
*/
ots = l->l_ts;
KASSERT(ots->ts_free == NULL);
ots->ts_free = ts->ts_free;
ts->ts_free = ots;
l->l_ts = ts;
KASSERT(ts->ts_obj == obj); KASSERT(TS_ALL_WAITERS(ts) != 0); KASSERT(!LIST_EMPTY(&ts->ts_sleepq[TS_READER_Q]) ||
!LIST_EMPTY(&ts->ts_sleepq[TS_WRITER_Q]));
}
sq = &ts->ts_sleepq[q];
ts->ts_waiters[q]++;
nlocks = sleepq_enter(sq, l, lock);
LOCKDEBUG_BARRIER(lock, 1);
sleepq_enqueue(sq, obj, sobj->sobj_name, sobj, false);
/*
* Disable preemption across this entire block, as we may drop
* scheduler locks (allowing preemption), and would prefer not
* to be interrupted while in a state of flux.
*/
KPREEMPT_DISABLE(l);
KASSERT(lock == l->l_mutex); turnstile_lendpri(l);
sleepq_block(0, false, sobj, nlocks);
KPREEMPT_ENABLE(l);
}
/*
* turnstile_wakeup:
*
* Wake up the specified number of threads that are blocked
* in a turnstile.
*/
void
turnstile_wakeup(turnstile_t *ts, int q, int count, lwp_t *nl)
{
sleepq_t *sq;
kmutex_t *lock;
u_int hash;
lwp_t *l;
hash = TS_HASH(ts->ts_obj);
lock = &turnstile_locks[hash].lock;
sq = &ts->ts_sleepq[q];
KASSERT(q == TS_READER_Q || q == TS_WRITER_Q); KASSERT(count > 0); KASSERT(count <= TS_WAITERS(ts, q)); KASSERT(mutex_owned(lock)); KASSERT(ts->ts_inheritor == curlwp || ts->ts_inheritor == NULL);
/*
* restore inherited priority if necessary.
*/
if (ts->ts_inheritor != NULL) { turnstile_unlendpri(ts);
}
if (nl != NULL) {
#if defined(DEBUG) || defined(LOCKDEBUG)
LIST_FOREACH(l, sq, l_sleepchain) {
if (l == nl)
break;
}
if (l == NULL)
panic("turnstile_wakeup: nl not on sleepq");
#endif
turnstile_remove(ts, nl, q);
} else {
while (count-- > 0) {
l = LIST_FIRST(sq);
KASSERT(l != NULL);
turnstile_remove(ts, l, q);
}
}
mutex_spin_exit(lock);
}
/*
* turnstile_unsleep:
*
* Remove an LWP from the turnstile. This is called when the LWP has
* not been awoken normally but instead interrupted: for example, if it
* has received a signal. It's not a valid action for turnstiles,
* since LWPs blocking on a turnstile are not interruptable.
*/
void
turnstile_unsleep(lwp_t *l, bool cleanup)
{
lwp_unlock(l);
panic("turnstile_unsleep");
}
/*
* turnstile_changepri:
*
* Adjust the priority of an LWP residing on a turnstile.
*/
void
turnstile_changepri(lwp_t *l, pri_t pri)
{
/* XXX priority inheritance */
sleepq_changepri(l, pri);
}
#if defined(LOCKDEBUG)
/*
* turnstile_print:
*
* Given the address of a lock object, print the contents of a
* turnstile.
*/
void
turnstile_print(volatile void *obj, void (*pr)(const char *, ...))
{
turnstile_t *ts;
tschain_t *tc;
sleepq_t *rsq, *wsq;
u_int hash;
lwp_t *l;
hash = TS_HASH(obj);
tc = &turnstile_chains[hash];
LIST_FOREACH(ts, tc, ts_chain)
if (ts->ts_obj == obj)
break;
if (ts == NULL) {
(*pr)("Turnstile: no active turnstile for this lock.\n");
return;
}
rsq = &ts->ts_sleepq[TS_READER_Q];
wsq = &ts->ts_sleepq[TS_WRITER_Q];
(*pr)("Turnstile:\n");
(*pr)("=> %d waiting readers:", TS_WAITERS(ts, TS_READER_Q));
LIST_FOREACH(l, rsq, l_sleepchain) {
(*pr)(" %p", l);
}
(*pr)("\n");
(*pr)("=> %d waiting writers:", TS_WAITERS(ts, TS_WRITER_Q));
LIST_FOREACH(l, wsq, l_sleepchain) {
(*pr)(" %p", l);
}
(*pr)("\n");
}
#endif /* LOCKDEBUG */
/* $NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $ */
/*-
* Copyright (c) 2000, 2004, 2005, 2007, 2008, 2009, 2020
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Christopher G. Demetriou, by Andrew Doran, and by Jason R. Thorpe.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_time.c 8.4 (Berkeley) 5/26/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_time.c,v 1.221 2023/02/23 02:57:17 riastradh Exp $");
#include <sys/param.h>
#include <sys/resourcevar.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/signalvar.h>
#include <sys/syslog.h>
#include <sys/timetc.h>
#include <sys/timevar.h>
#include <sys/timex.h>
#include <sys/kauth.h>
#include <sys/mount.h>
#include <sys/syscallargs.h>
#include <sys/cpu.h>
kmutex_t itimer_mutex __cacheline_aligned; /* XXX static */
static struct itlist itimer_realtime_changed_notify;
static void itimer_callout(void *);
static void ptimer_intr(void *);
static void *ptimer_sih __read_mostly;
static TAILQ_HEAD(, ptimer) ptimer_queue;
#define CLOCK_VIRTUAL_P(clockid) \
((clockid) == CLOCK_VIRTUAL || (clockid) == CLOCK_PROF)
CTASSERT(ITIMER_REAL == CLOCK_REALTIME);
CTASSERT(ITIMER_VIRTUAL == CLOCK_VIRTUAL);
CTASSERT(ITIMER_PROF == CLOCK_PROF);
CTASSERT(ITIMER_MONOTONIC == CLOCK_MONOTONIC);
#define DELAYTIMER_MAX 32
/*
* Initialize timekeeping.
*/
void
time_init(void)
{
mutex_init(&itimer_mutex, MUTEX_DEFAULT, IPL_SCHED);
LIST_INIT(&itimer_realtime_changed_notify);
TAILQ_INIT(&ptimer_queue);
ptimer_sih = softint_establish(SOFTINT_CLOCK | SOFTINT_MPSAFE,
ptimer_intr, NULL);
}
/*
* Check if the time will wrap if set to ts.
*
* ts - timespec describing the new time
* delta - the delta between the current time and ts
*/
bool
time_wraps(struct timespec *ts, struct timespec *delta)
{
/*
* Don't allow the time to be set forward so far it
* will wrap and become negative, thus allowing an
* attacker to bypass the next check below. The
* cutoff is 1 year before rollover occurs, so even
* if the attacker uses adjtime(2) to move the time
* past the cutoff, it will take a very long time
* to get to the wrap point.
*/
if ((ts->tv_sec > LLONG_MAX - 365*24*60*60) ||
(delta->tv_sec < 0 || delta->tv_nsec < 0))
return true;
return false;
}
/*
* itimer_lock:
*
* Acquire the interval timer data lock.
*/
void
itimer_lock(void)
{
mutex_spin_enter(&itimer_mutex);
}
/*
* itimer_unlock:
*
* Release the interval timer data lock.
*/
void
itimer_unlock(void)
{
mutex_spin_exit(&itimer_mutex);
}
/*
* itimer_lock_held:
*
* Check that the interval timer lock is held for diagnostic
* assertions.
*/
inline bool __diagused
itimer_lock_held(void)
{
return mutex_owned(&itimer_mutex);
}
/*
* Time of day and interval timer support.
*
* These routines provide the kernel entry points to get and set
* the time-of-day and per-process interval timers. Subroutines
* here provide support for adding and subtracting timeval structures
* and decrementing interval timers, optionally reloading the interval
* timers when they expire.
*/
/* This function is used by clock_settime and settimeofday */
static int
settime1(struct proc *p, const struct timespec *ts, bool check_kauth)
{
struct timespec delta, now;
/*
* The time being set to an unreasonable value will cause
* unreasonable system behaviour.
*/
if (ts->tv_sec < 0 || ts->tv_sec > (1LL << 36))
return EINVAL;
nanotime(&now);
timespecsub(ts, &now, &delta); if (check_kauth && kauth_authorize_system(kauth_cred_get(),
KAUTH_SYSTEM_TIME, KAUTH_REQ_SYSTEM_TIME_SYSTEM, __UNCONST(ts),
&delta, KAUTH_ARG(check_kauth ? false : true)) != 0) {
return EPERM;
}
#ifdef notyet
if ((delta.tv_sec < 86400) && securelevel > 0) { /* XXX elad - notyet */
return EPERM;
}
#endif
tc_setclock(ts);
resettodr();
/*
* Notify pending CLOCK_REALTIME timers about the real time change.
* There may be inactive timers on this list, but this happens
* comparatively less often than timers firing, and so it's better
* to put the extra checks here than to complicate the other code
* path.
*/
struct itimer *it;
itimer_lock();
LIST_FOREACH(it, &itimer_realtime_changed_notify, it_rtchgq) { KASSERT(it->it_ops->ito_realtime_changed != NULL); if (timespecisset(&it->it_time.it_value)) {
(*it->it_ops->ito_realtime_changed)(it);
}
}
itimer_unlock();
return 0;
}
int
settime(struct proc *p, struct timespec *ts)
{
return settime1(p, ts, true);
}
/* ARGSUSED */
int
sys___clock_gettime50(struct lwp *l,
const struct sys___clock_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */
int error;
struct timespec ats;
error = clock_gettime1(SCARG(uap, clock_id), &ats);
if (error != 0)
return error;
return copyout(&ats, SCARG(uap, tp), sizeof(ats));
}
/* ARGSUSED */
int
sys___clock_settime50(struct lwp *l,
const struct sys___clock_settime50_args *uap, register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(const struct timespec *) tp;
} */
int error;
struct timespec ats;
if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0)
return error;
return clock_settime1(l->l_proc, SCARG(uap, clock_id), &ats, true);
}
int
clock_settime1(struct proc *p, clockid_t clock_id, const struct timespec *tp,
bool check_kauth)
{
int error;
if (tp->tv_nsec < 0 || tp->tv_nsec >= 1000000000L)
return EINVAL;
switch (clock_id) {
case CLOCK_REALTIME:
if ((error = settime1(p, tp, check_kauth)) != 0)
return error;
break;
case CLOCK_MONOTONIC:
return EINVAL; /* read-only clock */
default:
return EINVAL;
}
return 0;
}
int
sys___clock_getres50(struct lwp *l, const struct sys___clock_getres50_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct timespec *) tp;
} */
struct timespec ts;
int error;
if ((error = clock_getres1(SCARG(uap, clock_id), &ts)) != 0)
return error;
if (SCARG(uap, tp)) error = copyout(&ts, SCARG(uap, tp), sizeof(ts));
return error;
}
int
clock_getres1(clockid_t clock_id, struct timespec *ts)
{ switch (clock_id) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
ts->tv_sec = 0;
if (tc_getfrequency() > 1000000000)
ts->tv_nsec = 1;
else
ts->tv_nsec = 1000000000 / tc_getfrequency();
break;
default:
return EINVAL;
}
return 0;
}
/* ARGSUSED */
int
sys___nanosleep50(struct lwp *l, const struct sys___nanosleep50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct timespec *) rqtp;
syscallarg(struct timespec *) rmtp;
} */
struct timespec rmt, rqt;
int error, error1;
error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
if (error)
return error;
error = nanosleep1(l, CLOCK_MONOTONIC, 0, &rqt,
SCARG(uap, rmtp) ? &rmt : NULL);
if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
return error;
error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt));
return error1 ? error1 : error;
}
/* ARGSUSED */
int
sys_clock_nanosleep(struct lwp *l, const struct sys_clock_nanosleep_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(int) flags;
syscallarg(struct timespec *) rqtp;
syscallarg(struct timespec *) rmtp;
} */
struct timespec rmt, rqt;
int error, error1;
error = copyin(SCARG(uap, rqtp), &rqt, sizeof(struct timespec));
if (error)
goto out;
error = nanosleep1(l, SCARG(uap, clock_id), SCARG(uap, flags), &rqt,
SCARG(uap, rmtp) ? &rmt : NULL);
if (SCARG(uap, rmtp) == NULL || (error != 0 && error != EINTR))
goto out;
if ((SCARG(uap, flags) & TIMER_ABSTIME) == 0 && (error1 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt))) != 0)
error = error1;
out:
*retval = error;
return 0;
}
int
nanosleep1(struct lwp *l, clockid_t clock_id, int flags, struct timespec *rqt,
struct timespec *rmt)
{
struct timespec rmtstart;
int error, timo;
if ((error = ts2timo(clock_id, flags, rqt, &timo, &rmtstart)) != 0) {
if (error == ETIMEDOUT) {
error = 0;
if (rmt != NULL) rmt->tv_sec = rmt->tv_nsec = 0;
}
return error;
}
/*
* Avoid inadvertently sleeping forever
*/
if (timo == 0)
timo = 1;
again:
error = kpause("nanoslp", true, timo, NULL);
if (error == EWOULDBLOCK)
error = 0;
if (rmt != NULL || error == 0) {
struct timespec rmtend;
struct timespec t0;
struct timespec *t;
int err;
err = clock_gettime1(clock_id, &rmtend);
if (err != 0)
return err;
t = (rmt != NULL) ? rmt : &t0;
if (flags & TIMER_ABSTIME) {
timespecsub(rqt, &rmtend, t);
} else {
if (timespeccmp(&rmtend, &rmtstart, <))
timespecclear(t); /* clock wound back */
else
timespecsub(&rmtend, &rmtstart, t); if (timespeccmp(rqt, t, <))
timespecclear(t);
else
timespecsub(rqt, t, t);
}
if (t->tv_sec < 0)
timespecclear(t);
if (error == 0) {
timo = tstohz(t);
if (timo > 0)
goto again;
}
}
if (error == ERESTART)
error = EINTR;
return error;
}
int
sys_clock_getcpuclockid2(struct lwp *l,
const struct sys_clock_getcpuclockid2_args *uap,
register_t *retval)
{
/* {
syscallarg(idtype_t idtype;
syscallarg(id_t id);
syscallarg(clockid_t *)clock_id;
} */
pid_t pid;
lwpid_t lid;
clockid_t clock_id;
id_t id = SCARG(uap, id);
switch (SCARG(uap, idtype)) {
case P_PID:
pid = id == 0 ? l->l_proc->p_pid : id;
clock_id = CLOCK_PROCESS_CPUTIME_ID | pid;
break;
case P_LWPID:
lid = id == 0 ? l->l_lid : id;
clock_id = CLOCK_THREAD_CPUTIME_ID | lid;
break;
default:
return EINVAL;
}
return copyout(&clock_id, SCARG(uap, clock_id), sizeof(clock_id));
}
/* ARGSUSED */
int
sys___gettimeofday50(struct lwp *l, const struct sys___gettimeofday50_args *uap,
register_t *retval)
{
/* {
syscallarg(struct timeval *) tp;
syscallarg(void *) tzp; really "struct timezone *";
} */
struct timeval atv;
int error = 0;
struct timezone tzfake;
if (SCARG(uap, tp)) {
memset(&atv, 0, sizeof(atv));
microtime(&atv);
error = copyout(&atv, SCARG(uap, tp), sizeof(atv));
if (error)
return error;
}
if (SCARG(uap, tzp)) {
/*
* NetBSD has no kernel notion of time zone, so we just
* fake up a timezone struct and return it if demanded.
*/
tzfake.tz_minuteswest = 0;
tzfake.tz_dsttime = 0;
error = copyout(&tzfake, SCARG(uap, tzp), sizeof(tzfake));
}
return error;
}
/* ARGSUSED */
int
sys___settimeofday50(struct lwp *l, const struct sys___settimeofday50_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct timeval *) tv;
syscallarg(const void *) tzp; really "const struct timezone *";
} */
return settimeofday1(SCARG(uap, tv), true, SCARG(uap, tzp), l, true);
}
int
settimeofday1(const struct timeval *utv, bool userspace,
const void *utzp, struct lwp *l, bool check_kauth)
{
struct timeval atv;
struct timespec ts;
int error;
/* Verify all parameters before changing time. */
/*
* NetBSD has no kernel notion of time zone, and only an
* obsolete program would try to set it, so we log a warning.
*/
if (utzp)
log(LOG_WARNING, "pid %d attempted to set the "
"(obsolete) kernel time zone\n", l->l_proc->p_pid); if (utv == NULL)
return 0;
if (userspace) { if ((error = copyin(utv, &atv, sizeof(atv))) != 0)
return error;
utv = &atv;
}
if (utv->tv_usec < 0 || utv->tv_usec >= 1000000)
return EINVAL;
TIMEVAL_TO_TIMESPEC(utv, &ts);
return settime1(l->l_proc, &ts, check_kauth);
}
int time_adjusted; /* set if an adjustment is made */
/* ARGSUSED */
int
sys___adjtime50(struct lwp *l, const struct sys___adjtime50_args *uap,
register_t *retval)
{
/* {
syscallarg(const struct timeval *) delta;
syscallarg(struct timeval *) olddelta;
} */
int error;
struct timeval atv, oldatv;
if ((error = kauth_authorize_system(l->l_cred, KAUTH_SYSTEM_TIME,
KAUTH_REQ_SYSTEM_TIME_ADJTIME, NULL, NULL, NULL)) != 0)
return error;
if (SCARG(uap, delta)) {
error = copyin(SCARG(uap, delta), &atv,
sizeof(*SCARG(uap, delta)));
if (error)
return error;
}
adjtime1(SCARG(uap, delta) ? &atv : NULL,
SCARG(uap, olddelta) ? &oldatv : NULL, l->l_proc);
if (SCARG(uap, olddelta))
error = copyout(&oldatv, SCARG(uap, olddelta),
sizeof(*SCARG(uap, olddelta)));
return error;
}
void
adjtime1(const struct timeval *delta, struct timeval *olddelta, struct proc *p)
{ if (olddelta) {
memset(olddelta, 0, sizeof(*olddelta));
mutex_spin_enter(&timecounter_lock);
olddelta->tv_sec = time_adjtime / 1000000;
olddelta->tv_usec = time_adjtime % 1000000;
if (olddelta->tv_usec < 0) { olddelta->tv_usec += 1000000;
olddelta->tv_sec--;
}
mutex_spin_exit(&timecounter_lock);
}
if (delta) {
mutex_spin_enter(&timecounter_lock);
/*
* XXX This should maybe just report failure to
* userland for nonsense deltas.
*/
if (delta->tv_sec > INT64_MAX/1000000 - 1) {
time_adjtime = INT64_MAX;
} else if (delta->tv_sec < INT64_MIN/1000000 + 1) {
time_adjtime = INT64_MIN;
} else {
time_adjtime = delta->tv_sec * 1000000
+ MAX(-999999, MIN(999999, delta->tv_usec));
}
if (time_adjtime) {
/* We need to save the system time during shutdown */
time_adjusted |= 1;
}
mutex_spin_exit(&timecounter_lock);
}
}
/*
* Interval timer support.
*
* The itimer_*() routines provide generic support for interval timers,
* both real (CLOCK_REALTIME, CLOCK_MONOTIME), and virtual (CLOCK_VIRTUAL,
* CLOCK_PROF).
*
* Real timers keep their deadline as an absolute time, and are fired
* by a callout. Virtual timers are kept as a linked-list of deltas,
* and are processed by hardclock().
*
* Because the real time timer callout may be delayed in real time due
* to interrupt processing on the system, it is possible for the real
* time timeout routine (itimer_callout()) run past after its deadline.
* It does not suffice, therefore, to reload the real timer .it_value
* from the timer's .it_interval. Rather, we compute the next deadline
* in absolute time based on the current time and the .it_interval value,
* and report any overruns.
*
* Note that while the virtual timers are supported in a generic fashion
* here, they only (currently) make sense as per-process timers, and thus
* only really work for that case.
*/
/*
* itimer_init:
*
* Initialize the common data for an interval timer.
*/
void
itimer_init(struct itimer * const it, const struct itimer_ops * const ops,
clockid_t const id, struct itlist * const itl)
{ KASSERT(itimer_lock_held()); KASSERT(ops != NULL);
timespecclear(&it->it_time.it_value);
it->it_ops = ops;
it->it_clockid = id;
it->it_overruns = 0;
it->it_dying = false;
if (!CLOCK_VIRTUAL_P(id)) {
KASSERT(itl == NULL);
callout_init(&it->it_ch, CALLOUT_MPSAFE);
callout_setfunc(&it->it_ch, itimer_callout, it);
if (id == CLOCK_REALTIME && ops->ito_realtime_changed != NULL) { LIST_INSERT_HEAD(&itimer_realtime_changed_notify,
it, it_rtchgq);
}
} else {
KASSERT(itl != NULL);
it->it_vlist = itl;
it->it_active = false;
}
}
/*
* itimer_poison:
*
* Poison an interval timer, preventing it from being scheduled
* or processed, in preparation for freeing the timer.
*/
void
itimer_poison(struct itimer * const it)
{
KASSERT(itimer_lock_held());
it->it_dying = true;
/*
* For non-virtual timers, stop the callout, or wait for it to
* run if it has already fired. It cannot restart again after
* this point: the callout won't restart itself when dying, no
* other users holding the lock can restart it, and any other
* users waiting for callout_halt concurrently (itimer_settime)
* will restart from the top.
*/
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
callout_halt(&it->it_ch, &itimer_mutex);
if (it->it_clockid == CLOCK_REALTIME &&
it->it_ops->ito_realtime_changed != NULL) {
LIST_REMOVE(it, it_rtchgq);
}
}
}
/*
* itimer_fini:
*
* Release resources used by an interval timer.
*
* N.B. itimer_lock must be held on entry, and is released on exit.
*/
void
itimer_fini(struct itimer * const it)
{
KASSERT(itimer_lock_held());
/* All done with the global state. */
itimer_unlock();
/* Destroy the callout, if needed. */
if (!CLOCK_VIRTUAL_P(it->it_clockid))
callout_destroy(&it->it_ch);
}
/*
* itimer_decr:
*
* Decrement an interval timer by a specified number of nanoseconds,
* which must be less than a second, i.e. < 1000000000. If the timer
* expires, then reload it. In this case, carry over (nsec - old value)
* to reduce the value reloaded into the timer so that the timer does
* not drift. This routine assumes that it is called in a context where
* the timers on which it is operating cannot change in value.
*
* Returns true if the timer has expired.
*/
static bool
itimer_decr(struct itimer *it, int nsec)
{
struct itimerspec *itp;
int error __diagused;
KASSERT(itimer_lock_held());
KASSERT(CLOCK_VIRTUAL_P(it->it_clockid));
itp = &it->it_time;
if (itp->it_value.tv_nsec < nsec) {
if (itp->it_value.tv_sec == 0) {
/* expired, and already in next interval */
nsec -= itp->it_value.tv_nsec;
goto expire;
}
itp->it_value.tv_nsec += 1000000000;
itp->it_value.tv_sec--;
}
itp->it_value.tv_nsec -= nsec;
nsec = 0;
if (timespecisset(&itp->it_value))
return false;
/* expired, exactly at end of interval */
expire:
if (timespecisset(&itp->it_interval)) {
itp->it_value = itp->it_interval;
itp->it_value.tv_nsec -= nsec;
if (itp->it_value.tv_nsec < 0) {
itp->it_value.tv_nsec += 1000000000;
itp->it_value.tv_sec--;
}
error = itimer_settime(it);
KASSERT(error == 0); /* virtual, never fails */
} else
itp->it_value.tv_nsec = 0; /* sec is already 0 */
return true;
}
/*
* itimer_arm_real:
*
* Arm a non-virtual timer.
*/
static void
itimer_arm_real(struct itimer * const it)
{ KASSERT(!it->it_dying); KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid)); KASSERT(!callout_pending(&it->it_ch));
/*
* Don't need to check tshzto() return value, here.
* callout_schedule() does it for us.
*/
callout_schedule(&it->it_ch,
(it->it_clockid == CLOCK_MONOTONIC
? tshztoup(&it->it_time.it_value) : tshzto(&it->it_time.it_value)));
}
/*
* itimer_callout:
*
* Callout to expire a non-virtual timer. Queue it up for processing,
* and then reload, if it is configured to do so.
*
* N.B. A delay in processing this callout causes multiple
* SIGALRM calls to be compressed into one.
*/
static void
itimer_callout(void *arg)
{
uint64_t last_val, next_val, interval, now_ns;
struct timespec now, next;
struct itimer * const it = arg;
int backwards;
itimer_lock();
(*it->it_ops->ito_fire)(it);
if (!timespecisset(&it->it_time.it_interval)) {
timespecclear(&it->it_time.it_value);
itimer_unlock();
return;
}
if (it->it_clockid == CLOCK_MONOTONIC) {
getnanouptime(&now);
} else {
getnanotime(&now);
}
backwards = (timespeccmp(&it->it_time.it_value, &now, >));
/* Nonnegative interval guaranteed by itimerfix. */
KASSERT(it->it_time.it_interval.tv_sec >= 0);
KASSERT(it->it_time.it_interval.tv_nsec >= 0);
/* Handle the easy case of non-overflown timers first. */
if (!backwards &&
timespecaddok(&it->it_time.it_value, &it->it_time.it_interval)) {
timespecadd(&it->it_time.it_value, &it->it_time.it_interval,
&next);
it->it_time.it_value = next;
} else {
now_ns = timespec2ns(&now);
last_val = timespec2ns(&it->it_time.it_value);
interval = timespec2ns(&it->it_time.it_interval);
next_val = now_ns +
(now_ns - last_val + interval - 1) % interval;
if (backwards)
next_val += interval;
else
it->it_overruns += (now_ns - last_val) / interval;
it->it_time.it_value.tv_sec = next_val / 1000000000;
it->it_time.it_value.tv_nsec = next_val % 1000000000;
}
/*
* Reset the callout, if it's not going away.
*/
if (!it->it_dying)
itimer_arm_real(it);
itimer_unlock();
}
/*
* itimer_settime:
*
* Set up the given interval timer. The value in it->it_time.it_value
* is taken to be an absolute time for CLOCK_REALTIME/CLOCK_MONOTONIC
* timers and a relative time for CLOCK_VIRTUAL/CLOCK_PROF timers.
*
* If the callout had already fired but not yet run, fails with
* ERESTART -- caller must restart from the top to look up a timer.
*/
int
itimer_settime(struct itimer *it)
{
struct itimer *itn, *pitn;
struct itlist *itl;
KASSERT(itimer_lock_held()); KASSERT(!it->it_dying);
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
/*
* Try to stop the callout. However, if it had already
* fired, we have to drop the lock to wait for it, so
* the world may have changed and pt may not be there
* any more. In that case, tell the caller to start
* over from the top.
*/
if (callout_halt(&it->it_ch, &itimer_mutex))
return ERESTART;
KASSERT(!it->it_dying);
/* Now we can touch it and start it up again. */
if (timespecisset(&it->it_time.it_value))
itimer_arm_real(it);
} else {
if (it->it_active) {
itn = LIST_NEXT(it, it_list);
LIST_REMOVE(it, it_list); for ( ; itn; itn = LIST_NEXT(itn, it_list)) timespecadd(&it->it_time.it_value,
&itn->it_time.it_value,
&itn->it_time.it_value);
}
if (timespecisset(&it->it_time.it_value)) {
itl = it->it_vlist;
for (itn = LIST_FIRST(itl), pitn = NULL; itn && timespeccmp(&it->it_time.it_value,
&itn->it_time.it_value, >);
pitn = itn, itn = LIST_NEXT(itn, it_list))
timespecsub(&it->it_time.it_value,
&itn->it_time.it_value,
&it->it_time.it_value);
if (pitn) LIST_INSERT_AFTER(pitn, it, it_list);
else
LIST_INSERT_HEAD(itl, it, it_list); for ( ; itn ; itn = LIST_NEXT(itn, it_list)) timespecsub(&itn->it_time.it_value,
&it->it_time.it_value,
&itn->it_time.it_value);
it->it_active = true;
} else {
it->it_active = false;
}
}
/* Success! */
return 0;
}
/*
* itimer_gettime:
*
* Return the remaining time of an interval timer.
*/
void
itimer_gettime(const struct itimer *it, struct itimerspec *aits)
{
struct timespec now;
struct itimer *itn;
KASSERT(itimer_lock_held()); KASSERT(!it->it_dying);
*aits = it->it_time;
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
/*
* Convert from absolute to relative time in .it_value
* part of real time timer. If time for real time
* timer has passed return 0, else return difference
* between current time and time for the timer to go
* off.
*/
if (timespecisset(&aits->it_value)) {
if (it->it_clockid == CLOCK_REALTIME) {
getnanotime(&now);
} else { /* CLOCK_MONOTONIC */
getnanouptime(&now);
}
if (timespeccmp(&aits->it_value, &now, <))
timespecclear(&aits->it_value);
else
timespecsub(&aits->it_value, &now,
&aits->it_value);
}
} else if (it->it_active) {
for (itn = LIST_FIRST(it->it_vlist); itn && itn != it;
itn = LIST_NEXT(itn, it_list))
timespecadd(&aits->it_value,
&itn->it_time.it_value, &aits->it_value);
KASSERT(itn != NULL); /* it should be findable on the list */
} else
timespecclear(&aits->it_value);
}
/*
* Per-process timer support.
*
* Both the BSD getitimer() family and the POSIX timer_*() family of
* routines are supported.
*
* All timers are kept in an array pointed to by p_timers, which is
* allocated on demand - many processes don't use timers at all. The
* first four elements in this array are reserved for the BSD timers:
* element 0 is ITIMER_REAL, element 1 is ITIMER_VIRTUAL, element
* 2 is ITIMER_PROF, and element 3 is ITIMER_MONOTONIC. The rest may be
* allocated by the timer_create() syscall.
*
* These timers are a "sub-class" of interval timer.
*/
/*
* ptimer_free:
*
* Free the per-process timer at the specified index.
*/
static void
ptimer_free(struct ptimers *pts, int index)
{
struct itimer *it;
struct ptimer *pt;
KASSERT(itimer_lock_held());
it = pts->pts_timers[index];
pt = container_of(it, struct ptimer, pt_itimer);
pts->pts_timers[index] = NULL;
itimer_poison(it);
/*
* Remove it from the queue to be signalled. Must be done
* after itimer is poisoned, because we may have had to wait
* for the callout to complete.
*/
if (pt->pt_queued) {
TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
pt->pt_queued = false;
}
itimer_fini(it); /* releases itimer_lock */
kmem_free(pt, sizeof(*pt));
}
/*
* ptimers_alloc:
*
* Allocate a ptimers for the specified process.
*/
static struct ptimers *
ptimers_alloc(struct proc *p)
{
struct ptimers *pts;
int i;
pts = kmem_alloc(sizeof(*pts), KM_SLEEP);
LIST_INIT(&pts->pts_virtual);
LIST_INIT(&pts->pts_prof);
for (i = 0; i < TIMER_MAX; i++)
pts->pts_timers[i] = NULL;
itimer_lock();
if (p->p_timers == NULL) {
p->p_timers = pts;
itimer_unlock();
return pts;
}
itimer_unlock();
kmem_free(pts, sizeof(*pts));
return p->p_timers;
}
/*
* ptimers_free:
*
* Clean up the per-process timers. If "which" is set to TIMERS_ALL,
* then clean up all timers and free all the data structures. If
* "which" is set to TIMERS_POSIX, only clean up the timers allocated
* by timer_create(), not the BSD setitimer() timers, and only free the
* structure if none of those remain.
*
* This function is exported because it is needed in the exec and
* exit code paths.
*/
void
ptimers_free(struct proc *p, int which)
{
struct ptimers *pts;
struct itimer *itn;
struct timespec ts;
int i;
if (p->p_timers == NULL)
return;
pts = p->p_timers;
itimer_lock();
if (which == TIMERS_ALL) {
p->p_timers = NULL;
i = 0;
} else {
timespecclear(&ts);
for (itn = LIST_FIRST(&pts->pts_virtual);
itn && itn != pts->pts_timers[ITIMER_VIRTUAL];
itn = LIST_NEXT(itn, it_list)) {
KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
timespecadd(&ts, &itn->it_time.it_value, &ts);
}
LIST_FIRST(&pts->pts_virtual) = NULL;
if (itn) {
KASSERT(itn->it_clockid == CLOCK_VIRTUAL);
timespecadd(&ts, &itn->it_time.it_value,
&itn->it_time.it_value);
LIST_INSERT_HEAD(&pts->pts_virtual, itn, it_list);
}
timespecclear(&ts);
for (itn = LIST_FIRST(&pts->pts_prof);
itn && itn != pts->pts_timers[ITIMER_PROF];
itn = LIST_NEXT(itn, it_list)) {
KASSERT(itn->it_clockid == CLOCK_PROF);
timespecadd(&ts, &itn->it_time.it_value, &ts);
}
LIST_FIRST(&pts->pts_prof) = NULL;
if (itn) {
KASSERT(itn->it_clockid == CLOCK_PROF);
timespecadd(&ts, &itn->it_time.it_value,
&itn->it_time.it_value);
LIST_INSERT_HEAD(&pts->pts_prof, itn, it_list);
}
i = TIMER_MIN;
}
for ( ; i < TIMER_MAX; i++) {
if (pts->pts_timers[i] != NULL) {
/* Free the timer and release the lock. */
ptimer_free(pts, i);
/* Reacquire the lock for the next one. */
itimer_lock();
}
}
if (pts->pts_timers[0] == NULL && pts->pts_timers[1] == NULL &&
pts->pts_timers[2] == NULL && pts->pts_timers[3] == NULL) {
p->p_timers = NULL;
itimer_unlock();
kmem_free(pts, sizeof(*pts));
} else
itimer_unlock();
}
/*
* ptimer_fire:
*
* Fire a per-process timer.
*/
static void
ptimer_fire(struct itimer *it)
{
struct ptimer *pt = container_of(it, struct ptimer, pt_itimer);
KASSERT(itimer_lock_held());
/*
* XXX Can overrun, but we don't do signal queueing yet, anyway.
* XXX Relying on the clock interrupt is stupid.
*/
if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
return;
}
if (!pt->pt_queued) {
TAILQ_INSERT_TAIL(&ptimer_queue, pt, pt_chain);
pt->pt_queued = true;
softint_schedule(ptimer_sih);
}
}
/*
* Operations vector for per-process timers (BSD and POSIX).
*/
static const struct itimer_ops ptimer_itimer_ops = {
.ito_fire = ptimer_fire,
};
/*
* sys_timer_create:
*
* System call to create a POSIX timer.
*/
int
sys_timer_create(struct lwp *l, const struct sys_timer_create_args *uap,
register_t *retval)
{
/* {
syscallarg(clockid_t) clock_id;
syscallarg(struct sigevent *) evp;
syscallarg(timer_t *) timerid;
} */
return timer_create1(SCARG(uap, timerid), SCARG(uap, clock_id),
SCARG(uap, evp), copyin, l);
}
int
timer_create1(timer_t *tid, clockid_t id, struct sigevent *evp,
copyin_t fetch_event, struct lwp *l)
{
int error;
timer_t timerid;
struct itlist *itl;
struct ptimers *pts;
struct ptimer *pt;
struct proc *p;
p = l->l_proc;
if ((u_int)id > CLOCK_MONOTONIC)
return EINVAL;
if ((pts = p->p_timers) == NULL)
pts = ptimers_alloc(p);
pt = kmem_zalloc(sizeof(*pt), KM_SLEEP);
if (evp != NULL) {
if (((error =
(*fetch_event)(evp, &pt->pt_ev, sizeof(pt->pt_ev))) != 0) ||
((pt->pt_ev.sigev_notify < SIGEV_NONE) ||
(pt->pt_ev.sigev_notify > SIGEV_SA)) ||
(pt->pt_ev.sigev_notify == SIGEV_SIGNAL &&
(pt->pt_ev.sigev_signo <= 0 ||
pt->pt_ev.sigev_signo >= NSIG))) {
kmem_free(pt, sizeof(*pt));
return (error ? error : EINVAL);
}
}
/* Find a free timer slot, skipping those reserved for setitimer(). */
itimer_lock();
for (timerid = TIMER_MIN; timerid < TIMER_MAX; timerid++)
if (pts->pts_timers[timerid] == NULL)
break;
if (timerid == TIMER_MAX) {
itimer_unlock();
kmem_free(pt, sizeof(*pt));
return EAGAIN;
}
if (evp == NULL) {
pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
switch (id) {
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
pt->pt_ev.sigev_signo = SIGALRM;
break;
case CLOCK_VIRTUAL:
pt->pt_ev.sigev_signo = SIGVTALRM;
break;
case CLOCK_PROF:
pt->pt_ev.sigev_signo = SIGPROF;
break;
}
pt->pt_ev.sigev_value.sival_int = timerid;
}
switch (id) {
case CLOCK_VIRTUAL:
itl = &pts->pts_virtual;
break;
case CLOCK_PROF:
itl = &pts->pts_prof;
break;
default:
itl = NULL;
}
itimer_init(&pt->pt_itimer, &ptimer_itimer_ops, id, itl);
pt->pt_proc = p;
pt->pt_poverruns = 0;
pt->pt_entry = timerid;
pt->pt_queued = false;
pts->pts_timers[timerid] = &pt->pt_itimer;
itimer_unlock();
return copyout(&timerid, tid, sizeof(timerid));
}
/*
* sys_timer_delete:
*
* System call to delete a POSIX timer.
*/
int
sys_timer_delete(struct lwp *l, const struct sys_timer_delete_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
} */
struct proc *p = l->l_proc;
timer_t timerid;
struct ptimers *pts;
struct itimer *it, *itn;
timerid = SCARG(uap, timerid);
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
if (CLOCK_VIRTUAL_P(it->it_clockid)) {
if (it->it_active) {
itn = LIST_NEXT(it, it_list);
LIST_REMOVE(it, it_list);
for ( ; itn; itn = LIST_NEXT(itn, it_list))
timespecadd(&it->it_time.it_value,
&itn->it_time.it_value,
&itn->it_time.it_value);
it->it_active = false;
}
}
/* Free the timer and release the lock. */
ptimer_free(pts, timerid);
return 0;
}
/*
* sys___timer_settime50:
*
* System call to set/arm a POSIX timer.
*/
int
sys___timer_settime50(struct lwp *l,
const struct sys___timer_settime50_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
syscallarg(int) flags;
syscallarg(const struct itimerspec *) value;
syscallarg(struct itimerspec *) ovalue;
} */
int error;
struct itimerspec value, ovalue, *ovp = NULL;
if ((error = copyin(SCARG(uap, value), &value,
sizeof(struct itimerspec))) != 0)
return error;
if (SCARG(uap, ovalue))
ovp = &ovalue;
if ((error = dotimer_settime(SCARG(uap, timerid), &value, ovp,
SCARG(uap, flags), l->l_proc)) != 0)
return error;
if (ovp)
return copyout(&ovalue, SCARG(uap, ovalue),
sizeof(struct itimerspec));
return 0;
}
int
dotimer_settime(int timerid, struct itimerspec *value,
struct itimerspec *ovalue, int flags, struct proc *p)
{
struct timespec now;
struct itimerspec val, oval;
struct ptimers *pts;
struct itimer *it;
int error;
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
val = *value;
if ((error = itimespecfix(&val.it_value)) != 0 ||
(error = itimespecfix(&val.it_interval)) != 0)
return error;
itimer_lock();
restart:
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
oval = it->it_time;
it->it_time = val;
/*
* If we've been passed a relative time for a realtime timer,
* convert it to absolute; if an absolute time for a virtual
* timer, convert it to relative and make sure we don't set it
* to zero, which would cancel the timer, or let it go
* negative, which would confuse the comparison tests.
*/
if (timespecisset(&it->it_time.it_value)) {
if (!CLOCK_VIRTUAL_P(it->it_clockid)) {
if ((flags & TIMER_ABSTIME) == 0) {
if (it->it_clockid == CLOCK_REALTIME) {
getnanotime(&now);
} else { /* CLOCK_MONOTONIC */
getnanouptime(&now);
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
}
} else {
if ((flags & TIMER_ABSTIME) != 0) {
getnanotime(&now);
timespecsub(&it->it_time.it_value, &now,
&it->it_time.it_value);
if (!timespecisset(&it->it_time.it_value) ||
it->it_time.it_value.tv_sec < 0) {
it->it_time.it_value.tv_sec = 0;
it->it_time.it_value.tv_nsec = 1;
}
}
}
}
error = itimer_settime(it);
if (error == ERESTART) {
KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
goto restart;
}
KASSERT(error == 0);
itimer_unlock();
if (ovalue)
*ovalue = oval;
return 0;
}
/*
* sys___timer_gettime50:
*
* System call to return the time remaining until a POSIX timer fires.
*/
int
sys___timer_gettime50(struct lwp *l,
const struct sys___timer_gettime50_args *uap, register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
syscallarg(struct itimerspec *) value;
} */
struct itimerspec its;
int error;
if ((error = dotimer_gettime(SCARG(uap, timerid), l->l_proc,
&its)) != 0)
return error;
return copyout(&its, SCARG(uap, value), sizeof(its));
}
int
dotimer_gettime(int timerid, struct proc *p, struct itimerspec *its)
{
struct itimer *it;
struct ptimers *pts;
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
itimer_gettime(it, its);
itimer_unlock();
return 0;
}
/*
* sys_timer_getoverrun:
*
* System call to return the number of times a POSIX timer has
* expired while a notification was already pending. The counter
* is reset when a timer expires and a notification can be posted.
*/
int
sys_timer_getoverrun(struct lwp *l, const struct sys_timer_getoverrun_args *uap,
register_t *retval)
{
/* {
syscallarg(timer_t) timerid;
} */
struct proc *p = l->l_proc;
struct ptimers *pts;
int timerid;
struct itimer *it;
struct ptimer *pt;
timerid = SCARG(uap, timerid);
pts = p->p_timers;
if (pts == NULL || timerid < 2 || timerid >= TIMER_MAX)
return EINVAL;
itimer_lock();
if ((it = pts->pts_timers[timerid]) == NULL) {
itimer_unlock();
return EINVAL;
}
pt = container_of(it, struct ptimer, pt_itimer);
*retval = pt->pt_poverruns;
if (*retval >= DELAYTIMER_MAX)
*retval = DELAYTIMER_MAX;
itimer_unlock();
return 0;
}
/*
* sys___getitimer50:
*
* System call to get the time remaining before a BSD timer fires.
*/
int
sys___getitimer50(struct lwp *l, const struct sys___getitimer50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */
struct proc *p = l->l_proc;
struct itimerval aitv;
int error;
memset(&aitv, 0, sizeof(aitv));
error = dogetitimer(p, SCARG(uap, which), &aitv);
if (error)
return error;
return copyout(&aitv, SCARG(uap, itv), sizeof(struct itimerval));
}
int
dogetitimer(struct proc *p, int which, struct itimerval *itvp)
{
struct ptimers *pts;
struct itimer *it;
struct itimerspec its;
if ((u_int)which > ITIMER_MONOTONIC)
return EINVAL;
itimer_lock();
pts = p->p_timers;
if (pts == NULL || (it = pts->pts_timers[which]) == NULL) {
timerclear(&itvp->it_value);
timerclear(&itvp->it_interval);
} else {
itimer_gettime(it, &its);
TIMESPEC_TO_TIMEVAL(&itvp->it_value, &its.it_value);
TIMESPEC_TO_TIMEVAL(&itvp->it_interval, &its.it_interval);
}
itimer_unlock();
return 0;
}
/*
* sys___setitimer50:
*
* System call to set/arm a BSD timer.
*/
int
sys___setitimer50(struct lwp *l, const struct sys___setitimer50_args *uap,
register_t *retval)
{
/* {
syscallarg(int) which;
syscallarg(const struct itimerval *) itv;
syscallarg(struct itimerval *) oitv;
} */
struct proc *p = l->l_proc;
int which = SCARG(uap, which);
struct sys___getitimer50_args getargs;
const struct itimerval *itvp;
struct itimerval aitv;
int error;
itvp = SCARG(uap, itv);
if (itvp &&
(error = copyin(itvp, &aitv, sizeof(struct itimerval))) != 0)
return error;
if (SCARG(uap, oitv) != NULL) {
SCARG(&getargs, which) = which;
SCARG(&getargs, itv) = SCARG(uap, oitv);
if ((error = sys___getitimer50(l, &getargs, retval)) != 0)
return error;
}
if (itvp == 0)
return 0;
return dosetitimer(p, which, &aitv);
}
int
dosetitimer(struct proc *p, int which, struct itimerval *itvp)
{
struct timespec now;
struct ptimers *pts;
struct ptimer *spare;
struct itimer *it;
struct itlist *itl;
int error;
if ((u_int)which > ITIMER_MONOTONIC)
return EINVAL;
if (itimerfix(&itvp->it_value) || itimerfix(&itvp->it_interval))
return EINVAL;
/*
* Don't bother allocating data structures if the process just
* wants to clear the timer.
*/
spare = NULL;
pts = p->p_timers;
retry:
if (!timerisset(&itvp->it_value) && (pts == NULL ||
pts->pts_timers[which] == NULL))
return 0;
if (pts == NULL) pts = ptimers_alloc(p);
itimer_lock();
restart:
it = pts->pts_timers[which];
if (it == NULL) {
struct ptimer *pt;
if (spare == NULL) {
itimer_unlock();
spare = kmem_zalloc(sizeof(*spare), KM_SLEEP);
goto retry;
}
pt = spare;
spare = NULL;
it = &pt->pt_itimer;
pt->pt_ev.sigev_notify = SIGEV_SIGNAL;
pt->pt_ev.sigev_value.sival_int = which;
switch (which) {
case ITIMER_REAL:
case ITIMER_MONOTONIC:
itl = NULL;
pt->pt_ev.sigev_signo = SIGALRM;
break;
case ITIMER_VIRTUAL:
itl = &pts->pts_virtual;
pt->pt_ev.sigev_signo = SIGVTALRM;
break;
case ITIMER_PROF:
itl = &pts->pts_prof;
pt->pt_ev.sigev_signo = SIGPROF;
break;
default:
panic("%s: can't happen %d", __func__, which);
}
itimer_init(it, &ptimer_itimer_ops, which, itl);
pt->pt_proc = p;
pt->pt_entry = which;
pts->pts_timers[which] = it;
}
TIMEVAL_TO_TIMESPEC(&itvp->it_value, &it->it_time.it_value);
TIMEVAL_TO_TIMESPEC(&itvp->it_interval, &it->it_time.it_interval);
error = 0;
if (timespecisset(&it->it_time.it_value)) {
/* Convert to absolute time */
/* XXX need to wrap in splclock for timecounters case? */
switch (which) {
case ITIMER_REAL:
getnanotime(&now);
if (!timespecaddok(&it->it_time.it_value, &now)) {
error = EINVAL;
goto out;
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
break;
case ITIMER_MONOTONIC:
getnanouptime(&now);
if (!timespecaddok(&it->it_time.it_value, &now)) {
error = EINVAL;
goto out;
}
timespecadd(&it->it_time.it_value, &now,
&it->it_time.it_value);
break;
default:
break;
}
}
error = itimer_settime(it);
if (error == ERESTART) { KASSERT(!CLOCK_VIRTUAL_P(it->it_clockid));
goto restart;
}
KASSERT(error == 0);
out:
itimer_unlock();
if (spare != NULL) kmem_free(spare, sizeof(*spare));
return error;
}
/*
* ptimer_tick:
*
* Called from hardclock() to decrement per-process virtual timers.
*/
void
ptimer_tick(lwp_t *l, bool user)
{
struct ptimers *pts;
struct itimer *it;
proc_t *p;
p = l->l_proc;
if (p->p_timers == NULL)
return;
itimer_lock();
if ((pts = l->l_proc->p_timers) != NULL) {
/*
* Run current process's virtual and profile time, as needed.
*/
if (user && (it = LIST_FIRST(&pts->pts_virtual)) != NULL)
if (itimer_decr(it, tick * 1000))
(*it->it_ops->ito_fire)(it);
if ((it = LIST_FIRST(&pts->pts_prof)) != NULL)
if (itimer_decr(it, tick * 1000))
(*it->it_ops->ito_fire)(it);
}
itimer_unlock();
}
/*
* ptimer_intr:
*
* Software interrupt handler for processing per-process
* timer expiration.
*/
static void
ptimer_intr(void *cookie)
{
ksiginfo_t ksi;
struct itimer *it;
struct ptimer *pt;
proc_t *p;
mutex_enter(&proc_lock);
itimer_lock();
while ((pt = TAILQ_FIRST(&ptimer_queue)) != NULL) {
it = &pt->pt_itimer;
TAILQ_REMOVE(&ptimer_queue, pt, pt_chain);
KASSERT(pt->pt_queued);
pt->pt_queued = false;
p = pt->pt_proc;
if (p->p_timers == NULL) {
/* Process is dying. */
continue;
}
if (pt->pt_ev.sigev_notify != SIGEV_SIGNAL) {
continue;
}
if (sigismember(&p->p_sigpend.sp_set, pt->pt_ev.sigev_signo)) {
it->it_overruns++;
continue;
}
KSI_INIT(&ksi);
ksi.ksi_signo = pt->pt_ev.sigev_signo;
ksi.ksi_code = SI_TIMER;
ksi.ksi_value = pt->pt_ev.sigev_value;
pt->pt_poverruns = it->it_overruns;
it->it_overruns = 0;
itimer_unlock();
kpsignal(p, &ksi, NULL);
itimer_lock();
}
itimer_unlock();
mutex_exit(&proc_lock);
}
/* $NetBSD: msgbuf.h,v 1.18 2022/10/26 23:28:43 riastradh Exp $ */
/*
* Copyright (c) 1981, 1984, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)msgbuf.h 8.1 (Berkeley) 6/2/93
*/
#ifndef _SYS_MSGBUF_H_
#define _SYS_MSGBUF_H_
struct kern_msgbuf {
#define MSG_MAGIC 0x063061
long msg_magic;
long msg_bufx; /* write pointer */
long msg_bufr; /* read pointer */
long msg_bufs; /* real msg_bufc size (bytes) */
char msg_bufc[1]; /* buffer */
};
#ifdef _KERNEL
extern int msgbufmapped; /* is the message buffer mapped */
extern int msgbufenabled; /* is logging to the buffer enabled */
extern struct kern_msgbuf *msgbufp; /* the mapped buffer, itself. */
extern int log_open; /* is /dev/klog open? */
void initmsgbuf(void *, size_t);
void loginit(void);
void logputchar(int);
static __inline int
logenabled(const struct kern_msgbuf *mbp)
{
return msgbufenabled && mbp->msg_magic == MSG_MAGIC;
}
#endif
#endif /* !_SYS_MSGBUF_H_ */
/* $NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $ */
/* $KAME: in6_pcb.c,v 1.84 2001/02/08 18:02:08 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)in_pcb.c 8.2 (Berkeley) 1/4/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in6_pcb.c,v 1.177 2022/11/04 09:04:27 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/proc.h>
#include <sys/kauth.h>
#include <sys/domain.h>
#include <sys/once.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip6.h>
#include <netinet/portalgo.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/scope6_var.h>
#include "faith.h"
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec6.h>
#include <netipsec/key.h>
#endif /* IPSEC */
#include <netinet/tcp_vtw.h>
const struct in6_addr zeroin6_addr;
#define IN6PCBHASH_PORT(table, lport) \
&(table)->inpt_porthashtbl[ntohs(lport) & (table)->inpt_porthash]
#define IN6PCBHASH_BIND(table, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
(((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
(laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + ntohs(lport)) & \
(table)->inpt_bindhash]
#define IN6PCBHASH_CONNECT(table, faddr, fport, laddr, lport) \
&(table)->inpt_bindhashtbl[ \
((((faddr)->s6_addr32[0] ^ (faddr)->s6_addr32[1] ^ \
(faddr)->s6_addr32[2] ^ (faddr)->s6_addr32[3]) + ntohs(fport)) + \
(((laddr)->s6_addr32[0] ^ (laddr)->s6_addr32[1] ^ \
(laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) + \
ntohs(lport))) & (table)->inpt_bindhash]
int ip6_anonportmin = IPV6PORT_ANONMIN;
int ip6_anonportmax = IPV6PORT_ANONMAX;
int ip6_lowportmin = IPV6PORT_RESERVEDMIN;
int ip6_lowportmax = IPV6PORT_RESERVEDMAX;
void
in6pcb_init(struct inpcbtable *table, int bindhashsize, int connecthashsize)
{
inpcb_init(table, bindhashsize, connecthashsize);
table->inpt_lastport = (in_port_t)ip6_anonportmax;
}
/*
* Bind address from sin6 to inp.
*/
static int
in6pcb_bind_addr(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
int error;
int s;
/*
* We should check the family, but old programs
* incorrectly fail to initialize it.
*/
if (sin6->sin6_family != AF_INET6)
return EAFNOSUPPORT;
#ifndef INET
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr))
return EADDRNOTAVAIL;
#endif
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
return error;
s = pserialize_read_enter();
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
error = EINVAL;
goto out;
}
if (sin6->sin6_addr.s6_addr32[3]) {
struct sockaddr_in sin;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
bcopy(&sin6->sin6_addr.s6_addr32[3],
&sin.sin_addr, sizeof(sin.sin_addr));
if (!IN_MULTICAST(sin.sin_addr.s_addr)) {
struct ifaddr *ifa;
ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
if (ifa == NULL &&
(inp->inp_flags & IN6P_BINDANY) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
}
}
} else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
// succeed
} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
struct ifaddr *ifa = NULL;
if ((inp->inp_flags & IN6P_FAITH) == 0) {
ifa = ifa_ifwithaddr(sin6tosa(sin6));
if (ifa == NULL &&
(inp->inp_flags & IN6P_BINDANY) == 0) {
error = EADDRNOTAVAIL;
goto out;
}
}
/*
* bind to an anycast address might accidentally
* cause sending a packet with an anycast source
* address, so we forbid it.
*
* We should allow to bind to a deprecated address,
* since the application dare to use it.
* But, can we assume that they are careful enough
* to check if the address is deprecated or not?
* Maybe, as a safeguard, we should have a setsockopt
* flag to control the bind(2) behavior against
* deprecated addresses (default: forbid bind(2)).
*/
if (ifa &&
ifatoia6(ifa)->ia6_flags &
(IN6_IFF_ANYCAST | IN6_IFF_DUPLICATED)) {
error = EADDRNOTAVAIL;
goto out;
}
}
in6p_laddr(inp) = sin6->sin6_addr;
error = 0;
out:
pserialize_read_exit(s);
return error;
}
/*
* Bind port from sin6 to inp.
*/
static int
in6pcb_bind_port(struct inpcb *inp, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcbtable *table = inp->inp_table;
struct socket *so = inp->inp_socket;
int wild = 0, reuseport = (so->so_options & SO_REUSEPORT);
int error;
if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0 && ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0 ||
(so->so_options & SO_ACCEPTCONN) == 0))
wild = 1;
if (sin6->sin6_port != 0) {
enum kauth_network_req req;
#ifndef IPNOPRIVPORTS
if (ntohs(sin6->sin6_port) < IPV6PORT_RESERVED)
req = KAUTH_REQ_NETWORK_BIND_PRIVPORT;
else
#endif /* IPNOPRIVPORTS */
req = KAUTH_REQ_NETWORK_BIND_PORT;
error = kauth_authorize_network(l->l_cred, KAUTH_NETWORK_BIND,
req, so, sin6, NULL);
if (error)
return EACCES;
}
if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) {
/*
* Treat SO_REUSEADDR as SO_REUSEPORT for multicast;
* allow compepte duplication of binding if
* SO_REUSEPORT is set, or if SO_REUSEADDR is set
* and a multicast address is bound on both
* new and duplicated sockets.
*/
if (so->so_options & (SO_REUSEADDR | SO_REUSEPORT))
reuseport = SO_REUSEADDR|SO_REUSEPORT;
}
if (sin6->sin6_port != 0) { if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
#ifdef INET
struct inpcb *t;
struct vestigial_inpcb vestige;
t = inpcb_lookup_local(table,
*(struct in_addr *)&sin6->sin6_addr.s6_addr32[3],
sin6->sin6_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
#else
return EADDRNOTAVAIL;
#endif
}
{
struct inpcb *t;
struct vestigial_inpcb vestige;
t = in6pcb_lookup_local(table, &sin6->sin6_addr,
sin6->sin6_port, wild, &vestige);
if (t && (reuseport & t->inp_socket->so_options) == 0)
return EADDRINUSE;
if (!t
&& vestige.valid
&& !(reuseport && vestige.reuse_port))
return EADDRINUSE;
}
}
if (sin6->sin6_port == 0) {
int e;
e = in6pcb_set_port(sin6, inp, l);
if (e != 0)
return e;
} else {
inp->inp_lport = sin6->sin6_port;
inpcb_set_state(inp, INP_BOUND);
}
LIST_REMOVE(inp, inp_lhash); LIST_INSERT_HEAD(IN6PCBHASH_PORT(table, inp->inp_lport),
inp, inp_lhash);
return 0;
}
int
in6pcb_bind(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcb *inp = v;
struct sockaddr_in6 lsin6;
int error;
if (inp->inp_af != AF_INET6)
return EINVAL;
/*
* If we already have a local port or a local address it means we're
* bounded.
*/
if (inp->inp_lport || !(IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0)))
return EINVAL;
if (NULL != sin6) {
/* We were provided a sockaddr_in6 to use. */
if (sin6->sin6_len != sizeof(*sin6))
return EINVAL;
} else {
/* We always bind to *something*, even if it's "anything". */
lsin6 = *((const struct sockaddr_in6 *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
sin6 = &lsin6;
}
/* Bind address. */
error = in6pcb_bind_addr(inp, sin6, l);
if (error)
return error;
/* Bind port. */
error = in6pcb_bind_port(inp, sin6, l);
if (error) {
/*
* Reset the address here to "any" so we don't "leak" the
* inpcb.
*/
in6p_laddr(inp) = in6addr_any;
return error;
}
#if 0
in6p_flowinfo(inp) = 0; /* XXX */
#endif
return 0;
}
/*
* Connect from a socket to a specified address.
* Both address and port must be specified in argument sin6.
* If don't have a local address for this socket yet,
* then pick one.
*/
int
in6pcb_connect(void *v, struct sockaddr_in6 *sin6, struct lwp *l)
{
struct inpcb *inp = v;
struct in6_addr *in6a = NULL;
struct in6_addr ia6;
struct ifnet *ifp = NULL; /* outgoing interface */
int error = 0;
int scope_ambiguous = 0;
#ifdef INET
struct in6_addr mapped;
#endif
struct sockaddr_in6 tmp;
struct vestigial_inpcb vestige;
struct psref psref;
int bound;
(void)&in6a; /* XXX fool gcc */
if (inp->inp_af != AF_INET6)
return EINVAL;
if (sin6->sin6_len != sizeof(*sin6))
return EINVAL;
if (sin6->sin6_family != AF_INET6)
return EAFNOSUPPORT;
if (sin6->sin6_port == 0)
return EADDRNOTAVAIL;
if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) &&
inp->inp_socket->so_type == SOCK_STREAM)
return EADDRNOTAVAIL;
if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
return error;
/* sanity check for mapped address case */
if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
return EINVAL;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) in6p_laddr(inp).s6_addr16[5] = htons(0xffff); if (!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
return EINVAL;
} else
{
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)))
return EINVAL;
}
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
bound = curlwp_bind();
/* Source address selection. */
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0) {
#ifdef INET
struct sockaddr_in sin;
struct in_ifaddr *ia4;
struct psref _psref;
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
memcpy(&sin.sin_addr, &sin6->sin6_addr.s6_addr32[3],
sizeof(sin.sin_addr));
ia4 = in_selectsrc(&sin, &inp->inp_route,
inp->inp_socket->so_options, NULL, &error, &_psref);
if (ia4 == NULL) {
if (error == 0)
error = EADDRNOTAVAIL;
curlwp_bindx(bound);
return error;
}
memset(&mapped, 0, sizeof(mapped));
mapped.s6_addr16[5] = htons(0xffff);
memcpy(&mapped.s6_addr32[3], &IA_SIN(ia4)->sin_addr,
sizeof(IA_SIN(ia4)->sin_addr));
ia4_release(ia4, &_psref);
in6a = &mapped;
#else
curlwp_bindx(bound);
return EADDRNOTAVAIL;
#endif
} else {
/*
* XXX: in6_selectsrc might replace the bound local address
* with the address specified by setsockopt(IPV6_PKTINFO).
* Is it the intended behavior?
*/
error = in6_selectsrc(sin6, in6p_outputopts(inp),
in6p_moptions(inp), &inp->inp_route, &in6p_laddr(inp),
&ifp, &psref, &ia6);
if (error == 0)
in6a = &ia6;
if (ifp && scope_ambiguous &&
(error = in6_setscope(&sin6->sin6_addr, ifp, NULL)) != 0) {
if_put(ifp, &psref);
curlwp_bindx(bound);
return error;
}
if (in6a == NULL) {
if_put(ifp, &psref);
curlwp_bindx(bound); if (error == 0)
error = EADDRNOTAVAIL;
return error;
}
}
if (ifp != NULL) {
in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim(inp, ifp);
if_put(ifp, &psref);
} else
in6p_ip6(inp).ip6_hlim = (u_int8_t)in6pcb_selecthlim_rt(inp); curlwp_bindx(bound); if (in6pcb_lookup(inp->inp_table, &sin6->sin6_addr,
sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) ? in6a : &in6p_laddr(inp),
inp->inp_lport, 0, &vestige)
|| vestige.valid)
return EADDRINUSE;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) || (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp)) &&
in6p_laddr(inp).s6_addr32[3] == 0))
{
if (inp->inp_lport == 0) {
error = in6pcb_bind(inp, NULL, l);
if (error != 0)
return error;
}
in6p_laddr(inp) = *in6a;
}
in6p_faddr(inp) = sin6->sin6_addr;
inp->inp_fport = sin6->sin6_port;
/* Late bind, if needed */
if (inp->inp_bindportonsend) {
struct sockaddr_in6 lsin = *((const struct sockaddr_in6 *)
inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin.sin6_addr = in6p_laddr(inp);
lsin.sin6_port = 0;
if ((error = in6pcb_bind_port(inp, &lsin, l)) != 0)
return error;
}
inpcb_set_state(inp, INP_CONNECTED);
in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
if (ip6_auto_flowlabel)
in6p_flowinfo(inp) |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
#if defined(IPSEC)
if (ipsec_enabled && inp->inp_socket->so_type == SOCK_STREAM) ipsec_pcbconn(inp->inp_sp);
#endif
return 0;
}
void
in6pcb_disconnect(struct inpcb *inp)
{
memset((void *)&in6p_faddr(inp), 0, sizeof(in6p_faddr(inp)));
inp->inp_fport = 0;
inpcb_set_state(inp, INP_BOUND);
in6p_flowinfo(inp) &= ~IPV6_FLOWLABEL_MASK;
#if defined(IPSEC)
if (ipsec_enabled) ipsec_pcbdisconn(inp->inp_sp);
#endif
if (inp->inp_socket->so_state & SS_NOFDREF) inpcb_destroy(inp);
}
void
in6pcb_fetch_sockaddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{ if (inp->inp_af != AF_INET6)
return;
sockaddr_in6_init(sin6, &in6p_laddr(inp), inp->inp_lport, 0, 0);
(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}
void
in6pcb_fetch_peeraddr(struct inpcb *inp, struct sockaddr_in6 *sin6)
{ if (inp->inp_af != AF_INET6)
return;
sockaddr_in6_init(sin6, &in6p_faddr(inp), inp->inp_fport, 0, 0);
(void)sa6_recoverscope(sin6); /* XXX: should catch errors */
}
/*
* Pass some notification to all connections of a protocol
* associated with address dst. The local address and/or port numbers
* may be specified to limit the search. The "usual action" will be
* taken, depending on the ctlinput cmd. The caller must filter any
* cmds that are uninteresting (e.g., no error in the map).
* Call the protocol specific routine (if any) to report
* any errors for each matching socket.
*
* Must be called at splsoftnet.
*
* Note: src (4th arg) carries the flowlabel value on the original IPv6
* header, in sin6_flowinfo member.
*/
int
in6pcb_notify(struct inpcbtable *table, const struct sockaddr *dst,
u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd,
void *cmdarg, void (*notify)(struct inpcb *, int))
{
struct inpcb *inp;
struct sockaddr_in6 sa6_src;
const struct sockaddr_in6 *sa6_dst;
in_port_t fport = fport_arg, lport = lport_arg;
int errno;
int nmatch = 0;
u_int32_t flowinfo;
if ((unsigned)cmd >= PRC_NCMDS || dst->sa_family != AF_INET6)
return 0;
sa6_dst = (const struct sockaddr_in6 *)dst;
if (IN6_IS_ADDR_UNSPECIFIED(&sa6_dst->sin6_addr))
return 0;
/*
* note that src can be NULL when we get notify by local fragmentation.
*/
sa6_src = (src == NULL) ? sa6_any : *(const struct sockaddr_in6 *)src;
flowinfo = sa6_src.sin6_flowinfo;
/*
* Redirects go to all references to the destination,
* and use in6pcb_rtchange to invalidate the route cache.
* Dead host indications: also use in6pcb_rtchange to invalidate
* the cache, and deliver the error to all the sockets.
* Otherwise, if we have knowledge of the local port and address,
* deliver only to that socket.
*/
if (PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) {
fport = 0;
lport = 0;
memset((void *)&sa6_src.sin6_addr, 0, sizeof(sa6_src.sin6_addr));
if (cmd != PRC_HOSTDEAD)
notify = in6pcb_rtchange;
}
errno = inet6ctlerrmap[cmd];
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
struct rtentry *rt = NULL;
if (inp->inp_af != AF_INET6)
continue;
/*
* Under the following condition, notify of redirects
* to the pcb, without making address matches against inpcb.
* - redirect notification is arrived.
* - the inpcb is unconnected.
* - the inpcb is caching !RTF_HOST routing entry.
* - the ICMPv6 notification is from the gateway cached in the
* inpcb. i.e. ICMPv6 notification is from nexthop gateway
* the inpcb used very recently.
*
* This is to improve interaction between netbsd/openbsd
* redirect handling code, and inpcb route cache code.
* without the clause, !RTF_HOST routing entry (which carries
* gateway used by inpcb right before the ICMPv6 redirect)
* will be cached forever in unconnected inpcb.
*
* There still is a question regarding to what is TRT:
* - On bsdi/freebsd, RTF_HOST (cloned) routing entry will be
* generated on packet output. inpcb will always cache
* RTF_HOST routing entry so there's no need for the clause
* (ICMPv6 redirect will update RTF_HOST routing entry,
* and inpcb is caching it already).
* However, bsdi/freebsd are vulnerable to local DoS attacks
* due to the cloned routing entries.
* - Specwise, "destination cache" is mentioned in RFC2461.
* Jinmei says that it implies bsdi/freebsd behavior, itojun
* is not really convinced.
* - Having hiwat/lowat on # of cloned host route (redirect/
* pmtud) may be a good idea. netbsd/openbsd has it. see
* icmp6_mtudisc_update().
*/
if ((PRC_IS_REDIRECT(cmd) || cmd == PRC_HOSTDEAD) &&
IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
(rt = rtcache_validate(&inp->inp_route)) != NULL &&
!(rt->rt_flags & RTF_HOST)) {
const struct sockaddr_in6 *dst6;
dst6 = (const struct sockaddr_in6 *)
rtcache_getdst(&inp->inp_route);
if (dst6 == NULL)
;
else if (IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr,
&sa6_dst->sin6_addr)) {
rtcache_unref(rt, &inp->inp_route);
goto do_notify;
}
}
rtcache_unref(rt, &inp->inp_route);
/*
* If the error designates a new path MTU for a destination
* and the application (associated with this socket) wanted to
* know the value, notify. Note that we notify for all
* disconnected sockets if the corresponding application
* wanted. This is because some UDP applications keep sending
* sockets disconnected.
* XXX: should we avoid to notify the value to TCP sockets?
*/
if (cmd == PRC_MSGSIZE && (inp->inp_flags & IN6P_MTU) != 0 &&
(IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)) ||
IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), &sa6_dst->sin6_addr))) {
ip6_notify_pmtu(inp, (const struct sockaddr_in6 *)dst,
(u_int32_t *)cmdarg);
}
/*
* Detect if we should notify the error. If no source and
* destination ports are specified, but non-zero flowinfo and
* local address match, notify the error. This is the case
* when the error is delivered with an encrypted buffer
* by ESP. Otherwise, just compare addresses and ports
* as usual.
*/
if (lport == 0 && fport == 0 && flowinfo &&
inp->inp_socket != NULL &&
flowinfo == (in6p_flowinfo(inp) & IPV6_FLOWLABEL_MASK) &&
IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &sa6_src.sin6_addr))
goto do_notify;
else if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
&sa6_dst->sin6_addr) ||
inp->inp_socket == NULL ||
(lport && inp->inp_lport != lport) ||
(!IN6_IS_ADDR_UNSPECIFIED(&sa6_src.sin6_addr) &&
!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
&sa6_src.sin6_addr)) ||
(fport && inp->inp_fport != fport))
continue;
do_notify:
if (notify)
(*notify)(inp, errno);
nmatch++;
}
return nmatch;
}
void
in6pcb_purgeif0(struct inpcbtable *table, struct ifnet *ifp)
{
struct inpcb *inp;
struct ip6_moptions *im6o;
struct in6_multi_mship *imm, *nimm;
KASSERT(ifp != NULL);
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
bool need_unlock = false;
if (inp->inp_af != AF_INET6)
continue;
/* The caller holds either one of inps' lock */
if (!inp_locked(inp)) {
inp_lock(inp);
need_unlock = true;
}
im6o = in6p_moptions(inp);
if (im6o) {
/*
* Unselect the outgoing interface if it is being
* detached.
*/
if (im6o->im6o_multicast_if_index == ifp->if_index)
im6o->im6o_multicast_if_index = 0;
/*
* Drop multicast group membership if we joined
* through the interface being detached.
* XXX controversial - is it really legal for kernel
* to force this?
*/
LIST_FOREACH_SAFE(imm, &im6o->im6o_memberships,
i6mm_chain, nimm) {
if (imm->i6mm_maddr->in6m_ifp == ifp) {
LIST_REMOVE(imm, i6mm_chain);
in6_leavegroup(imm);
}
}
}
in_purgeifmcast(inp->inp_moptions, ifp);
if (need_unlock)
inp_unlock(inp);
}
}
void
in6pcb_purgeif(struct inpcbtable *table, struct ifnet *ifp)
{
struct rtentry *rt;
struct inpcb *inp;
TAILQ_FOREACH(inp, &table->inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if ((rt = rtcache_validate(&inp->inp_route)) != NULL &&
rt->rt_ifp == ifp) {
rtcache_unref(rt, &inp->inp_route);
in6pcb_rtchange(inp, 0);
} else
rtcache_unref(rt, &inp->inp_route);
}
}
/*
* After a routing change, flush old routing. A new route can be
* allocated the next time output is attempted.
*/
void
in6pcb_rtchange(struct inpcb *inp, int errno)
{
if (inp->inp_af != AF_INET6)
return;
rtcache_free(&inp->inp_route);
/*
* A new route can be allocated the next time
* output is attempted.
*/
}
struct inpcb *
in6pcb_lookup_local(struct inpcbtable *table, struct in6_addr *laddr6,
u_int lport_arg, int lookup_wildcard, struct vestigial_inpcb *vp)
{
struct inpcbhead *head;
struct inpcb *inp, *match = NULL;
int matchwild = 3, wildcard;
in_port_t lport = lport_arg;
if (vp) vp->valid = 0;
head = IN6PCBHASH_PORT(table, lport);
LIST_FOREACH(inp, head, inp_lhash) { if (inp->inp_af != AF_INET6)
continue;
if (inp->inp_lport != lport)
continue;
wildcard = 0;
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
wildcard++;
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (!IN6_IS_ADDR_V4MAPPED(laddr6))
continue;
/* duplicate of IPv4 logic */
wildcard = 0;
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)) && in6p_faddr(inp).s6_addr32[3])
wildcard++;
if (!in6p_laddr(inp).s6_addr32[3]) {
if (laddr6->s6_addr32[3])
wildcard++;
} else {
if (!laddr6->s6_addr32[3])
wildcard++;
else {
if (in6p_laddr(inp).s6_addr32[3] !=
laddr6->s6_addr32[3])
continue;
}
}
} else if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) { if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
wildcard++;
} else {
if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
}
if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++;
else {
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
laddr6))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
match = inp;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (match && matchwild == 0)
return match;
if (vp && table->vestige && table->vestige->init_ports6) {
struct vestigial_inpcb better;
bool has_better = false;
void *state;
state = (*table->vestige->init_ports6)(laddr6,
lport_arg,
lookup_wildcard);
while (table->vestige && (*table->vestige->next_port6)(state, vp)) { if (vp->lport != lport)
continue;
wildcard = 0;
if (!IN6_IS_ADDR_UNSPECIFIED(&vp->faddr.v6))
wildcard++;
if (IN6_IS_ADDR_UNSPECIFIED(&vp->laddr.v6)) { if (!IN6_IS_ADDR_UNSPECIFIED(laddr6))
wildcard++;
} else {
if (IN6_IS_ADDR_V4MAPPED(laddr6)) { if (vp->v6only)
continue;
}
if (IN6_IS_ADDR_UNSPECIFIED(laddr6)) wildcard++;
else {
if (!IN6_ARE_ADDR_EQUAL(&vp->laddr.v6, laddr6))
continue;
}
}
if (wildcard && !lookup_wildcard)
continue;
if (wildcard < matchwild) {
better = *vp;
has_better = true;
matchwild = wildcard;
if (matchwild == 0)
break;
}
}
if (has_better) {
*vp = better;
return 0;
}
}
return match;
}
/*
* WARNING: return value (rtentry) could be IPv4 one if inpcb is connected to
* IPv4 mapped address.
*/
struct rtentry *
in6pcb_rtentry(struct inpcb *inp)
{
struct rtentry *rt;
struct route *ro;
union {
const struct sockaddr *sa;
const struct sockaddr_in6 *sa6;
#ifdef INET
const struct sockaddr_in *sa4;
#endif
} cdst;
ro = &inp->inp_route;
if (inp->inp_af != AF_INET6)
return NULL;
cdst.sa = rtcache_getdst(ro); if (cdst.sa == NULL)
;
#ifdef INET
else if (cdst.sa->sa_family == AF_INET) {
KASSERT(IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))); if (cdst.sa4->sin_addr.s_addr != in6p_faddr(inp).s6_addr32[3])
rtcache_free(ro);
}
#endif
else {
if (!IN6_ARE_ADDR_EQUAL(&cdst.sa6->sin6_addr,
&in6p_faddr(inp)))
rtcache_free(ro);
}
if ((rt = rtcache_validate(ro)) == NULL)
rt = rtcache_update(ro, 1);
#ifdef INET
if (rt == NULL && IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
union {
struct sockaddr dst;
struct sockaddr_in dst4;
} u;
struct in_addr addr;
addr.s_addr = in6p_faddr(inp).s6_addr32[3];
sockaddr_in_init(&u.dst4, &addr, 0);
if (rtcache_setdst(ro, &u.dst) != 0)
return NULL;
rt = rtcache_init(ro);
} else
#endif
if (rt == NULL && !IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
union {
struct sockaddr dst;
struct sockaddr_in6 dst6;
} u;
sockaddr_in6_init(&u.dst6, &in6p_faddr(inp), 0, 0, 0);
if (rtcache_setdst(ro, &u.dst) != 0)
return NULL;
rt = rtcache_init(ro);
}
return rt;
}
void
in6pcb_rtentry_unref(struct rtentry *rt, struct inpcb *inp)
{
rtcache_unref(rt, &inp->inp_route);
}
struct inpcb *
in6pcb_lookup(struct inpcbtable *table, const struct in6_addr *faddr6,
u_int fport_arg, const struct in6_addr *laddr6, u_int lport_arg,
int faith,
struct vestigial_inpcb *vp)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t fport = fport_arg, lport = lport_arg;
if (vp) vp->valid = 0;
head = IN6PCBHASH_CONNECT(table, faddr6, fport, laddr6, lport);
LIST_FOREACH(inp, head, inp_hash) { if (inp->inp_af != AF_INET6)
continue;
/* find exact match on both source and dest */
if (inp->inp_fport != fport)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
continue;
if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp), faddr6))
continue;
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)))
continue;
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
continue;
if ((IN6_IS_ADDR_V4MAPPED(laddr6) || IN6_IS_ADDR_V4MAPPED(faddr6)) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
return inp;
}
if (vp && table->vestige) { if ((*table->vestige->lookup6)(faddr6, fport_arg,
laddr6, lport_arg, vp))
return NULL;
}
return NULL;
}
struct inpcb *
in6pcb_lookup_bound(struct inpcbtable *table, const struct in6_addr *laddr6,
u_int lport_arg, int faith)
{
struct inpcbhead *head;
struct inpcb *inp;
in_port_t lport = lport_arg;
#ifdef INET
struct in6_addr zero_mapped;
#endif
head = IN6PCBHASH_BIND(table, laddr6, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), laddr6))
goto out;
}
#ifdef INET
if (IN6_IS_ADDR_V4MAPPED(laddr6)) {
memset(&zero_mapped, 0, sizeof(zero_mapped));
zero_mapped.s6_addr16[5] = 0xffff;
head = IN6PCBHASH_BIND(table, &zero_mapped, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zero_mapped))
goto out;
}
}
#endif
head = IN6PCBHASH_BIND(table, &zeroin6_addr, lport);
LIST_FOREACH(inp, head, inp_hash) {
if (inp->inp_af != AF_INET6)
continue;
if (faith && (inp->inp_flags & IN6P_FAITH) == 0)
continue;
if (inp->inp_fport != 0)
continue;
if (inp->inp_lport != lport)
continue;
if (IN6_IS_ADDR_V4MAPPED(laddr6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY) != 0)
continue;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp), &zeroin6_addr))
goto out;
}
return NULL;
out:
if (inp != LIST_FIRST(head)) {
LIST_REMOVE(inp, inp_hash);
LIST_INSERT_HEAD(head, inp, inp_hash);
}
return inp;
}
void
in6pcb_set_state(struct inpcb *inp, int state)
{ if (inp->inp_af != AF_INET6)
return;
if (inp->inp_state > INP_ATTACHED) LIST_REMOVE(inp, inp_hash); switch (state) {
case INP_BOUND:
LIST_INSERT_HEAD(IN6PCBHASH_BIND(inp->inp_table,
&in6p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
case INP_CONNECTED:
LIST_INSERT_HEAD(IN6PCBHASH_CONNECT(inp->inp_table,
&in6p_faddr(inp), inp->inp_fport,
&in6p_laddr(inp), inp->inp_lport), inp,
inp_hash);
break;
}
inp->inp_state = state;
}
/* $NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $ */
/*-
* Copyright (c) 1999, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Bill Sommerfeld.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_getcwd.c,v 1.61 2021/06/29 22:39:21 dholland Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
#include <sys/uio.h>
#include <sys/kmem.h>
#include <sys/dirent.h>
#include <sys/kauth.h>
#include <ufs/ufs/dir.h> /* XXX only for DIRBLKSIZ */
#include <sys/syscallargs.h>
/*
* Vnode variable naming conventions in this file:
*
* rvp: the current root we're aiming towards.
* lvp, *lvpp: the "lower" vnode
* uvp, *uvpp: the "upper" vnode.
*
* Since all the vnodes we're dealing with are directories, and the
* lookups are going *up* in the filesystem rather than *down*, the
* usual "pvp" (parent) or "dvp" (directory) naming conventions are
* too confusing.
*/
/*
* XXX Will infinite loop in certain cases if a directory read reliably
* returns EINVAL on last block.
* XXX is EINVAL the right thing to return if a directory is malformed?
*/
/*
* XXX Untested vs. mount -o union; probably does the wrong thing.
*/
/*
* Find parent vnode of *lvpp, return in *uvpp
*
* If we care about the name, scan it looking for name of directory
* entry pointing at lvp.
*
* Place the name in the buffer which starts at bufp, immediately
* before *bpp, and move bpp backwards to point at the start of it.
*
* On entry, *lvpp is a locked vnode reference; on exit, it is vput and NULL'ed
* On exit, *uvpp is either NULL or is a locked vnode reference.
*/
static int
getcwd_scandir(struct vnode *lvp, struct vnode **uvpp, char **bpp,
char *bufp, struct lwp *l)
{
int error = 0;
int eofflag;
off_t off;
int tries;
struct uio uio;
struct iovec iov;
char *dirbuf = NULL;
int dirbuflen;
ino_t fileno;
struct vattr va;
struct vnode *uvp = NULL;
kauth_cred_t cred = l->l_cred;
struct componentname cn;
int len, reclen;
tries = 0;
/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
KASSERT(VOP_ISLOCKED(lvp) == LK_EXCLUSIVE);
/*
* If we want the filename, get some info we need while the
* current directory is still locked.
*/
if (bufp != NULL) {
error = VOP_GETATTR(lvp, &va, cred);
if (error) { VOP_UNLOCK(lvp);
*uvpp = NULL;
return error;
}
}
/*
* Ok, we have to do it the hard way..
* Next, get parent vnode using lookup of ..
*/
cn.cn_nameiop = LOOKUP;
cn.cn_flags = ISLASTCN | ISDOTDOT | RDONLY;
cn.cn_cred = cred;
cn.cn_nameptr = "..";
cn.cn_namelen = 2;
/* At this point, lvp is locked */
error = VOP_LOOKUP(lvp, uvpp, &cn);
VOP_UNLOCK(lvp);
if (error) {
*uvpp = NULL;
return error;
}
uvp = *uvpp;
/* If we don't care about the pathname, we're done */
if (bufp == NULL) {
return 0;
}
fileno = va.va_fileid;
/* I guess UFS_DIRBLKSIZ is a good guess at a good size to use? */
dirbuflen = UFS_DIRBLKSIZ;
if (dirbuflen < va.va_blocksize)
dirbuflen = va.va_blocksize;
dirbuf = kmem_alloc(dirbuflen, KM_SLEEP);
/* Now lvp is unlocked, try to lock uvp */
error = vn_lock(uvp, LK_SHARED);
if (error) {
vrele(uvp);
*uvpp = NULL;
return error;
}
#if 0
unionread:
#endif
off = 0;
do {
/* call VOP_READDIR of parent */
iov.iov_base = dirbuf;
iov.iov_len = dirbuflen;
uio.uio_iov = &iov;
uio.uio_iovcnt = 1;
uio.uio_offset = off;
uio.uio_resid = dirbuflen;
uio.uio_rw = UIO_READ;
UIO_SETUP_SYSSPACE(&uio);
eofflag = 0;
error = VOP_READDIR(uvp, &uio, cred, &eofflag, 0, 0);
off = uio.uio_offset;
/*
* Try again if NFS tosses its cookies.
* XXX this can still loop forever if the directory is busted
* such that the second or subsequent page of it always
* returns EINVAL
*/
if ((error == EINVAL) && (tries < 3)) {
off = 0;
tries++;
continue; /* once more, with feeling */
}
if (!error) {
char *cpos;
struct dirent *dp;
cpos = dirbuf;
tries = 0;
/* scan directory page looking for matching vnode */
for (len = (dirbuflen - uio.uio_resid); len > 0;
len -= reclen) {
dp = (struct dirent *) cpos;
reclen = dp->d_reclen;
/* check for malformed directory.. */
if (reclen < _DIRENT_MINSIZE(dp) ||
reclen > len) {
error = EINVAL;
goto out;
}
/*
* XXX should perhaps do VOP_LOOKUP to
* check that we got back to the right place,
* but getting the locking games for that
* right would be heinous.
*/
if ((dp->d_type != DT_WHT) &&
(dp->d_fileno == fileno)) {
char *bp = *bpp;
bp -= dp->d_namlen;
if (bp <= bufp) {
error = ERANGE;
goto out;
}
memcpy(bp, dp->d_name, dp->d_namlen);
error = 0;
*bpp = bp;
goto out;
}
cpos += reclen;
}
} else
goto out;
} while (!eofflag);
#if 0
/*
* Deal with mount -o union, which unions only the
* root directory of the mount.
*/
if ((uvp->v_vflag & VV_ROOT) &&
(uvp->v_mount->mnt_flag & MNT_UNION)) {
struct vnode *tvp = uvp;
uvp = uvp->v_mount->mnt_vnodecovered;
vput(tvp);
vref(uvp);
*uvpp = uvp;
vn_lock(uvp, LK_SHARED | LK_RETRY);
goto unionread;
}
#endif
error = ENOENT;
out:
VOP_UNLOCK(uvp);
kmem_free(dirbuf, dirbuflen);
return error;
}
/*
* common routine shared by sys___getcwd() and vn_isunder()
*/
int
getcwd_common(struct vnode *lvp, struct vnode *rvp, char **bpp, char *bufp,
int limit, int flags, struct lwp *l)
{
struct cwdinfo *cwdi = l->l_proc->p_cwdi;
kauth_cred_t cred = l->l_cred;
struct vnode *uvp = NULL;
char *bp = NULL;
int error;
accmode_t accmode = VEXEC;
error = 0;
if (rvp == NULL) { rvp = cwdi->cwdi_rdir;
if (rvp == NULL)
rvp = rootvnode;
}
vref(rvp);
vref(lvp);
/*
* Error handling invariant:
* Before a `goto out':
* lvp is either NULL, or held.
* uvp is either NULL, or held.
*/
if (bufp)
bp = *bpp;
/*
* this loop will terminate when one of the following happens:
* - we hit the root
* - getdirentries or lookup fails
* - we run out of space in the buffer.
*/
if (lvp == rvp) { if (bp) *(--bp) = '/';
goto out;
}
do {
/*
* access check here is optional, depending on
* whether or not caller cares.
*/
int chkaccess = (flags & GETCWD_CHECK_ACCESS);
bool locked = false;
/*
* step up if we're a covered vnode..
* check access on the first vnode only.
*/
if (lvp->v_vflag & VV_ROOT) {
vn_lock(lvp, LK_SHARED | LK_RETRY);
if (chkaccess) {
error = VOP_ACCESS(lvp, accmode, cred);
if (error) {
VOP_UNLOCK(lvp);
goto out;
}
chkaccess = 0;
}
while (lvp->v_vflag & VV_ROOT) {
struct vnode *tvp;
if (lvp == rvp) {
VOP_UNLOCK(lvp);
goto out;
}
tvp = lvp->v_mount->mnt_vnodecovered;
/*
* hodie natus est radici frater
*/
if (tvp == NULL) {
VOP_UNLOCK(lvp);
error = ENOENT;
goto out;
}
vref(tvp);
vput(lvp);
lvp = tvp;
if (lvp->v_vflag & VV_ROOT) vn_lock(lvp, LK_SHARED | LK_RETRY);
}
}
/* Do we need to check access to the directory? */
if (chkaccess && !cache_have_id(lvp)) {
/* Need exclusive for UFS VOP_GETATTR (itimes) & VOP_LOOKUP. */
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
error = VOP_ACCESS(lvp, accmode, cred);
if (error) {
VOP_UNLOCK(lvp);
goto out;
}
chkaccess = 0;
locked = true;
}
/*
* Look in the name cache; if that fails, look in the
* directory..
*/
error = cache_revlookup(lvp, &uvp, &bp, bufp, chkaccess,
accmode);
if (error == -1) {
if (!locked) {
locked = true;
vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
}
if (lvp->v_type != VDIR) {
VOP_UNLOCK(lvp);
error = ENOTDIR;
goto out;
}
error = getcwd_scandir(lvp, &uvp, &bp, bufp, l);
/* lvp now unlocked */
} else if (locked) {
VOP_UNLOCK(lvp);
}
if (error)
goto out;
#if DIAGNOSTIC
if (bufp && (bp <= bufp)) {
panic("getcwd: oops, went back too far");
}
#endif
accmode = VEXEC | VREAD;
if (bp) *(--bp) = '/';
vrele(lvp);
lvp = uvp;
uvp = NULL;
limit--;
} while ((lvp != rvp) && (limit > 0));
out:
if (bpp)
*bpp = bp;
if (uvp) vrele(uvp); if (lvp) vrele(lvp);
vrele(rvp);
return error;
}
/*
* Check if one directory can be found inside another in the directory
* hierarchy.
*
* Intended to be used in chroot, chdir, fchdir, etc., to ensure that
* chroot() actually means something.
*/
int
vn_isunder(struct vnode *lvp, struct vnode *rvp, struct lwp *l)
{
int error;
error = getcwd_common(lvp, rvp, NULL, NULL, MAXPATHLEN / 2, 0, l);
if (!error)
return 1;
else
return 0;
}
/*
* Returns true if proc p1's root directory equal to or under p2's
* root directory.
*
* Intended to be used from ptrace/procfs sorts of things.
*/
int
proc_isunder(struct proc *p1, struct lwp *l2)
{
struct vnode *r1 = p1->p_cwdi->cwdi_rdir;
struct vnode *r2 = l2->l_proc->p_cwdi->cwdi_rdir;
if (r1 == NULL)
return (r2 == NULL);
else if (r2 == NULL)
return 1;
else
return vn_isunder(r1, r2, l2);
}
/*
* Find pathname of process's current directory.
*
* Use vfs vnode-to-name reverse cache; if that fails, fall back
* to reading directory contents.
*/
int
sys___getcwd(struct lwp *l, const struct sys___getcwd_args *uap, register_t *retval)
{
/* {
syscallarg(char *) bufp;
syscallarg(size_t) length;
} */
int error;
char *path;
char *bp, *bend;
int len = SCARG(uap, length);
int lenused;
struct cwdinfo *cwdi;
if (len > MAXPATHLEN * 4)
len = MAXPATHLEN * 4;
else if (len < 2)
return ERANGE;
path = kmem_alloc(len, KM_SLEEP);
bp = &path[len];
bend = bp;
*(--bp) = '\0';
/*
* 5th argument here is "max number of vnodes to traverse".
* Since each entry takes up at least 2 bytes in the output buffer,
* limit it to N/2 vnodes for an N byte buffer.
*/
cwdi = l->l_proc->p_cwdi;
rw_enter(&cwdi->cwdi_lock, RW_READER);
error = getcwd_common(cwdi->cwdi_cdir, NULL, &bp, path,
len/2, GETCWD_CHECK_ACCESS, l);
rw_exit(&cwdi->cwdi_lock);
if (error)
goto out;
lenused = bend - bp;
*retval = lenused;
/* put the result into user buffer */
error = copyout(bp, SCARG(uap, bufp), lenused);
out:
kmem_free(path, len);
return error;
}
/*
* Try to find a pathname for a vnode. Since there is no mapping vnode ->
* parent directory, this needs the namecache to succeed. Caller holds a
* reference to the vnode.
*/
int
vnode_to_path(char *path, size_t len, struct vnode *vp, struct lwp *curl,
struct proc *p)
{
struct proc *curp = curl->l_proc;
int error, lenused, elen;
char *bp, *bend;
struct vnode *dvp;
KASSERT(vrefcnt(vp) > 0);
bp = bend = &path[len];
*(--bp) = '\0';
error = cache_revlookup(vp, &dvp, &bp, path, false, 0);
if (error != 0)
return (error == -1 ? ENOENT : error);
*(--bp) = '/';
error = getcwd_common(dvp, NULL, &bp, path, len / 2,
GETCWD_CHECK_ACCESS, curl);
vrele(dvp);
if (error != 0)
return error;
/*
* Strip off emulation path for emulated processes looking at
* the maps file of a process of the same emulation. (Won't
* work if /emul/xxx is a symlink..)
*/
if (curp->p_emul == p->p_emul && curp->p_emul->e_path != NULL) {
elen = strlen(curp->p_emul->e_path);
if (!strncmp(bp, curp->p_emul->e_path, elen)) bp = &bp[elen];
}
lenused = bend - bp;
memcpy(path, bp, lenused);
path[lenused] = '\0';
return 0;
}
/* $NetBSD: cpu.c,v 1.210 2024/04/22 23:07:47 andvar Exp $ */
/*
* Copyright (c) 2000-2020 NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Bill Sommerfeld of RedBack Networks Inc, and by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1999 Stefan Grefen
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR AND CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.210 2024/04/22 23:07:47 andvar Exp $");
#include "opt_ddb.h"
#include "opt_mpbios.h" /* for MPDEBUG */
#include "opt_mtrr.h"
#include "opt_multiprocessor.h"
#include "opt_svs.h"
#include "lapic.h"
#include "ioapic.h"
#include "acpica.h"
#include "hpet.h"
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/systm.h>
#include <sys/device.h>
#include <sys/cpu.h>
#include <sys/cpufreq.h>
#include <sys/idle.h>
#include <sys/atomic.h>
#include <sys/reboot.h>
#include <sys/csan.h>
#include <uvm/uvm.h>
#include "acpica.h" /* for NACPICA, for mp_verbose */
#include <x86/machdep.h>
#include <machine/cpufunc.h>
#include <machine/cpuvar.h>
#include <machine/pmap.h>
#include <machine/vmparam.h>
#if defined(MULTIPROCESSOR)
#include <machine/mpbiosvar.h>
#endif
#include <machine/mpconfig.h> /* for mp_verbose */
#include <machine/pcb.h>
#include <machine/specialreg.h>
#include <machine/segments.h>
#include <machine/gdt.h>
#include <machine/mtrr.h>
#include <machine/pio.h>
#include <machine/cpu_counter.h>
#include <machine/pmap_private.h>
#include <x86/fpu.h>
#if NACPICA > 0
#include <dev/acpi/acpi_srat.h>
#endif
#if NLAPIC > 0
#include <machine/apicvar.h>
#include <machine/i82489reg.h>
#include <machine/i82489var.h>
#endif
#include <dev/ic/mc146818reg.h>
#include <dev/ic/hpetvar.h>
#include <i386/isa/nvram.h>
#include <dev/isa/isareg.h>
#include "tsc.h"
#ifndef XENPV
#include "hyperv.h"
#if NHYPERV > 0
#include <x86/x86/hypervvar.h>
#endif
#endif
#ifdef XEN
#include <xen/hypervisor.h>
#endif
static int cpu_match(device_t, cfdata_t, void *);
static void cpu_attach(device_t, device_t, void *);
static void cpu_defer(device_t);
static int cpu_rescan(device_t, const char *, const int *);
static void cpu_childdetached(device_t, device_t);
static bool cpu_stop(device_t);
static bool cpu_suspend(device_t, const pmf_qual_t *);
static bool cpu_resume(device_t, const pmf_qual_t *);
static bool cpu_shutdown(device_t, int);
struct cpu_softc {
device_t sc_dev; /* device tree glue */
struct cpu_info *sc_info; /* pointer to CPU info */
bool sc_wasonline;
};
#ifdef MULTIPROCESSOR
int mp_cpu_start(struct cpu_info *, paddr_t);
void mp_cpu_start_cleanup(struct cpu_info *);
const struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
mp_cpu_start_cleanup };
#endif
CFATTACH_DECL2_NEW(cpu, sizeof(struct cpu_softc),
cpu_match, cpu_attach, NULL, NULL, cpu_rescan, cpu_childdetached);
/*
* Statically-allocated CPU info for the primary CPU (or the only
* CPU, on uniprocessors). The CPU info list is initialized to
* point at it.
*/
struct cpu_info cpu_info_primary __aligned(CACHE_LINE_SIZE) = {
.ci_dev = 0,
.ci_self = &cpu_info_primary,
.ci_idepth = -1,
.ci_curlwp = &lwp0,
.ci_curldt = -1,
.ci_kfpu_spl = -1,
};
struct cpu_info *cpu_info_list = &cpu_info_primary;
#ifdef i386
void cpu_set_tss_gates(struct cpu_info *);
#endif
static void cpu_init_idle_lwp(struct cpu_info *);
uint32_t cpu_feature[7] __read_mostly; /* X86 CPUID feature bits */
/* [0] basic features cpuid.1:%edx
* [1] basic features cpuid.1:%ecx (CPUID2_xxx bits)
* [2] extended features cpuid:80000001:%edx
* [3] extended features cpuid:80000001:%ecx
* [4] VIA padlock features
* [5] structured extended features cpuid.7:%ebx
* [6] structured extended features cpuid.7:%ecx
*/
#ifdef MULTIPROCESSOR
bool x86_mp_online;
paddr_t mp_trampoline_paddr = MP_TRAMPOLINE;
#endif
#if NLAPIC > 0
static vaddr_t cmos_data_mapping;
#endif
struct cpu_info *cpu_starting;
#ifdef MULTIPROCESSOR
void cpu_hatch(void *);
static void cpu_boot_secondary(struct cpu_info *ci);
static void cpu_start_secondary(struct cpu_info *ci);
#if NLAPIC > 0
static void cpu_copy_trampoline(paddr_t);
#endif
#endif /* MULTIPROCESSOR */
/*
* Runs once per boot once multiprocessor goo has been detected and
* the local APIC on the boot processor has been mapped.
*
* Called from lapic_boot_init() (from mpbios_scan()).
*/
#if NLAPIC > 0
void
cpu_init_first(void)
{
cpu_info_primary.ci_cpuid = lapic_cpu_number();
cmos_data_mapping = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_VAONLY);
if (cmos_data_mapping == 0)
panic("No KVA for page 0");
pmap_kenter_pa(cmos_data_mapping, 0, VM_PROT_READ|VM_PROT_WRITE, 0);
pmap_update(pmap_kernel());
}
#endif
static int
cpu_match(device_t parent, cfdata_t match, void *aux)
{
return 1;
}
#ifdef __HAVE_PCPU_AREA
void
cpu_pcpuarea_init(struct cpu_info *ci)
{
struct vm_page *pg;
size_t i, npages;
vaddr_t base, va;
paddr_t pa;
CTASSERT(sizeof(struct pcpu_entry) % PAGE_SIZE == 0);
npages = sizeof(struct pcpu_entry) / PAGE_SIZE;
base = (vaddr_t)&pcpuarea->ent[cpu_index(ci)];
for (i = 0; i < npages; i++) {
pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
if (pg == NULL) {
panic("failed to allocate pcpu PA");
}
va = base + i * PAGE_SIZE;
pa = VM_PAGE_TO_PHYS(pg);
pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
}
pmap_update(pmap_kernel());
}
#endif
static void
cpu_vm_init(struct cpu_info *ci)
{
unsigned int ncolors = 2;
/*
* XXX: for AP's the cache info has not been initialized yet
* but that does not matter because uvm only pays attention at
* the maximum only. We should fix it once cpus have different
* cache sizes.
*/
for (unsigned int i = CAI_ICACHE; i <= CAI_L2CACHE; i++) {
struct x86_cache_info *cai;
unsigned int tcolors;
cai = &ci->ci_cinfo[i];
tcolors = atop(cai->cai_totalsize);
switch (cai->cai_associativity) {
case 0xff:
tcolors = 1; /* fully associative */
break;
case 0:
case 1:
break;
default:
tcolors /= cai->cai_associativity;
}
if (tcolors <= ncolors)
continue;
ncolors = tcolors;
}
/*
* If the desired number of colors is not a power of
* two, it won't be good. Find the greatest power of
* two which is an even divisor of the number of colors,
* to preserve even coloring of pages.
*/
if (ncolors & (ncolors - 1) ) {
unsigned int try, picked = 1;
for (try = 1; try < ncolors; try *= 2) {
if (ncolors % try == 0) picked = try;
}
if (picked == 1) {
panic("desired number of cache colors %u is "
" > 1, but not even!", ncolors);
}
ncolors = picked;
}
/*
* Knowing the size of the largest cache on this CPU, potentially
* re-color our pages.
*/
aprint_debug_dev(ci->ci_dev, "%d page colors\n", ncolors);
uvm_page_recolor(ncolors);
pmap_tlb_cpu_init(ci);
#ifndef __HAVE_DIRECT_MAP
pmap_vpage_cpu_init(ci);
#endif
}
static void
cpu_attach(device_t parent, device_t self, void *aux)
{
struct cpu_softc *sc = device_private(self);
struct cpu_attach_args *caa = aux;
struct cpu_info *ci;
uintptr_t ptr;
#if NLAPIC > 0
int cpunum = caa->cpu_number;
#endif
static bool again;
sc->sc_dev = self;
if (ncpu > maxcpus) {
#ifndef _LP64
aprint_error(": too many CPUs, please use NetBSD/amd64\n");
#else
aprint_error(": too many CPUs\n");
#endif
return;
}
/*
* If we're an Application Processor, allocate a cpu_info
* structure, otherwise use the primary's.
*/
if (caa->cpu_role == CPU_ROLE_AP) {
if ((boothowto & RB_MD1) != 0) {
aprint_error(": multiprocessor boot disabled\n");
if (!pmf_device_register(self, NULL, NULL))
aprint_error_dev(self,
"couldn't establish power handler\n");
return;
}
aprint_naive(": Application Processor\n");
ptr = (uintptr_t)uvm_km_alloc(kernel_map,
sizeof(*ci) + CACHE_LINE_SIZE - 1, 0,
UVM_KMF_WIRED|UVM_KMF_ZERO);
ci = (struct cpu_info *)roundup2(ptr, CACHE_LINE_SIZE);
ci->ci_curldt = -1;
} else {
aprint_naive(": %s Processor\n",
caa->cpu_role == CPU_ROLE_SP ? "Single" : "Boot");
ci = &cpu_info_primary;
#if NLAPIC > 0
if (cpunum != lapic_cpu_number()) {
/* XXX should be done earlier. */
uint32_t reg;
aprint_verbose("\n");
aprint_verbose_dev(self, "running CPU at apic %d"
" instead of at expected %d", lapic_cpu_number(),
cpunum);
reg = lapic_readreg(LAPIC_ID);
lapic_writereg(LAPIC_ID, (reg & ~LAPIC_ID_MASK) |
(cpunum << LAPIC_ID_SHIFT));
}
if (cpunum != lapic_cpu_number()) {
aprint_error_dev(self, "unable to reset apic id\n");
}
#endif
}
ci->ci_self = ci;
sc->sc_info = ci;
ci->ci_dev = self;
ci->ci_acpiid = caa->cpu_id;
ci->ci_cpuid = caa->cpu_number;
ci->ci_func = caa->cpu_func;
ci->ci_kfpu_spl = -1;
aprint_normal("\n");
/* Must be before mi_cpu_attach(). */
cpu_vm_init(ci);
if (caa->cpu_role == CPU_ROLE_AP) {
int error;
error = mi_cpu_attach(ci);
if (error != 0) {
aprint_error_dev(self,
"mi_cpu_attach failed with %d\n", error);
return;
}
#ifdef __HAVE_PCPU_AREA
cpu_pcpuarea_init(ci);
#endif
cpu_init_tss(ci);
} else {
KASSERT(ci->ci_data.cpu_idlelwp != NULL);
#if NACPICA > 0
/* Parse out NUMA info for cpu_identify(). */
acpisrat_init();
#endif
}
#ifdef SVS
cpu_svs_init(ci);
#endif
pmap_reference(pmap_kernel());
ci->ci_pmap = pmap_kernel();
ci->ci_tlbstate = TLBSTATE_STALE;
/*
* Boot processor may not be attached first, but the below
* must be done to allow booting other processors.
*/
if (!again) {
/* Make sure DELAY() (likely i8254_delay()) is initialized. */
DELAY(1);
/*
* Basic init. Compute an approximate frequency for the TSC
* using the i8254. If there's a HPET we'll redo it later.
*/
atomic_or_32(&ci->ci_flags, CPUF_PRESENT | CPUF_PRIMARY);
cpu_intr_init(ci);
tsc_setfunc(ci);
cpu_get_tsc_freq(ci);
cpu_init(ci);
#ifdef i386
cpu_set_tss_gates(ci);
#endif
pmap_cpu_init_late(ci);
#if NLAPIC > 0
if (caa->cpu_role != CPU_ROLE_SP) {
/* Enable lapic. */
lapic_enable();
lapic_set_lvt();
if (!vm_guest_is_xenpvh_or_pvhvm())
lapic_calibrate_timer(false);
}
#endif
kcsan_cpu_init(ci);
again = true;
}
/* further PCB init done later. */
switch (caa->cpu_role) {
case CPU_ROLE_SP:
atomic_or_32(&ci->ci_flags, CPUF_SP);
cpu_identify(ci);
x86_errata();
x86_cpu_idle_init();
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
break;
case CPU_ROLE_BP:
atomic_or_32(&ci->ci_flags, CPUF_BSP);
cpu_identify(ci);
x86_errata();
x86_cpu_idle_init();
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
break;
#ifdef MULTIPROCESSOR
case CPU_ROLE_AP:
/*
* report on an AP
*/
cpu_intr_init(ci);
idt_vec_init_cpu_md(&ci->ci_idtvec, cpu_index(ci));
gdt_alloc_cpu(ci);
#ifdef i386
cpu_set_tss_gates(ci);
#endif
pmap_cpu_init_late(ci);
cpu_start_secondary(ci);
if (ci->ci_flags & CPUF_PRESENT) {
struct cpu_info *tmp;
cpu_identify(ci);
tmp = cpu_info_list;
while (tmp->ci_next)
tmp = tmp->ci_next;
tmp->ci_next = ci;
}
break;
#endif
default:
panic("unknown processor type??\n");
}
pat_init(ci);
if (!pmf_device_register1(self, cpu_suspend, cpu_resume, cpu_shutdown))
aprint_error_dev(self, "couldn't establish power handler\n");
#ifdef MULTIPROCESSOR
if (mp_verbose) {
struct lwp *l = ci->ci_data.cpu_idlelwp;
struct pcb *pcb = lwp_getpcb(l);
aprint_verbose_dev(self,
"idle lwp at %p, idle sp at %p\n",
l,
#ifdef i386
(void *)pcb->pcb_esp
#else
(void *)pcb->pcb_rsp
#endif
);
}
#endif
/*
* Postpone the "cpufeaturebus" scan.
* It is safe to scan the pseudo-bus
* only after all CPUs have attached.
*/
(void)config_defer(self, cpu_defer);
}
static void
cpu_defer(device_t self)
{
cpu_rescan(self, NULL, NULL);
}
static int
cpu_rescan(device_t self, const char *ifattr, const int *locators)
{
struct cpu_softc *sc = device_private(self);
struct cpufeature_attach_args cfaa;
struct cpu_info *ci = sc->sc_info;
/*
* If we booted with RB_MD1 to disable multiprocessor, the
* auto-configuration data still contains the additional
* CPUs. But their initialization was mostly bypassed
* during attach, so we have to make sure we don't look at
* their featurebus info, since it wasn't retrieved.
*/
if (ci == NULL)
return 0;
memset(&cfaa, 0, sizeof(cfaa));
cfaa.ci = ci;
if (ifattr_match(ifattr, "cpufeaturebus")) {
if (ci->ci_frequency == NULL) {
cfaa.name = "frequency";
ci->ci_frequency =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_padlock == NULL) {
cfaa.name = "padlock";
ci->ci_padlock =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_temperature == NULL) {
cfaa.name = "temperature";
ci->ci_temperature =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
if (ci->ci_vm == NULL) {
cfaa.name = "vm";
ci->ci_vm =
config_found(self, &cfaa, NULL,
CFARGS(.iattr = "cpufeaturebus"));
}
}
return 0;
}
static void
cpu_childdetached(device_t self, device_t child)
{
struct cpu_softc *sc = device_private(self);
struct cpu_info *ci = sc->sc_info;
if (ci->ci_frequency == child)
ci->ci_frequency = NULL;
if (ci->ci_padlock == child)
ci->ci_padlock = NULL;
if (ci->ci_temperature == child)
ci->ci_temperature = NULL;
if (ci->ci_vm == child)
ci->ci_vm = NULL;
}
/*
* Initialize the processor appropriately.
*/
void
cpu_init(struct cpu_info *ci)
{
extern int x86_fpu_save;
uint32_t cr4 = 0;
lcr0(rcr0() | CR0_WP);
/* If global TLB caching is supported, enable it */
if (cpu_feature[0] & CPUID_PGE)
cr4 |= CR4_PGE;
/*
* If we have FXSAVE/FXRESTOR, use them.
*/
if (cpu_feature[0] & CPUID_FXSR) {
cr4 |= CR4_OSFXSR;
/*
* If we have SSE/SSE2, enable XMM exceptions.
*/
if (cpu_feature[0] & (CPUID_SSE|CPUID_SSE2))
cr4 |= CR4_OSXMMEXCPT;
}
/* If xsave is supported, enable it */
if (cpu_feature[1] & CPUID2_XSAVE)
cr4 |= CR4_OSXSAVE;
/* If SMEP is supported, enable it */
if (cpu_feature[5] & CPUID_SEF_SMEP)
cr4 |= CR4_SMEP;
/* If SMAP is supported, enable it */
if (cpu_feature[5] & CPUID_SEF_SMAP)
cr4 |= CR4_SMAP;
#ifdef SVS
/* If PCID is supported, enable it */
if (svs_pcid)
cr4 |= CR4_PCIDE;
#endif
if (cr4) {
cr4 |= rcr4();
lcr4(cr4);
}
/*
* Changing CR4 register may change cpuid values. For example, setting
* CR4_OSXSAVE sets CPUID2_OSXSAVE. The CPUID2_OSXSAVE is in
* ci_feat_val[1], so update it.
* XXX Other than ci_feat_val[1] might be changed.
*/
if (cpuid_level >= 1) {
u_int descs[4];
x86_cpuid(1, descs);
ci->ci_feat_val[1] = descs[2];
}
if (CPU_IS_PRIMARY(ci) &&
x86_fpu_save >= FPU_SAVE_FXSAVE) {
fpuinit_mxcsr_mask();
}
/* If xsave is enabled, enable all fpu features */
if (cr4 & CR4_OSXSAVE)
wrxcr(0, x86_xsave_features & XCR0_FPU);
#ifdef MTRR
/*
* On a P6 or above, initialize MTRR's if the hardware supports them.
*/
if (cpu_feature[0] & CPUID_MTRR) {
if ((ci->ci_flags & CPUF_AP) == 0)
i686_mtrr_init_first();
mtrr_init_cpu(ci);
}
#ifdef i386
if (strcmp((char *)(ci->ci_vendor), "AuthenticAMD") == 0) {
/*
* Must be a K6-2 Step >= 7 or a K6-III.
*/
if (CPUID_TO_FAMILY(ci->ci_signature) == 5) {
if (CPUID_TO_MODEL(ci->ci_signature) > 8 ||
(CPUID_TO_MODEL(ci->ci_signature) == 8 &&
CPUID_TO_STEPPING(ci->ci_signature) >= 7)) {
mtrr_funcs = &k6_mtrr_funcs;
k6_mtrr_init_first();
mtrr_init_cpu(ci);
}
}
}
#endif /* i386 */
#endif /* MTRR */
if (ci != &cpu_info_primary) {
/* Synchronize TSC */
atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
tsc_sync_ap(ci);
} else {
atomic_or_32(&ci->ci_flags, CPUF_RUNNING);
}
}
#ifdef MULTIPROCESSOR
void
cpu_boot_secondary_processors(void)
{
struct cpu_info *ci;
kcpuset_t *cpus;
u_long i;
/* Now that we know the number of CPUs, patch the text segment. */
x86_patch(false);
#if NACPICA > 0
/* Finished with NUMA info for now. */
acpisrat_exit();
#endif
kcpuset_create(&cpus, true);
kcpuset_set(cpus, cpu_index(curcpu()));
for (i = 0; i < maxcpus; i++) {
ci = cpu_lookup(i);
if (ci == NULL)
continue;
if (ci->ci_data.cpu_idlelwp == NULL)
continue;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
continue;
if (ci->ci_flags & (CPUF_BSP|CPUF_SP|CPUF_PRIMARY))
continue;
cpu_boot_secondary(ci);
kcpuset_set(cpus, cpu_index(ci));
}
while (!kcpuset_match(cpus, kcpuset_running))
;
kcpuset_destroy(cpus);
x86_mp_online = true;
/* Now that we know about the TSC, attach the timecounter. */
tsc_tc_init();
}
#endif
static void
cpu_init_idle_lwp(struct cpu_info *ci)
{
struct lwp *l = ci->ci_data.cpu_idlelwp;
struct pcb *pcb = lwp_getpcb(l);
pcb->pcb_cr0 = rcr0();
}
void
cpu_init_idle_lwps(void)
{
struct cpu_info *ci;
u_long i;
for (i = 0; i < maxcpus; i++) {
ci = cpu_lookup(i);
if (ci == NULL)
continue;
if (ci->ci_data.cpu_idlelwp == NULL)
continue;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
continue;
cpu_init_idle_lwp(ci);
}
}
#ifdef MULTIPROCESSOR
void
cpu_start_secondary(struct cpu_info *ci)
{
u_long psl;
int i;
#if NLAPIC > 0
paddr_t mp_pdirpa;
mp_pdirpa = pmap_init_tmp_pgtbl(mp_trampoline_paddr);
cpu_copy_trampoline(mp_pdirpa);
#endif
atomic_or_32(&ci->ci_flags, CPUF_AP);
ci->ci_curlwp = ci->ci_data.cpu_idlelwp;
if (CPU_STARTUP(ci, mp_trampoline_paddr) != 0) {
return;
}
/*
* Wait for it to become ready. Setting cpu_starting opens the
* initial gate and allows the AP to start soft initialization.
*/
KASSERT(cpu_starting == NULL);
cpu_starting = ci;
for (i = 100000; (!(ci->ci_flags & CPUF_PRESENT)) && i > 0; i--) {
delay_func(10);
}
if ((ci->ci_flags & CPUF_PRESENT) == 0) {
aprint_error_dev(ci->ci_dev, "failed to become ready\n");
#if defined(MPDEBUG) && defined(DDB)
printf("dropping into debugger; continue from here to resume boot\n");
Debugger();
#endif
} else {
/*
* Synchronize time stamp counters. Invalidate cache and do
* twice (in tsc_sync_bp) to minimize possible cache effects.
* Disable interrupts to try and rule out any external
* interference.
*/
psl = x86_read_psl();
x86_disable_intr();
tsc_sync_bp(ci);
x86_write_psl(psl);
}
CPU_START_CLEANUP(ci);
cpu_starting = NULL;
}
void
cpu_boot_secondary(struct cpu_info *ci)
{
int64_t drift;
u_long psl;
int i;
atomic_or_32(&ci->ci_flags, CPUF_GO);
for (i = 100000; (!(ci->ci_flags & CPUF_RUNNING)) && i > 0; i--) {
delay_func(10);
}
if ((ci->ci_flags & CPUF_RUNNING) == 0) {
aprint_error_dev(ci->ci_dev, "failed to start\n");
#if defined(MPDEBUG) && defined(DDB)
printf("dropping into debugger; continue from here to resume boot\n");
Debugger();
#endif
} else {
/* Synchronize TSC again, check for drift. */
drift = ci->ci_data.cpu_cc_skew;
psl = x86_read_psl();
x86_disable_intr();
tsc_sync_bp(ci);
x86_write_psl(psl);
drift -= ci->ci_data.cpu_cc_skew;
aprint_debug_dev(ci->ci_dev, "TSC skew=%lld drift=%lld\n",
(long long)ci->ci_data.cpu_cc_skew, (long long)drift);
tsc_sync_drift(drift);
}
}
/*
* The CPU ends up here when it's ready to run.
* This is called from code in mptramp.s; at this point, we are running
* in the idle pcb/idle stack of the new CPU. When this function returns,
* this processor will enter the idle loop and start looking for work.
*/
void
cpu_hatch(void *v)
{
struct cpu_info *ci = (struct cpu_info *)v;
struct pcb *pcb;
int s, i;
/* ------------------------------------------------------------- */
/*
* This section of code must be compiled with SSP disabled, to
* prevent a race against cpu0. See sys/conf/ssp.mk.
*/
/*
* Initialize MSRs on this CPU:
*
* - On amd64: Enables SYSCALL/SYSRET.
*
* - On amd64: Sets up %fs and %gs so that %gs points to the
* current struct cpu_info as needed for CPUVAR(...),
* curcpu(), and curlwp.
*
* (On i386, CPUVAR(...), curcpu(), and curlwp are made to
* work first by the conifguration of segment descriptors in
* the Global Descriptor Table (GDT) in initgdt.)
*
* - Enables the no-execute bit if supported.
*
* Thus, after this point, CPUVAR(...), curcpu(), and curlwp
* will work on this CPU.
*
* Note: The call to cpu_init_msrs for cpu0 happens in
* init386/init_x86_64.
*/
cpu_init_msrs(ci, true);
cpu_probe(ci);
cpu_speculation_init(ci);
#if NHYPERV > 0
hyperv_init_cpu(ci);
#endif
ci->ci_data.cpu_cc_freq = cpu_info_primary.ci_data.cpu_cc_freq;
/* cpu_get_tsc_freq(ci); */
KDASSERT((ci->ci_flags & CPUF_PRESENT) == 0);
/*
* Synchronize the TSC for the first time. Note that interrupts are
* off at this point.
*/
atomic_or_32(&ci->ci_flags, CPUF_PRESENT);
tsc_sync_ap(ci);
/* ------------------------------------------------------------- */
/*
* Wait to be brought online.
*
* Use MONITOR/MWAIT if available. These instructions put the CPU in
* a low consumption mode (C-state), and if the TSC is not invariant,
* this causes the TSC to drift. We want this to happen, so that we
* can later detect (in tsc_tc_init) any abnormal drift with invariant
* TSCs. That's just for safety; by definition such drifts should
* never occur with invariant TSCs.
*
* If not available, try PAUSE. We'd like to use HLT, but we have
* interrupts off.
*/
while ((ci->ci_flags & CPUF_GO) == 0) {
if ((cpu_feature[1] & CPUID2_MONITOR) != 0) {
x86_monitor(&ci->ci_flags, 0, 0);
if ((ci->ci_flags & CPUF_GO) != 0) {
continue;
}
x86_mwait(0, 0);
} else {
/*
* XXX The loop repetition count could be a lot higher, but
* XXX currently qemu emulator takes a _very_long_time_ to
* XXX execute the pause instruction. So for now, use a low
* XXX value to allow the cpu to hatch before timing out.
*/
for (i = 50; i != 0; i--) {
x86_pause();
}
}
}
/* Because the text may have been patched in x86_patch(). */
wbinvd();
x86_flush();
tlbflushg();
KASSERT((ci->ci_flags & CPUF_RUNNING) == 0);
#ifdef PAE
pd_entry_t * l3_pd = ci->ci_pae_l3_pdir;
for (i = 0 ; i < PDP_SIZE; i++) {
l3_pd[i] = pmap_kernel()->pm_pdirpa[i] | PTE_P;
}
lcr3(ci->ci_pae_l3_pdirpa);
#else
lcr3(pmap_pdirpa(pmap_kernel(), 0));
#endif
pcb = lwp_getpcb(curlwp);
pcb->pcb_cr3 = rcr3();
pcb = lwp_getpcb(ci->ci_data.cpu_idlelwp);
lcr0(pcb->pcb_cr0);
cpu_init_idt(ci);
gdt_init_cpu(ci);
#if NLAPIC > 0
lapic_enable();
lapic_set_lvt();
#endif
fpuinit(ci);
lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
ltr(ci->ci_tss_sel);
/*
* cpu_init will re-synchronize the TSC, and will detect any abnormal
* drift that would have been caused by the use of MONITOR/MWAIT
* above.
*/
cpu_init(ci);
#ifdef XENPVHVM
xen_hvm_init_cpu(ci);
#endif
(*x86_initclock_func)();
cpu_get_tsc_freq(ci);
s = splhigh();
#if NLAPIC > 0
lapic_write_tpri(0);
#endif
x86_enable_intr();
splx(s);
x86_errata();
aprint_debug_dev(ci->ci_dev, "running\n");
kcsan_cpu_init(ci);
idle_loop(NULL);
KASSERT(false);
}
#endif
#if defined(DDB)
#include <ddb/db_output.h>
#include <machine/db_machdep.h>
/*
* Dump CPU information from ddb.
*/
void
cpu_debug_dump(void)
{
struct cpu_info *ci;
CPU_INFO_ITERATOR cii;
const char sixtyfour64space[] =
#ifdef _LP64
" "
#endif
"";
db_printf("addr %sdev id flags ipis spl curlwp "
"\n", sixtyfour64space);
for (CPU_INFO_FOREACH(cii, ci)) {
db_printf("%p %s %ld %x %x %d %10p\n",
ci,
ci->ci_dev == NULL ? "BOOT" : device_xname(ci->ci_dev),
(long)ci->ci_cpuid,
ci->ci_flags, ci->ci_ipis, ci->ci_ilevel,
ci->ci_curlwp);
}
}
#endif
#ifdef MULTIPROCESSOR
#if NLAPIC > 0
static void
cpu_copy_trampoline(paddr_t pdir_pa)
{
extern uint32_t nox_flag;
extern u_char cpu_spinup_trampoline[];
extern u_char cpu_spinup_trampoline_end[];
vaddr_t mp_trampoline_vaddr;
struct {
uint32_t large;
uint32_t nox;
uint32_t pdir;
} smp_data;
CTASSERT(sizeof(smp_data) == 3 * 4);
smp_data.large = (pmap_largepages != 0);
smp_data.nox = nox_flag;
smp_data.pdir = (uint32_t)(pdir_pa & 0xFFFFFFFF);
/* Enter the physical address */
mp_trampoline_vaddr = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
UVM_KMF_VAONLY);
pmap_kenter_pa(mp_trampoline_vaddr, mp_trampoline_paddr,
VM_PROT_READ | VM_PROT_WRITE, 0);
pmap_update(pmap_kernel());
/* Copy boot code */
memcpy((void *)mp_trampoline_vaddr,
cpu_spinup_trampoline,
cpu_spinup_trampoline_end - cpu_spinup_trampoline);
/* Copy smp_data at the end */
memcpy((void *)(mp_trampoline_vaddr + PAGE_SIZE - sizeof(smp_data)),
&smp_data, sizeof(smp_data));
pmap_kremove(mp_trampoline_vaddr, PAGE_SIZE);
pmap_update(pmap_kernel());
uvm_km_free(kernel_map, mp_trampoline_vaddr, PAGE_SIZE, UVM_KMF_VAONLY);
}
#endif
int
mp_cpu_start(struct cpu_info *ci, paddr_t target)
{
#if NLAPIC > 0
int error;
/*
* Bootstrap code must be addressable in real mode
* and it must be page aligned.
*/
KASSERT(target < 0x10000 && target % PAGE_SIZE == 0);
/*
* "The BSP must initialize CMOS shutdown code to 0Ah ..."
*/
outb(IO_RTC, NVRAM_RESET);
outb(IO_RTC+1, NVRAM_RESET_JUMP);
/*
* "and the warm reset vector (DWORD based at 40:67) to point
* to the AP startup code ..."
*/
unsigned short dwordptr[2];
dwordptr[0] = 0;
dwordptr[1] = target >> 4;
memcpy((uint8_t *)cmos_data_mapping + 0x467, dwordptr, 4);
if ((cpu_feature[0] & CPUID_APIC) == 0) {
aprint_error("mp_cpu_start: CPU does not have APIC\n");
return ENODEV;
}
/*
* ... prior to executing the following sequence:". We'll also add in
* local cache flush, in case the BIOS has left the AP with its cache
* disabled. It may not be able to cope with MP coherency.
*/
wbinvd();
if (ci->ci_flags & CPUF_AP) {
error = x86_ipi_init(ci->ci_cpuid);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (1)\n",
__func__);
return error;
}
delay_func(10000);
error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (2)\n",
__func__);
return error;
}
delay_func(200);
error = x86_ipi_startup(ci->ci_cpuid, target / PAGE_SIZE);
if (error != 0) {
aprint_error_dev(ci->ci_dev, "%s: IPI not taken (3)\n",
__func__);
return error;
}
delay_func(200);
}
return 0;
#else
return ENODEV;
#endif /* NLAPIC > 0 */
}
void
mp_cpu_start_cleanup(struct cpu_info *ci)
{
/*
* Ensure the NVRAM reset byte contains something vaguely sane.
*/
outb(IO_RTC, NVRAM_RESET);
outb(IO_RTC+1, NVRAM_RESET_RST);
}
#endif
#ifdef __x86_64__
typedef void (vector)(void);
extern vector Xsyscall, Xsyscall32, Xsyscall_svs;
#endif
/*
* cpu_init_msrs(ci, full)
*
* Initialize some Model-Specific Registers (MSRs) on the current
* CPU, whose struct cpu_info pointer is ci, for:
*
* - SYSCALL/SYSRET.
* - %fs/%gs on amd64 if `full' is true; needed to make
* CPUVAR(...), curcpu(), and curlwp work. (We do this at boot,
* but skip it on ACPI wakeup.)
* - No-execute bit, if supported.
*
* References:
*
* - Intel 64 and IA-32 Architectures Software Developer's Manual,
* Volume 3: System Programming Guide, Order Number 325384,
* April 2022, Sec. 5.8.8 `Fast System Calls in 64-Bit Mode',
* pp. 5-22 through 5-23.
*
* - Intel 64 and IA-32 Architectures Software Developer's Manual,
* Volume 4: Model-Specific Registers, Order Number 335592,
* April 2022, Sec. 2.1 `Architectural MSRs', Table 2-2,
* pp. 2-60 through 2-61.
*/
void
cpu_init_msrs(struct cpu_info *ci, bool full)
{
#ifdef __x86_64__
/*
* On amd64, set up the syscall target address registers
* for SYSCALL/SYSRET:
*
* - IA32_STAR, c000_0081h (MSR_STAR): System Call Target
* Address. Code and stack segment selectors for SYSRET
* (bits 48:63) and SYSCALL (bits 32:47).
*
* - IA32_LSTAR, c000_0082h (MSR_LSTAR): IA-32e Mode System
* Call Target Address. Target rip for SYSCALL when executed
* in 64-bit mode.
*
* - IA32_CSTAR, c000_0083h (MSR_CSTAR): IA-32e Mode System
* Call Target Address. Target rip for SYSCALL when executed
* in compatibility mode. (XXX Manual says this is `[n]ot
* used, as the SYSCALL instruction is not recognized in
* compatibility mode', so why do we set it?)
*
* - IA32_FMASK, c000_0084h (MSR_SFMASK): System Call Flag
* Mask. Mask for the RFLAGS register on SYSCALL.
*/
wrmsr(MSR_STAR,
((uint64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
((uint64_t)LSEL(LSYSRETBASE_SEL, SEL_UPL) << 48));
wrmsr(MSR_LSTAR, (uint64_t)Xsyscall);
wrmsr(MSR_CSTAR, (uint64_t)Xsyscall32);
wrmsr(MSR_SFMASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_AC);
#ifdef SVS
if (svs_enabled)
wrmsr(MSR_LSTAR, (uint64_t)Xsyscall_svs);
#endif
/*
* On amd64 if `full' is true -- used at boot, but not on ACPI
* wakeup -- then additionally set up %fs and %gs:
*
* - IA32_FS_BASE, c000_0100h (MSR_FSBASE): Base address of
* %fs. Not used in NetBSD kernel, so zero it.
*
* - IA32_GS_BASE, c000_0101h (MSR_GSBASE): Base address of
* %gs. Used in NetBSD kernel by CPUVAR(...), curcpu(), and
* curlwp for access to the CPU-local area, so set it to ci.
*
* - IA32_KERNEL_GS_BASE, c000_0102h (MSR_KERNELGSBASE): Base
* address of what swapgs will leave in %gs when switching to
* userland. Zero for now; will be set to pcb->pcb_gs in
* cpu_switchto for user threads.
*/
if (full) {
wrmsr(MSR_FSBASE, 0);
wrmsr(MSR_GSBASE, (uint64_t)ci);
wrmsr(MSR_KERNELGSBASE, 0);
}
#endif /* __x86_64__ */
/*
* If the no-execute bit is supported, enable it in:
*
* - IA32_EFER, c000_0080h (MSR_EFER): Extended Feature
* Enables.
*/
if (cpu_feature[2] & CPUID_NOX)
wrmsr(MSR_EFER, rdmsr(MSR_EFER) | EFER_NXE);
}
void
cpu_offline_md(void)
{
return;
}
/* XXX joerg restructure and restart CPUs individually */
static bool
cpu_stop(device_t dv)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
int err;
KASSERT((ci->ci_flags & CPUF_PRESENT) != 0);
if (CPU_IS_PRIMARY(ci))
return true;
if (ci->ci_data.cpu_idlelwp == NULL)
return true;
sc->sc_wasonline = !(ci->ci_schedstate.spc_flags & SPCF_OFFLINE);
if (sc->sc_wasonline) {
mutex_enter(&cpu_lock);
err = cpu_setstate(ci, false);
mutex_exit(&cpu_lock);
if (err != 0)
return false;
}
return true;
}
static bool
cpu_suspend(device_t dv, const pmf_qual_t *qual)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
else {
cpufreq_suspend(ci);
}
return cpu_stop(dv);
}
static bool
cpu_resume(device_t dv, const pmf_qual_t *qual)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
int err = 0;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
if (CPU_IS_PRIMARY(ci))
goto out;
if (ci->ci_data.cpu_idlelwp == NULL)
goto out;
if (sc->sc_wasonline) {
mutex_enter(&cpu_lock);
err = cpu_setstate(ci, true);
mutex_exit(&cpu_lock);
}
out:
if (err != 0)
return false;
cpufreq_resume(ci);
return true;
}
static bool
cpu_shutdown(device_t dv, int how)
{
struct cpu_softc *sc = device_private(dv);
struct cpu_info *ci = sc->sc_info;
if ((ci->ci_flags & CPUF_BSP) != 0)
return false;
if ((ci->ci_flags & CPUF_PRESENT) == 0)
return true;
return cpu_stop(dv);
}
/* Get the TSC frequency and set it to ci->ci_data.cpu_cc_freq. */
void
cpu_get_tsc_freq(struct cpu_info *ci)
{
uint64_t freq = 0, freq_from_cpuid, t0, t1;
int64_t overhead;
if (CPU_IS_PRIMARY(ci) && cpu_hascounter()) {
/*
* If it's the first call of this function, try to get TSC
* freq from CPUID by calling cpu_tsc_freq_cpuid().
* The function also set lapic_per_second variable if it's
* known. This is required for Intel's Comet Lake and newer
* processors to set LAPIC timer correctly.
*/
if (ci->ci_data.cpu_cc_freq == 0)
freq = freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
if (freq != 0)
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from CPUID %" PRIu64 " Hz\n", freq);
#if NHPET > 0
if (freq == 0) {
freq = hpet_tsc_freq();
if (freq != 0)
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from HPET %" PRIu64 " Hz\n", freq);
}
#endif
if (freq == 0) {
/*
* Work out the approximate overhead involved below.
* Discard the result of the first go around the
* loop.
*/
overhead = 0;
for (int i = 0; i <= 8; i++) {
const int s = splhigh();
t0 = cpu_counter();
delay_func(0);
t1 = cpu_counter();
splx(s);
if (i > 0) {
overhead += (t1 - t0);
}
}
overhead >>= 3;
/*
* Now do the calibration.
*/
freq = 0;
for (int i = 0; i < 1000; i++) {
const int s = splhigh();
t0 = cpu_counter();
delay_func(100);
t1 = cpu_counter();
splx(s);
freq += t1 - t0 - overhead;
}
freq = freq * 10;
aprint_debug_dev(ci->ci_dev, "TSC freq "
"from delay %" PRIu64 " Hz\n", freq);
}
if (ci->ci_data.cpu_cc_freq != 0) {
freq_from_cpuid = cpu_tsc_freq_cpuid(ci);
if ((freq_from_cpuid != 0)
&& (freq != freq_from_cpuid))
aprint_verbose_dev(ci->ci_dev, "TSC freq "
"calibrated %" PRIu64 " Hz\n", freq);
}
} else {
freq = cpu_info_primary.ci_data.cpu_cc_freq;
}
ci->ci_data.cpu_cc_freq = freq;
}
void
x86_cpu_idle_mwait(void)
{
struct cpu_info *ci = curcpu();
KASSERT(ci->ci_ilevel == IPL_NONE);
x86_monitor(&ci->ci_want_resched, 0, 0);
if (__predict_false(ci->ci_want_resched)) {
return;
}
x86_mwait(0, 0);
}
void
x86_cpu_idle_halt(void)
{
struct cpu_info *ci = curcpu();
KASSERT(ci->ci_ilevel == IPL_NONE);
x86_disable_intr();
if (!__predict_false(ci->ci_want_resched)) {
x86_stihlt();
} else {
x86_enable_intr();
}
}
/*
* Loads pmap for the current CPU.
*/
void
cpu_load_pmap(struct pmap *pmap, struct pmap *oldpmap)
{ KASSERT(kpreempt_disabled());
#ifdef SVS
if (svs_enabled && pmap_is_user(pmap)) {
svs_pdir_switch(pmap);
}
#endif
#ifdef PAE
struct cpu_info *ci = curcpu();
bool interrupts_enabled;
pd_entry_t *l3_pd = ci->ci_pae_l3_pdir;
int i;
/*
* disable interrupts to block TLB shootdowns, which can reload cr3.
* while this doesn't block NMIs, it's probably ok as NMIs unlikely
* reload cr3.
*/
interrupts_enabled = (x86_read_flags() & PSL_I) != 0;
if (interrupts_enabled)
x86_disable_intr();
for (i = 0 ; i < PDP_SIZE; i++) {
l3_pd[i] = pmap->pm_pdirpa[i] | PTE_P;
}
if (interrupts_enabled)
x86_enable_intr();
tlbflush();
#else
lcr3(pmap_pdirpa(pmap, 0));
#endif
}
/*
* Notify all other cpus to halt.
*/
void
cpu_broadcast_halt(void)
{
x86_broadcast_ipi(X86_IPI_HALT);
}
/*
* Send a dummy ipi to a cpu to force it to run splraise()/spllower(),
* and trigger an AST on the running LWP.
*/
void
cpu_kick(struct cpu_info *ci)
{
x86_send_ipi(ci, X86_IPI_AST);
}
/* $NetBSD: rtc.c,v 1.2 2022/12/30 21:40:20 jakllsch Exp $ */
/*-
* Copyright (c) 1990 The Regents of the University of California.
* All rights reserved.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz and Don Ahn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)clock.c 7.2 (Berkeley) 5/12/91
*/
/*-
* Copyright (c) 1993, 1994 Charles M. Hannum.
*
* This code is derived from software contributed to Berkeley by
* William Jolitz and Don Ahn.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)clock.c 7.2 (Berkeley) 5/12/91
*/
/*
* Mach Operating System
* Copyright (c) 1991,1990,1989 Carnegie Mellon University
* All Rights Reserved.
*
* Permission to use, copy, modify and distribute this software and its
* documentation is hereby granted, provided that both the copyright
* notice and this permission notice appear in all copies of the
* software, derivative works or modified versions, and any portions
* thereof, and that both notices appear in supporting documentation.
*
* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
*
* Carnegie Mellon requests users of this software to return to
*
* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
* School of Computer Science
* Carnegie Mellon University
* Pittsburgh PA 15213-3890
*
* any improvements or extensions that they make and grant Carnegie Mellon
* the rights to redistribute these changes.
*/
/*
Copyright 1988, 1989 by Intel Corporation, Santa Clara, California.
All Rights Reserved
Permission to use, copy, modify, and distribute this software and
its documentation for any purpose and without fee is hereby
granted, provided that the above copyright notice appears in all
copies and that both the copyright notice and this permission notice
appear in supporting documentation, and that the name of Intel
not be used in advertising or publicity pertaining to distribution
of the software without specific, written prior permission.
INTEL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE
INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS,
IN NO EVENT SHALL INTEL BE LIABLE FOR ANY SPECIAL, INDIRECT, OR
CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN ACTION OF CONTRACT,
NEGLIGENCE, OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION
WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
/*
* Primitive RTC chip routines.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: rtc.c,v 1.2 2022/12/30 21:40:20 jakllsch Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/timetc.h>
#include <sys/kernel.h>
#include <dev/isa/isareg.h>
#include <dev/isa/isavar.h>
#include <i386/isa/nvram.h>
#include <machine/pio.h>
#include <dev/ic/mc146818reg.h>
#include <x86/rtc.h>
#ifndef __x86_64__
#include "mca.h"
#endif
#if NMCA > 0
#include <machine/mca_machdep.h> /* for MCA_system */
#endif
#include "acpica.h"
#if NACPICA > 0
#include <dev/acpi/acpivar.h>
#endif
static void rtcinit(void);
static int rtcget(mc_todregs *);
static void rtcput(mc_todregs *);
static int cmoscheck(void);
static int clock_expandyear(int);
/* XXX use sc? */
u_int
mc146818_read(void *sc, u_int reg)
{
outb(IO_RTC, reg);
return (inb(IO_RTC+1));
}
void
mc146818_write(void *sc, u_int reg, u_int datum)
{
outb(IO_RTC, reg);
outb(IO_RTC+1, datum);
}
static void
rtcinit(void)
{
static int first_rtcopen_ever = 1;
if (!first_rtcopen_ever)
return;
first_rtcopen_ever = 0;
mc146818_write(NULL, MC_REGA, /* XXX softc */
MC_BASE_32_KHz | MC_RATE_1024_Hz);
mc146818_write(NULL, MC_REGB, MC_REGB_24HR); /* XXX softc */
}
static int
rtcget(mc_todregs *regs)
{
rtcinit();
if ((mc146818_read(NULL, MC_REGD) & MC_REGD_VRT) == 0) /* XXX softc */
return (-1);
MC146818_GETTOD(NULL, regs); /* XXX softc */
return (0);
}
static void
rtcput(mc_todregs *regs)
{
rtcinit();
MC146818_PUTTOD(NULL, regs); /* XXX softc */
}
/*
* check whether the CMOS layout is "standard"-like (ie, not PS/2-like),
* to be called at splclock()
*/
static int
cmoscheck(void)
{
int i;
unsigned short cksum = 0;
for (i = 0x10; i <= 0x2d; i++)
cksum += mc146818_read(NULL, i); /* XXX softc */
return (cksum == (mc146818_read(NULL, 0x2e) << 8)
+ mc146818_read(NULL, 0x2f));
}
#if NMCA > 0
/*
* Check whether the CMOS layout is PS/2 like, to be called at splclock().
*/
static int cmoscheckps2(void);
static int
cmoscheckps2(void)
{
#if 0
/* Disabled until I find out the CRC checksum algorithm IBM uses */
int i;
unsigned short cksum = 0;
for (i = 0x10; i <= 0x31; i++)
cksum += mc146818_read(NULL, i); /* XXX softc */
return (cksum == (mc146818_read(NULL, 0x32) << 8)
+ mc146818_read(NULL, 0x33));
#else
/* Check 'incorrect checksum' bit of IBM PS/2 Diagnostic Status Byte */
return ((mc146818_read(NULL, NVRAM_DIAG) & (1<<6)) == 0);
#endif
}
#endif /* NMCA > 0 */
/*
* patchable to control century byte handling:
* 1: always update
* -1: never touch
* 0: try to figure out itself
*/
int rtc_update_century = 0;
/*
* Expand a two-digit year as read from the clock chip
* into full width.
* Being here, deal with the CMOS century byte.
*/
static int centb = NVRAM_CENTURY;
static int
clock_expandyear(int clockyear)
{
int s, clockcentury, cmoscentury;
clockcentury = (clockyear < 70) ? 20 : 19;
clockyear += 100 * clockcentury;
if (rtc_update_century < 0)
return (clockyear);
s = splclock();
#if NACPICA > 0
if (acpi_active)
cmoscentury = mc146818_read(NULL,
(centb = AcpiGbl_FADT.Century));
else
#endif
if (cmoscheck())
cmoscentury = mc146818_read(NULL, NVRAM_CENTURY);
#if NMCA > 0
else if (MCA_system && cmoscheckps2())
cmoscentury = mc146818_read(NULL, (centb = 0x37));
#endif
else
cmoscentury = 0;
splx(s);
if (!cmoscentury) {
#ifdef DIAGNOSTIC
printf("clock: unknown CMOS layout\n");
#endif
return (clockyear);
}
cmoscentury = bcdtobin(cmoscentury);
if (cmoscentury != clockcentury) {
/* XXX note: saying "century is 20" might confuse the naive. */
printf("WARNING: NVRAM century is %d but RTC year is %d\n",
cmoscentury, clockyear);
/* Kludge to roll over century. */
if ((rtc_update_century > 0) ||
((cmoscentury == 19) && (clockcentury == 20) &&
(clockyear == 2000))) {
printf("WARNING: Setting NVRAM century to %d\n",
clockcentury);
s = splclock();
mc146818_write(NULL, centb, bintobcd(clockcentury));
splx(s);
}
} else if (cmoscentury == 19 && rtc_update_century == 0)
rtc_update_century = 1; /* will update later in resettodr() */
return (clockyear);
}
int
rtc_get_ymdhms(todr_chip_handle_t tch, struct clock_ymdhms *dt)
{
int s;
mc_todregs rtclk;
s = splclock();
if (rtcget(&rtclk)) {
splx(s);
return -1;
}
splx(s);
dt->dt_sec = bcdtobin(rtclk[MC_SEC]);
dt->dt_min = bcdtobin(rtclk[MC_MIN]);
dt->dt_hour = bcdtobin(rtclk[MC_HOUR]);
dt->dt_day = bcdtobin(rtclk[MC_DOM]);
dt->dt_mon = bcdtobin(rtclk[MC_MONTH]);
dt->dt_year = clock_expandyear(bcdtobin(rtclk[MC_YEAR]));
return 0;
}
int
rtc_set_ymdhms(todr_chip_handle_t tch, struct clock_ymdhms *dt)
{
mc_todregs rtclk;
int century;
int s;
s = splclock();
if (rtcget(&rtclk)) memset(&rtclk, 0, sizeof(rtclk));
splx(s);
rtclk[MC_SEC] = bintobcd(dt->dt_sec);
rtclk[MC_MIN] = bintobcd(dt->dt_min);
rtclk[MC_HOUR] = bintobcd(dt->dt_hour);
rtclk[MC_DOW] = dt->dt_wday + 1;
rtclk[MC_YEAR] = bintobcd(dt->dt_year % 100);
rtclk[MC_MONTH] = bintobcd(dt->dt_mon);
rtclk[MC_DOM] = bintobcd(dt->dt_day);
#ifdef DEBUG_CLOCK
printf("setclock: %x/%x/%x %x:%x:%x\n", rtclk[MC_YEAR], rtclk[MC_MONTH],
rtclk[MC_DOM], rtclk[MC_HOUR], rtclk[MC_MIN], rtclk[MC_SEC]);
#endif
s = splclock();
rtcput(&rtclk); if (rtc_update_century > 0) { century = bintobcd(dt->dt_year / 100);
mc146818_write(NULL, centb, century); /* XXX softc */
}
splx(s);
return 0;
}
void
rtc_register(void)
{
static struct todr_chip_handle tch;
tch.todr_gettime_ymdhms = rtc_get_ymdhms;
tch.todr_settime_ymdhms = rtc_set_ymdhms;
tch.todr_setwen = NULL;
todr_attach(&tch);
}
/* $NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $ */
/*
* Copyright (c) 2010, 2011, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Mindaugas Rasiukevicius.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* tmpfs memory allocation routines.
* Implements memory usage accounting and limiting.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tmpfs_mem.c,v 1.14 2023/04/29 06:29:55 riastradh Exp $");
#include <sys/param.h>
#include <sys/atomic.h>
#include <sys/kmem.h>
#include <sys/namei.h>
#include <sys/pool.h>
#include <fs/tmpfs/tmpfs.h>
extern struct pool tmpfs_dirent_pool;
extern struct pool tmpfs_node_pool;
void
tmpfs_mntmem_init(struct tmpfs_mount *mp, uint64_t memlimit)
{
mutex_init(&mp->tm_acc_lock, MUTEX_DEFAULT, IPL_NONE);
mp->tm_mem_limit = memlimit;
mp->tm_bytes_used = 0;
}
void
tmpfs_mntmem_destroy(struct tmpfs_mount *mp)
{ KASSERT(mp->tm_bytes_used == 0);
mutex_destroy(&mp->tm_acc_lock);
}
int
tmpfs_mntmem_set(struct tmpfs_mount *mp, uint64_t memlimit)
{
int error;
mutex_enter(&mp->tm_acc_lock);
if (round_page(mp->tm_bytes_used) >= memlimit)
error = EBUSY;
else {
error = 0;
mp->tm_mem_limit = memlimit;
}
mutex_exit(&mp->tm_acc_lock);
return error;
}
/*
* tmpfs_mem_info: return the number of available memory pages.
*
* => If 'total' is true, then return _total_ amount of pages.
* => If false, then return the amount of _free_ memory pages.
*
* Remember to remove uvmexp.freetarg from the returned value to avoid
* excessive memory usage.
*/
size_t
tmpfs_mem_info(bool total)
{
size_t size = 0;
size += uvmexp.swpgavail;
if (!total) {
size -= uvmexp.swpgonly;
}
size += uvm_availmem(true);
size += uvmexp.filepages;
if (size > uvmexp.wired) {
size -= uvmexp.wired;
} else {
size = 0;
}
return size;
}
uint64_t
tmpfs_bytes_max(struct tmpfs_mount *mp)
{
psize_t freepages = tmpfs_mem_info(false);
int freetarg = uvmexp.freetarg; // XXX unlocked
uint64_t avail_mem;
if (freepages < freetarg) {
freepages = 0;
} else {
freepages -= freetarg;
}
avail_mem = round_page(mp->tm_bytes_used) + (freepages << PAGE_SHIFT);
return MIN(mp->tm_mem_limit, avail_mem);
}
size_t
tmpfs_pages_avail(struct tmpfs_mount *mp)
{
return (tmpfs_bytes_max(mp) - mp->tm_bytes_used) >> PAGE_SHIFT;
}
bool
tmpfs_mem_incr(struct tmpfs_mount *mp, size_t sz)
{
uint64_t lim;
mutex_enter(&mp->tm_acc_lock);
lim = tmpfs_bytes_max(mp);
if (mp->tm_bytes_used + sz >= lim) {
mutex_exit(&mp->tm_acc_lock);
return false;
}
mp->tm_bytes_used += sz;
mutex_exit(&mp->tm_acc_lock);
return true;
}
void
tmpfs_mem_decr(struct tmpfs_mount *mp, size_t sz)
{
mutex_enter(&mp->tm_acc_lock);
KASSERT(mp->tm_bytes_used >= sz);
mp->tm_bytes_used -= sz;
mutex_exit(&mp->tm_acc_lock);
}
struct tmpfs_dirent *
tmpfs_dirent_get(struct tmpfs_mount *mp)
{ if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_dirent))) {
return NULL;
}
return pool_get(&tmpfs_dirent_pool, PR_WAITOK);
}
void
tmpfs_dirent_put(struct tmpfs_mount *mp, struct tmpfs_dirent *de)
{ tmpfs_mem_decr(mp, sizeof(struct tmpfs_dirent));
pool_put(&tmpfs_dirent_pool, de);
}
struct tmpfs_node *
tmpfs_node_get(struct tmpfs_mount *mp)
{
if (atomic_inc_uint_nv(&mp->tm_nodes_cnt) >= mp->tm_nodes_max) {
atomic_dec_uint(&mp->tm_nodes_cnt);
return NULL;
}
if (!tmpfs_mem_incr(mp, sizeof(struct tmpfs_node))) {
atomic_dec_uint(&mp->tm_nodes_cnt);
return NULL;
}
return pool_get(&tmpfs_node_pool, PR_WAITOK);
}
void
tmpfs_node_put(struct tmpfs_mount *mp, struct tmpfs_node *tn)
{
atomic_dec_uint(&mp->tm_nodes_cnt);
tmpfs_mem_decr(mp, sizeof(struct tmpfs_node));
pool_put(&tmpfs_node_pool, tn);
}
/*
* Quantum size to round-up the tmpfs names in order to reduce re-allocations.
*/
#define TMPFS_NAME_QUANTUM (32)
char *
tmpfs_strname_alloc(struct tmpfs_mount *mp, size_t len)
{
const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);
KASSERT(sz > 0 && sz <= 1024); if (!tmpfs_mem_incr(mp, sz)) {
return NULL;
}
return kmem_alloc(sz, KM_SLEEP);
}
void
tmpfs_strname_free(struct tmpfs_mount *mp, char *str, size_t len)
{
const size_t sz = roundup2(len, TMPFS_NAME_QUANTUM);
KASSERT(sz > 0 && sz <= 1024); tmpfs_mem_decr(mp, sz);
kmem_free(str, sz);
}
bool
tmpfs_strname_neqlen(struct componentname *fcnp, struct componentname *tcnp)
{
const size_t fln = fcnp->cn_namelen;
const size_t tln = tcnp->cn_namelen;
return (fln != tln) || memcmp(fcnp->cn_nameptr, tcnp->cn_nameptr, fln);
}
/* $NetBSD: uvm_pdpolicy.h,v 1.9 2022/08/20 23:26:02 riastradh Exp $ */
/*-
* Copyright (c)2005, 2006 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#ifndef _UVM_PDPOLICY_H_
#define _UVM_PDPOLICY_H_
#include <sys/mutex.h>
#include <sys/stdint.h>
#include <uvm/uvm_page.h>
struct krwlock;
struct uvm_cpu;
struct vm_anon;
struct vm_page;
/*
* these API is for uvm internal use only.
* don't use them directly from outside of /sys/uvm.
*/
void uvmpdpol_idle(struct uvm_cpu *);
void uvmpdpol_init(void);
void uvmpdpol_init_cpu(struct uvm_cpu *);
void uvmpdpol_reinit(void);
void uvmpdpol_estimatepageable(int *, int *);
bool uvmpdpol_needsscan_p(void);
void uvmpdpol_pageactivate(struct vm_page *);
void uvmpdpol_pagedeactivate(struct vm_page *);
void uvmpdpol_pagedequeue(struct vm_page *);
void uvmpdpol_pageenqueue(struct vm_page *);
bool uvmpdpol_pageactivate_p(struct vm_page *);
bool uvmpdpol_pageisqueued_p(struct vm_page *);
void uvmpdpol_pagerealize(struct vm_page *);
void uvmpdpol_anfree(struct vm_anon *);
void uvmpdpol_tune(void);
void uvmpdpol_scaninit(void);
void uvmpdpol_scanfini(void);
struct vm_page *uvmpdpol_selectvictim(struct krwlock **);
void uvmpdpol_balancequeue(int);
void uvmpdpol_sysctlsetup(void);
/*
* uvmpdpol_set_intent: set an intended state for the page, taking care not
* to overwrite any of the other flags.
*/
static inline void
uvmpdpol_set_intent(struct vm_page *pg, uint32_t i)
{
KASSERT(mutex_owned(&pg->interlock));
pg->pqflags = PQ_INTENT_SET | (pg->pqflags & ~PQ_INTENT_MASK) | i;
}
#endif /* !_UVM_PDPOLICY_H_ */
/* $NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $ */
/* $KAME: udp6_usrreq.c,v 1.86 2001/05/27 17:33:00 itojun Exp $ */
/* $KAME: udp6_output.c,v 1.43 2001/10/15 09:19:52 itojun Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)udp_var.h 8.1 (Berkeley) 6/10/93
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: udp6_usrreq.c,v 1.154 2022/11/04 09:01:53 ozaki-r Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_inet_csum.h"
#include "opt_ipsec.h"
#include "opt_net_mpsafe.h"
#endif
#include <sys/param.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/if_types.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/in_offload.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/in_pcb.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udp_private.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet6/ip6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/udp6_var.h>
#include <netinet6/udp6_private.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/esp.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif
#include "faith.h"
#if defined(NFAITH) && NFAITH > 0
#include <net/if_faith.h>
#endif
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
*/
extern struct inpcbtable udbtable;
percpu_t *udp6stat_percpu;
/* UDP on IP6 parameters */
static int udp6_sendspace = 9216; /* really max datagram size */
static int udp6_recvspace = 40 * (1024 + sizeof(struct sockaddr_in6));
/* 40 1K datagrams */
static void udp6_notify(struct inpcb *, int);
static void sysctl_net_inet6_udp6_setup(struct sysctllog **);
#ifdef IPSEC
static int udp6_espinudp(struct mbuf **, int);
#endif
#ifdef UDP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt udp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum bad");
struct evcnt udp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum ok");
struct evcnt udp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "hwcsum data");
struct evcnt udp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "udp6", "swcsum");
EVCNT_ATTACH_STATIC(udp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(udp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(udp6_hwcsum_data);
EVCNT_ATTACH_STATIC(udp6_swcsum);
#define UDP_CSUM_COUNTER_INCR(ev) (ev)->ev_count++
#else
#define UDP_CSUM_COUNTER_INCR(ev) /* nothing */
#endif
void
udp6_init(void)
{
sysctl_net_inet6_udp6_setup(NULL);
udp6stat_percpu = percpu_alloc(sizeof(uint64_t) * UDP6_NSTATS);
udp_init_common();
}
/*
* Notify a udp user of an asynchronous error;
* just wake up so that he can collect error status.
*/
static void
udp6_notify(struct inpcb *inp, int errno)
{
inp->inp_socket->so_error = errno;
sorwakeup(inp->inp_socket);
sowwakeup(inp->inp_socket);
}
void *
udp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct udphdr uh;
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
struct mbuf *m;
int off;
void *cmdarg;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void (*notify)(struct inpcb *, int) = udp6_notify;
struct udp_portonly {
u_int16_t uh_sport;
u_int16_t uh_dport;
} *uhp;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
if (PRC_IS_REDIRECT(cmd))
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE) {
/* special code is present, see below */
notify = in6pcb_rtchange;
}
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
off = 0;
}
if (ip6) {
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(*uhp)) {
if (cmd == PRC_MSGSIZE) icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
return NULL;
}
memset(&uh, 0, sizeof(uh));
m_copydata(m, off, sizeof(*uhp), (void *)&uh);
if (cmd == PRC_MSGSIZE) {
int valid = 0;
/*
* Check to see if we have a valid UDP socket
* corresponding to the address in the ICMPv6 message
* payload.
*/
if (in6pcb_lookup(&udbtable, &sa6->sin6_addr,
uh.uh_dport, (const struct in6_addr *)&sa6_src->sin6_addr,
uh.uh_sport, 0, 0))
valid++;
#if 0
/*
* As the use of sendto(2) is fairly popular,
* we may want to allow non-connected pcb too.
* But it could be too weak against attacks...
* We should at least check if the local address (= s)
* is really ours.
*/
else if (in6pcb_lookup_bound(&udbtable, &sa6->sin6_addr,
uh.uh_dport, 0))
valid++;
#endif
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* regardless of if we called
* icmp6_mtudisc_update(), we need to call
* in6pcb_notify(), to notify path MTU change
* to the userland (RFC3542), because some
* unconnected sockets may share the same
* destination and want to know the path MTU.
*/
}
(void)in6pcb_notify(&udbtable, sa, uh.uh_dport,
sin6tocsa(sa6_src), uh.uh_sport, cmd, cmdarg,
notify);
} else {
(void)in6pcb_notify(&udbtable, sa, 0,
sin6tocsa(sa6_src), 0, cmd, cmdarg, notify);
}
return NULL;
}
int
udp6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int s;
int error = 0;
struct inpcb *inp;
int family;
int optval;
family = so->so_proto->pr_domain->dom_family;
s = splsoftnet();
switch (family) {
#ifdef INET
case PF_INET:
if (sopt->sopt_level != IPPROTO_UDP) { error = ip_ctloutput(op, so, sopt);
goto end;
}
break;
#endif
#ifdef INET6
case PF_INET6:
if (sopt->sopt_level != IPPROTO_UDP) { error = ip6_ctloutput(op, so, sopt);
goto end;
}
break;
#endif
default:
error = EAFNOSUPPORT;
goto end;
}
switch (op) {
case PRCO_SETOPT:
inp = sotoinpcb(so);
switch (sopt->sopt_name) {
case UDP_ENCAP:
error = sockopt_getint(sopt, &optval);
if (error)
break;
switch(optval) {
case 0:
inp->inp_flags &= ~IN6P_ESPINUDP;
break;
case UDP_ENCAP_ESPINUDP:
inp->inp_flags |= IN6P_ESPINUDP;
break;
default:
error = EINVAL;
break;
}
break;
default:
error = ENOPROTOOPT;
break;
}
break;
default:
error = EINVAL;
break;
}
end:
splx(s);
return error;
}
static void
udp6_sendup(struct mbuf *m, int off /* offset of data portion */,
struct sockaddr *src, struct socket *so)
{
struct mbuf *opts = NULL;
struct mbuf *n;
struct inpcb *inp;
KASSERT(so != NULL);
KASSERT(so->so_proto->pr_domain->dom_family == AF_INET6);
inp = sotoinpcb(so);
KASSERT(inp != NULL);
#if defined(IPSEC)
if (ipsec_used && ipsec_in_reject(m, inp)) {
if ((n = m_copypacket(m, M_DONTWAIT)) != NULL)
icmp6_error(n, ICMP6_DST_UNREACH,
ICMP6_DST_UNREACH_ADMIN, 0);
return;
}
#endif
if ((n = m_copypacket(m, M_DONTWAIT)) != NULL) {
if (inp->inp_flags & IN6P_CONTROLOPTS ||
SOOPT_TIMESTAMP(inp->inp_socket->so_options)) {
struct ip6_hdr *ip6 = mtod(n, struct ip6_hdr *);
ip6_savecontrol(inp, &opts, ip6, n);
}
m_adj(n, off);
if (sbappendaddr(&so->so_rcv, src, n, opts) == 0) {
m_freem(n);
if (opts)
m_freem(opts);
UDP6_STATINC(UDP6_STAT_FULLSOCK);
soroverflow(so);
} else
sorwakeup(so);
}
}
int
udp6_realinput(int af, struct sockaddr_in6 *src, struct sockaddr_in6 *dst,
struct mbuf **mp, int off)
{
u_int16_t sport, dport;
int rcvcnt;
struct in6_addr src6, *dst6;
const struct in_addr *dst4;
struct inpcb *inp;
struct mbuf *m = *mp;
rcvcnt = 0;
off += sizeof(struct udphdr); /* now, offset of payload */
if (af != AF_INET && af != AF_INET6)
goto bad;
if (src->sin6_family != AF_INET6 || dst->sin6_family != AF_INET6)
goto bad;
src6 = src->sin6_addr;
if (sa6_recoverscope(src) != 0) {
/* XXX: should be impossible. */
goto bad;
}
sport = src->sin6_port;
dport = dst->sin6_port;
dst4 = (struct in_addr *)&dst->sin6_addr.s6_addr[12];
dst6 = &dst->sin6_addr;
if (IN6_IS_ADDR_MULTICAST(dst6) ||
(af == AF_INET && IN_MULTICAST(dst4->s_addr))) {
/*
* Deliver a multicast or broadcast datagram to *all* sockets
* for which the local and remote addresses and ports match
* those of the incoming datagram. This allows more than
* one process to receive multi/broadcasts on the same port.
* (This really ought to be done for unicast datagrams as
* well, but that would cause problems with existing
* applications that open both address-specific sockets and
* a wildcard socket listening to the same port -- they would
* end up receiving duplicates of every unicast datagram.
* Those applications open the multiple sockets to overcome an
* inadequacy of the UDP socket interface, but for backwards
* compatibility we avoid the problem here rather than
* fixing the interface. Maybe 4.5BSD will remedy this?)
*/
/*
* KAME note: traditionally we dropped udpiphdr from mbuf here.
* we need udpiphdr for IPsec processing so we do that later.
*/
/*
* Locate pcb(s) for datagram.
*/
TAILQ_FOREACH(inp, &udbtable.inpt_queue, inp_queue) {
if (inp->inp_af != AF_INET6)
continue;
if (inp->inp_lport != dport)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
if (!IN6_ARE_ADDR_EQUAL(&in6p_laddr(inp),
dst6))
continue;
} else {
if (IN6_IS_ADDR_V4MAPPED(dst6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
if (!IN6_ARE_ADDR_EQUAL(&in6p_faddr(inp),
&src6) || inp->inp_fport != sport)
continue;
} else {
if (IN6_IS_ADDR_V4MAPPED(&src6) &&
(inp->inp_flags & IN6P_IPV6_V6ONLY))
continue;
}
udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
rcvcnt++;
/*
* Don't look for additional matches if this one does
* not have either the SO_REUSEPORT or SO_REUSEADDR
* socket options set. This heuristic avoids searching
* through all pcbs in the common case of a non-shared
* port. It assumes that an application will never
* clear these options after setting them.
*/
if ((inp->inp_socket->so_options &
(SO_REUSEPORT|SO_REUSEADDR)) == 0)
break;
}
} else {
/*
* Locate pcb for datagram.
*/
inp = in6pcb_lookup(&udbtable, &src6, sport, dst6,
dport, 0, 0);
if (inp == NULL) {
UDP_STATINC(UDP_STAT_PCBHASHMISS);
inp = in6pcb_lookup_bound(&udbtable, dst6, dport, 0);
if (inp == NULL)
return rcvcnt;
}
#ifdef IPSEC
/* Handle ESP over UDP */
if (inp->inp_flags & IN6P_ESPINUDP) {
switch (udp6_espinudp(mp, off)) {
case -1: /* Error, m was freed */
rcvcnt = -1;
goto bad;
case 1: /* ESP over UDP */
rcvcnt++;
goto bad;
case 0: /* plain UDP */
default: /* Unexpected */
/*
* Normal UDP processing will take place,
* m may have changed.
*/
m = *mp;
break;
}
}
#endif
if (inp->inp_overudp_cb != NULL) {
int ret;
ret = inp->inp_overudp_cb(mp, off, inp->inp_socket,
sin6tosa(src), inp->inp_overudp_arg);
switch (ret) {
case -1: /* Error, m was freed */
rcvcnt = -1;
goto bad;
case 1: /* Foo over UDP */
KASSERT(*mp == NULL);
rcvcnt++;
goto bad;
case 0: /* plain UDP */
default: /* Unexpected */
/*
* Normal UDP processing will take place,
* m may have changed.
*/
break;
}
}
udp6_sendup(m, off, sin6tosa(src), inp->inp_socket);
rcvcnt++;
}
bad:
return rcvcnt;
}
int
udp6_input_checksum(struct mbuf *m, const struct udphdr *uh, int off, int len)
{
/*
* XXX it's better to record and check if this mbuf is
* already checked.
*/
if (__predict_false((m->m_flags & M_LOOP) && !udp_do_loopback_cksum)) {
goto good;
}
if (uh->uh_sum == 0) {
UDP6_STATINC(UDP6_STAT_NOSUM);
goto bad;
}
switch (m->m_pkthdr.csum_flags &
((m_get_rcvif_NOMPSAFE(m)->if_csum_flags_rx & M_CSUM_UDPv6) |
M_CSUM_TCP_UDP_BAD | M_CSUM_DATA)) {
case M_CSUM_UDPv6|M_CSUM_TCP_UDP_BAD:
UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_bad);
UDP6_STATINC(UDP6_STAT_BADSUM);
goto bad;
#if 0 /* notyet */
case M_CSUM_UDPv6|M_CSUM_DATA:
#endif
case M_CSUM_UDPv6:
/* Checksum was okay. */
UDP_CSUM_COUNTER_INCR(&udp6_hwcsum_ok);
break;
default:
/*
* Need to compute it ourselves. Maybe skip checksum
* on loopback interfaces.
*/
UDP_CSUM_COUNTER_INCR(&udp6_swcsum);
if (in6_cksum(m, IPPROTO_UDP, off, len) != 0) {
UDP6_STATINC(UDP6_STAT_BADSUM);
goto bad;
}
}
good:
return 0;
bad:
return -1;
}
int
udp6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
int off = *offp;
struct sockaddr_in6 src, dst;
struct ip6_hdr *ip6;
struct udphdr *uh;
u_int32_t plen, ulen;
ip6 = mtod(m, struct ip6_hdr *);
#if defined(NFAITH) && 0 < NFAITH
if (faithprefix(&ip6->ip6_dst)) {
/* send icmp6 host unreach? */
m_freem(m);
return IPPROTO_DONE;
}
#endif
UDP6_STATINC(UDP6_STAT_IPACKETS);
/* Check for jumbogram is done in ip6_input. We can trust pkthdr.len. */
plen = m->m_pkthdr.len - off;
IP6_EXTHDR_GET(uh, struct udphdr *, m, off, sizeof(struct udphdr));
if (uh == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return IPPROTO_DONE;
}
/*
* Enforce alignment requirements that are violated in
* some cases, see kern/50766 for details.
*/
if (ACCESSIBLE_POINTER(uh, struct udphdr) == 0) {
m = m_copyup(m, off + sizeof(struct udphdr), 0);
if (m == NULL) {
IP6_STATINC(IP6_STAT_TOOSHORT);
return IPPROTO_DONE;
}
ip6 = mtod(m, struct ip6_hdr *);
uh = (struct udphdr *)(mtod(m, char *) + off);
}
KASSERT(ACCESSIBLE_POINTER(uh, struct udphdr));
ulen = ntohs((u_short)uh->uh_ulen);
/*
* RFC2675 section 4: jumbograms will have 0 in the UDP header field,
* iff payload length > 0xffff.
*/
if (ulen == 0 && plen > 0xffff)
ulen = plen;
if (plen != ulen) {
UDP6_STATINC(UDP6_STAT_BADLEN);
goto bad;
}
/* destination port of 0 is illegal, based on RFC768. */
if (uh->uh_dport == 0)
goto bad;
/*
* Checksum extended UDP header and data. Maybe skip checksum
* on loopback interfaces.
*/
if (udp6_input_checksum(m, uh, off, ulen))
goto bad;
/*
* Construct source and dst sockaddrs.
*/
memset(&src, 0, sizeof(src));
src.sin6_family = AF_INET6;
src.sin6_len = sizeof(struct sockaddr_in6);
src.sin6_addr = ip6->ip6_src;
src.sin6_port = uh->uh_sport;
memset(&dst, 0, sizeof(dst));
dst.sin6_family = AF_INET6;
dst.sin6_len = sizeof(struct sockaddr_in6);
dst.sin6_addr = ip6->ip6_dst;
dst.sin6_port = uh->uh_dport;
if (udp6_realinput(AF_INET6, &src, &dst, &m, off) == 0) {
if (m->m_flags & M_MCAST) {
UDP6_STATINC(UDP6_STAT_NOPORTMCAST);
goto bad;
}
UDP6_STATINC(UDP6_STAT_NOPORT);
icmp6_error(m, ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0);
m = NULL;
}
bad:
if (m)
m_freem(m);
return IPPROTO_DONE;
}
int
udp6_output(struct inpcb * const inp, struct mbuf *m,
struct sockaddr_in6 * const addr6, struct mbuf * const control,
struct lwp * const l)
{
u_int32_t ulen = m->m_pkthdr.len;
u_int32_t plen = sizeof(struct udphdr) + ulen;
struct ip6_hdr *ip6;
struct udphdr *udp6;
struct in6_addr _laddr, *laddr, *faddr;
struct in6_addr laddr_mapped; /* XXX ugly */
struct sockaddr_in6 *sin6 = NULL;
struct ifnet *oifp = NULL;
int scope_ambiguous = 0;
u_int16_t fport;
int error = 0;
struct ip6_pktopts *optp = NULL;
struct ip6_pktopts opt;
int af = AF_INET6, hlen = sizeof(struct ip6_hdr);
#ifdef INET
struct ip *ip;
struct udpiphdr *ui;
int flags = 0;
#endif
struct sockaddr_in6 tmp;
if (addr6) {
sin6 = addr6;
if (sin6->sin6_len != sizeof(*sin6)) {
error = EINVAL;
goto release;
}
if (sin6->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
goto release;
}
/* protect *sin6 from overwrites */
tmp = *sin6;
sin6 = &tmp;
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (sin6->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(sin6, ip6_use_defzone)) != 0)
goto release;
}
if (control) {
if (__predict_false(l == NULL)) {
panic("%s: control but no lwp", __func__);
}
if ((error = ip6_setpktopts(control, &opt,
in6p_outputopts(inp), l->l_cred, IPPROTO_UDP)) != 0)
goto release;
optp = &opt;
} else
optp = in6p_outputopts(inp);
if (sin6) {
/*
* Slightly different than v4 version in that we call
* in6_selectsrc and in6pcb_set_port to fill in the local
* address and port rather than inpcb_connect. inpcb_connect
* sets inp_faddr which causes EISCONN below to be hit on
* subsequent sendto.
*/
if (sin6->sin6_port == 0) {
error = EADDRNOTAVAIL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
/* how about ::ffff:0.0.0.0 case? */
error = EISCONN;
goto release;
}
faddr = &sin6->sin6_addr;
fport = sin6->sin6_port; /* allow 0 port */
if (IN6_IS_ADDR_V4MAPPED(faddr)) {
if ((inp->inp_flags & IN6P_IPV6_V6ONLY)) {
/*
* I believe we should explicitly discard the
* packet when mapped addresses are disabled,
* rather than send the packet as an IPv6 one.
* If we chose the latter approach, the packet
* might be sent out on the wire based on the
* default route, the situation which we'd
* probably want to avoid.
* (20010421 jinmei@kame.net)
*/
error = EINVAL;
goto release;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp)) &&
!IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))) {
/*
* when remote addr is an IPv4-mapped address,
* local addr should not be an IPv6 address,
* since you cannot determine how to map IPv6
* source address to IPv4.
*/
error = EINVAL;
goto release;
}
af = AF_INET;
}
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
struct psref psref;
int bound = curlwp_bind();
error = in6_selectsrc(sin6, optp,
in6p_moptions(inp),
&inp->inp_route,
&in6p_laddr(inp), &oifp, &psref, &_laddr);
if (error)
laddr = NULL;
else
laddr = &_laddr;
if (oifp && scope_ambiguous &&
(error = in6_setscope(&sin6->sin6_addr,
oifp, NULL))) {
if_put(oifp, &psref);
curlwp_bindx(bound);
goto release;
}
if_put(oifp, &psref);
curlwp_bindx(bound);
} else {
/*
* XXX: freebsd[34] does not have in_selectsrc, but
* we can omit the whole part because freebsd4 calls
* udp_output() directly in this case, and thus we'll
* never see this path.
*/
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_laddr(inp))) {
struct sockaddr_in sin_dst;
struct in_addr ina;
struct in_ifaddr *ia4;
struct psref _psref;
int bound;
memcpy(&ina, &faddr->s6_addr[12], sizeof(ina));
sockaddr_in_init(&sin_dst, &ina, 0);
bound = curlwp_bind();
ia4 = in_selectsrc(&sin_dst, &inp->inp_route,
inp->inp_socket->so_options, NULL,
&error, &_psref);
if (ia4 == NULL) {
curlwp_bindx(bound);
if (error == 0)
error = EADDRNOTAVAIL;
goto release;
}
memset(&laddr_mapped, 0, sizeof(laddr_mapped));
laddr_mapped.s6_addr16[5] = 0xffff; /* ugly */
memcpy(&laddr_mapped.s6_addr[12],
&IA_SIN(ia4)->sin_addr,
sizeof(IA_SIN(ia4)->sin_addr));
ia4_release(ia4, &_psref);
curlwp_bindx(bound);
laddr = &laddr_mapped;
} else
{
laddr = &in6p_laddr(inp); /* XXX */
}
}
if (laddr == NULL) { if (error == 0)
error = EADDRNOTAVAIL;
goto release;
}
if (inp->inp_lport == 0) {
/*
* Craft a sockaddr_in6 for the local endpoint. Use the
* "any" as a base, set the address, and recover the
* scope.
*/
struct sockaddr_in6 lsin6 =
*((const struct sockaddr_in6 *)inp->inp_socket->so_proto->pr_domain->dom_sa_any);
lsin6.sin6_addr = *laddr;
error = sa6_recoverscope(&lsin6);
if (error)
goto release;
error = in6pcb_set_port(&lsin6, inp, l);
if (error) {
in6p_laddr(inp) = in6addr_any;
goto release;
}
}
} else {
if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp))) {
error = ENOTCONN;
goto release;
}
if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) { if ((inp->inp_flags & IN6P_IPV6_V6ONLY))
{
/*
* XXX: this case would happen when the
* application sets the V6ONLY flag after
* connecting the foreign address.
* Such applications should be fixed,
* so we bark here.
*/
log(LOG_INFO, "udp6_output: IPV6_V6ONLY "
"option was set for a connected socket\n");
error = EINVAL;
goto release;
} else
af = AF_INET;
}
laddr = &in6p_laddr(inp);
faddr = &in6p_faddr(inp);
fport = inp->inp_fport;
}
if (af == AF_INET)
hlen = sizeof(struct ip);
/*
* Calculate data length and get a mbuf
* for UDP and IP6 headers.
*/
M_PREPEND(m, hlen + sizeof(struct udphdr), M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto release;
}
/*
* Stuff checksum and output datagram.
*/
udp6 = (struct udphdr *)(mtod(m, char *) + hlen);
udp6->uh_sport = inp->inp_lport; /* lport is always set in the PCB */
udp6->uh_dport = fport;
if (plen <= 0xffff) udp6->uh_ulen = htons((u_int16_t)plen);
else
udp6->uh_ulen = 0;
udp6->uh_sum = 0;
switch (af) {
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
#if 0 /* ip6_plen will be filled in ip6_output. */
ip6->ip6_plen = htons((u_int16_t)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
ip6->ip6_hlim = in6pcb_selecthlim_rt(inp);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
udp6->uh_sum = in6_cksum_phdr(laddr, faddr,
htonl(plen), htonl(IPPROTO_UDP));
m->m_pkthdr.csum_flags = M_CSUM_UDPv6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
UDP6_STATINC(UDP6_STAT_OPACKETS);
error = ip6_output(m, optp, &inp->inp_route, 0,
in6p_moptions(inp), inp, NULL);
break;
case AF_INET:
#ifdef INET
/* can't transmit jumbogram over IPv4 */
if (plen > 0xffff) {
error = EMSGSIZE;
goto release;
}
ip = mtod(m, struct ip *);
ui = (struct udpiphdr *)ip;
memset(ui->ui_x1, 0, sizeof(ui->ui_x1));
ui->ui_pr = IPPROTO_UDP;
ui->ui_len = htons(plen);
memcpy(&ui->ui_src, &laddr->s6_addr[12], sizeof(ui->ui_src));
ui->ui_ulen = ui->ui_len;
flags = (inp->inp_socket->so_options &
(SO_DONTROUTE | SO_BROADCAST));
memcpy(&ui->ui_dst, &faddr->s6_addr[12], sizeof(ui->ui_dst));
udp6->uh_sum = in_cksum(m, hlen + plen);
if (udp6->uh_sum == 0)
udp6->uh_sum = 0xffff;
ip->ip_len = htons(hlen + plen);
ip->ip_ttl = in6pcb_selecthlim(inp, NULL); /* XXX */
ip->ip_tos = 0; /* XXX */
UDP_STATINC(UDP_STAT_OPACKETS);
error = ip_output(m, NULL, &inp->inp_route, flags /* XXX */,
inp->inp_moptions, NULL);
break;
#else
error = EAFNOSUPPORT;
goto release;
#endif
}
goto releaseopt;
release:
m_freem(m);
releaseopt:
if (control) { if (optp == &opt) ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return (error);
}
static int
udp6_attach(struct socket *so, int proto)
{
struct inpcb *inp;
int s, error;
KASSERT(sotoinpcb(so) == NULL);
sosetlock(so);
error = soreserve(so, udp6_sendspace, udp6_recvspace);
if (error) {
return error;
}
/*
* MAPPED_ADDR implementation spec:
* Always attach for IPv6, and only when necessary for IPv4.
*/
s = splsoftnet();
error = inpcb_create(so, &udbtable);
splx(s);
if (error) {
return error;
}
inp = sotoinpcb(so);
in6p_cksum(inp) = -1; /* just to be sure */
KASSERT(solocked(so));
return 0;
}
static void
udp6_detach(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL);
s = splsoftnet();
inpcb_destroy(inp);
splx(s);
}
static int
udp6_accept(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_bind(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL);
s = splsoftnet();
error = in6pcb_bind(inp, sin6, l);
splx(s);
return error;
}
static int
udp6_listen(struct socket *so, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_connect(struct socket *so, struct sockaddr *nam, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); if (!IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
return EISCONN;
s = splsoftnet();
error = in6pcb_connect(inp, (struct sockaddr_in6 *)nam, l);
splx(s);
if (error == 0) soisconnected(so);
return error;
}
static int
udp6_connect2(struct socket *so, struct socket *so2)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_disconnect(struct socket *so)
{
struct inpcb *inp = sotoinpcb(so);
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); if (IN6_IS_ADDR_UNSPECIFIED(&in6p_faddr(inp)))
return ENOTCONN;
s = splsoftnet();
in6pcb_disconnect(inp);
memset((void *)&in6p_laddr(inp), 0, sizeof(in6p_laddr(inp)));
splx(s);
so->so_state &= ~SS_ISCONNECTED; /* XXX */
in6pcb_set_state(inp, INP_BOUND); /* XXX */
return 0;
}
static int
udp6_shutdown(struct socket *so)
{
int s;
s = splsoftnet();
socantsendmore(so);
splx(s);
return 0;
}
static int
udp6_abort(struct socket *so)
{
int s;
KASSERT(solocked(so));
KASSERT(sotoinpcb(so) != NULL);
s = splsoftnet();
soisdisconnected(so);
inpcb_destroy(sotoinpcb(so));
splx(s);
return 0;
}
static int
udp6_ioctl(struct socket *so, u_long cmd, void *addr6, struct ifnet *ifp)
{
/*
* MAPPED_ADDR implementation info:
* Mapped addr support for PRU_CONTROL is not necessary.
* Because typical user of PRU_CONTROL is such as ifconfig,
* and they don't associate any addr to their socket. Then
* socket family is only hint about the PRU_CONTROL'ed address
* family, especially when getting addrs from kernel.
* So AF_INET socket need to be used to control AF_INET addrs,
* and AF_INET6 socket for AF_INET6 addrs.
*/
return in6_control(so, cmd, addr6, ifp);
}
static int
udp6_stat(struct socket *so, struct stat *ub)
{
KASSERT(solocked(so));
/* stat: don't bother with a blocksize */
return 0;
}
static int
udp6_peeraddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL);
in6pcb_fetch_peeraddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
udp6_sockaddr(struct socket *so, struct sockaddr *nam)
{
KASSERT(solocked(so)); KASSERT(sotoinpcb(so) != NULL); KASSERT(nam != NULL);
in6pcb_fetch_sockaddr(sotoinpcb(so), (struct sockaddr_in6 *)nam);
return 0;
}
static int
udp6_rcvd(struct socket *so, int flags, struct lwp *l)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_recvoob(struct socket *so, struct mbuf *m, int flags)
{
KASSERT(solocked(so));
return EOPNOTSUPP;
}
static int
udp6_send(struct socket *so, struct mbuf *m, struct sockaddr *nam,
struct mbuf *control, struct lwp *l)
{
struct inpcb *inp = sotoinpcb(so);
int error = 0;
int s;
KASSERT(solocked(so)); KASSERT(inp != NULL); KASSERT(m != NULL);
s = splsoftnet();
error = udp6_output(inp, m, (struct sockaddr_in6 *)nam, control, l);
splx(s);
return error;
}
static int
udp6_sendoob(struct socket *so, struct mbuf *m, struct mbuf *control)
{
KASSERT(solocked(so));
m_freem(m);
m_freem(control);
return EOPNOTSUPP;
}
static int
udp6_purgeif(struct socket *so, struct ifnet *ifp)
{
mutex_enter(softnet_lock);
in6pcb_purgeif0(&udbtable, ifp);
#ifdef NET_MPSAFE
mutex_exit(softnet_lock);
#endif
in6_purgeif(ifp);
#ifdef NET_MPSAFE
mutex_enter(softnet_lock);
#endif
in6pcb_purgeif(&udbtable, ifp);
mutex_exit(softnet_lock);
return 0;
}
static int
sysctl_net_inet6_udp6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(udp6stat_percpu, UDP6_NSTATS));
}
static void
sysctl_net_inet6_udp6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "udp6",
SYSCTL_DESCR("UDPv6 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "sendspace",
SYSCTL_DESCR("Default UDP send buffer size"),
NULL, 0, &udp6_sendspace, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_SENDSPACE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "recvspace",
SYSCTL_DESCR("Default UDP receive buffer size"),
NULL, 0, &udp6_recvspace, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_RECVSPACE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "do_loopback_cksum",
SYSCTL_DESCR("Perform UDP checksum on loopback"),
NULL, 0, &udp_do_loopback_cksum, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_LOOPBACKCKSUM,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pcblist",
SYSCTL_DESCR("UDP protocol control block list"),
sysctl_inpcblist, 0, &udbtable, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, CTL_CREATE,
CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("UDPv6 statistics"),
sysctl_net_inet6_udp6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_UDP, UDP6CTL_STATS,
CTL_EOL);
}
void
udp6_statinc(u_int stat)
{
KASSERT(stat < UDP6_NSTATS);
UDP6_STATINC(stat);
}
#ifdef IPSEC
/*
* Returns:
* 1 if the packet was processed
* 0 if normal UDP processing should take place
* -1 if an error occurred and m was freed
*/
static int
udp6_espinudp(struct mbuf **mp, int off)
{
const size_t skip = sizeof(struct udphdr);
size_t len;
void *data;
size_t minlen;
int ip6hdrlen;
struct ip6_hdr *ip6;
struct m_tag *tag;
struct udphdr *udphdr;
u_int16_t sport, dport;
struct mbuf *m = *mp;
uint32_t *marker;
/*
* Collapse the mbuf chain if the first mbuf is too short
* The longest case is: UDP + non ESP marker + ESP
*/
minlen = off + sizeof(u_int64_t) + sizeof(struct esp);
if (minlen > m->m_pkthdr.len)
minlen = m->m_pkthdr.len;
if (m->m_len < minlen) {
if ((*mp = m_pullup(m, minlen)) == NULL) {
return -1;
}
m = *mp;
}
len = m->m_len - off;
data = mtod(m, char *) + off;
/* Ignore keepalive packets */
if ((len == 1) && (*(unsigned char *)data == 0xff)) {
m_freem(m);
*mp = NULL; /* avoid any further processing by caller ... */
return 1;
}
/* Handle Non-ESP marker (32bit). If zero, then IKE. */
marker = (uint32_t *)data;
if (len <= sizeof(uint32_t))
return 0;
if (marker[0] == 0)
return 0;
/*
* Get the UDP ports. They are handled in network
* order everywhere in IPSEC_NAT_T code.
*/
udphdr = (struct udphdr *)((char *)data - skip);
sport = udphdr->uh_sport;
dport = udphdr->uh_dport;
/*
* Remove the UDP header (and possibly the non ESP marker)
* IPv6 header length is ip6hdrlen
* Before:
* <---- off --->
* +-----+------+-----+
* | IP6 | UDP | ESP |
* +-----+------+-----+
* <-skip->
* After:
* +-----+-----+
* | IP6 | ESP |
* +-----+-----+
* <-skip->
*/
ip6hdrlen = off - sizeof(struct udphdr);
memmove(mtod(m, char *) + skip, mtod(m, void *), ip6hdrlen);
m_adj(m, skip);
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - skip);
ip6->ip6_nxt = IPPROTO_ESP;
/*
* We have modified the packet - it is now ESP, so we should not
* return to UDP processing ...
*
* Add a PACKET_TAG_IPSEC_NAT_T_PORT tag to remember
* the source UDP port. This is required if we want
* to select the right SPD for multiple hosts behind
* same NAT
*/
if ((tag = m_tag_get(PACKET_TAG_IPSEC_NAT_T_PORTS,
sizeof(sport) + sizeof(dport), M_DONTWAIT)) == NULL) {
m_freem(m);
return -1;
}
((u_int16_t *)(tag + 1))[0] = sport;
((u_int16_t *)(tag + 1))[1] = dport;
m_tag_prepend(m, tag);
if (ipsec_used)
ipsec6_common_input(&m, &ip6hdrlen, IPPROTO_ESP);
else
m_freem(m);
/* We handled it, it shouldn't be handled by UDP */
*mp = NULL; /* avoid free by caller ... */
return 1;
}
#endif /* IPSEC */
PR_WRAP_USRREQS(udp6)
#define udp6_attach udp6_attach_wrapper
#define udp6_detach udp6_detach_wrapper
#define udp6_accept udp6_accept_wrapper
#define udp6_bind udp6_bind_wrapper
#define udp6_listen udp6_listen_wrapper
#define udp6_connect udp6_connect_wrapper
#define udp6_connect2 udp6_connect2_wrapper
#define udp6_disconnect udp6_disconnect_wrapper
#define udp6_shutdown udp6_shutdown_wrapper
#define udp6_abort udp6_abort_wrapper
#define udp6_ioctl udp6_ioctl_wrapper
#define udp6_stat udp6_stat_wrapper
#define udp6_peeraddr udp6_peeraddr_wrapper
#define udp6_sockaddr udp6_sockaddr_wrapper
#define udp6_rcvd udp6_rcvd_wrapper
#define udp6_recvoob udp6_recvoob_wrapper
#define udp6_send udp6_send_wrapper
#define udp6_sendoob udp6_sendoob_wrapper
#define udp6_purgeif udp6_purgeif_wrapper
const struct pr_usrreqs udp6_usrreqs = {
.pr_attach = udp6_attach,
.pr_detach = udp6_detach,
.pr_accept = udp6_accept,
.pr_bind = udp6_bind,
.pr_listen = udp6_listen,
.pr_connect = udp6_connect,
.pr_connect2 = udp6_connect2,
.pr_disconnect = udp6_disconnect,
.pr_shutdown = udp6_shutdown,
.pr_abort = udp6_abort,
.pr_ioctl = udp6_ioctl,
.pr_stat = udp6_stat,
.pr_peeraddr = udp6_peeraddr,
.pr_sockaddr = udp6_sockaddr,
.pr_rcvd = udp6_rcvd,
.pr_recvoob = udp6_recvoob,
.pr_send = udp6_send,
.pr_sendoob = udp6_sendoob,
.pr_purgeif = udp6_purgeif,
};
/* $NetBSD: kern_core.c,v 1.39 2023/10/04 22:17:09 ad Exp $ */
/*
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_core.c,v 1.39 2023/10/04 22:17:09 ad Exp $");
#ifdef _KERNEL_OPT
#include "opt_execfmt.h"
#include "opt_compat_netbsd32.h"
#endif
#include <sys/param.h>
#include <sys/vnode.h>
#include <sys/namei.h>
#include <sys/acct.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/proc.h>
#include <sys/exec.h>
#include <sys/filedesc.h>
#include <sys/kauth.h>
#include <sys/module.h>
#include <sys/compat_stub.h>
#include <sys/exec_elf.h>
#include <sys/resourcevar.h>
MODULE(MODULE_CLASS_MISC, coredump, NULL);
struct coredump_iostate {
struct lwp *io_lwp;
struct vnode *io_vp;
kauth_cred_t io_cred;
off_t io_offset;
};
static int coredump(struct lwp *, const char *);
static int coredump_buildname(struct proc *, char *, const char *, size_t);
static int coredump_write(struct coredump_iostate *, enum uio_seg segflg,
const void *, size_t);
static off_t coredump_offset(struct coredump_iostate *);
static int
coredump_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
MODULE_HOOK_SET(coredump_hook, coredump);
MODULE_HOOK_SET(coredump_write_hook, coredump_write);
MODULE_HOOK_SET(coredump_offset_hook, coredump_offset);
MODULE_HOOK_SET(coredump_netbsd_hook, real_coredump_netbsd);
#if defined(EXEC_ELF64)
MODULE_HOOK_SET(coredump_elf64_hook, real_coredump_elf64);
#elif defined(EXEC_ELF32)
MODULE_HOOK_SET(coredump_elf32_hook, real_coredump_elf32);
#endif
MODULE_HOOK_SET(uvm_coredump_walkmap_hook,
uvm_coredump_walkmap);
MODULE_HOOK_SET(uvm_coredump_count_segs_hook,
uvm_coredump_count_segs);
return 0;
case MODULE_CMD_FINI:
MODULE_HOOK_UNSET(uvm_coredump_count_segs_hook);
MODULE_HOOK_UNSET(uvm_coredump_walkmap_hook);
#if defined(EXEC_ELF64)
MODULE_HOOK_UNSET(coredump_elf64_hook);
#elif defined(EXEC_ELF32)
MODULE_HOOK_UNSET(coredump_elf32_hook);
#endif
MODULE_HOOK_UNSET(coredump_netbsd_hook);
MODULE_HOOK_UNSET(coredump_offset_hook);
MODULE_HOOK_UNSET(coredump_write_hook);
MODULE_HOOK_UNSET(coredump_hook);
return 0;
default:
return ENOTTY;
}
}
/*
* Dump core, into a file named "progname.core" or "core" (depending on the
* value of shortcorename), unless the process was setuid/setgid.
*/
static int
coredump(struct lwp *l, const char *pattern)
{
struct vnode *vp;
struct proc *p;
struct vmspace *vm;
kauth_cred_t cred = NULL;
struct pathbuf *pb;
struct vattr vattr;
struct coredump_iostate io;
struct plimit *lim;
int error, error1;
char *name, *lastslash = NULL /* XXXgcc */;
name = PNBUF_GET();
p = l->l_proc;
vm = p->p_vmspace;
mutex_enter(&proc_lock); /* p_session */
mutex_enter(p->p_lock);
/*
* Refuse to core if the data + stack + user size is larger than
* the core dump limit. XXX THIS IS WRONG, because of mapped
* data.
*/
if (USPACE + ctob(vm->vm_dsize + vm->vm_ssize) >=
p->p_rlimit[RLIMIT_CORE].rlim_cur) {
error = EFBIG; /* better error code? */
goto release;
}
/*
* It may well not be curproc, so grab a reference to its current
* credentials.
*/
cred = kauth_cred_hold(p->p_cred);
/*
* Make sure the process has not set-id, to prevent data leaks,
* unless it was specifically requested to allow set-id coredumps.
*/
if (p->p_flag & PK_SUGID) { if (!security_setidcore_dump) {
error = EPERM;
goto release;
}
pattern = security_setidcore_path;
}
/* Lock, as p_limit and pl_corename might change. */
lim = p->p_limit;
mutex_enter(&lim->pl_lock);
if (pattern == NULL) { pattern = lim->pl_corename;
}
error = coredump_buildname(p, name, pattern, MAXPATHLEN);
mutex_exit(&lim->pl_lock);
if (error)
goto release;
/*
* On a simple filename, see if the filesystem allow us to write
* core dumps there.
*/
lastslash = strrchr(name, '/');
if (!lastslash) {
vp = p->p_cwdi->cwdi_cdir;
if (vp->v_mount == NULL ||
(vp->v_mount->mnt_flag & MNT_NOCOREDUMP) != 0)
error = EPERM;
}
release:
mutex_exit(p->p_lock);
mutex_exit(&proc_lock);
if (error)
goto done;
/*
* On a complex filename, see if the filesystem allow us to write
* core dumps there.
*
* XXX: We should have an API that avoids double lookups
*/
if (lastslash) {
char c[2];
if (lastslash - name >= MAXPATHLEN - 2) {
error = EPERM;
goto done;
}
c[0] = lastslash[1];
c[1] = lastslash[2];
lastslash[1] = '.';
lastslash[2] = '\0';
error = namei_simple_kernel(name, NSM_FOLLOW_NOEMULROOT, &vp);
if (error)
goto done;
if (vp->v_mount == NULL ||
(vp->v_mount->mnt_flag & MNT_NOCOREDUMP) != 0)
error = EPERM;
vrele(vp);
if (error)
goto done;
lastslash[1] = c[0];
lastslash[2] = c[1];
}
pb = pathbuf_create(name);
if (pb == NULL) {
error = ENOMEM;
goto done;
}
error = vn_open(NULL, pb, 0, O_CREAT | O_NOFOLLOW | FWRITE,
S_IRUSR | S_IWUSR, &vp, NULL, NULL);
if (error != 0) {
pathbuf_destroy(pb);
goto done;
}
pathbuf_destroy(pb);
/*
* Don't dump to:
* - non-regular files
* - files with links
* - files we don't own
*/
if (vp->v_type != VREG || VOP_GETATTR(vp, &vattr, cred) || vattr.va_nlink != 1 ||
vattr.va_uid != kauth_cred_geteuid(cred)) {
error = EACCES;
goto out;
}
vattr_null(&vattr);
vattr.va_size = 0;
if ((p->p_flag & PK_SUGID) && security_setidcore_dump) { vattr.va_uid = security_setidcore_owner;
vattr.va_gid = security_setidcore_group;
vattr.va_mode = security_setidcore_mode;
}
VOP_SETATTR(vp, &vattr, cred);
p->p_acflag |= ACORE;
io.io_lwp = l;
io.io_vp = vp;
io.io_cred = cred;
io.io_offset = 0;
/* Now dump the actual core file. */
error = (*p->p_execsw->es_coredump)(l, &io);
out:
VOP_UNLOCK(vp);
error1 = vn_close(vp, FWRITE, cred);
if (error == 0)
error = error1;
done:
if (cred != NULL) kauth_cred_free(cred); if (name != NULL) PNBUF_PUT(name);
return error;
}
static int
coredump_buildname(struct proc *p, char *dst, const char *src, size_t len)
{
const char *s;
char *d, *end;
int i;
KASSERT(mutex_owned(&proc_lock)); for (s = src, d = dst, end = d + len; *s != '\0'; s++) {
if (*s == '%') {
switch (*(s + 1)) {
case 'n':
i = snprintf(d, end - d, "%s", p->p_comm);
break;
case 'p':
i = snprintf(d, end - d, "%d", p->p_pid);
break;
case 'u':
i = snprintf(d, end - d, "%.*s",
(int)sizeof p->p_pgrp->pg_session->s_login,
p->p_pgrp->pg_session->s_login);
break;
case 't':
i = snprintf(d, end - d, "%lld",
(long long)p->p_stats->p_start.tv_sec);
break;
default:
goto copy;
}
d += i;
s++;
} else {
copy: *d = *s;
d++;
}
if (d >= end)
return (ENAMETOOLONG);
}
*d = '\0';
return 0;
}
static int
coredump_write(struct coredump_iostate *io, enum uio_seg segflg,
const void *data, size_t len)
{
int error;
error = vn_rdwr(UIO_WRITE, io->io_vp, __UNCONST(data), len,
io->io_offset, segflg,
IO_NODELOCKED|IO_UNIT, io->io_cred, NULL,
segflg == UIO_USERSPACE ? io->io_lwp : NULL);
if (error) {
printf("pid %d (%s): %s write of %zu@%p at %lld failed: %d\n",
io->io_lwp->l_proc->p_pid, io->io_lwp->l_proc->p_comm,
segflg == UIO_USERSPACE ? "user" : "system",
len, data, (long long) io->io_offset, error);
return (error);
}
io->io_offset += len;
return (0);
}
static off_t
coredump_offset(struct coredump_iostate *io)
{
return io->io_offset;
}
/* $NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran, and by Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
*/
/*-
* Copyright (c) 1994 Christopher G. Demetriou
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
*/
/*
* The buffer cache subsystem.
*
* Some references:
* Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
* Leffler, et al.: The Design and Implementation of the 4.3BSD
* UNIX Operating System (Addison Welley, 1989)
*
* Locking
*
* There are three locks:
* - bufcache_lock: protects global buffer cache state.
* - BC_BUSY: a long term per-buffer lock.
* - buf_t::b_objlock: lock on completion (biowait vs biodone).
*
* For buffers associated with vnodes (a most common case) b_objlock points
* to the vnode_t::v_interlock. Otherwise, it points to generic buffer_lock.
*
* Lock order:
* bufcache_lock ->
* buf_t::b_objlock
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.303 2022/03/30 14:54:29 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_bufcache.h"
#include "opt_dtrace.h"
#include "opt_biohist.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
#include <sys/sysctl.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/fstrans.h>
#include <sys/intr.h>
#include <sys/cpu.h>
#include <sys/wapbl.h>
#include <sys/bitops.h>
#include <sys/cprng.h>
#include <sys/sdt.h>
#include <uvm/uvm.h> /* extern struct uvm uvm */
#include <miscfs/specfs/specdev.h>
SDT_PROVIDER_DEFINE(io);
SDT_PROBE_DEFINE4(io, kernel, , bbusy__start,
"struct buf *"/*bp*/,
"bool"/*intr*/, "int"/*timo*/, "kmutex_t *"/*interlock*/);
SDT_PROBE_DEFINE5(io, kernel, , bbusy__done,
"struct buf *"/*bp*/,
"bool"/*intr*/,
"int"/*timo*/,
"kmutex_t *"/*interlock*/,
"int"/*error*/);
SDT_PROBE_DEFINE0(io, kernel, , getnewbuf__start);
SDT_PROBE_DEFINE1(io, kernel, , getnewbuf__done, "struct buf *"/*bp*/);
SDT_PROBE_DEFINE3(io, kernel, , getblk__start,
"struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/);
SDT_PROBE_DEFINE4(io, kernel, , getblk__done,
"struct vnode *"/*vp*/, "daddr_t"/*blkno*/, "int"/*size*/,
"struct buf *"/*bp*/);
SDT_PROBE_DEFINE2(io, kernel, , brelse, "struct buf *"/*bp*/, "int"/*set*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__start, "struct buf *"/*bp*/);
SDT_PROBE_DEFINE1(io, kernel, , wait__done, "struct buf *"/*bp*/);
#ifndef BUFPAGES
# define BUFPAGES 0
#endif
#ifdef BUFCACHE
# if (BUFCACHE < 5) || (BUFCACHE > 95)
# error BUFCACHE is not between 5 and 95
# endif
#else
# define BUFCACHE 15
#endif
u_int nbuf; /* desired number of buffer headers */
u_int bufpages = BUFPAGES; /* optional hardwired count */
u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */
/*
* Definitions for the buffer free lists.
*/
#define BQUEUES 3 /* number of free buffer queues */
#define BQ_LOCKED 0 /* super-blocks &c */
#define BQ_LRU 1 /* lru, useful buffers */
#define BQ_AGE 2 /* rubbish */
struct bqueue {
TAILQ_HEAD(, buf) bq_queue;
uint64_t bq_bytes;
buf_t *bq_marker;
};
static struct bqueue bufqueues[BQUEUES] __cacheline_aligned;
/* Function prototypes */
static void buf_setwm(void);
static int buf_trim(void);
static void *bufpool_page_alloc(struct pool *, int);
static void bufpool_page_free(struct pool *, void *);
static buf_t *bio_doread(struct vnode *, daddr_t, int, int);
static buf_t *getnewbuf(int, int, int);
static int buf_lotsfree(void);
static int buf_canrelease(void);
static u_long buf_mempoolidx(u_long);
static u_long buf_roundsize(u_long);
static void *buf_alloc(size_t);
static void buf_mrelease(void *, size_t);
static void binsheadfree(buf_t *, struct bqueue *);
static void binstailfree(buf_t *, struct bqueue *);
#ifdef DEBUG
static int checkfreelist(buf_t *, struct bqueue *, int);
#endif
static void biointr(void *);
static void biodone2(buf_t *);
static void sysctl_kern_buf_setup(void);
static void sysctl_vm_buf_setup(void);
/* Initialization for biohist */
#include <sys/biohist.h>
BIOHIST_DEFINE(biohist);
void
biohist_init(void)
{
BIOHIST_INIT(biohist, BIOHIST_SIZE);
}
/*
* Definitions for the buffer hash lists.
*/
#define BUFHASH(dvp, lbn) \
(&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash])
LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
u_long bufhash;
static int bufhash_stats(struct hashstat_sysctl *, bool);
static kcondvar_t needbuffer_cv;
/*
* Buffer queue lock.
*/
kmutex_t bufcache_lock __cacheline_aligned;
kmutex_t buffer_lock __cacheline_aligned;
/* Software ISR for completed transfers. */
static void *biodone_sih;
/* Buffer pool for I/O buffers. */
static pool_cache_t buf_cache;
static pool_cache_t bufio_cache;
#define MEMPOOL_INDEX_OFFSET (ilog2(DEV_BSIZE)) /* smallest pool is 512 bytes */
#define NMEMPOOLS (ilog2(MAXBSIZE) - MEMPOOL_INDEX_OFFSET + 1)
__CTASSERT((1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) == MAXBSIZE);
/* Buffer memory pools */
static struct pool bmempools[NMEMPOOLS];
static struct vm_map *buf_map;
/*
* Buffer memory pool allocator.
*/
static void *
bufpool_page_alloc(struct pool *pp, int flags)
{
return (void *)uvm_km_alloc(buf_map,
MAXBSIZE, MAXBSIZE,
((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT|UVM_KMF_TRYLOCK)
| UVM_KMF_WIRED);
}
static void
bufpool_page_free(struct pool *pp, void *v)
{
uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED);
}
static struct pool_allocator bufmempool_allocator = {
.pa_alloc = bufpool_page_alloc,
.pa_free = bufpool_page_free,
.pa_pagesz = MAXBSIZE,
};
/* Buffer memory management variables */
u_long bufmem_valimit;
u_long bufmem_hiwater;
u_long bufmem_lowater;
u_long bufmem;
/*
* MD code can call this to set a hard limit on the amount
* of virtual memory used by the buffer cache.
*/
int
buf_setvalimit(vsize_t sz)
{
/* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */
if (sz < NMEMPOOLS * MAXBSIZE)
return EINVAL;
bufmem_valimit = sz;
return 0;
}
static void
buf_setwm(void)
{
bufmem_hiwater = buf_memcalc();
/* lowater is approx. 2% of memory (with bufcache = 15) */
#define BUFMEM_WMSHIFT 3
#define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT)
if (bufmem_hiwater < BUFMEM_HIWMMIN)
/* Ensure a reasonable minimum value */
bufmem_hiwater = BUFMEM_HIWMMIN;
bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT;
}
#ifdef DEBUG
int debug_verify_freelist = 0;
static int
checkfreelist(buf_t *bp, struct bqueue *dp, int ison)
{
buf_t *b;
if (!debug_verify_freelist)
return 1;
TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { if (b == bp)
return ison ? 1 : 0;
}
return ison ? 0 : 1;
}
#endif
/*
* Insq/Remq for the buffer hash lists.
* Call with buffer queue locked.
*/
static void
binsheadfree(buf_t *bp, struct bqueue *dp)
{
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bp->b_freelistindex == -1); TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes += bp->b_bufsize;
bp->b_freelistindex = dp - bufqueues;
}
static void
binstailfree(buf_t *bp, struct bqueue *dp)
{
KASSERT(mutex_owned(&bufcache_lock)); KASSERTMSG(bp->b_freelistindex == -1, "double free of buffer? "
"bp=%p, b_freelistindex=%d\n", bp, bp->b_freelistindex);
TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes += bp->b_bufsize;
bp->b_freelistindex = dp - bufqueues;
}
void
bremfree(buf_t *bp)
{
struct bqueue *dp;
int bqidx = bp->b_freelistindex;
KASSERT(mutex_owned(&bufcache_lock)); KASSERT(bqidx != -1);
dp = &bufqueues[bqidx];
KDASSERT(checkfreelist(bp, dp, 1)); KASSERT(dp->bq_bytes >= bp->b_bufsize); TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist);
dp->bq_bytes -= bp->b_bufsize;
/* For the sysctl helper. */
if (bp == dp->bq_marker) dp->bq_marker = NULL;
#if defined(DIAGNOSTIC)
bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
}
/*
* note that for some ports this is used by pmap bootstrap code to
* determine kva size.
*/
u_long
buf_memcalc(void)
{
u_long n;
vsize_t mapsz = 0;
/*
* Determine the upper bound of memory to use for buffers.
*
* - If bufpages is specified, use that as the number
* pages.
*
* - Otherwise, use bufcache as the percentage of
* physical memory.
*/
if (bufpages != 0) {
n = bufpages;
} else {
if (bufcache < 5) {
printf("forcing bufcache %d -> 5", bufcache);
bufcache = 5;
}
if (bufcache > 95) {
printf("forcing bufcache %d -> 95", bufcache);
bufcache = 95;
}
if (buf_map != NULL)
mapsz = vm_map_max(buf_map) - vm_map_min(buf_map);
n = calc_cache_size(mapsz, bufcache,
(buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT)
/ PAGE_SIZE;
}
n <<= PAGE_SHIFT;
if (bufmem_valimit != 0 && n > bufmem_valimit)
n = bufmem_valimit;
return (n);
}
/*
* Initialize buffers and hash links for buffers.
*/
void
bufinit(void)
{
struct bqueue *dp;
int use_std;
u_int i;
biodone_vfs = biodone;
mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE);
mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&needbuffer_cv, "needbuf");
if (bufmem_valimit != 0) {
vaddr_t minaddr = 0, maxaddr;
buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
bufmem_valimit, 0, false, 0);
if (buf_map == NULL)
panic("bufinit: cannot allocate submap");
} else
buf_map = kernel_map;
/*
* Initialize buffer cache memory parameters.
*/
bufmem = 0;
buf_setwm();
/* On "small" machines use small pool page sizes where possible */
use_std = (physmem < atop(16*1024*1024));
/*
* Also use them on systems that can map the pool pages using
* a direct-mapped segment.
*/
#ifdef PMAP_MAP_POOLPAGE
use_std = 1;
#endif
buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
"bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL);
bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0,
"biopl", NULL, IPL_BIO, NULL, NULL, NULL);
for (i = 0; i < NMEMPOOLS; i++) {
struct pool_allocator *pa;
struct pool *pp = &bmempools[i];
u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET);
char *name = kmem_alloc(8, KM_SLEEP); /* XXX: never freed */
if (__predict_false(size >= 1048576))
(void)snprintf(name, 8, "buf%um", size / 1048576);
else if (__predict_true(size >= 1024))
(void)snprintf(name, 8, "buf%uk", size / 1024);
else
(void)snprintf(name, 8, "buf%ub", size);
pa = (size <= PAGE_SIZE && use_std)
? &pool_allocator_nointr
: &bufmempool_allocator;
pool_init(pp, size, DEV_BSIZE, 0, 0, name, pa, IPL_NONE);
pool_setlowat(pp, 1);
pool_sethiwat(pp, 1);
}
/* Initialize the buffer queues */
for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) {
TAILQ_INIT(&dp->bq_queue);
dp->bq_bytes = 0;
}
/*
* Estimate hash table size based on the amount of memory we
* intend to use for the buffer cache. The average buffer
* size is dependent on our clients (i.e. filesystems).
*
* For now, use an empirical 3K per buffer.
*/
nbuf = (bufmem_hiwater / 1024) / 3;
bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash);
sysctl_kern_buf_setup();
sysctl_vm_buf_setup();
hashstat_register("bufhash", bufhash_stats);
}
void
bufinit2(void)
{
biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr,
NULL);
if (biodone_sih == NULL)
panic("bufinit2: can't establish soft interrupt");
}
static int
buf_lotsfree(void)
{
u_long guess;
/* Always allocate if less than the low water mark. */
if (bufmem < bufmem_lowater)
return 1;
/* Never allocate if greater than the high water mark. */
if (bufmem > bufmem_hiwater)
return 0;
/* If there's anything on the AGE list, it should be eaten. */
if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL)
return 0;
/*
* The probabily of getting a new allocation is inversely
* proportional to the current size of the cache above
* the low water mark. Divide the total first to avoid overflows
* in the product.
*/
guess = cprng_fast32() % 16;
if ((bufmem_hiwater - bufmem_lowater) / 16 * guess >=
(bufmem - bufmem_lowater))
return 1;
/* Otherwise don't allocate. */
return 0;
}
/*
* Return estimate of bytes we think need to be
* released to help resolve low memory conditions.
*
* => called with bufcache_lock held.
*/
static int
buf_canrelease(void)
{
int pagedemand, ninvalid = 0;
KASSERT(mutex_owned(&bufcache_lock)); if (bufmem < bufmem_lowater)
return 0;
if (bufmem > bufmem_hiwater)
return bufmem - bufmem_hiwater;
ninvalid += bufqueues[BQ_AGE].bq_bytes;
pagedemand = uvmexp.freetarg - uvm_availmem(false);
if (pagedemand < 0)
return ninvalid;
return MAX(ninvalid, MIN(2 * MAXBSIZE,
MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE)));
}
/*
* Buffer memory allocation helper functions
*/
static u_long
buf_mempoolidx(u_long size)
{
u_int n = 0;
size -= 1;
size >>= MEMPOOL_INDEX_OFFSET;
while (size) {
size >>= 1;
n += 1;
}
if (n >= NMEMPOOLS) panic("buf mem pool index %d", n);
return n;
}
static u_long
buf_roundsize(u_long size)
{
/* Round up to nearest power of 2 */
return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET));
}
static void *
buf_alloc(size_t size)
{
u_int n = buf_mempoolidx(size);
void *addr;
while (1) {
addr = pool_get(&bmempools[n], PR_NOWAIT);
if (addr != NULL)
break;
/* No memory, see if we can free some. If so, try again */
mutex_enter(&bufcache_lock);
if (buf_drain(1) > 0) {
mutex_exit(&bufcache_lock);
continue;
}
if (curlwp == uvm.pagedaemon_lwp) {
mutex_exit(&bufcache_lock);
return NULL;
}
/* Wait for buffers to arrive on the LRU queue */
cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4);
mutex_exit(&bufcache_lock);
}
return addr;
}
static void
buf_mrelease(void *addr, size_t size)
{
pool_put(&bmempools[buf_mempoolidx(size)], addr);
}
/*
* bread()/breadn() helper.
*/
static buf_t *
bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
{
buf_t *bp;
struct mount *mp;
bp = getblk(vp, blkno, size, 0, 0);
/*
* getblk() may return NULL if we are the pagedaemon.
*/
if (bp == NULL) {
KASSERT(curlwp == uvm.pagedaemon_lwp);
return NULL;
}
/*
* If buffer does not have data valid, start a read.
* Note that if buffer is BC_INVAL, getblk() won't return it.
* Therefore, it's valid if its I/O has completed or been delayed.
*/
if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) {
/* Start I/O for the buffer. */
SET(bp->b_flags, B_READ | async);
if (async)
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
else
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
VOP_STRATEGY(vp, bp);
/* Pay for the read. */
curlwp->l_ru.ru_inblock++;
} else if (async) brelse(bp, 0);
if (vp->v_type == VBLK)
mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
/*
* Collect statistics on synchronous and asynchronous reads.
* Reads from block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) {
if (async == 0)
mp->mnt_stat.f_syncreads++;
else
mp->mnt_stat.f_asyncreads++;
}
return (bp);
}
/*
* Read a disk block.
* This algorithm described in Bach (p.54).
*/
int
bread(struct vnode *vp, daddr_t blkno, int size, int flags, buf_t **bpp)
{
buf_t *bp;
int error;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
/* Get buffer for block. */
bp = *bpp = bio_doread(vp, blkno, size, 0);
if (bp == NULL)
return ENOMEM;
/* Wait for the read to complete, and return result. */
error = biowait(bp);
if (error == 0 && (flags & B_MODIFY) != 0)
error = fscow_run(bp, true);
if (error) {
brelse(bp, 0);
*bpp = NULL;
}
return error;
}
/*
* Read-ahead multiple disk blocks. The first is sync, the rest async.
* Trivial modification to the breada algorithm presented in Bach (p.55).
*/
int
breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks,
int *rasizes, int nrablks, int flags, buf_t **bpp)
{
buf_t *bp;
int error, i;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
bp = *bpp = bio_doread(vp, blkno, size, 0);
if (bp == NULL)
return ENOMEM;
/*
* For each of the read-ahead blocks, start a read, if necessary.
*/
mutex_enter(&bufcache_lock);
for (i = 0; i < nrablks; i++) {
/* If it's in the cache, just go on to next one. */
if (incore(vp, rablks[i]))
continue;
/* Get a buffer for the read-ahead block */
mutex_exit(&bufcache_lock);
(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
mutex_enter(&bufcache_lock);
}
mutex_exit(&bufcache_lock);
/* Otherwise, we had to start a read for it; wait until it's valid. */
error = biowait(bp);
if (error == 0 && (flags & B_MODIFY) != 0)
error = fscow_run(bp, true);
if (error) {
brelse(bp, 0);
*bpp = NULL;
}
return error;
}
/*
* Block write. Described in Bach (p.56)
*/
int
bwrite(buf_t *bp)
{
int rv, sync, wasdelayed;
struct vnode *vp;
struct mount *mp;
BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
(uintptr_t)bp, 0, 0, 0);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
vp = bp->b_vp;
/*
* dholland 20160728 AFAICT vp==NULL must be impossible as it
* will crash upon reaching VOP_STRATEGY below... see further
* analysis on tech-kern.
*/
KASSERTMSG(vp != NULL, "bwrite given buffer with null vnode");
if (vp != NULL) {
KASSERT(bp->b_objlock == vp->v_interlock);
if (vp->v_type == VBLK)
mp = spec_node_getmountedfs(vp);
else
mp = vp->v_mount;
} else {
mp = NULL;
}
if (mp && mp->mnt_wapbl) { if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { bdwrite(bp);
return 0;
}
}
/*
* Remember buffer type, to switch on it later. If the write was
* synchronous, but the file system was mounted with MNT_ASYNC,
* convert it to a delayed write.
* XXX note that this relies on delayed tape writes being converted
* to async, not sync writes (which is safe, but ugly).
*/
sync = !ISSET(bp->b_flags, B_ASYNC);
if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) {
bdwrite(bp);
return (0);
}
/*
* Collect statistics on synchronous and asynchronous writes.
* Writes to block devices are charged to their associated
* filesystem (if any).
*/
if (mp != NULL) {
if (sync)
mp->mnt_stat.f_syncwrites++;
else
mp->mnt_stat.f_asyncwrites++;
}
/*
* Pay for the I/O operation and make sure the buf is on the correct
* vnode queue.
*/
bp->b_error = 0;
wasdelayed = ISSET(bp->b_oflags, BO_DELWRI);
CLR(bp->b_flags, B_READ);
if (wasdelayed) {
mutex_enter(&bufcache_lock);
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
reassignbuf(bp, bp->b_vp);
/* Wake anyone trying to busy the buffer via vnode's lists. */
cv_broadcast(&bp->b_busy);
mutex_exit(&bufcache_lock);
} else {
curlwp->l_ru.ru_oublock++;
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE | BO_DELWRI);
}
if (vp != NULL) vp->v_numoutput++;
mutex_exit(bp->b_objlock);
/* Initiate disk write. */
if (sync)
BIO_SETPRIO(bp, BPRIO_TIMECRITICAL);
else
BIO_SETPRIO(bp, BPRIO_TIMELIMITED);
VOP_STRATEGY(vp, bp);
if (sync) {
/* If I/O was synchronous, wait for it to complete. */
rv = biowait(bp);
/* Release the buffer. */
brelse(bp, 0);
return (rv);
} else {
return (0);
}
}
int
vn_bwrite(void *v)
{
struct vop_bwrite_args *ap = v;
return (bwrite(ap->a_bp));
}
/*
* Delayed write.
*
* The buffer is marked dirty, but is not queued for I/O.
* This routine should be used when the buffer is expected
* to be modified again soon, typically a small write that
* partially fills a buffer.
*
* NB: magnetic tapes cannot be delayed; they must be
* written in the order that the writes are requested.
*
* Described in Leffler, et al. (pp. 208-213).
*/
void
bdwrite(buf_t *bp)
{
BIOHIST_FUNC(__func__); BIOHIST_CALLARGS(biohist, "bp=%#jx",
(uintptr_t)bp, 0, 0, 0);
KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS ||
bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE));
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
/* If this is a tape block, write the block now. */
if (bdev_type(bp->b_dev) == D_TAPE) {
bawrite(bp);
return;
}
if (wapbl_vphaswapbl(bp->b_vp)) { struct mount *mp = wapbl_vptomp(bp->b_vp); if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { WAPBL_ADD_BUF(mp, bp);
}
}
/*
* If the block hasn't been seen before:
* (1) Mark it as having been seen,
* (2) Charge for the write,
* (3) Make sure it's on its vnode's correct block list.
*/
KASSERT(bp->b_vp == NULL || bp->b_objlock == bp->b_vp->v_interlock);
if (!ISSET(bp->b_oflags, BO_DELWRI)) {
mutex_enter(&bufcache_lock);
mutex_enter(bp->b_objlock);
SET(bp->b_oflags, BO_DELWRI);
curlwp->l_ru.ru_oublock++;
reassignbuf(bp, bp->b_vp);
/* Wake anyone trying to busy the buffer via vnode's lists. */
cv_broadcast(&bp->b_busy);
mutex_exit(&bufcache_lock);
} else {
mutex_enter(bp->b_objlock);
}
/* Otherwise, the "write" is done, so mark and release the buffer. */
CLR(bp->b_oflags, BO_DONE);
mutex_exit(bp->b_objlock);
brelse(bp, 0);
}
/*
* Asynchronous block write; just an asynchronous bwrite().
*/
void
bawrite(buf_t *bp)
{ KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_vp != NULL);
SET(bp->b_flags, B_ASYNC);
VOP_BWRITE(bp->b_vp, bp);
}
/*
* Release a buffer on to the free lists.
* Described in Bach (p. 46).
*/
void
brelsel(buf_t *bp, int set)
{
struct bqueue *bufq;
struct vnode *vp;
SDT_PROBE2(io, kernel, , brelse, bp, set); KASSERT(bp != NULL); KASSERT(mutex_owned(&bufcache_lock)); KASSERT(!cv_has_waiters(&bp->b_done));
SET(bp->b_cflags, set);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(bp->b_iodone == NULL);
/* Wake up any processes waiting for any buffer to become free. */
cv_signal(&needbuffer_cv);
/* Wake up any proceeses waiting for _this_ buffer to become free */
if (ISSET(bp->b_cflags, BC_WANTED)) CLR(bp->b_cflags, BC_WANTED|BC_AGE);
/* If it's clean clear the copy-on-write flag. */
if (ISSET(bp->b_flags, B_COWDONE)) {
mutex_enter(bp->b_objlock);
if (!ISSET(bp->b_oflags, BO_DELWRI)) CLR(bp->b_flags, B_COWDONE);
mutex_exit(bp->b_objlock);
}
/*
* Determine which queue the buffer should be on, then put it there.
*/
/* If it's locked, don't report an error; try again later. */
if (ISSET(bp->b_flags, B_LOCKED)) bp->b_error = 0;
/* If it's not cacheable, or an error, mark it invalid. */
if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0)
SET(bp->b_cflags, BC_INVAL);
if (ISSET(bp->b_cflags, BC_VFLUSH)) {
/*
* This is a delayed write buffer that was just flushed to
* disk. It is still on the LRU queue. If it's become
* invalid, then we need to move it to a different queue;
* otherwise leave it in its current position.
*/
CLR(bp->b_cflags, BC_VFLUSH);
if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) &&
!ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) {
KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1));
goto already_queued;
} else {
bremfree(bp);
}
}
KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) {
/*
* If it's invalid or empty, dissociate it from its vnode
* and put on the head of the appropriate queue.
*/
if (ISSET(bp->b_flags, B_LOCKED)) { if (wapbl_vphaswapbl(vp = bp->b_vp)) { struct mount *mp = wapbl_vptomp(vp); KASSERT(bp->b_iodone
!= mp->mnt_wapbl_op->wo_wapbl_biodone);
WAPBL_REMOVE_BUF(mp, bp);
}
}
mutex_enter(bp->b_objlock);
CLR(bp->b_oflags, BO_DONE|BO_DELWRI);
if ((vp = bp->b_vp) != NULL) {
KASSERT(bp->b_objlock == vp->v_interlock);
reassignbuf(bp, bp->b_vp);
brelvp(bp);
mutex_exit(vp->v_interlock);
} else {
KASSERT(bp->b_objlock == &buffer_lock);
mutex_exit(bp->b_objlock);
}
/* We want to dispose of the buffer, so wake everybody. */
cv_broadcast(&bp->b_busy);
if (bp->b_bufsize <= 0)
/* no data */
goto already_queued;
else
/* invalid data */
bufq = &bufqueues[BQ_AGE];
binsheadfree(bp, bufq);
} else {
/*
* It has valid data. Put it on the end of the appropriate
* queue, so that it'll stick around for as long as possible.
* If buf is AGE, but has dependencies, must put it on last
* bufqueue to be scanned, ie LRU. This protects against the
* livelock where BQ_AGE only has buffers with dependencies,
* and we thus never get to the dependent buffers in BQ_LRU.
*/
if (ISSET(bp->b_flags, B_LOCKED)) {
/* locked in core */
bufq = &bufqueues[BQ_LOCKED];
} else if (!ISSET(bp->b_cflags, BC_AGE)) {
/* valid data */
bufq = &bufqueues[BQ_LRU];
} else {
/* stale but valid data */
bufq = &bufqueues[BQ_AGE];
}
binstailfree(bp, bufq);
}
already_queued:
/* Unlock the buffer. */
CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE);
CLR(bp->b_flags, B_ASYNC);
/*
* Wake only the highest priority waiter on the lock, in order to
* prevent a thundering herd: many LWPs simultaneously awakening and
* competing for the buffer's lock. Testing in 2019 revealed this
* to reduce contention on bufcache_lock tenfold during a kernel
* compile. Here and elsewhere, when the buffer is changing
* identity, being disposed of, or moving from one list to another,
* we wake all lock requestors.
*/
if (bp->b_bufsize <= 0) {
cv_broadcast(&bp->b_busy);
buf_destroy(bp);
#ifdef DEBUG
memset((char *)bp, 0, sizeof(*bp));
#endif
pool_cache_put(buf_cache, bp);
} else
cv_signal(&bp->b_busy);
}
void
brelse(buf_t *bp, int set)
{
mutex_enter(&bufcache_lock);
brelsel(bp, set);
mutex_exit(&bufcache_lock);
}
/*
* Determine if a block is in the cache.
* Just look on what would be its hash chain. If it's there, return
* a pointer to it, unless it's marked invalid. If it's marked invalid,
* we normally don't return the buffer, unless the caller explicitly
* wants us to.
*/
buf_t *
incore(struct vnode *vp, daddr_t blkno)
{
buf_t *bp;
KASSERT(mutex_owned(&bufcache_lock));
/* Search hash chain */
LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { if (bp->b_lblkno == blkno && bp->b_vp == vp &&
!ISSET(bp->b_cflags, BC_INVAL)) {
KASSERT(bp->b_objlock == vp->v_interlock);
return (bp);
}
}
return (NULL);
}
/*
* Get a block of requested size that is associated with
* a given vnode and block offset. If it is found in the
* block cache, mark it as having been found, make it busy
* and return it. Otherwise, return an empty block of the
* correct size. It is up to the caller to insure that the
* cached blocks be of the correct size.
*/
buf_t *
getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
int err, preserve;
buf_t *bp;
mutex_enter(&bufcache_lock);
SDT_PROBE3(io, kernel, , getblk__start, vp, blkno, size);
loop:
bp = incore(vp, blkno);
if (bp != NULL) {
err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL);
if (err != 0) {
if (err == EPASSTHROUGH)
goto loop;
mutex_exit(&bufcache_lock);
SDT_PROBE4(io, kernel, , getblk__done,
vp, blkno, size, NULL);
return (NULL);
}
KASSERT(!cv_has_waiters(&bp->b_done));
#ifdef DIAGNOSTIC
if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && bp->b_bcount < size && vp->v_type != VBLK) panic("getblk: block size invariant failed");
#endif
bremfree(bp);
preserve = 1;
} else {
if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL)
goto loop;
if (incore(vp, blkno) != NULL) {
/* The block has come into memory in the meantime. */
brelsel(bp, 0);
goto loop;
}
LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash);
bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno;
mutex_enter(vp->v_interlock);
bgetvp(vp, bp);
mutex_exit(vp->v_interlock);
preserve = 0;
}
mutex_exit(&bufcache_lock);
/*
* LFS can't track total size of B_LOCKED buffer (locked_queue_bytes)
* if we re-size buffers here.
*/
if (ISSET(bp->b_flags, B_LOCKED)) {
KASSERT(bp->b_bufsize >= size);
} else {
if (allocbuf(bp, size, preserve)) {
mutex_enter(&bufcache_lock);
LIST_REMOVE(bp, b_hash);
brelsel(bp, BC_INVAL);
mutex_exit(&bufcache_lock);
SDT_PROBE4(io, kernel, , getblk__done,
vp, blkno, size, NULL);
return NULL;
}
}
BIO_SETPRIO(bp, BPRIO_DEFAULT);
SDT_PROBE4(io, kernel, , getblk__done, vp, blkno, size, bp);
return (bp);
}
/*
* Get an empty, disassociated buffer of given size.
*/
buf_t *
geteblk(int size)
{
buf_t *bp;
int error __diagused;
mutex_enter(&bufcache_lock);
while ((bp = getnewbuf(0, 0, 0)) == NULL)
;
SET(bp->b_cflags, BC_INVAL);
LIST_INSERT_HEAD(&invalhash, bp, b_hash);
mutex_exit(&bufcache_lock);
BIO_SETPRIO(bp, BPRIO_DEFAULT);
error = allocbuf(bp, size, 0);
KASSERT(error == 0);
return (bp);
}
/*
* Expand or contract the actual memory allocated to a buffer.
*
* If the buffer shrinks, data is lost, so it's up to the
* caller to have written it out *first*; this routine will not
* start a write. If the buffer grows, it's the callers
* responsibility to fill out the buffer's additional contents.
*/
int
allocbuf(buf_t *bp, int size, int preserve)
{
void *addr;
vsize_t oldsize, desired_size;
int oldcount;
int delta;
desired_size = buf_roundsize(size);
if (desired_size > MAXBSIZE)
printf("allocbuf: buffer larger than MAXBSIZE requested");
oldcount = bp->b_bcount;
bp->b_bcount = size;
oldsize = bp->b_bufsize;
if (oldsize == desired_size) {
/*
* Do not short cut the WAPBL resize, as the buffer length
* could still have changed and this would corrupt the
* tracking of the transaction length.
*/
goto out;
}
/*
* If we want a buffer of a different size, re-allocate the
* buffer's memory; copy old content only if needed.
*/
addr = buf_alloc(desired_size);
if (addr == NULL)
return ENOMEM;
if (preserve) memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); if (bp->b_data != NULL) buf_mrelease(bp->b_data, oldsize);
bp->b_data = addr;
bp->b_bufsize = desired_size;
/*
* Update overall buffer memory counter (protected by bufcache_lock)
*/
delta = (long)desired_size - (long)oldsize;
mutex_enter(&bufcache_lock);
if ((bufmem += delta) > bufmem_hiwater) {
/*
* Need to trim overall memory usage.
*/
while (buf_canrelease()) { if (preempt_needed()) { mutex_exit(&bufcache_lock);
preempt();
mutex_enter(&bufcache_lock);
}
if (buf_trim() == 0)
break;
}
}
mutex_exit(&bufcache_lock);
out:
if (wapbl_vphaswapbl(bp->b_vp)) WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount);
return 0;
}
/*
* Find a buffer which is available for use.
* Select something from a free list.
* Preference is to AGE list, then LRU list.
*
* Called with the buffer queues locked.
* Return buffer locked.
*/
static buf_t *
getnewbuf(int slpflag, int slptimeo, int from_bufq)
{
buf_t *bp;
struct vnode *vp;
struct mount *transmp = NULL;
SDT_PROBE0(io, kernel, , getnewbuf__start);
start:
KASSERT(mutex_owned(&bufcache_lock));
/*
* Get a new buffer from the pool.
*/
if (!from_bufq && buf_lotsfree()) {
mutex_exit(&bufcache_lock);
bp = pool_cache_get(buf_cache, PR_NOWAIT);
if (bp != NULL) {
memset((char *)bp, 0, sizeof(*bp));
buf_init(bp);
SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */
mutex_enter(&bufcache_lock);
#if defined(DIAGNOSTIC)
bp->b_freelistindex = -1;
#endif /* defined(DIAGNOSTIC) */
SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
return (bp);
}
mutex_enter(&bufcache_lock);
}
KASSERT(mutex_owned(&bufcache_lock));
if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL) {
KASSERT(!ISSET(bp->b_oflags, BO_DELWRI));
} else {
TAILQ_FOREACH(bp, &bufqueues[BQ_LRU].bq_queue, b_freelist) { if (ISSET(bp->b_cflags, BC_VFLUSH) ||
!ISSET(bp->b_oflags, BO_DELWRI))
break;
if (fstrans_start_nowait(bp->b_vp->v_mount) == 0) {
KASSERT(transmp == NULL);
transmp = bp->b_vp->v_mount;
break;
}
}
}
if (bp != NULL) {
KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH));
bremfree(bp);
/* Buffer is no longer on free lists. */
SET(bp->b_cflags, BC_BUSY);
/* Wake anyone trying to lock the old identity. */
cv_broadcast(&bp->b_busy);
} else {
/*
* XXX: !from_bufq should be removed.
*/
if (!from_bufq || curlwp != uvm.pagedaemon_lwp) {
/* wait for a free buffer of any kind */
if ((slpflag & PCATCH) != 0)
(void)cv_timedwait_sig(&needbuffer_cv,
&bufcache_lock, slptimeo);
else
(void)cv_timedwait(&needbuffer_cv,
&bufcache_lock, slptimeo);
}
SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
return (NULL);
}
#ifdef DIAGNOSTIC
if (bp->b_bufsize <= 0)
panic("buffer %p: on queue but empty", bp);
#endif
if (ISSET(bp->b_cflags, BC_VFLUSH)) {
/*
* This is a delayed write buffer being flushed to disk. Make
* sure it gets aged out of the queue when it's finished, and
* leave it off the LRU queue.
*/
CLR(bp->b_cflags, BC_VFLUSH);
SET(bp->b_cflags, BC_AGE);
goto start;
}
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); KASSERT(!cv_has_waiters(&bp->b_done));
/*
* If buffer was a delayed write, start it and return NULL
* (since we might sleep while starting the write).
*/
if (ISSET(bp->b_oflags, BO_DELWRI)) {
/*
* This buffer has gone through the LRU, so make sure it gets
* reused ASAP.
*/
SET(bp->b_cflags, BC_AGE);
mutex_exit(&bufcache_lock);
bawrite(bp); KASSERT(transmp != NULL);
fstrans_done(transmp);
mutex_enter(&bufcache_lock);
SDT_PROBE1(io, kernel, , getnewbuf__done, NULL);
return (NULL);
}
KASSERT(transmp == NULL);
vp = bp->b_vp;
/* clear out various other fields */
bp->b_cflags = BC_BUSY;
bp->b_oflags = 0;
bp->b_flags = 0;
bp->b_dev = NODEV;
bp->b_blkno = 0;
bp->b_lblkno = 0;
bp->b_rawblkno = 0;
bp->b_iodone = 0;
bp->b_error = 0;
bp->b_resid = 0;
bp->b_bcount = 0;
LIST_REMOVE(bp, b_hash);
/* Disassociate us from our vnode, if we had one... */
if (vp != NULL) { mutex_enter(vp->v_interlock);
brelvp(bp);
mutex_exit(vp->v_interlock);
}
SDT_PROBE1(io, kernel, , getnewbuf__done, bp);
return (bp);
}
/*
* Invalidate the specified buffer if it exists.
*/
void
binvalbuf(struct vnode *vp, daddr_t blkno)
{
buf_t *bp;
int err;
mutex_enter(&bufcache_lock);
loop:
bp = incore(vp, blkno);
if (bp != NULL) {
err = bbusy(bp, 0, 0, NULL);
if (err == EPASSTHROUGH)
goto loop;
bremfree(bp);
if (ISSET(bp->b_oflags, BO_DELWRI)) {
SET(bp->b_cflags, BC_NOCACHE);
mutex_exit(&bufcache_lock);
bwrite(bp);
} else {
brelsel(bp, BC_INVAL);
mutex_exit(&bufcache_lock);
}
} else
mutex_exit(&bufcache_lock);
}
/*
* Attempt to free an aged buffer off the queues.
* Called with queue lock held.
* Returns the amount of buffer memory freed.
*/
static int
buf_trim(void)
{
buf_t *bp;
long size;
KASSERT(mutex_owned(&bufcache_lock));
/* Instruct getnewbuf() to get buffers off the queues */
if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL)
return 0;
KASSERT((bp->b_cflags & BC_WANTED) == 0);
size = bp->b_bufsize;
bufmem -= size;
if (size > 0) {
buf_mrelease(bp->b_data, size);
bp->b_bcount = bp->b_bufsize = 0;
}
/* brelse() will return the buffer to the global buffer pool */
brelsel(bp, 0);
return size;
}
int
buf_drain(int n)
{
int size = 0, sz;
KASSERT(mutex_owned(&bufcache_lock)); while (size < n && bufmem > bufmem_lowater) {
sz = buf_trim();
if (sz <= 0)
break;
size += sz;
}
return size;
}
/*
* Wait for operations on the buffer to complete.
* When they do, extract and return the I/O's error value.
*/
int
biowait(buf_t *bp)
{
BIOHIST_FUNC(__func__);
KASSERT(ISSET(bp->b_cflags, BC_BUSY)); SDT_PROBE1(io, kernel, , wait__start, bp);
mutex_enter(bp->b_objlock);
BIOHIST_CALLARGS(biohist, "bp=%#jx, oflags=0x%jx, ret_addr=%#jx",
(uintptr_t)bp, bp->b_oflags,
(uintptr_t)__builtin_return_address(0), 0);
while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) {
BIOHIST_LOG(biohist, "waiting bp=%#jx", (uintptr_t)bp, 0, 0, 0);
cv_wait(&bp->b_done, bp->b_objlock);
}
mutex_exit(bp->b_objlock);
SDT_PROBE1(io, kernel, , wait__done, bp);
BIOHIST_LOG(biohist, "return %jd", bp->b_error, 0, 0, 0);
return bp->b_error;
}
/*
* Mark I/O complete on a buffer.
*
* If a callback has been requested, e.g. the pageout
* daemon, do so. Otherwise, awaken waiting processes.
*
* [ Leffler, et al., says on p.247:
* "This routine wakes up the blocked process, frees the buffer
* for an asynchronous write, or, for a request by the pagedaemon
* process, invokes a procedure specified in the buffer structure" ]
*
* In real life, the pagedaemon (or other system processes) wants
* to do async stuff too, and doesn't want the buffer brelse()'d.
* (for swap pager, that puts swap buffers on the free lists (!!!),
* for the vn device, that puts allocated buffers on the free lists!)
*/
void
biodone(buf_t *bp)
{
int s;
BIOHIST_FUNC(__func__);
KASSERT(!ISSET(bp->b_oflags, BO_DONE));
if (cpu_intr_p()) {
/* From interrupt mode: defer to a soft interrupt. */
s = splvm();
TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq);
BIOHIST_CALLARGS(biohist, "bp=%#jx, softint scheduled",
(uintptr_t)bp, 0, 0, 0);
softint_schedule(biodone_sih);
splx(s);
} else {
/* Process now - the buffer may be freed soon. */
biodone2(bp);
}
}
SDT_PROBE_DEFINE1(io, kernel, , done, "struct buf *"/*bp*/);
static void
biodone2(buf_t *bp)
{
void (*callout)(buf_t *);
SDT_PROBE1(io, kernel, ,done, bp);
BIOHIST_FUNC(__func__);
BIOHIST_CALLARGS(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
mutex_enter(bp->b_objlock);
/* Note that the transfer is done. */
if (ISSET(bp->b_oflags, BO_DONE))
panic("biodone2 already");
CLR(bp->b_flags, B_COWDONE);
SET(bp->b_oflags, BO_DONE);
BIO_SETPRIO(bp, BPRIO_DEFAULT);
/* Wake up waiting writers. */
if (!ISSET(bp->b_flags, B_READ)) vwakeup(bp);
if ((callout = bp->b_iodone) != NULL) {
BIOHIST_LOG(biohist, "callout %#jx", (uintptr_t)callout,
0, 0, 0);
/* Note callout done, then call out. */
KASSERT(!cv_has_waiters(&bp->b_done));
bp->b_iodone = NULL;
mutex_exit(bp->b_objlock);
(*callout)(bp);
} else if (ISSET(bp->b_flags, B_ASYNC)) {
/* If async, release. */
BIOHIST_LOG(biohist, "async", 0, 0, 0, 0);
KASSERT(!cv_has_waiters(&bp->b_done));
mutex_exit(bp->b_objlock);
brelse(bp, 0);
} else {
/* Otherwise just wake up waiters in biowait(). */
BIOHIST_LOG(biohist, "wake-up", 0, 0, 0, 0);
cv_broadcast(&bp->b_done);
mutex_exit(bp->b_objlock);
}
}
static void
biointr(void *cookie)
{
struct cpu_info *ci;
buf_t *bp;
int s;
BIOHIST_FUNC(__func__); BIOHIST_CALLED(biohist);
ci = curcpu();
s = splvm();
while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) {
KASSERT(curcpu() == ci);
bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone);
TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq);
splx(s);
BIOHIST_LOG(biohist, "bp=%#jx", (uintptr_t)bp, 0, 0, 0);
biodone2(bp);
s = splvm();
}
splx(s);
}
static void
sysctl_fillbuf(const buf_t *i, struct buf_sysctl *o)
{
const bool allowaddr = get_expose_address(curproc);
memset(o, 0, sizeof(*o));
o->b_flags = i->b_flags | i->b_cflags | i->b_oflags;
o->b_error = i->b_error;
o->b_prio = i->b_prio;
o->b_dev = i->b_dev;
o->b_bufsize = i->b_bufsize;
o->b_bcount = i->b_bcount;
o->b_resid = i->b_resid;
COND_SET_VALUE(o->b_addr, PTRTOUINT64(i->b_data), allowaddr);
o->b_blkno = i->b_blkno;
o->b_rawblkno = i->b_rawblkno;
COND_SET_VALUE(o->b_iodone, PTRTOUINT64(i->b_iodone), allowaddr);
COND_SET_VALUE(o->b_proc, PTRTOUINT64(i->b_proc), allowaddr);
COND_SET_VALUE(o->b_vp, PTRTOUINT64(i->b_vp), allowaddr);
COND_SET_VALUE(o->b_saveaddr, PTRTOUINT64(i->b_saveaddr), allowaddr);
o->b_lblkno = i->b_lblkno;
}
static int
sysctl_dobuf(SYSCTLFN_ARGS)
{
buf_t *bp;
struct buf_sysctl bs;
struct bqueue *bq;
char *dp;
u_int i, op, arg;
size_t len, needed, elem_size, out_size;
int error, elem_count, retries;
if (namelen == 1 && name[0] == CTL_QUERY)
return (sysctl_query(SYSCTLFN_CALL(rnode)));
if (namelen != 4)
return (EINVAL);
retries = 100;
retry:
dp = oldp;
len = (oldp != NULL) ? *oldlenp : 0;
op = name[0];
arg = name[1];
elem_size = name[2];
elem_count = name[3];
out_size = MIN(sizeof(bs), elem_size);
/*
* at the moment, these are just "placeholders" to make the
* API for retrieving kern.buf data more extensible in the
* future.
*
* XXX kern.buf currently has "netbsd32" issues. hopefully
* these will be resolved at a later point.
*/
if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL ||
elem_size < 1 || elem_count < 0)
return (EINVAL);
if (oldp == NULL) {
/* count only, don't run through the buffer queues */
needed = pool_cache_nget(buf_cache) - pool_cache_nput(buf_cache);
*oldlenp = (needed + KERN_BUFSLOP) * elem_size;
return 0;
}
error = 0;
needed = 0;
sysctl_unlock();
mutex_enter(&bufcache_lock);
for (i = 0; i < BQUEUES; i++) {
bq = &bufqueues[i];
TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) {
bq->bq_marker = bp;
if (len >= elem_size && elem_count > 0) {
sysctl_fillbuf(bp, &bs);
mutex_exit(&bufcache_lock);
error = copyout(&bs, dp, out_size);
mutex_enter(&bufcache_lock);
if (error)
break;
if (bq->bq_marker != bp) {
/*
* This sysctl node is only for
* statistics. Retry; if the
* queue keeps changing, then
* bail out.
*/
if (retries-- == 0) {
error = EAGAIN;
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
goto retry;
}
dp += elem_size;
len -= elem_size;
}
needed += elem_size;
if (elem_count > 0 && elem_count != INT_MAX)
elem_count--;
}
if (error != 0)
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
*oldlenp = needed;
return (error);
}
static int
sysctl_bufvm_update(SYSCTLFN_ARGS)
{
int error, rv;
struct sysctlnode node;
unsigned int temp_bufcache;
unsigned long temp_water;
/* Take a copy of the supplied node and its data */
node = *rnode;
if (node.sysctl_data == &bufcache) {
node.sysctl_data = &temp_bufcache;
temp_bufcache = *(unsigned int *)rnode->sysctl_data;
} else {
node.sysctl_data = &temp_water;
temp_water = *(unsigned long *)rnode->sysctl_data;
}
/* Update the copy */
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return (error);
if (rnode->sysctl_data == &bufcache) {
if (temp_bufcache > 100)
return (EINVAL);
bufcache = temp_bufcache;
buf_setwm();
} else if (rnode->sysctl_data == &bufmem_lowater) {
if (bufmem_hiwater - temp_water < 16)
return (EINVAL);
bufmem_lowater = temp_water;
} else if (rnode->sysctl_data == &bufmem_hiwater) {
if (temp_water - bufmem_lowater < 16)
return (EINVAL);
bufmem_hiwater = temp_water;
} else
return (EINVAL);
/* Drain until below new high water mark */
sysctl_unlock();
mutex_enter(&bufcache_lock);
while (bufmem > bufmem_hiwater) {
rv = buf_drain((bufmem - bufmem_hiwater) / (2 * 1024));
if (rv <= 0)
break;
}
mutex_exit(&bufcache_lock);
sysctl_relock();
return 0;
}
static struct sysctllog *vfsbio_sysctllog;
static void
sysctl_kern_buf_setup(void)
{
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "buf",
SYSCTL_DESCR("Kernel buffer cache information"),
sysctl_dobuf, 0, NULL, 0,
CTL_KERN, KERN_BUF, CTL_EOL);
}
static void
sysctl_vm_buf_setup(void)
{
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "bufcache",
SYSCTL_DESCR("Percentage of physical memory to use for "
"buffer cache"),
sysctl_bufvm_update, 0, &bufcache, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READONLY,
CTLTYPE_LONG, "bufmem",
SYSCTL_DESCR("Amount of kernel memory used by buffer "
"cache"),
NULL, 0, &bufmem, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "bufmem_lowater",
SYSCTL_DESCR("Minimum amount of kernel memory to "
"reserve for buffer cache"),
sysctl_bufvm_update, 0, &bufmem_lowater, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
sysctl_createv(&vfsbio_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_LONG, "bufmem_hiwater",
SYSCTL_DESCR("Maximum amount of kernel memory to use "
"for buffer cache"),
sysctl_bufvm_update, 0, &bufmem_hiwater, 0,
CTL_VM, CTL_CREATE, CTL_EOL);
}
static int
bufhash_stats(struct hashstat_sysctl *hs, bool fill)
{
buf_t *bp;
uint64_t chain;
strlcpy(hs->hash_name, "bufhash", sizeof(hs->hash_name));
strlcpy(hs->hash_desc, "buffer hash", sizeof(hs->hash_desc));
if (!fill)
return 0;
hs->hash_size = bufhash + 1;
for (size_t i = 0; i < hs->hash_size; i++) {
chain = 0;
mutex_enter(&bufcache_lock);
LIST_FOREACH(bp, &bufhashtbl[i], b_hash) {
chain++;
}
mutex_exit(&bufcache_lock);
if (chain > 0) {
hs->hash_used++;
hs->hash_items += chain;
if (chain > hs->hash_maxchain)
hs->hash_maxchain = chain;
}
preempt_point();
}
return 0;
}
#ifdef DEBUG
/*
* Print out statistics on the current allocation of the buffer pool.
* Can be enabled to print out on every ``sync'' by setting "syncprt"
* in vfs_syscalls.c using sysctl.
*/
void
vfs_bufstats(void)
{
int i, j, count;
buf_t *bp;
struct bqueue *dp;
int counts[MAXBSIZE / MIN_PAGE_SIZE + 1];
static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" };
for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
count = 0;
memset(counts, 0, sizeof(counts));
TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) {
counts[bp->b_bufsize / PAGE_SIZE]++;
count++;
}
printf("%s: total-%d", bname[i], count);
for (j = 0; j <= MAXBSIZE / PAGE_SIZE; j++)
if (counts[j] != 0)
printf(", %d-%d", j * PAGE_SIZE, counts[j]);
printf("\n");
}
}
#endif /* DEBUG */
/* ------------------------------ */
buf_t *
getiobuf(struct vnode *vp, bool waitok)
{
buf_t *bp;
bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT));
if (bp == NULL)
return bp;
buf_init(bp);
if ((bp->b_vp = vp) != NULL) { bp->b_objlock = vp->v_interlock;
} else {
KASSERT(bp->b_objlock == &buffer_lock);
}
return bp;
}
void
putiobuf(buf_t *bp)
{
buf_destroy(bp);
pool_cache_put(bufio_cache, bp);
}
/*
* nestiobuf_iodone: b_iodone callback for nested buffers.
*/
void
nestiobuf_iodone(buf_t *bp)
{
buf_t *mbp = bp->b_private;
int error;
int donebytes;
KASSERT(bp->b_bcount <= bp->b_bufsize);
KASSERT(mbp != bp);
error = bp->b_error;
if (bp->b_error == 0 &&
(bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) {
/*
* Not all got transferred, raise an error. We have no way to
* propagate these conditions to mbp.
*/
error = EIO;
}
donebytes = bp->b_bufsize;
putiobuf(bp);
nestiobuf_done(mbp, donebytes, error);
}
/*
* nestiobuf_setup: setup a "nested" buffer.
*
* => 'mbp' is a "master" buffer which is being divided into sub pieces.
* => 'bp' should be a buffer allocated by getiobuf.
* => 'offset' is a byte offset in the master buffer.
* => 'size' is a size in bytes of this nested buffer.
*/
void
nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size)
{
const int b_pass = mbp->b_flags & (B_READ|B_PHYS|B_RAW|B_MEDIA_FLAGS);
struct vnode *vp = mbp->b_vp;
KASSERT(mbp->b_bcount >= offset + size);
bp->b_vp = vp;
bp->b_dev = mbp->b_dev;
bp->b_objlock = mbp->b_objlock;
bp->b_cflags = BC_BUSY;
bp->b_flags = B_ASYNC | b_pass;
bp->b_iodone = nestiobuf_iodone;
bp->b_data = (char *)mbp->b_data + offset;
bp->b_resid = bp->b_bcount = size;
bp->b_bufsize = bp->b_bcount;
bp->b_private = mbp;
BIO_COPYPRIO(bp, mbp);
if (BUF_ISWRITE(bp) && vp != NULL) { mutex_enter(vp->v_interlock);
vp->v_numoutput++;
mutex_exit(vp->v_interlock);
}
}
/*
* nestiobuf_done: propagate completion to the master buffer.
*
* => 'donebytes' specifies how many bytes in the 'mbp' is completed.
* => 'error' is an errno(2) that 'donebytes' has been completed with.
*/
void
nestiobuf_done(buf_t *mbp, int donebytes, int error)
{ if (donebytes == 0) {
return;
}
mutex_enter(mbp->b_objlock);
KASSERT(mbp->b_resid >= donebytes);
mbp->b_resid -= donebytes;
if (error) mbp->b_error = error;
if (mbp->b_resid == 0) {
if (mbp->b_error) mbp->b_resid = mbp->b_bcount;
mutex_exit(mbp->b_objlock);
biodone(mbp);
} else
mutex_exit(mbp->b_objlock);
}
void
buf_init(buf_t *bp)
{
cv_init(&bp->b_busy, "biolock");
cv_init(&bp->b_done, "biowait");
bp->b_dev = NODEV;
bp->b_error = 0;
bp->b_flags = 0;
bp->b_cflags = 0;
bp->b_oflags = 0;
bp->b_objlock = &buffer_lock;
bp->b_iodone = NULL;
bp->b_dev = NODEV;
bp->b_vnbufs.le_next = NOLIST;
BIO_SETPRIO(bp, BPRIO_DEFAULT);
}
void
buf_destroy(buf_t *bp)
{
cv_destroy(&bp->b_done);
cv_destroy(&bp->b_busy);
}
int
bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock)
{
int error;
KASSERT(mutex_owned(&bufcache_lock)); SDT_PROBE4(io, kernel, , bbusy__start, bp, intr, timo, interlock);
if ((bp->b_cflags & BC_BUSY) != 0) {
if (curlwp == uvm.pagedaemon_lwp) {
error = EDEADLK;
goto out;
}
bp->b_cflags |= BC_WANTED;
if (interlock != NULL) mutex_exit(interlock);
if (intr) {
error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock,
timo);
} else {
error = cv_timedwait(&bp->b_busy, &bufcache_lock,
timo);
}
/*
* At this point the buffer may be gone: don't touch it
* again. The caller needs to find it again and retry.
*/
if (interlock != NULL) mutex_enter(interlock);
if (error == 0)
error = EPASSTHROUGH;
} else {
bp->b_cflags |= BC_BUSY;
error = 0;
}
out: SDT_PROBE5(io, kernel, , bbusy__done,
bp, intr, timo, interlock, error);
return error;
}
/*
* Nothing outside this file should really need to know about nbuf,
* but a few things still want to read it, so give them a way to do that.
*/
u_int
buf_nbuf(void)
{
return nbuf;
}
/* $NetBSD: socketvar.h,v 1.168 2024/02/03 19:05:14 jdolecek Exp $ */
/*-
* Copyright (c) 2008, 2009 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)socketvar.h 8.3 (Berkeley) 2/19/95
*/
#ifndef _SYS_SOCKETVAR_H_
#define _SYS_SOCKETVAR_H_
#include <sys/select.h>
#include <sys/selinfo.h> /* for struct selinfo */
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#if !defined(_KERNEL)
struct uio;
struct lwp;
struct uidinfo;
#else
#include <sys/atomic.h>
#include <sys/uidinfo.h>
#endif
TAILQ_HEAD(soqhead, socket);
/*
* Variables for socket buffering.
*/
struct sockbuf {
struct selinfo sb_sel; /* process selecting read/write */
struct mowner *sb_mowner; /* who owns data for this sockbuf */
struct socket *sb_so; /* back pointer to socket */
kcondvar_t sb_cv; /* notifier */
/* When re-zeroing this struct, we zero from sb_startzero to the end */
#define sb_startzero sb_cc
u_long sb_cc; /* actual chars in buffer */
u_long sb_hiwat; /* max actual char count */
u_long sb_mbcnt; /* chars of mbufs used */
u_long sb_mbmax; /* max chars of mbufs to use */
u_long sb_lowat; /* low water mark */
struct mbuf *sb_mb; /* the mbuf chain */
struct mbuf *sb_mbtail; /* the last mbuf in the chain */
struct mbuf *sb_lastrecord; /* first mbuf of last record in
socket buffer */
int sb_flags; /* flags, see below */
int sb_timeo; /* timeout for read/write */
u_long sb_overflowed; /* # of drops due to full buffer */
};
#ifndef SB_MAX
#define SB_MAX (256*1024) /* default for max chars in sockbuf */
#endif
#define SB_LOCK 0x01 /* lock on data queue */
#define SB_NOTIFY 0x04 /* someone is waiting for data/space */
#define SB_ASYNC 0x10 /* ASYNC I/O, need signals */
#define SB_UPCALL 0x20 /* someone wants an upcall */
#define SB_NOINTR 0x40 /* operations not interruptible */
#define SB_KNOTE 0x100 /* kernel note attached */
#define SB_AUTOSIZE 0x800 /* automatically size socket buffer */
/*
* Kernel structure per socket.
* Contains send and receive buffer queues,
* handle on protocol and pointer to protocol
* private data and error information.
*/
struct so_accf {
struct accept_filter *so_accept_filter;
void *so_accept_filter_arg; /* saved filter args */
char *so_accept_filter_str; /* saved user args */
};
struct sockaddr;
struct socket {
kmutex_t * volatile so_lock; /* pointer to lock on structure */
kcondvar_t so_cv; /* notifier */
short so_type; /* generic type, see socket.h */
short so_options; /* from socket call, see socket.h */
u_short so_linger; /* time to linger while closing */
short so_state; /* internal state flags SS_*, below */
int so_unused; /* used to be so_nbio */
void *so_pcb; /* protocol control block */
const struct protosw *so_proto; /* protocol handle */
/*
* Variables for connection queueing.
* Socket where accepts occur is so_head in all subsidiary sockets.
* If so_head is 0, socket is not related to an accept.
* For head socket so_q0 queues partially completed connections,
* while so_q is a queue of connections ready to be accepted.
* If a connection is aborted and it has so_head set, then
* it has to be pulled out of either so_q0 or so_q.
* We allow connections to queue up based on current queue lengths
* and limit on number of queued connections for this socket.
*/
struct socket *so_head; /* back pointer to accept socket */
struct soqhead *so_onq; /* queue (q or q0) that we're on */
struct soqhead so_q0; /* queue of partial connections */
struct soqhead so_q; /* queue of incoming connections */
TAILQ_ENTRY(socket) so_qe; /* our queue entry (q or q0) */
short so_q0len; /* partials on so_q0 */
short so_qlen; /* number of connections on so_q */
short so_qlimit; /* max number queued connections */
short so_timeo; /* connection timeout */
u_short so_error; /* error affecting connection */
u_short so_rerror; /* error affecting receiving */
u_short so_aborting; /* references from soabort() */
pid_t so_pgid; /* pgid for signals */
u_long so_oobmark; /* chars to oob mark */
struct sockbuf so_snd; /* send buffer */
struct sockbuf so_rcv; /* receive buffer */
void *so_internal; /* Space for svr4 stream data */
void (*so_upcall) (struct socket *, void *, int, int);
void * so_upcallarg; /* Arg for above */
int (*so_send) (struct socket *, struct sockaddr *,
struct uio *, struct mbuf *,
struct mbuf *, int, struct lwp *);
int (*so_receive) (struct socket *,
struct mbuf **,
struct uio *, struct mbuf **,
struct mbuf **, int *);
struct mowner *so_mowner; /* who owns mbufs for this socket */
struct uidinfo *so_uidinfo; /* who opened the socket */
gid_t so_egid; /* creator effective gid */
pid_t so_cpid; /* creator pid */
struct so_accf *so_accf;
kauth_cred_t so_cred; /* socket credentials */
};
/*
* Socket state bits.
*/
#define SS_NOFDREF 0x001 /* no file table ref any more */
#define SS_ISCONNECTED 0x002 /* socket connected to a peer */
#define SS_ISCONNECTING 0x004 /* in process of connecting to peer */
#define SS_ISDISCONNECTING 0x008 /* in process of disconnecting */
#define SS_CANTSENDMORE 0x010 /* can't send more data to peer */
#define SS_CANTRCVMORE 0x020 /* can't receive more data from peer */
#define SS_RCVATMARK 0x040 /* at mark on input */
#define SS_ISABORTING 0x080 /* aborting fd references - close() */
#define SS_RESTARTSYS 0x100 /* restart blocked system calls */
#define SS_POLLRDBAND 0x200 /* poll should return POLLRDBAND */
#define SS_MORETOCOME 0x400 /*
* hint from sosend to lower layer;
* more data coming
*/
#define SS_ISDISCONNECTED 0x800 /* socket disconnected from peer */
#define SS_ISAPIPE 0x1000 /* socket is implementing a pipe */
#define SS_NBIO 0x2000 /* socket is in non blocking I/O */
#ifdef _KERNEL
struct accept_filter {
char accf_name[16];
void (*accf_callback)
(struct socket *, void *, int, int);
void * (*accf_create)
(struct socket *, char *);
void (*accf_destroy)
(struct socket *);
LIST_ENTRY(accept_filter) accf_next;
u_int accf_refcnt;
};
struct sockopt {
int sopt_level; /* option level */
int sopt_name; /* option name */
size_t sopt_size; /* data length */
size_t sopt_retsize; /* returned data length */
void * sopt_data; /* data pointer */
uint8_t sopt_buf[sizeof(int)]; /* internal storage */
};
#define SB_EMPTY_FIXUP(sb) \
do { \
KASSERT(solocked((sb)->sb_so)); \
if ((sb)->sb_mb == NULL) { \
(sb)->sb_mbtail = NULL; \
(sb)->sb_lastrecord = NULL; \
} \
} while (/*CONSTCOND*/0)
extern u_long sb_max;
extern int somaxkva;
extern int sock_loan_thresh;
extern kmutex_t *softnet_lock;
struct mbuf;
struct lwp;
struct msghdr;
struct stat;
struct knote;
struct sockaddr_big;
enum uio_seg;
/* 0x400 is SO_OTIMESTAMP */
#define SOOPT_TIMESTAMP(o) ((o) & (SO_TIMESTAMP | 0x400))
/*
* File operations on sockets.
*/
int soo_read(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int soo_write(file_t *, off_t *, struct uio *, kauth_cred_t, int);
int soo_fcntl(file_t *, u_int cmd, void *);
int soo_ioctl(file_t *, u_long cmd, void *);
int soo_poll(file_t *, int);
int soo_kqfilter(file_t *, struct knote *);
int soo_close(file_t *);
int soo_stat(file_t *, struct stat *);
void soo_restart(file_t *);
void sbappend(struct sockbuf *, struct mbuf *);
void sbappendstream(struct sockbuf *, struct mbuf *);
int sbappendaddr(struct sockbuf *, const struct sockaddr *, struct mbuf *,
struct mbuf *);
int sbappendaddrchain(struct sockbuf *, const struct sockaddr *,
struct mbuf *, int);
int sbappendcontrol(struct sockbuf *, struct mbuf *, struct mbuf *);
void sbappendrecord(struct sockbuf *, struct mbuf *);
void sbcheck(struct sockbuf *);
void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
struct mbuf *
sbcreatecontrol(void *, int, int, int);
struct mbuf *
sbcreatecontrol1(void **, int, int, int, int);
struct mbuf **
sbsavetimestamp(int, struct mbuf **);
void sbdrop(struct sockbuf *, int);
void sbdroprecord(struct sockbuf *);
void sbflush(struct sockbuf *);
void sbinsertoob(struct sockbuf *, struct mbuf *);
void sbrelease(struct sockbuf *, struct socket *);
int sbreserve(struct sockbuf *, u_long, struct socket *);
int sbwait(struct sockbuf *);
int sb_max_set(u_long);
void soinit(void);
void soinit1(void);
void soinit2(void);
int soabort(struct socket *);
int soaccept(struct socket *, struct sockaddr *);
int sofamily(const struct socket *);
int sobind(struct socket *, struct sockaddr *, struct lwp *);
void socantrcvmore(struct socket *);
void socantsendmore(struct socket *);
void soroverflow(struct socket *);
int soclose(struct socket *);
int soconnect(struct socket *, struct sockaddr *, struct lwp *);
int soconnect2(struct socket *, struct socket *);
int socreate(int, struct socket **, int, int, struct lwp *,
struct socket *);
int fsocreate(int, struct socket **, int, int, int *, file_t **,
struct socket *);
int sodisconnect(struct socket *);
void sofree(struct socket *);
int sogetopt(struct socket *, struct sockopt *);
void sohasoutofband(struct socket *);
void soisconnected(struct socket *);
void soisconnecting(struct socket *);
void soisdisconnected(struct socket *);
void soisdisconnecting(struct socket *);
int solisten(struct socket *, int, struct lwp *);
struct socket *
sonewconn(struct socket *, bool);
void soqinsque(struct socket *, struct socket *, int);
bool soqremque(struct socket *, int);
int soreceive(struct socket *, struct mbuf **, struct uio *,
struct mbuf **, struct mbuf **, int *);
int soreserve(struct socket *, u_long, u_long);
void sorflush(struct socket *);
int sosend(struct socket *, struct sockaddr *, struct uio *,
struct mbuf *, struct mbuf *, int, struct lwp *);
int sosetopt(struct socket *, struct sockopt *);
int so_setsockopt(struct lwp *, struct socket *, int, int, const void *, size_t);
int soshutdown(struct socket *, int);
void sorestart(struct socket *);
void sowakeup(struct socket *, struct sockbuf *, int);
int sockargs(struct mbuf **, const void *, size_t, enum uio_seg, int);
int sopoll(struct socket *, int);
struct socket *soget(bool);
void soput(struct socket *);
bool solocked(const struct socket *);
bool solocked2(const struct socket *, const struct socket *);
int sblock(struct sockbuf *, int);
void sbunlock(struct sockbuf *);
int sowait(struct socket *, bool, int);
void solockretry(struct socket *, kmutex_t *);
void sosetlock(struct socket *);
void solockreset(struct socket *, kmutex_t *);
void sockopt_init(struct sockopt *, int, int, size_t);
void sockopt_destroy(struct sockopt *);
int sockopt_set(struct sockopt *, const void *, size_t);
int sockopt_setint(struct sockopt *, int);
int sockopt_get(const struct sockopt *, void *, size_t);
int sockopt_getint(const struct sockopt *, int *);
int sockopt_setmbuf(struct sockopt *, struct mbuf *);
struct mbuf *sockopt_getmbuf(const struct sockopt *);
int copyout_sockname(struct sockaddr *, unsigned int *, int, struct mbuf *);
int copyout_sockname_sb(struct sockaddr *, unsigned int *,
int , struct sockaddr_big *);
int copyout_msg_control(struct lwp *, struct msghdr *, struct mbuf *);
void free_control_mbuf(struct lwp *, struct mbuf *, struct mbuf *);
int do_sys_getpeername(int, struct sockaddr *);
int do_sys_getsockname(int, struct sockaddr *);
int do_sys_sendmsg(struct lwp *, int, struct msghdr *, int, register_t *);
int do_sys_sendmsg_so(struct lwp *, int, struct socket *, file_t *,
struct msghdr *, int, register_t *);
int do_sys_recvmsg(struct lwp *, int, struct msghdr *,
struct mbuf **, struct mbuf **, register_t *);
int do_sys_recvmsg_so(struct lwp *, int, struct socket *,
struct msghdr *mp, struct mbuf **, struct mbuf **, register_t *);
int do_sys_bind(struct lwp *, int, struct sockaddr *);
int do_sys_connect(struct lwp *, int, struct sockaddr *);
int do_sys_accept(struct lwp *, int, struct sockaddr *, register_t *,
const sigset_t *, int, int);
int do_sys_peeloff(struct socket *, void *);
/*
* Inline functions for sockets and socket buffering.
*/
#include <sys/protosw.h>
#include <sys/mbuf.h>
/*
* Do we need to notify the other side when I/O is possible?
*/
static __inline int
sb_notify(struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so));
return sb->sb_flags & (SB_NOTIFY | SB_ASYNC | SB_UPCALL | SB_KNOTE);
}
/*
* How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
* Since the fields are unsigned, detect overflow and return 0.
*/
static __inline u_long
sbspace(const struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so)); if (sb->sb_hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
return 0;
return lmin(sb->sb_hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
static __inline u_long
sbspace_oob(const struct sockbuf *sb)
{
u_long hiwat = sb->sb_hiwat;
if (hiwat < ULONG_MAX - 1024)
hiwat += 1024;
KASSERT(solocked(sb->sb_so)); if (hiwat <= sb->sb_cc || sb->sb_mbmax <= sb->sb_mbcnt)
return 0;
return lmin(hiwat - sb->sb_cc, sb->sb_mbmax - sb->sb_mbcnt);
}
/*
* How much socket buffer space has been used?
*/
static __inline u_long
sbused(const struct sockbuf *sb)
{
KASSERT(solocked(sb->sb_so));
return sb->sb_cc;
}
/* do we have to send all at once on a socket? */
static __inline int
sosendallatonce(const struct socket *so)
{
return so->so_proto->pr_flags & PR_ATOMIC;
}
/* can we read something from so? */
static __inline int
soreadable(const struct socket *so)
{
KASSERT(solocked(so)); return so->so_rcv.sb_cc >= so->so_rcv.sb_lowat || (so->so_state & SS_CANTRCVMORE) != 0 || so->so_qlen != 0 || so->so_error != 0 || so->so_rerror != 0;
}
/* can we write something to so? */
static __inline int
sowritable(const struct socket *so)
{
KASSERT(solocked(so)); return (sbspace(&so->so_snd) >= so->so_snd.sb_lowat && ((so->so_state & SS_ISCONNECTED) != 0 || (so->so_proto->pr_flags & PR_CONNREQUIRED) == 0)) || (so->so_state & SS_CANTSENDMORE) != 0 ||
so->so_error != 0;
}
/* adjust counters in sb reflecting allocation of m */
static __inline void
sballoc(struct sockbuf *sb, struct mbuf *m)
{
KASSERT(solocked(sb->sb_so));
sb->sb_cc += m->m_len;
sb->sb_mbcnt += MSIZE;
if (m->m_flags & M_EXT) sb->sb_mbcnt += m->m_ext.ext_size;
}
/* adjust counters in sb reflecting freeing of m */
static __inline void
sbfree(struct sockbuf *sb, struct mbuf *m)
{
KASSERT(solocked(sb->sb_so));
sb->sb_cc -= m->m_len;
sb->sb_mbcnt -= MSIZE;
if (m->m_flags & M_EXT) sb->sb_mbcnt -= m->m_ext.ext_size;
}
static __inline void
sorwakeup(struct socket *so)
{
KASSERT(solocked(so)); if (sb_notify(&so->so_rcv)) sowakeup(so, &so->so_rcv, POLL_IN);
}
static __inline void
sowwakeup(struct socket *so)
{
KASSERT(solocked(so)); if (sb_notify(&so->so_snd)) sowakeup(so, &so->so_snd, POLL_OUT);
}
static __inline void
solock(struct socket *so)
{
kmutex_t *lock;
lock = atomic_load_consume(&so->so_lock);
mutex_enter(lock);
if (__predict_false(lock != atomic_load_relaxed(&so->so_lock))) solockretry(so, lock);
}
static __inline void
sounlock(struct socket *so)
{
mutex_exit(so->so_lock);
}
#ifdef SOCKBUF_DEBUG
/*
* SBLASTRECORDCHK: check sb->sb_lastrecord is maintained correctly.
* SBLASTMBUFCHK: check sb->sb_mbtail is maintained correctly.
*
* => panic if the socket buffer is inconsistent.
* => 'where' is used for a panic message.
*/
void sblastrecordchk(struct sockbuf *, const char *);
#define SBLASTRECORDCHK(sb, where) sblastrecordchk((sb), (where))
void sblastmbufchk(struct sockbuf *, const char *);
#define SBLASTMBUFCHK(sb, where) sblastmbufchk((sb), (where))
#define SBCHECK(sb) sbcheck(sb)
#else
#define SBLASTRECORDCHK(sb, where) /* nothing */
#define SBLASTMBUFCHK(sb, where) /* nothing */
#define SBCHECK(sb) /* nothing */
#endif /* SOCKBUF_DEBUG */
/* sosend loan */
vaddr_t sokvaalloc(vaddr_t, vsize_t, struct socket *);
void sokvafree(vaddr_t, vsize_t);
void soloanfree(struct mbuf *, void *, size_t, void *);
/*
* Values for socket-buffer-append priority argument to sbappendaddrchain().
* The following flags are reserved for future implementation:
*
* SB_PRIO_NONE: honour normal socket-buffer limits.
*
* SB_PRIO_ONESHOT_OVERFLOW: if the socket has any space,
* deliver the entire chain. Intended for large requests
* that should be delivered in their entirety, or not at all.
*
* SB_PRIO_OVERDRAFT: allow a small (2*MLEN) overflow, over and
* aboce normal socket limits. Intended messages indicating
* buffer overflow in earlier normal/lower-priority messages .
*
* SB_PRIO_BESTEFFORT: Ignore limits entirely. Intended only for
* kernel-generated messages to specially-marked scokets which
* require "reliable" delivery, nd where the source socket/protocol
* message generator enforce some hard limit (but possibly well
* above kern.sbmax). It is entirely up to the in-kernel source to
* avoid complete mbuf exhaustion or DoS scenarios.
*/
#define SB_PRIO_NONE 0
#define SB_PRIO_ONESHOT_OVERFLOW 1
#define SB_PRIO_OVERDRAFT 2
#define SB_PRIO_BESTEFFORT 3
/*
* Accept filter functions (duh).
*/
int accept_filt_getopt(struct socket *, struct sockopt *);
int accept_filt_setopt(struct socket *, const struct sockopt *);
int accept_filt_clear(struct socket *);
int accept_filt_add(struct accept_filter *);
int accept_filt_del(struct accept_filter *);
struct accept_filter *accept_filt_get(char *);
#ifdef ACCEPT_FILTER_MOD
#ifdef SYSCTL_DECL
SYSCTL_DECL(_net_inet_accf);
#endif
void accept_filter_init(void);
#endif
#ifdef DDB
int sofindproc(struct socket *so, int all, void (*pr)(const char *, ...));
void socket_print(const char *modif, void (*pr)(const char *, ...));
#endif
#endif /* _KERNEL */
#endif /* !_SYS_SOCKETVAR_H_ */
/* $NetBSD: ip6_var.h,v 1.94 2024/02/09 22:08:37 andvar Exp $ */
/* $KAME: ip6_var.h,v 1.33 2000/06/11 14:59:20 jinmei Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_var.h 8.1 (Berkeley) 6/10/93
*/
#ifndef _NETINET6_IP6_VAR_H_
#define _NETINET6_IP6_VAR_H_
#include <sys/types.h>
#include <sys/queue.h>
#include <sys/socketvar.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/ip6.h>
struct ip6_moptions {
if_index_t im6o_multicast_if_index; /* I/F for outgoing multicasts */
u_char im6o_multicast_hlim; /* hoplimit for outgoing multicasts */
u_char im6o_multicast_loop; /* 1 >= hear sends if a member */
LIST_HEAD(, in6_multi_mship) im6o_memberships;
};
/*
* Control options for outgoing packets
*/
/* Routing header related info */
struct ip6po_rhinfo {
struct ip6_rthdr *ip6po_rhi_rthdr; /* Routing header */
struct route ip6po_rhi_route; /* Route to the 1st hop */
};
#define ip6po_rthdr ip6po_rhinfo.ip6po_rhi_rthdr
#define ip6po_route ip6po_rhinfo.ip6po_rhi_route
/* Nexthop related info */
struct ip6po_nhinfo {
struct sockaddr *ip6po_nhi_nexthop;
struct route ip6po_nhi_route; /* Route to the nexthop */
};
#define ip6po_nexthop ip6po_nhinfo.ip6po_nhi_nexthop
#define ip6po_nextroute ip6po_nhinfo.ip6po_nhi_route
struct ip6_pktopts {
int ip6po_hlim; /* Hoplimit for outgoing packets */
struct in6_pktinfo *ip6po_pktinfo; /* Outgoing IF/address information */
struct ip6po_nhinfo ip6po_nhinfo; /* Next-hop address information */
struct ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
struct ip6_dest *ip6po_dest1; /* Destination options header(1st part) */
struct ip6po_rhinfo ip6po_rhinfo; /* Routing header related info. */
struct ip6_dest *ip6po_dest2; /* Destination options header(2nd part) */
int ip6po_tclass; /* traffic class */
int ip6po_minmtu; /* fragment vs PMTU discovery policy */
#define IP6PO_MINMTU_MCASTONLY -1 /* default; send at min MTU for multicast*/
#define IP6PO_MINMTU_DISABLE 0 /* always perform pmtu disc */
#define IP6PO_MINMTU_ALL 1 /* always send at min MTU */
int ip6po_prefer_tempaddr; /* whether temporary addresses are
* preferred as source address */
#define IP6PO_TEMPADDR_SYSTEM -1 /* follow the system default */
#define IP6PO_TEMPADDR_NOTPREFER 0 /* not prefer temporary address */
#define IP6PO_TEMPADDR_PREFER 1 /* prefer temporary address */
int ip6po_flags;
#if 0 /* parameters in this block is obsolete. do not reuse the values. */
#define IP6PO_REACHCONF 0x01 /* upper-layer reachability confirmation. */
#define IP6PO_MINMTU 0x02 /* use minimum MTU (IPV6_USE_MIN_MTU) */
#endif
#define IP6PO_DONTFRAG 0x04 /* disable fragmentation (IPV6_DONTFRAG) */
};
/*
* IPv6 statistics.
* Each counter is an unsigned 64-bit value.
*/
#define IP6_STAT_TOTAL 0 /* total packets received */
#define IP6_STAT_TOOSHORT 1 /* packet too short */
#define IP6_STAT_TOOSMALL 2 /* not enough data */
#define IP6_STAT_FRAGMENTS 3 /* fragments received */
#define IP6_STAT_FRAGDROPPED 4 /* frags dropped (dups, out of space) */
#define IP6_STAT_FRAGTIMEOUT 5 /* fragments timed out */
#define IP6_STAT_FRAGOVERFLOW 6 /* fragments that exceed limit */
#define IP6_STAT_FORWARD 7 /* packets forwarded */
#define IP6_STAT_CANTFORWARD 8 /* packets rcvd for uncreachable dst */
#define IP6_STAT_REDIRECTSENT 9 /* packets forwarded on same net */
#define IP6_STAT_DELIVERED 10 /* datagrams delivered to upper level */
#define IP6_STAT_LOCALOUT 11 /* total IP packets generated here */
#define IP6_STAT_ODROPPED 12 /* lost packets due to nobufs, etc. */
#define IP6_STAT_REASSEMBLED 13 /* total packets reassembled ok */
#define IP6_STAT_FRAGMENTED 14 /* datagrams successfully fragmented */
#define IP6_STAT_OFRAGMENTS 15 /* output fragments created */
#define IP6_STAT_CANTFRAG 16 /* don't fragment flag was set, etc. */
#define IP6_STAT_BADOPTIONS 17 /* error in option processing */
#define IP6_STAT_NOROUTE 18 /* packets discarded due to no route */
#define IP6_STAT_BADVERS 19 /* ip6 version != 6 */
#define IP6_STAT_RAWOUT 20 /* total raw ip packets generated */
#define IP6_STAT_BADSCOPE 21 /* scope error */
#define IP6_STAT_NOTMEMBER 22 /* don't join this multicast group */
#define IP6_STAT_NXTHIST 23 /* next header histogram */
/* space for 256 counters */
#define IP6_STAT_M1 279 /* one mbuf */
#define IP6_STAT_M2M 280 /* two or more mbuf */
/* space for 32 counters */
#define IP6_STAT_MEXT1 312 /* one ext mbuf */
#define IP6_STAT_MEXT2M 313 /* two or more ext mbuf */
#define IP6_STAT_EXTHDRTOOLONG 314 /* ext hdr are not contiguous */
#define IP6_STAT_NOGIF 315 /* no match gif found */
#define IP6_STAT_TOOMANYHDR 316 /* discarded due to too many headers */
/*
* statistics for improvement of the source address selection
* algorithm:
* XXX: hardcoded 16 = # of ip6 multicast scope types + 1
*/
#define IP6_STAT_SOURCES_NONE 317 /* number of times that address
selection fails */
#define IP6_STAT_SOURCES_SAMEIF 318 /* number of times that an address
on the outgoing I/F is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_OTHERIF 334 /* number of times that an address on
a non-outgoing I/F is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_SAMESCOPE 350 /* number of times that an address that
has the same scope from the dest.
is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_OTHERSCOPE 366 /* number of times that an address that
has a different scope from the dest.
is chosen */
/* space for 16 counters */
#define IP6_STAT_SOURCES_DEPRECATED 382 /* number of times that a deprecated
address is chosen */
/* space for 16 counters */
#define IP6_STAT_FORWARD_CACHEHIT 398
#define IP6_STAT_FORWARD_CACHEMISS 399
#define IP6_STAT_FASTFORWARD 400 /* packets fast forwarded */
#define IP6_STAT_FASTFORWARDFLOWS 401 /* number of fast forward flows */
#define IP6_STAT_NOIPSEC 402 /* no match ipsec(4) found */
#define IP6_STAT_PFILDROP_IN 403 /* dropped by pfil (PFIL_IN) */
#define IP6_STAT_PFILDROP_OUT 404 /* dropped by pfil (PFIL_OUT) */
#define IP6_STAT_IPSECDROP_IN 405 /* dropped by IPsec SP check */
#define IP6_STAT_IPSECDROP_OUT 406 /* dropped by IPsec SP check */
#define IP6_STAT_IFDROP 407 /* dropped due to interface state */
#define IP6_STAT_IDROPPED 408 /* lost packets due to nobufs, etc. */
#define IP6_STAT_TIMXCEED 409 /* hop limit exceeded */
#define IP6_STAT_TOOBIG 410 /* packet bigger than MTU */
#define IP6_STAT_RTREJECT 411 /* rejected by route */
#define IP6_NSTATS 412
#define IP6FLOW_HASHBITS 6 /* should not be a multiple of 8 */
/*
* Structure for an IPv6 flow (ip6_fastforward).
*/
struct ip6flow {
TAILQ_ENTRY(ip6flow) ip6f_list; /* next in active list */
TAILQ_ENTRY(ip6flow) ip6f_hash; /* next ip6flow in bucket */
size_t ip6f_hashidx; /* own hash index of ipflowtable[] */
struct in6_addr ip6f_dst; /* destination address */
struct in6_addr ip6f_src; /* source address */
struct route ip6f_ro; /* associated route entry */
u_int32_t ip6f_flow; /* flow (tos) */
u_quad_t ip6f_uses; /* number of uses in this period */
u_quad_t ip6f_last_uses; /* number of uses in last period */
u_quad_t ip6f_dropped; /* ENOBUFS returned by if_output */
u_quad_t ip6f_forwarded; /* packets forwarded */
u_int ip6f_timer; /* lifetime timer */
};
#ifdef _KERNEL
#include <sys/protosw.h>
#include <sys/cprng.h>
/*
* Auxiliary attributes of incoming IPv6 packets, which is initialized when we
* come into ip6_input().
* XXX do not make it a kitchen sink!
*/
struct ip6aux {
/* ip6.ip6_dst */
struct in6_addr ip6a_src;
uint32_t ip6a_scope_id;
int ip6a_flags;
};
/* flags passed to ip6_output as last parameter */
#define IPV6_UNSPECSRC 0x01 /* allow :: as the source address */
#define IPV6_FORWARDING 0x02 /* most of IPv6 header exists */
#define IPV6_MINMTU 0x04 /* use minimum MTU (IPV6_USE_MIN_MTU) */
extern u_int32_t ip6_id; /* fragment identifier */
extern int ip6_defhlim; /* default hop limit */
extern int ip6_defmcasthlim; /* default multicast hop limit */
extern int ip6_forwarding; /* act as router? */
extern int ip6_sendredirect; /* send ICMPv6 redirect? */
extern int ip6_use_deprecated; /* allow deprecated addr as source */
extern int ip6_mcast_pmtu; /* enable pMTU discovery for multicast? */
extern int ip6_v6only;
extern int ip6_neighborgcthresh; /* Threshold # of NDP entries for GC */
extern int ip6_maxdynroutes; /* Max # of routes created via redirect */
extern int ip6_param_rt_msg; /* How to send parameter changing rtm */
extern struct socket *ip6_mrouter; /* multicast routing daemon */
extern int ip6_sendredirects; /* send IP redirects when forwarding? */
extern int ip6_maxfragpackets; /* Maximum packets in reassembly queue */
extern int ip6_maxfrags; /* Maximum fragments in reassembly queue */
extern int ip6_keepfaith; /* Firewall Aided Internet Translator */
extern int ip6_log_interval;
extern time_t ip6_log_time;
extern int ip6_hdrnestlimit; /* upper limit of # of extension headers */
extern int ip6_dad_count; /* DupAddrDetectionTransmits */
extern int ip6_auto_flowlabel;
extern int ip6_auto_linklocal;
extern int ip6_anonportmin; /* minimum ephemeral port */
extern int ip6_anonportmax; /* maximum ephemeral port */
extern int ip6_lowportmin; /* minimum reserved port */
extern int ip6_lowportmax; /* maximum reserved port */
extern int ip6_prefer_tempaddr; /* whether to prefer temporary addresses
in the source address selection */
extern int ip6_use_defzone; /* whether to use the default scope zone
when unspecified */
#ifdef GATEWAY
extern int ip6_maxflows; /* maximum amount of flows for ip6ff */
extern int ip6_hashsize; /* size of hash table */
#endif
struct inpcb;
extern const struct pr_usrreqs rip6_usrreqs;
int icmp6_ctloutput(int, struct socket *, struct sockopt *);
struct mbuf;
void ip6_init(void);
const struct ip6aux *ip6_getdstifaddr(struct mbuf *);
void ip6_freepcbopts(struct ip6_pktopts *);
void ip6_freemoptions(struct ip6_moptions *);
int ip6_unknown_opt(u_int8_t *, struct mbuf *, int);
int ip6_get_prevhdr(struct mbuf *, int);
int ip6_nexthdr(struct mbuf *, int, int, int *);
int ip6_lasthdr(struct mbuf *, int, int, int *);
struct ip6_hdr;
int ip6_mforward(struct ip6_hdr *, struct ifnet *, struct mbuf *);
int ip6_hopopts_input(u_int32_t *, u_int32_t *, struct mbuf **, int *);
void ip6_savecontrol(struct inpcb *, struct mbuf **, struct ip6_hdr *,
struct mbuf *);
void ip6_notify_pmtu(struct inpcb *, const struct sockaddr_in6 *,
u_int32_t *);
int ip6_sysctl(int *, u_int, void *, size_t *, void *, size_t);
void ip6_forward(struct mbuf *, int, struct ifnet *);
void ip6_mloopback(struct ifnet *, struct mbuf *,
const struct sockaddr_in6 *);
int ip6_output(struct mbuf *, struct ip6_pktopts *, struct route *, int,
struct ip6_moptions *, struct inpcb *, struct ifnet **);
int ip6_if_output(struct ifnet * const, struct ifnet * const,
struct mbuf * const,
const struct sockaddr_in6 * const, const struct rtentry *);
int ip6_ctloutput(int, struct socket *, struct sockopt *);
int ip6_raw_ctloutput(int, struct socket *, struct sockopt *);
void ip6_initpktopts(struct ip6_pktopts *);
int ip6_setpktopts(struct mbuf *, struct ip6_pktopts *,
struct ip6_pktopts *, kauth_cred_t, int);
void ip6_clearpktopts(struct ip6_pktopts *, int);
struct ip6_pktopts *ip6_copypktopts(struct ip6_pktopts *, int);
int ip6_optlen(struct inpcb *);
void ip6_statinc(u_int);
int route6_input(struct mbuf **, int *, int);
void frag6_init(void);
int frag6_input(struct mbuf **, int *, int);
int ip6_reass_packet(struct mbuf **, int);
void frag6_slowtimo(void);
void frag6_fasttimo(void);
void frag6_drain(void);
void frag6_drainstub(void);
int ip6flow_init(int);
void ip6flow_poolinit(void);
struct ip6flow *ip6flow_reap(int);
void ip6flow_create(struct route *, struct mbuf *);
void ip6flow_slowtimo(void);
int ip6flow_invalidate_all(int);
void rip6_init(void);
int rip6_input(struct mbuf **, int *, int);
void *rip6_ctlinput(int, const struct sockaddr *, void *);
int rip6_ctloutput(int, struct socket *, struct sockopt *);
int rip6_output(struct mbuf *, struct socket *, struct sockaddr_in6 *,
struct mbuf *);
int rip6_attach(struct socket *, int);
int rip6_usrreq(struct socket *,
int, struct mbuf *, struct mbuf *, struct mbuf *, struct lwp *);
int dest6_input(struct mbuf **, int *, int);
int none_input(struct mbuf **, int *, int);
struct route;
int in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
struct ip6_moptions *, struct route *, struct in6_addr *,
struct ifnet **, struct psref *, struct in6_addr *);
int in6_selectroute(struct sockaddr_in6 *, struct ip6_pktopts *,
struct route **, struct rtentry **, bool);
int ip6_get_membership(const struct sockopt *, struct ifnet **,
struct psref *, void *, size_t);
static __inline uint32_t
ip6_randomid(void)
{
return cprng_fast32();
}
static __inline uint32_t
ip6_randomflowlabel(void)
{
return cprng_fast32() & 0xfffff;
}
static __inline bool
ip6_dad_enabled(void)
{
return ip6_dad_count > 0;
}
#endif /* _KERNEL */
#endif /* !_NETINET6_IP6_VAR_H_ */
/* $NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $ */
/*
* Copyright (c) 1993, 1994 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by Christopher G. Demetriou.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: exec_aout.c,v 1.41 2019/11/20 19:37:53 pgoyette Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
#include <sys/exec.h>
#include <sys/exec_aout.h>
#include <sys/resourcevar.h>
#include <sys/module.h>
#include <uvm/uvm_extern.h>
MODULE(MODULE_CLASS_EXEC, exec_aout, NULL);
static struct execsw exec_aout_execsw = {
.es_hdrsz = sizeof(struct exec),
.es_makecmds = exec_aout_makecmds,
.u = {
.elf_probe_func = NULL,
},
.es_emul = &emul_netbsd,
.es_prio = EXECSW_PRIO_ANY,
.es_arglen = 0,
.es_copyargs = copyargs,
.es_setregs = NULL,
.es_coredump = coredump_netbsd,
.es_setup_stack = exec_setup_stack,
};
static int
exec_aout_modcmd(modcmd_t cmd, void *arg)
{
switch (cmd) {
case MODULE_CMD_INIT:
return exec_add(&exec_aout_execsw, 1);
case MODULE_CMD_FINI:
return exec_remove(&exec_aout_execsw, 1);
default:
return ENOTTY;
}
}
/*
* exec_aout_makecmds(): Check if it's an a.out-format executable.
*
* Given a lwp pointer and an exec package pointer, see if the referent
* of the epp is in a.out format. First check 'standard' magic numbers for
* this architecture. If that fails, try a CPU-dependent hook.
*
* This function, in the former case, or the hook, in the latter, is
* responsible for creating a set of vmcmds which can be used to build
* the process's vm space and inserting them into the exec package.
*/
int
exec_aout_makecmds(struct lwp *l, struct exec_package *epp)
{
u_long midmag, magic;
u_short mid;
int error;
struct exec *execp = epp->ep_hdr;
if (epp->ep_hdrvalid < sizeof(struct exec))
return ENOEXEC;
midmag = ntohl(execp->a_midmag);
mid = (midmag >> 16) & 0x3ff;
magic = midmag & 0xffff;
midmag = mid << 16 | magic;
switch (midmag) {
case (MID_MACHINE << 16) | ZMAGIC:
error = exec_aout_prep_zmagic(l, epp);
break;
case (MID_MACHINE << 16) | NMAGIC:
error = exec_aout_prep_nmagic(l, epp);
break;
case (MID_MACHINE << 16) | OMAGIC:
error = exec_aout_prep_omagic(l, epp);
break;
default:
error = cpu_exec_aout_makecmds(l, epp);
}
if (error)
kill_vmcmds(&epp->ep_vmcmds);
else
epp->ep_flags &= ~EXEC_TOPDOWN_VM;
return error;
}
/*
* exec_aout_prep_zmagic(): Prepare a 'native' ZMAGIC binary's exec package
*
* First, set of the various offsets/lengths in the exec package.
*
* Then, mark the text image busy (so it can be demand paged) or error
* out if this is not possible. Finally, set up vmcmds for the
* text, data, bss, and stack segments.
*/
int
exec_aout_prep_zmagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
int error;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
error = vn_marktext(epp->ep_vp);
if (error)
return (error);
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_text),
epp->ep_taddr, epp->ep_vp, 0, VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_pagedvn, round_page(execp->a_data),
epp->ep_daddr, epp->ep_vp, execp->a_text,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
if (execp->a_bss > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, execp->a_bss,
epp->ep_daddr + execp->a_data, NULLVP, 0,
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* exec_aout_prep_nmagic(): Prepare a 'native' NMAGIC binary's exec package
*/
int
exec_aout_prep_nmagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
long bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = roundup(epp->ep_taddr + execp->a_text, AOUT_LDPGSZ);
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_text,
epp->ep_taddr, epp->ep_vp, sizeof(struct exec),
VM_PROT_READ|VM_PROT_EXECUTE);
/* set up command for data segment */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn, execp->a_data,
epp->ep_daddr, epp->ep_vp, execp->a_text + sizeof(struct exec),
VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = round_page(epp->ep_daddr + execp->a_data);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/*
* exec_aout_prep_omagic(): Prepare a 'native' OMAGIC binary's exec package
*/
int
exec_aout_prep_omagic(struct lwp *l, struct exec_package *epp)
{
struct exec *execp = epp->ep_hdr;
long dsize, bsize, baddr;
epp->ep_taddr = AOUT_LDPGSZ;
epp->ep_tsize = execp->a_text;
epp->ep_daddr = epp->ep_taddr + execp->a_text;
epp->ep_dsize = execp->a_data + execp->a_bss;
epp->ep_entry = execp->a_entry;
/* set up command for text and data segments */
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_readvn,
execp->a_text + execp->a_data, epp->ep_taddr, epp->ep_vp,
sizeof(struct exec), VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/* set up command for bss segment */
baddr = round_page(epp->ep_daddr + execp->a_data);
bsize = epp->ep_daddr + epp->ep_dsize - baddr;
if (bsize > 0)
NEW_VMCMD(&epp->ep_vmcmds, vmcmd_map_zero, bsize, baddr,
NULLVP, 0, VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE);
/*
* Make sure (# of pages) mapped above equals (vm_tsize + vm_dsize);
* obreak(2) relies on this fact. Both `vm_tsize' and `vm_dsize' are
* computed (in execve(2)) by rounding *up* `ep_tsize' and `ep_dsize'
* respectively to page boundaries.
* Compensate `ep_dsize' for the amount of data covered by the last
* text page.
*/
dsize = epp->ep_dsize + execp->a_text - round_page(execp->a_text);
epp->ep_dsize = (dsize > 0) ? dsize : 0;
return (*epp->ep_esch->es_setup_stack)(l, epp);
}
/* $NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $ */
/*-
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Written by Iain Hibbert for Itronix Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: sco_upper.c,v 1.16 2014/08/05 07:55:32 rtr Exp $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/socketvar.h>
#include <sys/systm.h>
#include <netbt/bluetooth.h>
#include <netbt/hci.h>
#include <netbt/sco.h>
/****************************************************************************
*
* SCO - Upper Protocol API
*/
struct sco_pcb_list sco_pcb = LIST_HEAD_INITIALIZER(sco_pcb);
/*
* sco_attach_pcb(handle, proto, upper)
*
* Attach a new instance of SCO pcb to handle
*/
int
sco_attach_pcb(struct sco_pcb **handle,
const struct btproto *proto, void *upper)
{
struct sco_pcb *pcb;
KASSERT(handle != NULL); KASSERT(proto != NULL); KASSERT(upper != NULL);
pcb = malloc(sizeof(struct sco_pcb), M_BLUETOOTH,
M_NOWAIT | M_ZERO);
if (pcb == NULL)
return ENOMEM;
pcb->sp_proto = proto;
pcb->sp_upper = upper;
LIST_INSERT_HEAD(&sco_pcb, pcb, sp_next);
*handle = pcb;
return 0;
}
/*
* sco_bind_pcb(pcb, sockaddr)
*
* Bind SCO pcb to local address
*/
int
sco_bind_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
if (pcb->sp_link != NULL || pcb->sp_flags & SP_LISTENING)
return EINVAL;
bdaddr_copy(&pcb->sp_laddr, &addr->bt_bdaddr);
return 0;
}
/*
* sco_sockaddr_pcb(pcb, sockaddr)
*
* Copy local address of PCB to sockaddr
*/
int
sco_sockaddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
memset(addr, 0, sizeof(struct sockaddr_bt));
addr->bt_len = sizeof(struct sockaddr_bt);
addr->bt_family = AF_BLUETOOTH;
bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_laddr);
return 0;
}
/*
* sco_connect_pcb(pcb, sockaddr)
*
* Initiate a SCO connection to the destination address.
*/
int
sco_connect_pcb(struct sco_pcb *pcb, struct sockaddr_bt *dest)
{
hci_add_sco_con_cp cp;
struct hci_unit *unit;
struct hci_link *acl, *sco;
int err;
if (pcb->sp_flags & SP_LISTENING)
return EINVAL;
bdaddr_copy(&pcb->sp_raddr, &dest->bt_bdaddr);
if (bdaddr_any(&pcb->sp_raddr))
return EDESTADDRREQ;
if (bdaddr_any(&pcb->sp_laddr)) {
err = hci_route_lookup(&pcb->sp_laddr, &pcb->sp_raddr);
if (err)
return err;
}
unit = hci_unit_lookup(&pcb->sp_laddr);
if (unit == NULL)
return ENETDOWN;
/*
* We must have an already open ACL connection before we open the SCO
* connection, and since SCO connections dont happen on their own we
* will not open one, the application wanting this should have opened
* it previously.
*/
acl = hci_link_lookup_bdaddr(unit, &pcb->sp_raddr, HCI_LINK_ACL);
if (acl == NULL || acl->hl_state != HCI_LINK_OPEN)
return EHOSTUNREACH;
sco = hci_link_alloc(unit, &pcb->sp_raddr, HCI_LINK_SCO);
if (sco == NULL)
return ENOMEM;
sco->hl_link = hci_acl_open(unit, &pcb->sp_raddr);
KASSERT(sco->hl_link == acl);
cp.con_handle = htole16(acl->hl_handle);
cp.pkt_type = htole16(0x00e0); /* HV1, HV2, HV3 */
err = hci_send_cmd(unit, HCI_CMD_ADD_SCO_CON, &cp, sizeof(cp));
if (err) {
hci_link_free(sco, err);
return err;
}
sco->hl_sco = pcb;
pcb->sp_link = sco;
pcb->sp_mtu = unit->hci_max_sco_size;
return 0;
}
/*
* sco_peeraddr_pcb(pcb, sockaddr)
*
* Copy remote address of SCO pcb to sockaddr
*/
int
sco_peeraddr_pcb(struct sco_pcb *pcb, struct sockaddr_bt *addr)
{
memset(addr, 0, sizeof(struct sockaddr_bt));
addr->bt_len = sizeof(struct sockaddr_bt);
addr->bt_family = AF_BLUETOOTH;
bdaddr_copy(&addr->bt_bdaddr, &pcb->sp_raddr);
return 0;
}
/*
* sco_disconnect_pcb(pcb, linger)
*
* Initiate disconnection of connected SCO pcb
*/
int
sco_disconnect_pcb(struct sco_pcb *pcb, int linger)
{
hci_discon_cp cp;
struct hci_link *sco;
int err;
sco = pcb->sp_link;
if (sco == NULL)
return EINVAL;
cp.con_handle = htole16(sco->hl_handle);
cp.reason = 0x13; /* "Remote User Terminated Connection" */
err = hci_send_cmd(sco->hl_unit, HCI_CMD_DISCONNECT, &cp, sizeof(cp));
if (err || linger == 0) {
sco->hl_sco = NULL;
pcb->sp_link = NULL;
hci_link_free(sco, err);
}
return err;
}
/*
* sco_detach_pcb(handle)
*
* Detach SCO pcb from handle and clear up
*/
void
sco_detach_pcb(struct sco_pcb **handle)
{
struct sco_pcb *pcb;
KASSERT(handle != NULL);
pcb = *handle;
*handle = NULL;
if (pcb->sp_link != NULL) { sco_disconnect_pcb(pcb, 0);
pcb->sp_link = NULL;
}
LIST_REMOVE(pcb, sp_next);
free(pcb, M_BLUETOOTH);
}
/*
* sco_listen_pcb(pcb)
*
* Mark pcb as a listener.
*/
int
sco_listen_pcb(struct sco_pcb *pcb)
{ if (pcb->sp_link != NULL)
return EINVAL;
pcb->sp_flags |= SP_LISTENING;
return 0;
}
/*
* sco_send_pcb(pcb, mbuf)
*
* Send data on SCO pcb.
*
* Gross hackage, we just output the packet directly onto the unit queue.
* This will work fine for one channel per unit, but for more channels it
* really needs fixing. We set the context so that when the packet is sent,
* we can drop a record from the socket buffer.
*/
int
sco_send_pcb(struct sco_pcb *pcb, struct mbuf *m)
{
hci_scodata_hdr_t *hdr;
int plen;
if (pcb->sp_link == NULL) {
m_freem(m);
return EINVAL;
}
plen = m->m_pkthdr.len;
DPRINTFN(10, "%d bytes\n", plen);
/*
* This is a temporary limitation, as USB devices cannot
* handle SCO packet sizes that are not an integer number
* of Isochronous frames. See ubt(4)
*/
if (plen != pcb->sp_mtu) {
m_freem(m);
return EMSGSIZE;
}
M_PREPEND(m, sizeof(hci_scodata_hdr_t), M_DONTWAIT);
if (m == NULL)
return ENOMEM;
hdr = mtod(m, hci_scodata_hdr_t *);
hdr->type = HCI_SCO_DATA_PKT;
hdr->con_handle = htole16(pcb->sp_link->hl_handle);
hdr->length = plen;
pcb->sp_pending++;
M_SETCTX(m, pcb->sp_link);
hci_output_sco(pcb->sp_link->hl_unit, m);
return 0;
}
/*
* sco_setopt(pcb, sopt)
*
* Set SCO pcb options
*/
int
sco_setopt(struct sco_pcb *pcb, const struct sockopt *sopt)
{
int err = 0;
switch (sopt->sopt_name) {
default:
err = ENOPROTOOPT;
break;
}
return err;
}
/*
* sco_getopt(pcb, sopt)
*
* Get SCO pcb options
*/
int
sco_getopt(struct sco_pcb *pcb, struct sockopt *sopt)
{
switch (sopt->sopt_name) {
case SO_SCO_MTU:
return sockopt_set(sopt, &pcb->sp_mtu, sizeof(uint16_t));
case SO_SCO_HANDLE:
if (pcb->sp_link)
return sockopt_set(sopt,
&pcb->sp_link->hl_handle, sizeof(uint16_t));
return ENOTCONN;
default:
break;
}
return ENOPROTOOPT;
}
/* $NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $ */
/*-
* Copyright (c) 2006, 2007, 2008, 2009, 2019, 2020, 2023
* The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Sleep queue implementation, used by turnstiles and general sleep/wakeup
* interfaces.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_sleepq.c,v 1.87 2023/11/02 10:31:55 martin Exp $");
#include <sys/param.h>
#include <sys/cpu.h>
#include <sys/intr.h>
#include <sys/kernel.h>
#include <sys/ktrace.h>
#include <sys/pool.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
#include <sys/sleepq.h>
#include <sys/syncobj.h>
#include <sys/systm.h>
/*
* for sleepq_abort:
* During autoconfiguration or after a panic, a sleep will simply lower the
* priority briefly to allow interrupts, then return. The priority to be
* used (IPL_SAFEPRI) is machine-dependent, thus this value is initialized and
* maintained in the machine-dependent layers. This priority will typically
* be 0, or the lowest priority that is safe for use on the interrupt stack;
* it can be made higher to block network software interrupts after panics.
*/
#ifndef IPL_SAFEPRI
#define IPL_SAFEPRI 0
#endif
static int sleepq_sigtoerror(lwp_t *, int);
/* General purpose sleep table, used by mtsleep() and condition variables. */
sleeptab_t sleeptab __cacheline_aligned;
sleepqlock_t sleepq_locks[SLEEPTAB_HASH_SIZE] __cacheline_aligned;
/*
* sleeptab_init:
*
* Initialize a sleep table.
*/
void
sleeptab_init(sleeptab_t *st)
{
static bool again;
int i;
for (i = 0; i < SLEEPTAB_HASH_SIZE; i++) {
if (!again) {
mutex_init(&sleepq_locks[i].lock, MUTEX_DEFAULT,
IPL_SCHED);
}
sleepq_init(&st->st_queue[i]);
}
again = true;
}
/*
* sleepq_init:
*
* Prepare a sleep queue for use.
*/
void
sleepq_init(sleepq_t *sq)
{
LIST_INIT(sq);
}
/*
* sleepq_remove:
*
* Remove an LWP from a sleep queue and wake it up. Distinguish
* between deliberate wakeups (which are a valuable information) and
* "unsleep" (an out-of-band action must be taken).
*
* For wakeup, convert any interruptable wait into non-interruptable
* one before waking the LWP. Otherwise, if only one LWP is awoken it
* could fail to do something useful with the wakeup due to an error
* return and the caller of e.g. cv_signal() may not expect this.
*/
void
sleepq_remove(sleepq_t *sq, lwp_t *l, bool wakeup)
{
struct schedstate_percpu *spc;
struct cpu_info *ci;
KASSERT(lwp_locked(l, NULL));
if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_NULL) == 0) {
KASSERT(sq != NULL); LIST_REMOVE(l, l_sleepchain);
} else {
KASSERT(sq == NULL);
}
l->l_syncobj = &sched_syncobj;
l->l_wchan = NULL;
l->l_sleepq = NULL;
l->l_flag &= wakeup ? ~(LW_SINTR|LW_CATCHINTR|LW_STIMO) : ~LW_SINTR;
ci = l->l_cpu;
spc = &ci->ci_schedstate;
/*
* If not sleeping, the LWP must have been suspended. Let whoever
* holds it stopped set it running again.
*/
if (l->l_stat != LSSLEEP) { KASSERT(l->l_stat == LSSTOP || l->l_stat == LSSUSPENDED);
lwp_setlock(l, spc->spc_lwplock);
return;
}
/*
* If the LWP is still on the CPU, mark it as LSONPROC. It may be
* about to call mi_switch(), in which case it will yield.
*/
if ((l->l_pflag & LP_RUNNING) != 0) {
l->l_stat = LSONPROC;
l->l_slptime = 0;
lwp_setlock(l, spc->spc_lwplock);
return;
}
/* Update sleep time delta, call the wake-up handler of scheduler */
l->l_slpticksum += (getticks() - l->l_slpticks);
sched_wakeup(l);
/* Look for a CPU to wake up */
l->l_cpu = sched_takecpu(l);
ci = l->l_cpu;
spc = &ci->ci_schedstate;
/*
* Set it running.
*/
spc_lock(ci);
lwp_setlock(l, spc->spc_mutex);
sched_setrunnable(l);
l->l_stat = LSRUN;
l->l_slptime = 0;
sched_enqueue(l);
sched_resched_lwp(l, true);
/* LWP & SPC now unlocked, but we still hold sleep queue lock. */
}
/*
* sleepq_insert:
*
* Insert an LWP into the sleep queue, optionally sorting by priority.
*/
static void
sleepq_insert(sleepq_t *sq, lwp_t *l, syncobj_t *sobj)
{
if ((sobj->sobj_flag & SOBJ_SLEEPQ_NULL) != 0) {
KASSERT(sq == NULL);
return;
}
KASSERT(sq != NULL); if ((sobj->sobj_flag & SOBJ_SLEEPQ_SORTED) != 0) {
lwp_t *l2, *l_last = NULL;
const pri_t pri = lwp_eprio(l);
LIST_FOREACH(l2, sq, l_sleepchain) {
l_last = l2;
if (lwp_eprio(l2) < pri) {
LIST_INSERT_BEFORE(l2, l, l_sleepchain);
return;
}
}
/*
* Ensure FIFO ordering if no waiters are of lower priority.
*/
if (l_last != NULL) {
LIST_INSERT_AFTER(l_last, l, l_sleepchain);
return;
}
}
LIST_INSERT_HEAD(sq, l, l_sleepchain);
}
/*
* sleepq_enter:
*
* Prepare to block on a sleep queue, after which any interlock can be
* safely released.
*/
int
sleepq_enter(sleepq_t *sq, lwp_t *l, kmutex_t *mp)
{
int nlocks;
KASSERT((sq != NULL) == (mp != NULL));
/*
* Acquire the per-LWP mutex and lend it our sleep queue lock.
* Once interlocked, we can release the kernel lock.
*/
lwp_lock(l);
if (mp != NULL) { lwp_unlock_to(l, mp);
}
if (__predict_false((nlocks = l->l_blcnt) != 0)) { KERNEL_UNLOCK_ALL(NULL, NULL);
}
return nlocks;
}
/*
* sleepq_enqueue:
*
* Enter an LWP into the sleep queue and prepare for sleep. The sleep
* queue must already be locked, and any interlock (such as the kernel
* lock) must have be released (see sleeptab_lookup(), sleepq_enter()).
*/
void
sleepq_enqueue(sleepq_t *sq, wchan_t wchan, const char *wmesg, syncobj_t *sobj,
bool catch_p)
{
lwp_t *l = curlwp;
KASSERT(lwp_locked(l, NULL)); KASSERT(l->l_stat == LSONPROC); KASSERT(l->l_wchan == NULL); KASSERT(l->l_sleepq == NULL); KASSERT((l->l_flag & LW_SINTR) == 0);
l->l_syncobj = sobj;
l->l_wchan = wchan;
l->l_sleepq = sq;
l->l_wmesg = wmesg;
l->l_slptime = 0;
l->l_stat = LSSLEEP;
if (catch_p) l->l_flag |= LW_SINTR;
sleepq_insert(sq, l, sobj);
/* Save the time when thread has slept */
l->l_slpticks = getticks();
sched_slept(l);
}
/*
* sleepq_transfer:
*
* Move an LWP from one sleep queue to another. Both sleep queues
* must already be locked.
*
* The LWP will be updated with the new sleepq, wchan, wmesg,
* sobj, and mutex. The interruptible flag will also be updated.
*/
void
sleepq_transfer(lwp_t *l, sleepq_t *from_sq, sleepq_t *sq, wchan_t wchan,
const char *wmesg, syncobj_t *sobj, kmutex_t *mp, bool catch_p)
{
KASSERT(l->l_sleepq == from_sq);
LIST_REMOVE(l, l_sleepchain);
l->l_syncobj = sobj;
l->l_wchan = wchan;
l->l_sleepq = sq;
l->l_wmesg = wmesg;
if (catch_p)
l->l_flag = LW_SINTR | LW_CATCHINTR;
else
l->l_flag = ~(LW_SINTR | LW_CATCHINTR);
/*
* This allows the transfer from one sleepq to another where
* it is known that they're both protected by the same lock.
*/
if (mp != NULL)
lwp_setlock(l, mp);
sleepq_insert(sq, l, sobj);
}
/*
* sleepq_uncatch:
*
* Mark the LWP as no longer sleeping interruptibly.
*/
void
sleepq_uncatch(lwp_t *l)
{
l->l_flag &= ~(LW_SINTR | LW_CATCHINTR | LW_STIMO);
}
/*
* sleepq_block:
*
* After any intermediate step such as releasing an interlock, switch.
* sleepq_block() may return early under exceptional conditions, for
* example if the LWP's containing process is exiting.
*
* timo is a timeout in ticks. timo = 0 specifies an infinite timeout.
*/
int
sleepq_block(int timo, bool catch_p, syncobj_t *syncobj, int nlocks)
{
const int mask = LW_CANCELLED|LW_WEXIT|LW_WCORE|LW_PENDSIG;
int error = 0, sig, flag;
struct proc *p;
lwp_t *l = curlwp;
bool early = false;
ktrcsw(1, 0, syncobj);
/*
* If sleeping interruptably, check for pending signals, exits or
* core dump events.
*
* Note the usage of LW_CATCHINTR. This expresses our intent
* to catch or not catch sleep interruptions, which might change
* while we are sleeping. It is independent from LW_SINTR because
* we don't want to leave LW_SINTR set when the LWP is not asleep.
*/
if (catch_p) {
if ((l->l_flag & (LW_CANCELLED|LW_WEXIT|LW_WCORE)) != 0) {
l->l_flag &= ~LW_CANCELLED;
error = EINTR;
early = true;
} else if ((l->l_flag & LW_PENDSIG) != 0 && sigispending(l, 0))
early = true;
l->l_flag |= LW_CATCHINTR;
} else
l->l_flag &= ~LW_CATCHINTR; if (early) {
/* lwp_unsleep() will release the lock */
lwp_unsleep(l, true);
} else {
/*
* The LWP may have already been awoken if the caller
* dropped the sleep queue lock between sleepq_enqueue() and
* sleepq_block(). If that happens l_stat will be LSONPROC
* and mi_switch() will treat this as a preemption. No need
* to do anything special here.
*/
if (timo) {
l->l_flag &= ~LW_STIMO;
callout_schedule(&l->l_timeout_ch, timo);
}
l->l_boostpri = l->l_syncobj->sobj_boostpri;
spc_lock(l->l_cpu);
mi_switch(l);
/* The LWP and sleep queue are now unlocked. */
if (timo) {
/*
* Even if the callout appears to have fired, we
* need to stop it in order to synchronise with
* other CPUs. It's important that we do this in
* this LWP's context, and not during wakeup, in
* order to keep the callout & its cache lines
* co-located on the CPU with the LWP.
*/
(void)callout_halt(&l->l_timeout_ch, NULL);
error = (l->l_flag & LW_STIMO) ? EWOULDBLOCK : 0;
}
}
/*
* LW_CATCHINTR is only modified in this function OR when we
* are asleep (with the sleepq locked). We can therefore safely
* test it unlocked here as it is guaranteed to be stable by
* virtue of us running.
*
* We do not bother clearing it if set; that would require us
* to take the LWP lock, and it doesn't seem worth the hassle
* considering it is only meaningful here inside this function,
* and is set to reflect intent upon entry.
*/
flag = atomic_load_relaxed(&l->l_flag); if (__predict_false((flag & mask) != 0)) { if ((flag & LW_CATCHINTR) == 0 || error != 0)
/* nothing */;
else if ((flag & (LW_CANCELLED | LW_WEXIT | LW_WCORE)) != 0)
error = EINTR;
else if ((flag & LW_PENDSIG) != 0) {
/*
* Acquiring p_lock may cause us to recurse
* through the sleep path and back into this
* routine, but is safe because LWPs sleeping
* on locks are non-interruptable and we will
* not recurse again.
*/
p = l->l_proc;
mutex_enter(p->p_lock);
if (((sig = sigispending(l, 0)) != 0 && (sigprop[sig] & SA_STOP) == 0) ||
(sig = issignal(l)) != 0)
error = sleepq_sigtoerror(l, sig);
mutex_exit(p->p_lock);
}
}
ktrcsw(0, 0, syncobj); if (__predict_false(nlocks != 0)) { KERNEL_LOCK(nlocks, NULL);
}
return error;
}
/*
* sleepq_wake:
*
* Wake zero or more LWPs blocked on a single wait channel.
*/
void
sleepq_wake(sleepq_t *sq, wchan_t wchan, u_int expected, kmutex_t *mp)
{
lwp_t *l, *next;
KASSERT(mutex_owned(mp)); for (l = LIST_FIRST(sq); l != NULL; l = next) { KASSERT(l->l_sleepq == sq); KASSERT(l->l_mutex == mp);
next = LIST_NEXT(l, l_sleepchain);
if (l->l_wchan != wchan)
continue;
sleepq_remove(sq, l, true);
if (--expected == 0)
break;
}
mutex_spin_exit(mp);
}
/*
* sleepq_unsleep:
*
* Remove an LWP from its sleep queue and set it runnable again.
* sleepq_unsleep() is called with the LWP's mutex held, and will
* release it if "unlock" is true.
*/
void
sleepq_unsleep(lwp_t *l, bool unlock)
{
sleepq_t *sq = l->l_sleepq;
kmutex_t *mp = l->l_mutex;
KASSERT(lwp_locked(l, mp)); KASSERT(l->l_wchan != NULL);
sleepq_remove(sq, l, false);
if (unlock) { mutex_spin_exit(mp);
}
}
/*
* sleepq_timeout:
*
* Entered via the callout(9) subsystem to time out an LWP that is on a
* sleep queue.
*/
void
sleepq_timeout(void *arg)
{
lwp_t *l = arg;
/*
* Lock the LWP. Assuming it's still on the sleep queue, its
* current mutex will also be the sleep queue mutex.
*/
lwp_lock(l);
if (l->l_wchan == NULL || l->l_syncobj == &callout_syncobj) {
/*
* Somebody beat us to it, or the LWP is blocked in
* callout_halt() waiting for us to finish here. In
* neither case should the LWP produce EWOULDBLOCK.
*/
lwp_unlock(l);
return;
}
l->l_flag |= LW_STIMO;
lwp_unsleep(l, true);
}
/*
* sleepq_sigtoerror:
*
* Given a signal number, interpret and return an error code.
*/
static int
sleepq_sigtoerror(lwp_t *l, int sig)
{
struct proc *p = l->l_proc;
int error;
KASSERT(mutex_owned(p->p_lock));
/*
* If this sleep was canceled, don't let the syscall restart.
*/
if ((SIGACTION(p, sig).sa_flags & SA_RESTART) == 0)
error = EINTR;
else
error = ERESTART;
return error;
}
/*
* sleepq_abort:
*
* After a panic or during autoconfiguration, lower the interrupt
* priority level to give pending interrupts a chance to run, and
* then return. Called if sleepq_dontsleep() returns non-zero, and
* always returns zero.
*/
int
sleepq_abort(kmutex_t *mtx, int unlock)
{
int s;
s = splhigh();
splx(IPL_SAFEPRI);
splx(s);
if (mtx != NULL && unlock != 0)
mutex_exit(mtx);
return 0;
}
/*
* sleepq_reinsert:
*
* Move the position of the lwp in the sleep queue after a possible
* change of the lwp's effective priority.
*/
static void
sleepq_reinsert(sleepq_t *sq, lwp_t *l)
{ KASSERT(l->l_sleepq == sq); if ((l->l_syncobj->sobj_flag & SOBJ_SLEEPQ_SORTED) == 0) {
return;
}
/*
* Don't let the sleep queue become empty, even briefly.
* cv_signal() and cv_broadcast() inspect it without the
* sleep queue lock held and need to see a non-empty queue
* head if there are waiters.
*/
if (LIST_FIRST(sq) == l && LIST_NEXT(l, l_sleepchain) == NULL) {
return;
}
LIST_REMOVE(l, l_sleepchain);
sleepq_insert(sq, l, l->l_syncobj);
}
/*
* sleepq_changepri:
*
* Adjust the priority of an LWP residing on a sleepq.
*/
void
sleepq_changepri(lwp_t *l, pri_t pri)
{
sleepq_t *sq = l->l_sleepq;
KASSERT(lwp_locked(l, NULL));
l->l_priority = pri;
sleepq_reinsert(sq, l);
}
/*
* sleepq_changepri:
*
* Adjust the lended priority of an LWP residing on a sleepq.
*/
void
sleepq_lendpri(lwp_t *l, pri_t pri)
{
sleepq_t *sq = l->l_sleepq;
KASSERT(lwp_locked(l, NULL));
l->l_inheritedprio = pri;
l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio);
sleepq_reinsert(sq, l);
}
/* $NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $ */
/*-
* Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_runq.c,v 1.70 2023/09/19 22:15:32 ad Exp $");
#include "opt_dtrace.h"
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/bitops.h>
#include <sys/cpu.h>
#include <sys/idle.h>
#include <sys/intr.h>
#include <sys/kmem.h>
#include <sys/lwp.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/pset.h>
#include <sys/sched.h>
#include <sys/syscallargs.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/types.h>
#include <sys/evcnt.h>
#include <sys/atomic.h>
/*
* Bits per map.
*/
#define BITMAP_BITS (32)
#define BITMAP_SHIFT (5)
#define BITMAP_MSB (0x80000000U)
#define BITMAP_MASK (BITMAP_BITS - 1)
const int schedppq = 1;
static void *sched_getrq(struct schedstate_percpu *, const pri_t);
#ifdef MULTIPROCESSOR
static lwp_t * sched_catchlwp(struct cpu_info *);
#endif
/*
* Preemption control.
*/
#ifdef __HAVE_PREEMPTION
# ifdef DEBUG
int sched_kpreempt_pri = 0;
# else
int sched_kpreempt_pri = PRI_USER_RT;
# endif
#else
int sched_kpreempt_pri = 1000;
#endif
/*
* Migration and balancing.
*/
static u_int cacheht_time; /* Cache hotness time */
static u_int min_catch; /* Minimal LWP count for catching */
static u_int skim_interval; /* Rate limit for stealing LWPs */
#ifdef KDTRACE_HOOKS
struct lwp *curthread;
#endif
void
runq_init(void)
{
/* Pulling from remote packages, LWP must not have run for 10ms. */
cacheht_time = 10;
/* Minimal count of LWPs for catching */
min_catch = 1;
/* Steal from other CPUs at most every 10ms. */
skim_interval = 10;
}
void
sched_cpuattach(struct cpu_info *ci)
{
struct schedstate_percpu *spc;
size_t size;
void *p;
u_int i;
spc = &ci->ci_schedstate;
spc->spc_nextpkg = ci;
if (spc->spc_lwplock == NULL) {
spc->spc_lwplock = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
}
if (ci == lwp0.l_cpu) {
/* Initialize the scheduler structure of the primary LWP */
lwp0.l_mutex = spc->spc_lwplock;
}
if (spc->spc_mutex != NULL) {
/* Already initialized. */
return;
}
/* Allocate the run queue */
size = roundup2(sizeof(spc->spc_queue[0]) * PRI_COUNT, coherency_unit) +
coherency_unit;
p = kmem_alloc(size, KM_SLEEP);
spc->spc_queue = (void *)roundup2((uintptr_t)p, coherency_unit);
/* Initialize run queues */
spc->spc_mutex = mutex_obj_alloc(MUTEX_DEFAULT, IPL_SCHED);
for (i = 0; i < PRI_COUNT; i++)
TAILQ_INIT(&spc->spc_queue[i]);
}
/*
* Control of the runqueue.
*/
static inline void *
sched_getrq(struct schedstate_percpu *spc, const pri_t prio)
{
KASSERT(prio < PRI_COUNT);
return &spc->spc_queue[prio];
}
/*
* Put an LWP onto a run queue. The LWP must be locked by spc_mutex for
* l_cpu.
*/
void
sched_enqueue(struct lwp *l)
{
struct schedstate_percpu *spc;
TAILQ_HEAD(, lwp) *q_head;
const pri_t eprio = lwp_eprio(l);
struct cpu_info *ci;
ci = l->l_cpu;
spc = &ci->ci_schedstate;
KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex));
/* Enqueue the thread */
q_head = sched_getrq(spc, eprio); if (TAILQ_EMPTY(q_head)) {
u_int i;
uint32_t q;
/* Mark bit */
i = eprio >> BITMAP_SHIFT;
q = BITMAP_MSB >> (eprio & BITMAP_MASK);
KASSERT((spc->spc_bitmap[i] & q) == 0);
spc->spc_bitmap[i] |= q;
}
/*
* Determine run queue position according to POSIX. XXX Explicitly
* lowering a thread's priority with pthread_setschedparam() is not
* handled.
*/
if ((l->l_pflag & LP_PREEMPTING) != 0) {
switch (l->l_class) {
case SCHED_OTHER:
TAILQ_INSERT_TAIL(q_head, l, l_runq);
break;
case SCHED_FIFO:
TAILQ_INSERT_HEAD(q_head, l, l_runq);
break;
case SCHED_RR:
if (getticks() - l->l_rticks >= sched_rrticks) {
TAILQ_INSERT_TAIL(q_head, l, l_runq);
} else {
TAILQ_INSERT_HEAD(q_head, l, l_runq);
}
break;
default:
panic("sched_enqueue: LWP %p has class %d\n",
l, l->l_class);
}
} else {
TAILQ_INSERT_TAIL(q_head, l, l_runq);
}
spc->spc_flags &= ~SPCF_IDLE;
spc->spc_count++;
if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount,
atomic_load_relaxed(&spc->spc_mcount) + 1);
}
/*
* Update the value of highest priority in the runqueue,
* if priority of this thread is higher.
*/
if (eprio > spc->spc_maxpriority) spc->spc_maxpriority = eprio;
sched_newts(l);
}
/*
* Remove and LWP from the run queue it's on. The LWP must be in state
* LSRUN.
*/
void
sched_dequeue(struct lwp *l)
{
TAILQ_HEAD(, lwp) *q_head;
struct schedstate_percpu *spc;
const pri_t eprio = lwp_eprio(l);
spc = &l->l_cpu->ci_schedstate;
KASSERT(lwp_locked(l, spc->spc_mutex)); KASSERT(eprio <= spc->spc_maxpriority); KASSERT(spc->spc_bitmap[eprio >> BITMAP_SHIFT] != 0); KASSERT(spc->spc_count > 0); if (spc->spc_migrating == l) spc->spc_migrating = NULL;
spc->spc_count--;
if ((l->l_pflag & LP_BOUND) == 0) { atomic_store_relaxed(&spc->spc_mcount,
atomic_load_relaxed(&spc->spc_mcount) - 1);
}
q_head = sched_getrq(spc, eprio); TAILQ_REMOVE(q_head, l, l_runq); if (TAILQ_EMPTY(q_head)) {
u_int i;
uint32_t q;
/* Unmark bit */
i = eprio >> BITMAP_SHIFT;
q = BITMAP_MSB >> (eprio & BITMAP_MASK);
KASSERT((spc->spc_bitmap[i] & q) != 0);
spc->spc_bitmap[i] &= ~q;
/*
* Update the value of highest priority in the runqueue, in a
* case it was a last thread in the queue of highest priority.
*/
if (eprio != spc->spc_maxpriority)
return;
do {
if (spc->spc_bitmap[i] != 0) {
q = ffs(spc->spc_bitmap[i]);
spc->spc_maxpriority =
(i << BITMAP_SHIFT) + (BITMAP_BITS - q);
return;
}
} while (i--);
/* If not found - set the lowest value */
spc->spc_maxpriority = 0;
}
}
/*
* Cause a preemption on the given CPU, if the priority "pri" is higher
* priority than the running LWP. If "unlock" is specified, and ideally it
* will be for concurrency reasons, spc_mutex will be dropped before return.
*/
void
sched_resched_cpu(struct cpu_info *ci, pri_t pri, bool unlock)
{
struct schedstate_percpu *spc;
u_int o, n, f;
lwp_t *l;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(spc->spc_mutex));
/*
* If the priority level we're evaluating wouldn't cause a new LWP
* to be run on the CPU, then we have nothing to do.
*/
if (pri <= spc->spc_curpriority || !mp_online) {
if (__predict_true(unlock)) { spc_unlock(ci);
}
return;
}
/*
* Figure out what kind of preemption we should do.
*/
l = ci->ci_onproc;
if ((l->l_flag & LW_IDLE) != 0) {
f = RESCHED_IDLE | RESCHED_UPREEMPT;
} else if (pri >= sched_kpreempt_pri && (l->l_pflag & LP_INTR) == 0) {
/* We can't currently preempt softints - should be able to. */
#ifdef __HAVE_PREEMPTION
f = RESCHED_KPREEMPT;
#else
/* Leave door open for test: set kpreempt_pri with sysctl. */
f = RESCHED_UPREEMPT;
#endif
/*
* l_dopreempt must be set with the CPU locked to sync with
* mi_switch(). It must also be set with an atomic to sync
* with kpreempt().
*/
atomic_or_uint(&l->l_dopreempt, DOPREEMPT_ACTIVE);
} else {
f = RESCHED_UPREEMPT;
}
if (ci != curcpu()) {
f |= RESCHED_REMOTE;
}
/*
* Things can start as soon as ci_want_resched is touched: x86 has
* an instruction that monitors the memory cell it's in. Drop the
* schedstate lock in advance, otherwise the remote CPU can awaken
* and immediately block on the lock.
*/
if (__predict_true(unlock)) { spc_unlock(ci);
}
/*
* The caller almost always has a second scheduler lock held: either
* the running LWP lock (spc_lwplock), or a sleep queue lock. That
* keeps preemption disabled, which among other things ensures all
* LWPs involved won't be freed while we're here (see lwp_dtor()).
*/
KASSERT(kpreempt_disabled());
for (o = 0;; o = n) {
n = atomic_cas_uint(&ci->ci_want_resched, o, o | f);
if (__predict_true(o == n)) {
/*
* We're the first to set a resched on the CPU. Try
* to avoid causing a needless trip through trap()
* to handle an AST fault, if it's known the LWP
* will either block or go through userret() soon.
*/
if (l != curlwp || cpu_intr_p()) {
cpu_need_resched(ci, l, f);
}
break;
}
if (__predict_true(
(n & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)) >=
(f & (RESCHED_KPREEMPT|RESCHED_UPREEMPT)))) {
/* Already in progress, nothing to do. */
break;
}
}
}
/*
* Cause a preemption on the given CPU, if the priority of LWP "l" in state
* LSRUN, is higher priority than the running LWP. If "unlock" is
* specified, and ideally it will be for concurrency reasons, spc_mutex will
* be dropped before return.
*/
void
sched_resched_lwp(struct lwp *l, bool unlock)
{
struct cpu_info *ci = l->l_cpu;
KASSERT(lwp_locked(l, ci->ci_schedstate.spc_mutex)); KASSERT(l->l_stat == LSRUN);
sched_resched_cpu(ci, lwp_eprio(l), unlock);
}
/*
* Migration and balancing.
*/
#ifdef MULTIPROCESSOR
/*
* Estimate if LWP is cache-hot.
*/
static inline bool
lwp_cache_hot(const struct lwp *l)
{
/* Leave new LWPs in peace, determination has already been made. */
if (l->l_stat == LSIDL)
return true;
if (__predict_false(l->l_slptime != 0 || l->l_rticks == 0))
return false;
return (getticks() - l->l_rticks < mstohz(cacheht_time));
}
/*
* Check if LWP can migrate to the chosen CPU.
*/
static inline bool
sched_migratable(const struct lwp *l, struct cpu_info *ci)
{
const struct schedstate_percpu *spc = &ci->ci_schedstate;
KASSERT(lwp_locked(__UNCONST(l), NULL));
/* Is CPU offline? */
if (__predict_false(spc->spc_flags & SPCF_OFFLINE))
return false;
/* Is affinity set? */
if (__predict_false(l->l_affinity))
return kcpuset_isset(l->l_affinity, cpu_index(ci));
/* Is there a processor-set? */
return (spc->spc_psid == l->l_psid);
}
/*
* A small helper to do round robin through CPU packages.
*/
static struct cpu_info *
sched_nextpkg(void)
{
struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
spc->spc_nextpkg =
spc->spc_nextpkg->ci_sibling[CPUREL_PACKAGE1ST];
return spc->spc_nextpkg;
}
/*
* Find a CPU to run LWP "l". Look for the CPU with the lowest priority
* thread. In case of equal priority, prefer first class CPUs, and amongst
* the remainder choose the CPU with the fewest runqueue entries.
*
* Begin the search in the CPU package which "pivot" is a member of.
*/
static struct cpu_info * __noinline
sched_bestcpu(struct lwp *l, struct cpu_info *pivot)
{
struct cpu_info *bestci, *curci, *outer;
struct schedstate_percpu *bestspc, *curspc;
pri_t bestpri, curpri;
/*
* If this fails (it shouldn't), run on the given CPU. This also
* gives us a weak preference for "pivot" to begin with.
*/
bestci = pivot;
bestspc = &bestci->ci_schedstate;
if (sched_migratable(l, bestci)) { bestpri = MAX(bestspc->spc_curpriority,
bestspc->spc_maxpriority);
} else {
/* Invalidate the priority. */
bestpri = PRI_COUNT;
}
/* In the outer loop scroll through all CPU packages. */
pivot = pivot->ci_package1st;
outer = pivot;
do {
/* In the inner loop scroll through all CPUs in package. */
curci = outer;
do {
if (!sched_migratable(l, curci)) {
continue;
}
curspc = &curci->ci_schedstate;
/* If this CPU is idle and 1st class, we're done. */
if ((curspc->spc_flags & (SPCF_IDLE | SPCF_1STCLASS)) ==
(SPCF_IDLE | SPCF_1STCLASS)) {
return curci;
}
curpri = MAX(curspc->spc_curpriority,
curspc->spc_maxpriority);
if (curpri > bestpri) {
continue;
}
if (curpri == bestpri) {
/* Prefer first class CPUs over others. */
if ((curspc->spc_flags & SPCF_1STCLASS) == 0 &&
(bestspc->spc_flags & SPCF_1STCLASS) != 0) {
continue;
}
/*
* Pick the least busy CPU. Make sure this is not
* <=, otherwise it defeats the above preference.
*/
if (bestspc->spc_count < curspc->spc_count) {
continue;
}
}
bestpri = curpri;
bestci = curci;
bestspc = curspc;
} while (curci = curci->ci_sibling[CPUREL_PACKAGE],
curci != outer);
} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
outer != pivot);
return bestci;
}
/*
* Estimate the migration of LWP to the other CPU.
* Take and return the CPU, if migration is needed.
*/
struct cpu_info *
sched_takecpu(struct lwp *l)
{
struct schedstate_percpu *spc, *tspc;
struct cpu_info *ci, *curci, *tci;
pri_t eprio;
int flags;
KASSERT(lwp_locked(l, NULL));
/* If thread is strictly bound, do not estimate other CPUs */
ci = l->l_cpu;
if (l->l_pflag & LP_BOUND)
return ci;
spc = &ci->ci_schedstate;
eprio = lwp_eprio(l);
/*
* Handle new LWPs. For vfork() with a timeshared child, make it
* run on the same CPU as the parent if no other LWPs in queue.
* Otherwise scatter far and wide - try for an even distribution
* across all CPU packages and CPUs.
*/
if (l->l_stat == LSIDL) { if (curlwp->l_vforkwaiting && l->l_class == SCHED_OTHER) { if (sched_migratable(l, curlwp->l_cpu) && eprio >
curlwp->l_cpu->ci_schedstate.spc_maxpriority) {
return curlwp->l_cpu;
}
} else {
return sched_bestcpu(l, sched_nextpkg());
}
flags = SPCF_IDLE;
} else {
flags = SPCF_IDLE | SPCF_1STCLASS;
}
/*
* Try to send the LWP back to the first CPU in the same core if
* idle. This keeps LWPs clustered in the run queues of 1st class
* CPUs. This implies stickiness. If we didn't find a home for
* a vfork() child above, try to use any SMT sibling to help out.
*/
tci = ci;
do {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
return tci;
}
tci = tci->ci_sibling[CPUREL_CORE];
} while (tci != ci);
/*
* Otherwise the LWP is "sticky", i.e. generally preferring to stay
* on the same CPU.
*/
if (sched_migratable(l, ci) && (eprio > spc->spc_curpriority || (lwp_cache_hot(l) && l->l_class == SCHED_OTHER))) {
return ci;
}
/*
* If the current CPU core is idle, run there and avoid the
* expensive scan of CPUs below.
*/
curci = curcpu();
tci = curci;
do {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
return tci;
}
tci = tci->ci_sibling[CPUREL_CORE];
} while (tci != curci);
/*
* Didn't find a new home above - happens infrequently. Start the
* search in last CPU package that the LWP ran in, but expand to
* include the whole system if needed.
*/
return sched_bestcpu(l, l->l_cpu);
}
/*
* Tries to catch an LWP from the runqueue of other CPU.
*/
static struct lwp *
sched_catchlwp(struct cpu_info *ci)
{
struct cpu_info *curci = curcpu();
struct schedstate_percpu *spc, *curspc;
TAILQ_HEAD(, lwp) *q_head;
struct lwp *l;
bool gentle;
curspc = &curci->ci_schedstate;
spc = &ci->ci_schedstate;
/*
* Be more aggressive if this CPU is first class, and the other
* is not.
*/
gentle = ((curspc->spc_flags & SPCF_1STCLASS) == 0 ||
(spc->spc_flags & SPCF_1STCLASS) != 0);
if (atomic_load_relaxed(&spc->spc_mcount) < (gentle ? min_catch : 1) ||
curspc->spc_psid != spc->spc_psid) {
spc_unlock(ci);
return NULL;
}
/* Take the highest priority thread */
q_head = sched_getrq(spc, spc->spc_maxpriority);
l = TAILQ_FIRST(q_head);
for (;;) {
/* Check the first and next result from the queue */
if (l == NULL) {
break;
}
KASSERTMSG(l->l_stat == LSRUN, "%s l %p (%s) l_stat %d",
ci->ci_data.cpu_name,
l, (l->l_name ? l->l_name : l->l_proc->p_comm), l->l_stat);
/* Look for threads, whose are allowed to migrate */
if ((l->l_pflag & LP_BOUND) ||
(gentle && lwp_cache_hot(l)) ||
!sched_migratable(l, curci)) {
l = TAILQ_NEXT(l, l_runq);
/* XXX Gap: could walk down priority list. */
continue;
}
/* Grab the thread, and move to the local run queue */
sched_dequeue(l);
l->l_cpu = curci;
lwp_unlock_to(l, curspc->spc_mutex);
sched_enqueue(l);
return l;
}
spc_unlock(ci);
return l;
}
/*
* Called from sched_idle() to handle migration. Return the CPU that we
* pushed the LWP to (may be NULL).
*/
static struct cpu_info *
sched_idle_migrate(void)
{
struct cpu_info *ci = curcpu(), *tci = NULL;
struct schedstate_percpu *spc, *tspc;
bool dlock = false;
spc = &ci->ci_schedstate;
spc_lock(ci);
for (;;) {
struct lwp *l;
l = spc->spc_migrating;
if (l == NULL)
break;
/*
* If second attempt, and target CPU has changed,
* drop the old lock.
*/
if (dlock == true && tci != l->l_target_cpu) {
KASSERT(tci != NULL);
spc_unlock(tci);
dlock = false;
}
/*
* Nothing to do if destination has changed to the
* local CPU, or migration was done by other CPU.
*/
tci = l->l_target_cpu;
if (tci == NULL || tci == ci) {
spc->spc_migrating = NULL;
l->l_target_cpu = NULL;
break;
}
tspc = &tci->ci_schedstate;
/*
* Double-lock the runqueues.
* We do that only once.
*/
if (dlock == false) {
dlock = true;
if (ci < tci) {
spc_lock(tci);
} else if (!mutex_tryenter(tspc->spc_mutex)) {
spc_unlock(ci);
spc_lock(tci);
spc_lock(ci);
/* Check the situation again.. */
continue;
}
}
/* Migrate the thread */
KASSERT(l->l_stat == LSRUN);
spc->spc_migrating = NULL;
l->l_target_cpu = NULL;
sched_dequeue(l);
l->l_cpu = tci;
lwp_setlock(l, tspc->spc_mutex);
sched_enqueue(l);
sched_resched_lwp(l, true);
/* tci now unlocked */
spc_unlock(ci);
return tci;
}
if (dlock == true) {
KASSERT(tci != NULL);
spc_unlock(tci);
}
spc_unlock(ci);
return NULL;
}
/*
* Try to steal an LWP from "tci".
*/
static bool
sched_steal(struct cpu_info *ci, struct cpu_info *tci)
{
struct schedstate_percpu *spc, *tspc;
lwp_t *l;
spc = &ci->ci_schedstate;
tspc = &tci->ci_schedstate;
if (atomic_load_relaxed(&tspc->spc_mcount) != 0 &&
spc->spc_psid == tspc->spc_psid) {
spc_dlock(ci, tci);
l = sched_catchlwp(tci);
spc_unlock(ci);
if (l != NULL) {
return true;
}
}
return false;
}
/*
* Called from each CPU's idle loop.
*/
void
sched_idle(void)
{
struct cpu_info *ci, *inner, *outer, *first, *tci, *mci;
struct schedstate_percpu *spc, *tspc;
struct lwp *l;
ci = curcpu();
spc = &ci->ci_schedstate;
tci = NULL;
mci = NULL;
/*
* Handle LWP migrations off this CPU to another. If there a is
* migration to do then remember the CPU the LWP was sent to, and
* don't steal the LWP back from that CPU below.
*/
if (spc->spc_migrating != NULL) {
mci = sched_idle_migrate();
}
/* If this CPU is offline, or we have an LWP to run, we're done. */
if ((spc->spc_flags & SPCF_OFFLINE) != 0 || spc->spc_count != 0) {
return;
}
/* Deal with SMT. */
if (ci->ci_nsibling[CPUREL_CORE] > 1) {
/* Try to help our siblings out. */
tci = ci->ci_sibling[CPUREL_CORE];
while (tci != ci) {
if (tci != mci && sched_steal(ci, tci)) {
return;
}
tci = tci->ci_sibling[CPUREL_CORE];
}
/*
* If not the first SMT in the core, and in the default
* processor set, the search ends here.
*/
if ((spc->spc_flags & SPCF_1STCLASS) == 0 &&
spc->spc_psid == PS_NONE) {
return;
}
}
/*
* Find something to run, unless this CPU exceeded the rate limit.
* Start looking on the current package to maximise L2/L3 cache
* locality. Then expand to looking at the rest of the system.
*
* XXX Should probably look at 2nd class CPUs first, but they will
* shed jobs via preempt() anyway.
*/
if (spc->spc_nextskim > getticks()) {
return;
}
spc->spc_nextskim = getticks() + mstohz(skim_interval);
/* In the outer loop scroll through all CPU packages, starting here. */
first = ci->ci_package1st;
outer = first;
do {
/* In the inner loop scroll through all CPUs in package. */
inner = outer;
do {
/* Don't hit the locks unless needed. */
tspc = &inner->ci_schedstate;
if (ci == inner || ci == mci ||
spc->spc_psid != tspc->spc_psid ||
atomic_load_relaxed(&tspc->spc_mcount) < min_catch) {
continue;
}
spc_dlock(ci, inner);
l = sched_catchlwp(inner);
spc_unlock(ci);
if (l != NULL) {
/* Got it! */
return;
}
} while (inner = inner->ci_sibling[CPUREL_PACKAGE],
inner != outer);
} while (outer = outer->ci_sibling[CPUREL_PACKAGE1ST],
outer != first);
}
/*
* Called from mi_switch() when an LWP has been preempted / has yielded.
* The LWP is presently in the CPU's run queue. Here we look for a better
* CPU to teleport the LWP to; there may not be one.
*/
void
sched_preempted(struct lwp *l)
{
const int flags = SPCF_IDLE | SPCF_1STCLASS;
struct schedstate_percpu *tspc;
struct cpu_info *ci, *tci;
ci = l->l_cpu;
tspc = &ci->ci_schedstate;
KASSERT(tspc->spc_count >= 1);
/*
* Try to select another CPU if:
*
* - there is no migration pending already
* - and this LWP is running on a 2nd class CPU
* - or this LWP is a child of vfork() that has just done execve()
*/
if (l->l_target_cpu != NULL || ((tspc->spc_flags & SPCF_1STCLASS) != 0 &&
(l->l_pflag & LP_TELEPORT) == 0)) {
return;
}
/*
* Fast path: if the first SMT in the core is idle, send it back
* there, because the cache is shared (cheap) and we want all LWPs
* to be clustered on 1st class CPUs (either running there or on
* their runqueues).
*/
tci = ci->ci_sibling[CPUREL_CORE];
while (tci != ci) {
tspc = &tci->ci_schedstate;
if ((tspc->spc_flags & flags) == flags && sched_migratable(l, tci)) {
l->l_target_cpu = tci;
l->l_pflag &= ~LP_TELEPORT;
return;
}
tci = tci->ci_sibling[CPUREL_CORE];
}
if ((l->l_pflag & LP_TELEPORT) != 0) {
/*
* A child of vfork(): now that the parent is released,
* scatter far and wide, to match the LSIDL distribution
* done in sched_takecpu().
*/
l->l_pflag &= ~LP_TELEPORT;
tci = sched_bestcpu(l, sched_nextpkg());
if (tci != ci) { l->l_target_cpu = tci;
}
} else {
/*
* Try to find a better CPU to take it, but don't move to
* another 2nd class CPU, and don't move to a non-idle CPU,
* because that would prevent SMT being used to maximise
* throughput.
*
* Search in the current CPU package in order to try and
* keep L2/L3 cache locality, but expand to include the
* whole system if needed.
*/
tci = sched_bestcpu(l, l->l_cpu);
if (tci != ci &&
(tci->ci_schedstate.spc_flags & flags) == flags) {
l->l_target_cpu = tci;
}
}
}
/*
* Called during execve() by a child of vfork(). Does two things:
*
* - If the parent has been awoken and put back on curcpu then give the
* CPU back to the parent.
*
* - If curlwp is not on a 1st class CPU then find somewhere else to run,
* since it dodged the distribution in sched_takecpu() when first set
* runnable.
*/
void
sched_vforkexec(struct lwp *l, bool samecpu)
{
KASSERT(l == curlwp);
if ((samecpu && ncpu > 1) ||
(l->l_cpu->ci_schedstate.spc_flags & SPCF_1STCLASS) == 0) {
l->l_pflag |= LP_TELEPORT;
preempt();
}
}
#else
/*
* stubs for !MULTIPROCESSOR
*/
struct cpu_info *
sched_takecpu(struct lwp *l)
{
return l->l_cpu;
}
void
sched_idle(void)
{
}
void
sched_preempted(struct lwp *l)
{
}
void
sched_vforkexec(struct lwp *l, bool samecpu)
{
KASSERT(l == curlwp);
}
#endif /* MULTIPROCESSOR */
/*
* Scheduling statistics and balancing.
*/
void
sched_lwp_stats(struct lwp *l)
{
int batch;
KASSERT(lwp_locked(l, NULL));
/* Update sleep time */
if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
l->l_stat == LSSUSPENDED)
l->l_slptime++;
/*
* Set that thread is more CPU-bound, if sum of run time exceeds the
* sum of sleep time. Check if thread is CPU-bound a first time.
*/
batch = (l->l_rticksum > l->l_slpticksum);
if (batch != 0) {
if ((l->l_flag & LW_BATCH) == 0)
batch = 0;
l->l_flag |= LW_BATCH;
} else
l->l_flag &= ~LW_BATCH;
/* Reset the time sums */
l->l_slpticksum = 0;
l->l_rticksum = 0;
/* Scheduler-specific hook */
sched_pstats_hook(l, batch);
#ifdef KDTRACE_HOOKS
curthread = l;
#endif
}
/*
* Scheduler mill.
*/
struct lwp *
sched_nextlwp(void)
{
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc;
TAILQ_HEAD(, lwp) *q_head;
struct lwp *l;
/* Update the last run time on switch */
l = curlwp;
l->l_rticksum += (getticks() - l->l_rticks);
/* Return to idle LWP if there is a migrating thread */
spc = &ci->ci_schedstate;
if (__predict_false(spc->spc_migrating != NULL))
return NULL;
/* Return to idle LWP if there is no runnable job */
if (__predict_false(spc->spc_count == 0))
return NULL;
/* Take the highest priority thread */
KASSERT(spc->spc_bitmap[spc->spc_maxpriority >> BITMAP_SHIFT]); q_head = sched_getrq(spc, spc->spc_maxpriority);
l = TAILQ_FIRST(q_head);
KASSERT(l != NULL);
sched_oncpu(l);
l->l_rticks = getticks();
return l;
}
/*
* sched_curcpu_runnable_p: return if curcpu() should exit the idle loop.
*/
bool
sched_curcpu_runnable_p(void)
{
const struct cpu_info *ci;
const struct schedstate_percpu *spc;
bool rv;
kpreempt_disable();
ci = curcpu();
spc = &ci->ci_schedstate;
rv = (spc->spc_count != 0);
#ifndef __HAVE_FAST_SOFTINTS
rv |= (ci->ci_data.cpu_softints != 0);
#endif
kpreempt_enable();
return rv;
}
/*
* Sysctl nodes and initialization.
*/
SYSCTL_SETUP(sysctl_sched_setup, "sysctl sched setup")
{
const struct sysctlnode *node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "sched",
SYSCTL_DESCR("Scheduler options"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node == NULL)
return;
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "cacheht_time",
SYSCTL_DESCR("Cache hotness time (in ms)"),
NULL, 0, &cacheht_time, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "skim_interval",
SYSCTL_DESCR("Rate limit for stealing from other CPUs (in ms)"),
NULL, 0, &skim_interval, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "min_catch",
SYSCTL_DESCR("Minimal count of threads for catching"),
NULL, 0, &min_catch, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "timesoftints",
SYSCTL_DESCR("Track CPU time for soft interrupts"),
NULL, 0, &softint_timing, 0,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, &node, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "kpreempt_pri",
SYSCTL_DESCR("Minimum priority to trigger kernel preemption"),
NULL, 0, &sched_kpreempt_pri, 0,
CTL_CREATE, CTL_EOL);
}
/*
* Debugging.
*/
#ifdef DDB
void
sched_print_runqueue(void (*pr)(const char *, ...))
{
struct cpu_info *ci, *tci;
struct schedstate_percpu *spc;
struct lwp *l;
struct proc *p;
CPU_INFO_ITERATOR cii;
for (CPU_INFO_FOREACH(cii, ci)) {
int i;
spc = &ci->ci_schedstate;
(*pr)("Run-queue (CPU = %u):\n", ci->ci_index);
(*pr)(" pid.lid = %d.%d, r_count = %u, "
"maxpri = %d, mlwp = %p\n",
#ifdef MULTIPROCESSOR
ci->ci_curlwp->l_proc->p_pid, ci->ci_curlwp->l_lid,
#else
curlwp->l_proc->p_pid, curlwp->l_lid,
#endif
spc->spc_count, spc->spc_maxpriority,
spc->spc_migrating);
i = (PRI_COUNT >> BITMAP_SHIFT) - 1;
do {
uint32_t q;
q = spc->spc_bitmap[i];
(*pr)(" bitmap[%d] => [ %d (0x%x) ]\n", i, ffs(q), q);
} while (i--);
}
(*pr)(" %5s %4s %4s %10s %3s %18s %4s %4s %s\n",
"LID", "PRI", "EPRI", "FL", "ST", "LWP", "CPU", "TCI", "LRTICKS");
PROCLIST_FOREACH(p, &allproc) {
(*pr)(" /- %d (%s)\n", (int)p->p_pid, p->p_comm);
LIST_FOREACH(l, &p->p_lwps, l_sibling) {
ci = l->l_cpu;
tci = l->l_target_cpu;
(*pr)(" | %5d %4u %4u 0x%8.8x %3s %18p %4u %4d %u\n",
(int)l->l_lid, l->l_priority, lwp_eprio(l),
l->l_flag, l->l_stat == LSRUN ? "RQ" :
(l->l_stat == LSSLEEP ? "SQ" : "-"),
l, ci->ci_index, (tci ? tci->ci_index : -1),
(u_int)(getticks() - l->l_rticks));
}
}
}
#endif
/* $NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $ */
/*-
* Copyright (c) 2007, 2008, 2019, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Generic software interrupt framework.
*
* Overview
*
* The soft interrupt framework provides a mechanism to schedule a
* low priority callback that runs with thread context. It allows
* for dynamic registration of software interrupts, and for fair
* queueing and prioritization of those interrupts. The callbacks
* can be scheduled to run from nearly any point in the kernel: by
* code running with thread context, by code running from a
* hardware interrupt handler, and at any interrupt priority
* level.
*
* Priority levels
*
* Since soft interrupt dispatch can be tied to the underlying
* architecture's interrupt dispatch code, it can be limited
* both by the capabilities of the hardware and the capabilities
* of the interrupt dispatch code itself. The number of priority
* levels is restricted to four. In order of priority (lowest to
* highest) the levels are: clock, bio, net, serial.
*
* The names are symbolic and in isolation do not have any direct
* connection with a particular kind of device activity: they are
* only meant as a guide.
*
* The four priority levels map directly to scheduler priority
* levels, and where the architecture implements 'fast' software
* interrupts, they also map onto interrupt priorities. The
* interrupt priorities are intended to be hidden from machine
* independent code, which should use thread-safe mechanisms to
* synchronize with software interrupts (for example: mutexes).
*
* Capabilities
*
* Software interrupts run with limited machine context. In
* particular, they do not posess any address space context. They
* should not try to operate on user space addresses, or to use
* virtual memory facilities other than those noted as interrupt
* safe.
*
* Unlike hardware interrupts, software interrupts do have thread
* context. They may block on synchronization objects, sleep, and
* resume execution at a later time.
*
* Since software interrupts are a limited resource and run with
* higher priority than most other LWPs in the system, all
* block-and-resume activity by a software interrupt must be kept
* short to allow further processing at that level to continue. By
* extension, code running with process context must take care to
* ensure that any lock that may be taken from a software interrupt
* can not be held for more than a short period of time.
*
* The kernel does not allow software interrupts to use facilities
* or perform actions that may block for a significant amount of
* time. This means that it's not valid for a software interrupt
* to sleep on condition variables or wait for resources to become
* available (for example, memory).
*
* Per-CPU operation
*
* If a soft interrupt is triggered on a CPU, it can only be
* dispatched on the same CPU. Each LWP dedicated to handling a
* soft interrupt is bound to its home CPU, so if the LWP blocks
* and needs to run again, it can only run there. Nearly all data
* structures used to manage software interrupts are per-CPU.
*
* The per-CPU requirement is intended to reduce "ping-pong" of
* cache lines between CPUs: lines occupied by data structures
* used to manage the soft interrupts, and lines occupied by data
* items being passed down to the soft interrupt. As a positive
* side effect, this also means that the soft interrupt dispatch
* code does not need to to use spinlocks to synchronize.
*
* Generic implementation
*
* A generic, low performance implementation is provided that
* works across all architectures, with no machine-dependent
* modifications needed. This implementation uses the scheduler,
* and so has a number of restrictions:
*
* 1) The software interrupts are not currently preemptive, so
* must wait for the currently executing LWP to yield the CPU.
* This can introduce latency.
*
* 2) An expensive context switch is required for a software
* interrupt to be handled.
*
* 'Fast' software interrupts
*
* If an architectures defines __HAVE_FAST_SOFTINTS, it implements
* the fast mechanism. Threads running either in the kernel or in
* userspace will be interrupted, but will not be preempted. When
* the soft interrupt completes execution, the interrupted LWP
* is resumed. Interrupt dispatch code must provide the minimum
* level of context necessary for the soft interrupt to block and
* be resumed at a later time. The machine-dependent dispatch
* path looks something like the following:
*
* softintr()
* {
* go to IPL_HIGH if necessary for switch;
* save any necessary registers in a format that can be
* restored by cpu_switchto if the softint blocks;
* arrange for cpu_switchto() to restore into the
* trampoline function;
* identify LWP to handle this interrupt;
* switch to the LWP's stack;
* switch register stacks, if necessary;
* assign new value of curlwp;
* call MI softint_dispatch, passing old curlwp and IPL
* to execute interrupt at;
* switch back to old stack;
* switch back to old register stack, if necessary;
* restore curlwp;
* return to interrupted LWP;
* }
*
* If the soft interrupt blocks, a trampoline function is returned
* to in the context of the interrupted LWP, as arranged for by
* softint():
*
* softint_ret()
* {
* unlock soft interrupt LWP;
* resume interrupt processing, likely returning to
* interrupted LWP or dispatching another, different
* interrupt;
* }
*
* Once the soft interrupt has fired (and even if it has blocked),
* no further soft interrupts at that level will be triggered by
* MI code until the soft interrupt handler has ceased execution.
* If a soft interrupt handler blocks and is resumed, it resumes
* execution as a normal LWP (kthread) and gains VM context. Only
* when it has completed and is ready to fire again will it
* interrupt other threads.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_softint.c,v 1.76 2024/03/01 04:32:38 mrg Exp $");
#include <sys/param.h>
#include <sys/proc.h>
#include <sys/intr.h>
#include <sys/ipi.h>
#include <sys/lock.h>
#include <sys/mutex.h>
#include <sys/kernel.h>
#include <sys/kthread.h>
#include <sys/evcnt.h>
#include <sys/cpu.h>
#include <sys/xcall.h>
#include <sys/psref.h>
#include <sys/sdt.h>
#include <uvm/uvm_extern.h>
/* This could overlap with signal info in struct lwp. */
typedef struct softint {
SIMPLEQ_HEAD(, softhand) si_q;
struct lwp *si_lwp;
struct cpu_info *si_cpu;
uintptr_t si_machdep;
struct evcnt si_evcnt;
struct evcnt si_evcnt_block;
volatile int si_active;
int si_ipl;
char si_name[8];
char si_name_block[8+6];
} softint_t;
typedef struct softhand {
SIMPLEQ_ENTRY(softhand) sh_q;
void (*sh_func)(void *);
void *sh_arg;
softint_t *sh_isr;
u_int sh_flags;
u_int sh_ipi_id;
} softhand_t;
typedef struct softcpu {
struct cpu_info *sc_cpu;
softint_t sc_int[SOFTINT_COUNT];
softhand_t sc_hand[1];
} softcpu_t;
static void softint_thread(void *);
u_int softint_bytes = 32768;
u_int softint_timing;
static u_int softint_max;
static kmutex_t softint_lock;
SDT_PROBE_DEFINE4(sdt, kernel, softint, establish,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE1(sdt, kernel, softint, disestablish,
"void *"/*sih*/);
SDT_PROBE_DEFINE2(sdt, kernel, softint, schedule,
"void *"/*sih*/,
"struct cpu_info *"/*ci*/);
SDT_PROBE_DEFINE4(sdt, kernel, softint, entry,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
SDT_PROBE_DEFINE4(sdt, kernel, softint, return,
"void *"/*sih*/,
"void (*)(void *)"/*func*/,
"void *"/*arg*/,
"unsigned"/*flags*/);
/*
* softint_init_isr:
*
* Initialize a single interrupt level for a single CPU.
*/
static void
softint_init_isr(softcpu_t *sc, const char *desc, pri_t pri, u_int level,
int ipl)
{
struct cpu_info *ci;
softint_t *si;
int error;
si = &sc->sc_int[level];
ci = sc->sc_cpu;
si->si_cpu = ci;
SIMPLEQ_INIT(&si->si_q);
error = kthread_create(pri, KTHREAD_MPSAFE | KTHREAD_INTR |
KTHREAD_IDLE, ci, softint_thread, si, &si->si_lwp,
"soft%s/%u", desc, ci->ci_index);
if (error != 0)
panic("softint_init_isr: error %d", error);
snprintf(si->si_name, sizeof(si->si_name), "%s/%u", desc,
ci->ci_index);
evcnt_attach_dynamic(&si->si_evcnt, EVCNT_TYPE_MISC, NULL,
"softint", si->si_name);
snprintf(si->si_name_block, sizeof(si->si_name_block), "%s block/%u",
desc, ci->ci_index);
evcnt_attach_dynamic(&si->si_evcnt_block, EVCNT_TYPE_MISC, NULL,
"softint", si->si_name_block);
si->si_ipl = ipl;
si->si_lwp->l_private = si;
softint_init_md(si->si_lwp, level, &si->si_machdep);
}
/*
* softint_init:
*
* Initialize per-CPU data structures. Called from mi_cpu_attach().
*/
void
softint_init(struct cpu_info *ci)
{
static struct cpu_info *first;
softcpu_t *sc, *scfirst;
softhand_t *sh, *shmax;
if (first == NULL) {
/* Boot CPU. */
first = ci;
mutex_init(&softint_lock, MUTEX_DEFAULT, IPL_NONE);
softint_bytes = round_page(softint_bytes);
softint_max = (softint_bytes - sizeof(softcpu_t)) /
sizeof(softhand_t);
}
/* Use uvm_km(9) for persistent, page-aligned allocation. */
sc = (softcpu_t *)uvm_km_alloc(kernel_map, softint_bytes, 0,
UVM_KMF_WIRED | UVM_KMF_ZERO);
if (sc == NULL)
panic("softint_init_cpu: cannot allocate memory");
ci->ci_data.cpu_softcpu = sc;
ci->ci_data.cpu_softints = 0;
sc->sc_cpu = ci;
softint_init_isr(sc, "net", PRI_SOFTNET, SOFTINT_NET,
IPL_SOFTNET);
softint_init_isr(sc, "bio", PRI_SOFTBIO, SOFTINT_BIO,
IPL_SOFTBIO);
softint_init_isr(sc, "clk", PRI_SOFTCLOCK, SOFTINT_CLOCK,
IPL_SOFTCLOCK);
softint_init_isr(sc, "ser", PRI_SOFTSERIAL, SOFTINT_SERIAL,
IPL_SOFTSERIAL);
if (first != ci) {
mutex_enter(&softint_lock);
scfirst = first->ci_data.cpu_softcpu;
sh = sc->sc_hand;
memcpy(sh, scfirst->sc_hand, sizeof(*sh) * softint_max);
/* Update pointers for this CPU. */
for (shmax = sh + softint_max; sh < shmax; sh++) {
if (sh->sh_func == NULL)
continue;
sh->sh_isr =
&sc->sc_int[sh->sh_flags & SOFTINT_LVLMASK];
}
mutex_exit(&softint_lock);
}
}
/*
* softint_establish:
*
* Register a software interrupt handler.
*/
void *
softint_establish(u_int flags, void (*func)(void *), void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
softcpu_t *sc;
softhand_t *sh;
u_int level, index;
u_int ipi_id = 0;
void *sih;
level = (flags & SOFTINT_LVLMASK);
KASSERT(level < SOFTINT_COUNT); KASSERT((flags & SOFTINT_IMPMASK) == 0);
mutex_enter(&softint_lock);
/* Find a free slot. */
sc = curcpu()->ci_data.cpu_softcpu;
for (index = 1; index < softint_max; index++) { if (sc->sc_hand[index].sh_func == NULL)
break;
}
if (index == softint_max) {
mutex_exit(&softint_lock);
printf("WARNING: softint_establish: table full, "
"increase softint_bytes\n");
return NULL;
}
sih = (void *)((uint8_t *)&sc->sc_hand[index] - (uint8_t *)sc);
if (flags & SOFTINT_RCPU) { if ((ipi_id = ipi_register(softint_schedule, sih)) == 0) { mutex_exit(&softint_lock);
return NULL;
}
}
/* Set up the handler on each CPU. */
if (ncpu < 2) {
/* XXX hack for machines with no CPU_INFO_FOREACH() early on */
sc = curcpu()->ci_data.cpu_softcpu;
sh = &sc->sc_hand[index];
sh->sh_isr = &sc->sc_int[level];
sh->sh_func = func;
sh->sh_arg = arg;
sh->sh_flags = flags;
sh->sh_ipi_id = ipi_id;
} else for (CPU_INFO_FOREACH(cii, ci)) {
sc = ci->ci_data.cpu_softcpu;
sh = &sc->sc_hand[index];
sh->sh_isr = &sc->sc_int[level];
sh->sh_func = func;
sh->sh_arg = arg;
sh->sh_flags = flags;
sh->sh_ipi_id = ipi_id;
}
mutex_exit(&softint_lock);
SDT_PROBE4(sdt, kernel, softint, establish, sih, func, arg, flags);
return sih;
}
/*
* softint_disestablish:
*
* Unregister a software interrupt handler. The soft interrupt could
* still be active at this point, but the caller commits not to try
* and trigger it again once this call is made. The caller must not
* hold any locks that could be taken from soft interrupt context,
* because we will wait for the softint to complete if it's still
* running.
*/
void
softint_disestablish(void *arg)
{
CPU_INFO_ITERATOR cii;
struct cpu_info *ci;
softcpu_t *sc;
softhand_t *sh;
uintptr_t offset;
offset = (uintptr_t)arg;
KASSERT(offset != 0);
KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
offset, softint_bytes);
/*
* Unregister IPI handler if there is any. Note: there is no need
* to disable preemption here - ID is stable.
*/
sc = curcpu()->ci_data.cpu_softcpu;
sh = (softhand_t *)((uint8_t *)sc + offset);
if (sh->sh_ipi_id) {
ipi_unregister(sh->sh_ipi_id);
}
/*
* Run a dummy softint at the same level on all CPUs and wait for
* completion, to make sure this softint is no longer running
* anywhere.
*/
xc_barrier(XC_HIGHPRI_IPL(sh->sh_isr->si_ipl));
/*
* Notify dtrace probe when the old softint can't be running
* any more, but before it can be recycled for a new softint.
*/
SDT_PROBE1(sdt, kernel, softint, disestablish, arg);
/* Clear the handler on each CPU. */
mutex_enter(&softint_lock);
for (CPU_INFO_FOREACH(cii, ci)) {
sc = ci->ci_data.cpu_softcpu;
sh = (softhand_t *)((uint8_t *)sc + offset);
KASSERT(sh->sh_func != NULL);
sh->sh_func = NULL;
}
mutex_exit(&softint_lock);
}
/*
* softint_schedule:
*
* Trigger a software interrupt. Must be called from a hardware
* interrupt handler, or with preemption disabled (since we are
* using the value of curcpu()).
*/
void
softint_schedule(void *arg)
{
softhand_t *sh;
softint_t *si;
uintptr_t offset;
int s;
SDT_PROBE2(sdt, kernel, softint, schedule, arg, /*ci*/NULL);
/*
* If this assert fires, rather than disabling preemption explicitly
* to make it stop, consider that you are probably using a softint
* when you don't need to.
*/
KASSERT(kpreempt_disabled());
/* Find the handler record for this CPU. */
offset = (uintptr_t)arg;
KASSERT(offset != 0); KASSERTMSG(offset < softint_bytes, "%"PRIuPTR" %u",
offset, softint_bytes);
sh = (softhand_t *)((uint8_t *)curcpu()->ci_data.cpu_softcpu + offset);
/* If it's already pending there's nothing to do. */
if ((sh->sh_flags & SOFTINT_PENDING) != 0) {
return;
}
/*
* Enqueue the handler into the LWP's pending list.
* If the LWP is completely idle, then make it run.
*/
s = splhigh();
if ((sh->sh_flags & SOFTINT_PENDING) == 0) {
si = sh->sh_isr;
sh->sh_flags |= SOFTINT_PENDING;
SIMPLEQ_INSERT_TAIL(&si->si_q, sh, sh_q);
if (si->si_active == 0) { si->si_active = 1;
softint_trigger(si->si_machdep);
}
}
splx(s);
}
/*
* softint_schedule_cpu:
*
* Trigger a software interrupt on a target CPU. This invokes
* softint_schedule() for the local CPU or send an IPI to invoke
* this routine on the remote CPU. Preemption must be disabled.
*/
void
softint_schedule_cpu(void *arg, struct cpu_info *ci)
{ KASSERT(kpreempt_disabled());
if (curcpu() != ci) {
const softcpu_t *sc = ci->ci_data.cpu_softcpu;
const uintptr_t offset = (uintptr_t)arg;
const softhand_t *sh;
SDT_PROBE2(sdt, kernel, softint, schedule, arg, ci);
sh = (const softhand_t *)((const uint8_t *)sc + offset);
KASSERT((sh->sh_flags & SOFTINT_RCPU) != 0);
ipi_trigger(sh->sh_ipi_id, ci);
return;
}
/* Just a local CPU. */
softint_schedule(arg);
}
/*
* softint_execute:
*
* Invoke handlers for the specified soft interrupt.
* Must be entered at splhigh. Will drop the priority
* to the level specified, but returns back at splhigh.
*/
static inline void
softint_execute(lwp_t *l, int s)
{
softint_t *si = l->l_private;
softhand_t *sh;
KASSERT(si->si_lwp == curlwp);
KASSERT(si->si_cpu == curcpu());
KASSERT(si->si_lwp->l_wchan == NULL);
KASSERT(si->si_active);
KASSERTMSG(l->l_nopreempt == 0, "lwp %p nopreempt %d",
l, l->l_nopreempt);
/*
* Note: due to priority inheritance we may have interrupted a
* higher priority LWP. Since the soft interrupt must be quick
* and is non-preemptable, we don't bother yielding.
*/
while (!SIMPLEQ_EMPTY(&si->si_q)) {
/*
* Pick the longest waiting handler to run. We block
* interrupts but do not lock in order to do this, as
* we are protecting against the local CPU only.
*/
sh = SIMPLEQ_FIRST(&si->si_q);
SIMPLEQ_REMOVE_HEAD(&si->si_q, sh_q);
KASSERT((sh->sh_flags & SOFTINT_PENDING) != 0);
sh->sh_flags ^= SOFTINT_PENDING;
splx(s);
/* Run the handler. */
SDT_PROBE4(sdt, kernel, softint, entry,
((const char *)sh -
(const char *)curcpu()->ci_data.cpu_softcpu),
sh->sh_func, sh->sh_arg, sh->sh_flags);
if (__predict_true((sh->sh_flags & SOFTINT_MPSAFE) != 0)) {
(*sh->sh_func)(sh->sh_arg);
} else {
KERNEL_LOCK(1, l);
(*sh->sh_func)(sh->sh_arg);
KERNEL_UNLOCK_ONE(l);
}
SDT_PROBE4(sdt, kernel, softint, return,
((const char *)sh -
(const char *)curcpu()->ci_data.cpu_softcpu),
sh->sh_func, sh->sh_arg, sh->sh_flags);
/* Diagnostic: check that spin-locks have not leaked. */
KASSERTMSG(curcpu()->ci_mtx_count == 0,
"%s: ci_mtx_count (%d) != 0, sh_func %p\n",
__func__, curcpu()->ci_mtx_count, sh->sh_func);
/* Diagnostic: check that psrefs have not leaked. */
KASSERTMSG(l->l_psrefs == 0, "%s: l_psrefs=%d, sh_func=%p\n",
__func__, l->l_psrefs, sh->sh_func);
/* Diagnostic: check that biglocks have not leaked. */
KASSERTMSG(l->l_blcnt == 0,
"%s: sh_func=%p leaked %d biglocks",
__func__, sh->sh_func, curlwp->l_blcnt);
/* Diagnostic: check that LWP nopreempt remains zero. */
KASSERTMSG(l->l_nopreempt == 0,
"%s: lwp %p nopreempt %d func %p",
__func__, l, l->l_nopreempt, sh->sh_func);
(void)splhigh();
}
PSREF_DEBUG_BARRIER();
CPU_COUNT(CPU_COUNT_NSOFT, 1);
KASSERT(si->si_cpu == curcpu());
KASSERT(si->si_lwp->l_wchan == NULL);
KASSERT(si->si_active);
si->si_evcnt.ev_count++;
si->si_active = 0;
}
/*
* softint_block:
*
* Update statistics when the soft interrupt blocks.
*/
void
softint_block(lwp_t *l)
{
softint_t *si = l->l_private;
KASSERT((l->l_pflag & LP_INTR) != 0);
si->si_evcnt_block.ev_count++;
}
#ifndef __HAVE_FAST_SOFTINTS
#ifdef __HAVE_PREEMPTION
#error __HAVE_PREEMPTION requires __HAVE_FAST_SOFTINTS
#endif
/*
* softint_init_md:
*
* Slow path: perform machine-dependent initialization.
*/
void
softint_init_md(lwp_t *l, u_int level, uintptr_t *machdep)
{
struct proc *p;
softint_t *si;
*machdep = (1 << level);
si = l->l_private;
p = l->l_proc;
mutex_enter(p->p_lock);
lwp_lock(l);
/* Cheat and make the KASSERT in softint_thread() happy. */
si->si_active = 1;
setrunnable(l);
/* LWP now unlocked */
mutex_exit(p->p_lock);
}
/*
* softint_trigger:
*
* Slow path: cause a soft interrupt handler to begin executing.
* Called at IPL_HIGH.
*/
void
softint_trigger(uintptr_t machdep)
{
struct cpu_info *ci;
lwp_t *l;
ci = curcpu();
ci->ci_data.cpu_softints |= machdep;
l = ci->ci_onproc;
/*
* Arrange for mi_switch() to be called. If called from interrupt
* mode, we don't know if curlwp is executing in kernel or user, so
* post an AST and have it take a trip through userret(). If not in
* interrupt mode, curlwp is running in kernel and will notice the
* resched soon enough; avoid the AST.
*/
if (l == ci->ci_data.cpu_idlelwp) {
atomic_or_uint(&ci->ci_want_resched,
RESCHED_IDLE | RESCHED_UPREEMPT);
} else {
atomic_or_uint(&ci->ci_want_resched, RESCHED_UPREEMPT);
if (cpu_intr_p()) {
cpu_signotify(l);
}
}
}
/*
* softint_thread:
*
* Slow path: MI software interrupt dispatch.
*/
void
softint_thread(void *cookie)
{
softint_t *si;
lwp_t *l;
int s;
l = curlwp;
si = l->l_private;
for (;;) {
/* Clear pending status and run it. */
s = splhigh();
l->l_cpu->ci_data.cpu_softints &= ~si->si_machdep;
softint_execute(l, s);
splx(s);
/* Interrupts allowed to run again before switching. */
lwp_lock(l);
l->l_stat = LSIDL;
spc_lock(l->l_cpu);
mi_switch(l);
}
}
/*
* softint_picklwp:
*
* Slow path: called from mi_switch() to pick the highest priority
* soft interrupt LWP that needs to run.
*/
lwp_t *
softint_picklwp(void)
{
struct cpu_info *ci;
u_int mask;
softint_t *si;
lwp_t *l;
ci = curcpu();
si = ((softcpu_t *)ci->ci_data.cpu_softcpu)->sc_int;
mask = ci->ci_data.cpu_softints;
if ((mask & (1 << SOFTINT_SERIAL)) != 0) {
l = si[SOFTINT_SERIAL].si_lwp;
} else if ((mask & (1 << SOFTINT_NET)) != 0) {
l = si[SOFTINT_NET].si_lwp;
} else if ((mask & (1 << SOFTINT_BIO)) != 0) {
l = si[SOFTINT_BIO].si_lwp;
} else if ((mask & (1 << SOFTINT_CLOCK)) != 0) {
l = si[SOFTINT_CLOCK].si_lwp;
} else {
panic("softint_picklwp");
}
return l;
}
#else /* !__HAVE_FAST_SOFTINTS */
/*
* softint_thread:
*
* Fast path: the LWP is switched to without restoring any state,
* so we should not arrive here - there is a direct handoff between
* the interrupt stub and softint_dispatch().
*/
void
softint_thread(void *cookie)
{
panic("softint_thread");
}
/*
* softint_dispatch:
*
* Fast path: entry point from machine-dependent code.
*/
void
softint_dispatch(lwp_t *pinned, int s)
{
struct bintime now;
u_int timing;
lwp_t *l;
#ifdef DIAGNOSTIC
if ((pinned->l_pflag & LP_RUNNING) == 0 || curlwp->l_stat != LSIDL) {
struct lwp *onproc = curcpu()->ci_onproc;
int s2 = splhigh();
printf("curcpu=%d, spl=%d curspl=%d\n"
"onproc=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
"curlwp=%p => l_stat=%d l_flag=%08x l_cpu=%d\n"
"pinned=%p => l_stat=%d l_flag=%08x l_cpu=%d\n",
cpu_index(curcpu()), s, s2, onproc, onproc->l_stat,
onproc->l_flag, cpu_index(onproc->l_cpu), curlwp,
curlwp->l_stat, curlwp->l_flag,
cpu_index(curlwp->l_cpu), pinned, pinned->l_stat,
pinned->l_flag, cpu_index(pinned->l_cpu));
splx(s2);
panic("softint screwup");
}
#endif
/*
* Note the interrupted LWP, and mark the current LWP as running
* before proceeding. Although this must as a rule be done with
* the LWP locked, at this point no external agents will want to
* modify the interrupt LWP's state.
*/
timing = softint_timing;
l = curlwp;
l->l_switchto = pinned;
l->l_stat = LSONPROC;
/*
* Dispatch the interrupt. If softints are being timed, charge
* for it.
*/
if (timing) {
binuptime(&l->l_stime);
membar_producer(); /* for calcru */
l->l_pflag |= LP_TIMEINTR;
}
l->l_pflag |= LP_RUNNING;
softint_execute(l, s);
if (timing) {
binuptime(&now);
updatertime(l, &now);
l->l_pflag &= ~LP_TIMEINTR;
}
/*
* If we blocked while handling the interrupt, the pinned LWP is
* gone and we are now running as a kthread, so find another LWP to
* run. softint_dispatch() won't be reentered until the priority is
* finally dropped to IPL_NONE on entry to the next LWP on this CPU.
*/
l->l_stat = LSIDL;
if (l->l_switchto == NULL) {
lwp_lock(l);
spc_lock(l->l_cpu);
mi_switch(l);
/* NOTREACHED */
}
l->l_switchto = NULL;
l->l_pflag &= ~LP_RUNNING;
}
#endif /* !__HAVE_FAST_SOFTINTS */
/* $NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008, 2009, 2010, 2012, 2019 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c)2007 YAMAMOTO Takashi,
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* CPU related routines not shared with rump.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_cpu.c,v 1.97 2023/09/02 17:44:59 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_cpu_ucode.h"
#include "opt_heartbeat.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/idle.h>
#include <sys/sched.h>
#include <sys/intr.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/cpuio.h>
#include <sys/proc.h>
#include <sys/percpu.h>
#include <sys/kernel.h>
#include <sys/kauth.h>
#include <sys/xcall.h>
#include <sys/pool.h>
#include <sys/kmem.h>
#include <sys/select.h>
#include <sys/namei.h>
#include <sys/callout.h>
#include <sys/pcu.h>
#include <sys/heartbeat.h>
#include <uvm/uvm_extern.h>
#include "ioconf.h"
/*
* If the port has stated that cpu_data is the first thing in cpu_info,
* verify that the claim is true. This will prevent them from getting out
* of sync.
*/
#ifdef __HAVE_CPU_DATA_FIRST
CTASSERT(offsetof(struct cpu_info, ci_data) == 0);
#else
CTASSERT(offsetof(struct cpu_info, ci_data) != 0);
#endif
int (*compat_cpuctl_ioctl)(struct lwp *, u_long, void *) = (void *)enosys;
static void cpu_xc_online(struct cpu_info *, void *);
static void cpu_xc_offline(struct cpu_info *, void *);
dev_type_ioctl(cpuctl_ioctl);
const struct cdevsw cpuctl_cdevsw = {
.d_open = nullopen,
.d_close = nullclose,
.d_read = nullread,
.d_write = nullwrite,
.d_ioctl = cpuctl_ioctl,
.d_stop = nullstop,
.d_tty = notty,
.d_poll = nopoll,
.d_mmap = nommap,
.d_kqfilter = nokqfilter,
.d_discard = nodiscard,
.d_flag = D_OTHER | D_MPSAFE
};
int
mi_cpu_attach(struct cpu_info *ci)
{
int error;
KASSERT(maxcpus > 0);
if ((ci->ci_index = ncpu) >= maxcpus)
panic("Too many CPUs. Increase MAXCPUS?");
kcpuset_set(kcpuset_attached, cpu_index(ci));
/*
* Create a convenience cpuset of just ourselves.
*/
kcpuset_create(&ci->ci_kcpuset, true);
kcpuset_set(ci->ci_kcpuset, cpu_index(ci));
TAILQ_INIT(&ci->ci_data.cpu_ld_locks);
__cpu_simple_lock_init(&ci->ci_data.cpu_ld_lock);
/* This is useful for eg, per-cpu evcnt */
snprintf(ci->ci_data.cpu_name, sizeof(ci->ci_data.cpu_name), "cpu%d",
cpu_index(ci));
if (__predict_false(cpu_infos == NULL)) {
size_t ci_bufsize = (maxcpus + 1) * sizeof(struct cpu_info *);
cpu_infos = kmem_zalloc(ci_bufsize, KM_SLEEP);
}
cpu_infos[cpu_index(ci)] = ci;
sched_cpuattach(ci);
error = create_idle_lwp(ci);
if (error != 0) {
/* XXX revert sched_cpuattach */
return error;
}
if (ci == curcpu())
ci->ci_onproc = curlwp;
else
ci->ci_onproc = ci->ci_data.cpu_idlelwp;
percpu_init_cpu(ci);
softint_init(ci);
callout_init_cpu(ci);
xc_init_cpu(ci);
pool_cache_cpu_init(ci);
selsysinit(ci);
cache_cpu_init(ci);
TAILQ_INIT(&ci->ci_data.cpu_biodone);
ncpu++;
ncpuonline++;
return 0;
}
void
cpuctlattach(int dummy __unused)
{
KASSERT(cpu_infos != NULL);
}
int
cpuctl_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l)
{
CPU_INFO_ITERATOR cii;
cpustate_t *cs;
struct cpu_info *ci;
int error, i;
u_int id;
error = 0;
mutex_enter(&cpu_lock);
switch (cmd) {
case IOC_CPU_SETSTATE:
cs = data;
error = kauth_authorize_system(l->l_cred,
KAUTH_SYSTEM_CPU, KAUTH_REQ_SYSTEM_CPU_SETSTATE, cs, NULL,
NULL);
if (error != 0)
break;
if (cs->cs_id >= maxcpus ||
(ci = cpu_lookup(cs->cs_id)) == NULL) {
error = ESRCH;
break;
}
cpu_setintr(ci, cs->cs_intr); /* XXX neglect errors */
error = cpu_setstate(ci, cs->cs_online);
break;
case IOC_CPU_GETSTATE:
cs = data;
id = cs->cs_id;
memset(cs, 0, sizeof(*cs));
cs->cs_id = id;
if (cs->cs_id >= maxcpus ||
(ci = cpu_lookup(id)) == NULL) {
error = ESRCH;
break;
}
if ((ci->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
cs->cs_online = false;
else
cs->cs_online = true;
if ((ci->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
cs->cs_intr = false;
else
cs->cs_intr = true;
cs->cs_lastmod = (int32_t)ci->ci_schedstate.spc_lastmod;
cs->cs_lastmodhi = (int32_t)
(ci->ci_schedstate.spc_lastmod >> 32);
cs->cs_intrcnt = cpu_intr_count(ci) + 1;
cs->cs_hwid = ci->ci_cpuid;
break;
case IOC_CPU_MAPID:
i = 0;
for (CPU_INFO_FOREACH(cii, ci)) {
if (i++ == *(int *)data)
break;
}
if (ci == NULL)
error = ESRCH;
else
*(int *)data = cpu_index(ci);
break;
case IOC_CPU_GETCOUNT:
*(int *)data = ncpu;
break;
#ifdef CPU_UCODE
case IOC_CPU_UCODE_GET_VERSION:
error = cpu_ucode_get_version((struct cpu_ucode_version *)data);
break;
case IOC_CPU_UCODE_APPLY:
error = kauth_authorize_machdep(l->l_cred,
KAUTH_MACHDEP_CPU_UCODE_APPLY,
NULL, NULL, NULL, NULL);
if (error != 0)
break;
error = cpu_ucode_apply((const struct cpu_ucode *)data);
break;
#endif
default:
error = (*compat_cpuctl_ioctl)(l, cmd, data);
break;
}
mutex_exit(&cpu_lock);
return error;
}
struct cpu_info *
cpu_lookup(u_int idx)
{
struct cpu_info *ci;
/*
* cpu_infos is a NULL terminated array of MAXCPUS + 1 entries,
* so an index of MAXCPUS here is ok. See mi_cpu_attach.
*/
KASSERT(idx <= maxcpus);
if (__predict_false(cpu_infos == NULL)) {
KASSERT(idx == 0);
return curcpu();
}
ci = cpu_infos[idx];
KASSERT(ci == NULL || cpu_index(ci) == idx); KASSERTMSG(idx < maxcpus || ci == NULL, "idx %d ci %p", idx, ci);
return ci;
}
static void
cpu_xc_offline(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc, *mspc = NULL;
struct cpu_info *target_ci;
struct lwp *l;
CPU_INFO_ITERATOR cii;
int s;
/*
* Thread that made the cross call (separate context) holds
* cpu_lock on our behalf.
*/
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags |= SPCF_OFFLINE;
splx(s);
/* Take the first available CPU for the migration. */
for (CPU_INFO_FOREACH(cii, target_ci)) {
mspc = &target_ci->ci_schedstate;
if ((mspc->spc_flags & SPCF_OFFLINE) == 0)
break;
}
KASSERT(target_ci != NULL);
/*
* Migrate all non-bound threads to the other CPU. Note that this
* runs from the xcall thread, thus handling of LSONPROC is not needed.
*/
mutex_enter(&proc_lock);
LIST_FOREACH(l, &alllwp, l_list) {
struct cpu_info *mci;
lwp_lock(l);
if (l->l_cpu != ci || (l->l_pflag & (LP_BOUND | LP_INTR))) {
lwp_unlock(l);
continue;
}
/* Regular case - no affinity. */
if (l->l_affinity == NULL) {
lwp_migrate(l, target_ci);
continue;
}
/* Affinity is set, find an online CPU in the set. */
for (CPU_INFO_FOREACH(cii, mci)) {
mspc = &mci->ci_schedstate;
if ((mspc->spc_flags & SPCF_OFFLINE) == 0 &&
kcpuset_isset(l->l_affinity, cpu_index(mci)))
break;
}
if (mci == NULL) {
lwp_unlock(l);
mutex_exit(&proc_lock);
goto fail;
}
lwp_migrate(l, mci);
}
mutex_exit(&proc_lock);
#if PCU_UNIT_COUNT > 0
pcu_save_all_on_cpu();
#endif
heartbeat_suspend();
#ifdef __HAVE_MD_CPU_OFFLINE
cpu_offline_md();
#endif
return;
fail:
/* Just unset the SPCF_OFFLINE flag, caller will check */
s = splsched();
spc->spc_flags &= ~SPCF_OFFLINE;
splx(s);
}
static void
cpu_xc_online(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
heartbeat_resume();
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags &= ~SPCF_OFFLINE;
splx(s);
}
int
cpu_setstate(struct cpu_info *ci, bool online)
{
struct schedstate_percpu *spc;
CPU_INFO_ITERATOR cii;
struct cpu_info *ci2;
uint64_t where;
xcfunc_t func;
int nonline;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(&cpu_lock));
if (online) {
if ((spc->spc_flags & SPCF_OFFLINE) == 0)
return 0;
func = (xcfunc_t)cpu_xc_online;
} else {
if ((spc->spc_flags & SPCF_OFFLINE) != 0)
return 0;
nonline = 0;
/*
* Ensure that at least one CPU within the processor set
* stays online. Revisit this later.
*/
for (CPU_INFO_FOREACH(cii, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_OFFLINE) != 0)
continue;
if (ci2->ci_schedstate.spc_psid != spc->spc_psid)
continue;
nonline++;
}
if (nonline == 1)
return EBUSY;
func = (xcfunc_t)cpu_xc_offline;
}
where = xc_unicast(0, func, ci, NULL, ci);
xc_wait(where);
if (online) {
KASSERT((spc->spc_flags & SPCF_OFFLINE) == 0);
ncpuonline++;
} else {
if ((spc->spc_flags & SPCF_OFFLINE) == 0) {
/* If was not set offline, then it is busy */
return EBUSY;
}
ncpuonline--;
}
spc->spc_lastmod = time_second;
return 0;
}
#if defined(__HAVE_INTR_CONTROL)
static void
cpu_xc_intr(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags &= ~SPCF_NOINTR;
splx(s);
}
static void
cpu_xc_nointr(struct cpu_info *ci, void *unused)
{
struct schedstate_percpu *spc;
int s;
spc = &ci->ci_schedstate;
s = splsched();
spc->spc_flags |= SPCF_NOINTR;
splx(s);
}
int
cpu_setintr(struct cpu_info *ci, bool intr)
{
struct schedstate_percpu *spc;
CPU_INFO_ITERATOR cii;
struct cpu_info *ci2;
uint64_t where;
xcfunc_t func;
int nintr;
spc = &ci->ci_schedstate;
KASSERT(mutex_owned(&cpu_lock));
if (intr) {
if ((spc->spc_flags & SPCF_NOINTR) == 0)
return 0;
func = (xcfunc_t)cpu_xc_intr;
} else {
if (CPU_IS_PRIMARY(ci)) /* XXX kern/45117 */
return EINVAL;
if ((spc->spc_flags & SPCF_NOINTR) != 0)
return 0;
/*
* Ensure that at least one CPU within the system
* is handing device interrupts.
*/
nintr = 0;
for (CPU_INFO_FOREACH(cii, ci2)) {
if ((ci2->ci_schedstate.spc_flags & SPCF_NOINTR) != 0)
continue;
if (ci2 == ci)
continue;
nintr++;
}
if (nintr == 0)
return EBUSY;
func = (xcfunc_t)cpu_xc_nointr;
}
where = xc_unicast(0, func, ci, NULL, ci);
xc_wait(where);
if (intr) {
KASSERT((spc->spc_flags & SPCF_NOINTR) == 0);
} else if ((spc->spc_flags & SPCF_NOINTR) == 0) {
/* If was not set offline, then it is busy */
return EBUSY;
}
/* Direct interrupts away from the CPU and record the change. */
cpu_intr_redistribute();
spc->spc_lastmod = time_second;
return 0;
}
#else /* __HAVE_INTR_CONTROL */
int
cpu_setintr(struct cpu_info *ci, bool intr)
{
return EOPNOTSUPP;
}
u_int
cpu_intr_count(struct cpu_info *ci)
{
return 0; /* 0 == "don't know" */
}
#endif /* __HAVE_INTR_CONTROL */
#ifdef CPU_UCODE
int
cpu_ucode_load(struct cpu_ucode_softc *sc, const char *fwname)
{
firmware_handle_t fwh;
int error;
if (sc->sc_blob != NULL) {
firmware_free(sc->sc_blob, sc->sc_blobsize);
sc->sc_blob = NULL;
sc->sc_blobsize = 0;
}
error = cpu_ucode_md_open(&fwh, sc->loader_version, fwname);
if (error != 0) {
#ifdef DEBUG
printf("ucode: firmware_open(%s) failed: %i\n", fwname, error);
#endif
goto err0;
}
sc->sc_blobsize = firmware_get_size(fwh);
if (sc->sc_blobsize == 0) {
error = EFTYPE;
firmware_close(fwh);
goto err0;
}
sc->sc_blob = firmware_malloc(sc->sc_blobsize);
if (sc->sc_blob == NULL) {
error = ENOMEM;
firmware_close(fwh);
goto err0;
}
error = firmware_read(fwh, 0, sc->sc_blob, sc->sc_blobsize);
firmware_close(fwh);
if (error != 0)
goto err1;
return 0;
err1:
firmware_free(sc->sc_blob, sc->sc_blobsize);
sc->sc_blob = NULL;
sc->sc_blobsize = 0;
err0:
return error;
}
#endif
/* $NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $ */
/*-
* Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1982, 1986, 1990, 1991, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tty.c 8.13 (Berkeley) 1/9/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tty.c,v 1.312 2023/12/07 09:00:32 pgoyette Exp $");
#ifdef _KERNEL_OPT
#include "opt_compat_netbsd.h"
#endif
#define TTY_ALLOW_PRIVATE
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/ioctl.h>
#include <sys/proc.h>
#define TTYDEFCHARS
#include <sys/tty.h>
#undef TTYDEFCHARS
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/cpu.h>
#include <sys/dkstat.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
#include <sys/kmem.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/poll.h>
#include <sys/kprintf.h>
#include <sys/namei.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/intr.h>
#include <sys/ioctl_compat.h>
#include <sys/module.h>
#include <sys/bitops.h>
#include <sys/compat_stub.h>
#include <sys/atomic.h>
#include <sys/condvar.h>
#include <sys/pserialize.h>
static int ttnread(struct tty *);
static void ttyblock(struct tty *);
static void ttyecho(int, struct tty *);
static void ttyrubo(struct tty *, int);
static void ttyprintf_nolock(struct tty *, const char *fmt, ...)
__printflike(2, 3);
static int proc_compare_wrapper(struct proc *, struct proc *);
static void ttysigintr(void *);
/* Symbolic sleep message strings. */
const char ttclos[] = "ttycls";
const char ttopen[] = "ttyopn";
const char ttybg[] = "ttybg";
const char ttyin[] = "ttyin";
const char ttyout[] = "ttyout";
/*
* Used to determine whether we still have a connection. This is true in
* one of 3 cases:
* 1) We have carrier.
* 2) It's a locally attached terminal, and we are therefore ignoring carrier.
* 3) We're using a flow control mechanism that overloads the carrier signal.
*/
#define CONNECTED(tp) (ISSET(tp->t_state, TS_CARR_ON) || \
ISSET(tp->t_cflag, CLOCAL | MDMBUF))
/*
* Table with character classes and parity. The 8th bit indicates parity,
* the 7th bit indicates the character is an alphameric or underscore (for
* ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits
* are 0 then the character needs no special processing on output; classes
* other than 0 might be translated or (not currently) require delays.
*/
#define E 0x00 /* Even parity. */
#define O 0x80 /* Odd parity. */
#define PARITY(c) (char_type[c] & O)
#define ALPHA 0x40 /* Alpha or underscore. */
#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA)
#define CCLASSMASK 0x3f
#define CCLASS(c) (char_type[c] & CCLASSMASK)
#define BS BACKSPACE
#define CC CONTROL
#define CR RETURN
#define NA ORDINARY | ALPHA
#define NL NEWLINE
#define NO ORDINARY
#define TB TAB
#define VT VTAB
unsigned char const char_type[] = {
E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */
O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */
O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */
E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */
O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */
O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */
E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */
O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */
E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */
O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */
O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */
E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */
/*
* Meta chars; should be settable per character set;
* for now, treat them all as normal characters.
*/
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA,
};
#undef BS
#undef CC
#undef CR
#undef NA
#undef NL
#undef NO
#undef TB
#undef VT
static struct ttylist_head tty_sigqueue = TAILQ_HEAD_INITIALIZER(tty_sigqueue);
static void *tty_sigsih;
struct ttylist_head ttylist = TAILQ_HEAD_INITIALIZER(ttylist);
int tty_count;
kmutex_t tty_lock;
kmutex_t constty_lock;
static struct pserialize *constty_psz;
static kcondvar_t ttyref_cv;
struct ptm_pty *ptm = NULL;
uint64_t tk_cancc;
uint64_t tk_nin;
uint64_t tk_nout;
uint64_t tk_rawcc;
static kauth_listener_t tty_listener;
#define TTY_MINQSIZE 0x00400
#define TTY_MAXQSIZE 0x10000
int tty_qsize = TTY_MINQSIZE;
static int
tty_get_qsize(int *qsize, int newsize)
{ if (newsize <= 0)
return EINVAL;
newsize = 1 << ilog2(newsize); /* Make it a power of two */
if (newsize < TTY_MINQSIZE || newsize > TTY_MAXQSIZE)
return EINVAL;
*qsize = newsize;
return 0;
}
static int
tty_set_qsize(struct tty *tp, int newsize)
{
struct clist rawq, canq, outq;
struct clist orawq, ocanq, ooutq;
clalloc(&rawq, newsize, 1);
clalloc(&canq, newsize, 1);
clalloc(&outq, newsize, 0);
mutex_spin_enter(&tty_lock);
if (tp->t_outq.c_cc != 0) {
mutex_spin_exit(&tty_lock);
clfree(&rawq);
clfree(&canq);
clfree(&outq);
return EBUSY;
}
orawq = tp->t_rawq;
ocanq = tp->t_canq;
ooutq = tp->t_outq;
tp->t_qsize = newsize;
tp->t_rawq = rawq;
tp->t_canq = canq;
tp->t_outq = outq;
ttsetwater(tp);
mutex_spin_exit(&tty_lock);
clfree(&orawq);
clfree(&ocanq);
clfree(&ooutq);
return 0;
}
static int
sysctl_kern_tty_qsize(SYSCTLFN_ARGS)
{
int newsize;
int error;
struct sysctlnode node;
node = *rnode;
node.sysctl_data = &newsize;
newsize = tty_qsize;
error = sysctl_lookup(SYSCTLFN_CALL(&node));
if (error || newp == NULL)
return error;
return tty_get_qsize(&tty_qsize, newsize);
}
static void
sysctl_kern_tty_setup(void)
{
const struct sysctlnode *rnode, *cnode;
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "tkstat",
SYSCTL_DESCR("Number of characters sent and received "
"on ttys"),
NULL, 0, NULL, 0,
CTL_KERN, KERN_TKSTAT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "nin",
SYSCTL_DESCR("Total number of tty input characters"),
NULL, 0, &tk_nin, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NIN, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "nout",
SYSCTL_DESCR("Total number of tty output characters"),
NULL, 0, &tk_nout, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_NOUT, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "cancc",
SYSCTL_DESCR("Number of canonical tty input characters"),
NULL, 0, &tk_cancc, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_CANCC, CTL_EOL);
sysctl_createv(NULL, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_QUAD, "rawcc",
SYSCTL_DESCR("Number of raw tty input characters"),
NULL, 0, &tk_rawcc, 0,
CTL_KERN, KERN_TKSTAT, KERN_TKSTAT_RAWCC, CTL_EOL);
sysctl_createv(NULL, 0, NULL, &rnode,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "tty", NULL,
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
sysctl_createv(NULL, 0, &rnode, &cnode,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_INT, "qsize",
SYSCTL_DESCR("TTY input and output queue size"),
sysctl_kern_tty_qsize, 0, &tty_qsize, 0,
CTL_CREATE, CTL_EOL);
}
/*
* ttylock(tp), ttyunlock(tp), ttylocked(tp)
*
* Exclusive lock on tty. Currently a single global lock.
*
* ttylocked is for positive DIAGNOSTIC assertions only.
*/
void
ttylock(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
}
void
ttyunlock(struct tty *tp)
{
mutex_spin_exit(&tty_lock);
}
bool
ttylocked(struct tty *tp)
{
return mutex_owned(&tty_lock);
}
int
ttyopen(struct tty *tp, int dialout, int nonblock)
{
int error;
error = 0;
mutex_spin_enter(&tty_lock);
if (dialout) {
/*
* If the device is already open for non-dialout, fail.
* Otherwise, set TS_DIALOUT to block any pending non-dialout
* opens.
*/
if (ISSET(tp->t_state, TS_ISOPEN) &&
!ISSET(tp->t_state, TS_DIALOUT)) {
error = EBUSY;
goto out;
}
SET(tp->t_state, TS_DIALOUT);
} else {
if (!nonblock) {
/*
* Wait for carrier. Also wait for any dialout
* processes to close the tty first.
*/
while (ISSET(tp->t_state, TS_DIALOUT) || !CONNECTED(tp)) {
tp->t_wopen++;
error = ttysleep(tp, &tp->t_rawcv, true, 0);
tp->t_wopen--;
if (error)
goto out;
}
} else {
/*
* Don't allow a non-blocking non-dialout open if the
* device is already open for dialout.
*/
if (ISSET(tp->t_state, TS_DIALOUT)) {
error = EBUSY;
goto out;
}
}
}
out:
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Initial open of tty, or (re)entry to standard tty line discipline.
*/
int
ttylopen(dev_t device, struct tty *tp)
{
mutex_spin_enter(&tty_lock);
tp->t_dev = device;
if (!ISSET(tp->t_state, TS_ISOPEN)) { SET(tp->t_state, TS_ISOPEN);
memset(&tp->t_winsize, 0, sizeof(tp->t_winsize));
tp->t_flags = 0;
}
mutex_spin_exit(&tty_lock);
if (tp->t_qsize != tty_qsize) tty_set_qsize(tp, tty_qsize);
return (0);
}
/*
* Interrupt any pending I/O and make it fail. Used before close to
* interrupt pending open/read/write/&c. and make it fail promptly.
*/
void
ttycancel(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
tp->t_state |= TS_CANCEL;
cv_broadcast(&tp->t_outcv);
cv_broadcast(&tp->t_rawcv);
mutex_spin_exit(&tty_lock);
}
/*
* Handle close() on a tty line: flush and set to initial state,
* bumping generation number so that pending read/write calls
* can detect recycling of the tty.
*/
int
ttyclose(struct tty *tp)
{
struct session *sess;
/*
* Make sure this is not the constty. Without constty_lock it
* is always allowed to transition from nonnull to null.
*/
(void)atomic_cas_ptr(&constty, tp, NULL);
/*
* We don't know if this has _ever_ been the constty: another
* thread may have kicked it out as constty before we started
* to close.
*
* So we wait for all users that might be acquiring references
* to finish doing so -- after that, no more references can be
* made, at which point we can safely flush the tty, wait for
* the existing references to drain, and finally free or reuse
* the tty.
*/
pserialize_perform(constty_psz);
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD | FWRITE);
tp->t_gen++;
tp->t_pgrp = NULL;
tp->t_state = 0;
sess = tp->t_session;
tp->t_session = NULL;
while (tp->t_refcnt)
cv_wait(&ttyref_cv, &tty_lock);
mutex_spin_exit(&tty_lock);
if (sess != NULL) { mutex_enter(&proc_lock);
/* Releases proc_lock. */
proc_sessrele(sess);
}
return (0);
}
#define FLUSHQ(q) { \
if ((q)->c_cc) \
ndflush(q, (q)->c_cc); \
}
/*
* tty_acquire(tp), tty_release(tp)
*
* Acquire a reference to tp that prevents it from being closed
* until released. Caller must guarantee tp has not yet been
* closed, e.g. by obtaining tp from constty during a pserialize
* read section. Caller must not hold tty_lock.
*/
void
tty_acquire(struct tty *tp)
{
unsigned refcnt __diagused;
refcnt = atomic_inc_uint_nv(&tp->t_refcnt);
KASSERT(refcnt < UINT_MAX);
}
void
tty_release(struct tty *tp)
{
unsigned old, new;
KDASSERT(mutex_ownable(&tty_lock));
do {
old = atomic_load_relaxed(&tp->t_refcnt); if (old == 1) {
mutex_spin_enter(&tty_lock);
if (atomic_dec_uint_nv(&tp->t_refcnt) == 0) cv_broadcast(&ttyref_cv);
mutex_spin_exit(&tty_lock);
return;
}
KASSERT(old != 0);
new = old - 1;
} while (atomic_cas_uint(&tp->t_refcnt, old, new) != old);
}
/*
* This macro is used in canonical mode input processing, where a read
* request shall not return unless a 'line delimiter' ('\n') or 'break'
* (EOF, EOL, EOL2) character (or a signal) has been received. As EOL2
* is an extension to the POSIX.1 defined set of special characters,
* recognize it only if IEXTEN is set in the set of local flags.
*/
#define TTBREAKC(c, lflg) \
((c) == '\n' || (((c) == cc[VEOF] || (c) == cc[VEOL] || \
((c) == cc[VEOL2] && ISSET(lflg, IEXTEN))) && (c) != _POSIX_VDISABLE))
/*
* ttyinput() helper.
* Call with the tty lock held.
*/
/* XXX static */ int
ttyinput_wlock(int c, struct tty *tp)
{
int iflag, lflag, i, error;
u_char *cc;
KASSERT(mutex_owned(&tty_lock));
/*
* If input is pending take it first.
*/
lflag = tp->t_lflag;
if (ISSET(lflag, PENDIN)) ttypend(tp);
/*
* Gather stats.
*/
if (ISSET(lflag, ICANON)) {
++tk_cancc;
++tp->t_cancc;
} else {
++tk_rawcc;
++tp->t_rawcc;
}
++tk_nin;
cc = tp->t_cc;
/*
* Handle exceptional conditions (break, parity, framing).
*/
iflag = tp->t_iflag;
if ((error = (ISSET(c, TTY_ERRORMASK))) != 0) {
CLR(c, TTY_ERRORMASK);
if (ISSET(error, TTY_FE) && c == 0) { /* Break. */
if (ISSET(iflag, IGNBRK))
return (0);
else if (ISSET(iflag, BRKINT)) {
ttyflush(tp, FREAD | FWRITE);
ttysig(tp, TTYSIG_PG1, SIGINT);
return (0);
} else if (ISSET(iflag, PARMRK))
goto parmrk;
} else if ((ISSET(error, TTY_PE) && ISSET(iflag, INPCK)) ||
ISSET(error, TTY_FE)) {
if (ISSET(iflag, IGNPAR))
return (0);
else if (ISSET(iflag, PARMRK)) {
parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
(void)putc(c | TTY_QUOTE, &tp->t_rawq);
return (0);
} else
c = 0;
}
} else if (c == 0377 &&
ISSET(iflag, ISTRIP|IGNPAR|INPCK|PARMRK) == (INPCK|PARMRK)) {
/* "Escape" a valid character of '\377'. */
(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
goto endcase;
}
/*
* In tandem mode, check high water mark.
*/
if (ISSET(iflag, IXOFF) || ISSET(tp->t_cflag, CHWFLOW)) ttyblock(tp);
if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
CLR(c, 0x80);
if (!ISSET(lflag, EXTPROC)) {
/*
* Check for literal nexting very first
*/
if (ISSET(tp->t_state, TS_LNCH)) { SET(c, TTY_QUOTE);
CLR(tp->t_state, TS_LNCH);
}
/*
* Scan for special characters. This code
* is really just a big case statement with
* non-constant cases. The bottom of the
* case statement is labeled ``endcase'', so goto
* it after a case match, or similar.
*/
/*
* Control chars which aren't controlled
* by ICANON, ISIG, or IXON.
*/
if (ISSET(lflag, IEXTEN)) {
if (CCEQ(cc[VLNEXT], c)) {
if (ISSET(lflag, ECHO)) {
if (ISSET(lflag, ECHOE)) {
(void)ttyoutput('^', tp);
(void)ttyoutput('\b', tp);
} else
ttyecho(c, tp);
}
SET(tp->t_state, TS_LNCH);
goto endcase;
}
if (CCEQ(cc[VDISCARD], c)) {
if (ISSET(lflag, FLUSHO))
CLR(tp->t_lflag, FLUSHO);
else {
ttyflush(tp, FWRITE);
ttyecho(c, tp);
if (tp->t_rawq.c_cc + tp->t_canq.c_cc) ttyretype(tp);
SET(tp->t_lflag, FLUSHO);
}
goto startoutput;
}
}
/*
* Signals.
*/
if (ISSET(lflag, ISIG)) { if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD | FWRITE);
ttyecho(c, tp);
ttysig(tp, TTYSIG_PG1, CCEQ(cc[VINTR], c) ?
SIGINT : SIGQUIT);
goto endcase;
}
if (CCEQ(cc[VSUSP], c)) { if (!ISSET(lflag, NOFLSH)) ttyflush(tp, FREAD);
ttyecho(c, tp);
ttysig(tp, TTYSIG_PG1, SIGTSTP);
goto endcase;
}
}
/*
* Handle start/stop characters.
*/
if (ISSET(iflag, IXON)) {
if (CCEQ(cc[VSTOP], c)) {
if (!ISSET(tp->t_state, TS_TTSTOP)) {
SET(tp->t_state, TS_TTSTOP);
cdev_stop(tp, 0);
return (0);
}
if (!CCEQ(cc[VSTART], c))
return (0);
/*
* if VSTART == VSTOP then toggle
*/
goto endcase;
}
if (CCEQ(cc[VSTART], c))
goto restartoutput;
}
/*
* IGNCR, ICRNL, & INLCR
*/
if (c == '\r') { if (ISSET(iflag, IGNCR))
goto endcase;
else if (ISSET(iflag, ICRNL))
c = '\n';
} else if (c == '\n' && ISSET(iflag, INLCR))
c = '\r';
}
if (!ISSET(lflag, EXTPROC) && ISSET(lflag, ICANON)) {
/*
* From here on down canonical mode character
* processing takes place.
*/
/*
* erase (^H / ^?)
*/
if (CCEQ(cc[VERASE], c)) {
if (tp->t_rawq.c_cc)
ttyrub(unputc(&tp->t_rawq), tp);
goto endcase;
}
/*
* kill (^U)
*/
if (CCEQ(cc[VKILL], c)) {
if (ISSET(lflag, ECHOKE) && tp->t_rawq.c_cc == tp->t_rocount &&
!ISSET(lflag, ECHOPRT))
while (tp->t_rawq.c_cc)
ttyrub(unputc(&tp->t_rawq), tp);
else {
ttyecho(c, tp);
if (ISSET(lflag, ECHOK) ||
ISSET(lflag, ECHOKE))
ttyecho('\n', tp); FLUSHQ(&tp->t_rawq);
tp->t_rocount = 0;
}
CLR(tp->t_state, TS_LOCAL);
goto endcase;
}
/*
* Extensions to the POSIX.1 GTI set of functions.
*/
if (ISSET(lflag, IEXTEN)) {
/*
* word erase (^W)
*/
if (CCEQ(cc[VWERASE], c)) {
int alt = ISSET(lflag, ALTWERASE);
int ctype;
/*
* erase whitespace
*/
while ((c = unputc(&tp->t_rawq)) == ' ' ||
c == '\t')
ttyrub(c, tp);
if (c == -1)
goto endcase;
/*
* erase last char of word and remember the
* next chars type (for ALTWERASE)
*/
ttyrub(c, tp);
c = unputc(&tp->t_rawq);
if (c == -1)
goto endcase;
if (c == ' ' || c == '\t') {
(void)putc(c, &tp->t_rawq);
goto endcase;
}
ctype = ISALPHA(c);
/*
* erase rest of word
*/
do {
ttyrub(c, tp);
c = unputc(&tp->t_rawq);
if (c == -1)
goto endcase;
} while (c != ' ' && c != '\t' && (alt == 0 || ISALPHA(c) == ctype));
(void)putc(c, &tp->t_rawq);
goto endcase;
}
/*
* reprint line (^R)
*/
if (CCEQ(cc[VREPRINT], c)) {
ttyretype(tp);
goto endcase;
}
/*
* ^T - kernel info and generate SIGINFO
*/
if (CCEQ(cc[VSTATUS], c)) { ttysig(tp, TTYSIG_PG1, SIGINFO);
goto endcase;
}
}
}
/*
* Check for input buffer overflow
*/
if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) {
if (ISSET(iflag, IMAXBEL)) {
if (tp->t_outq.c_cc < tp->t_hiwat) (void)ttyoutput(CTRL('g'), tp);
} else
ttyflush(tp, FREAD | FWRITE);
goto endcase;
}
/*
* Put data char in q for user and
* wakeup on seeing a line delimiter.
*/
if (putc(c, &tp->t_rawq) >= 0) {
if (!ISSET(lflag, ICANON)) {
ttwakeup(tp);
ttyecho(c, tp);
goto endcase;
}
if (TTBREAKC(c, lflag)) {
tp->t_rocount = 0;
catq(&tp->t_rawq, &tp->t_canq);
ttwakeup(tp);
} else if (tp->t_rocount++ == 0) tp->t_rocol = tp->t_column; if (ISSET(tp->t_state, TS_ERASE)) {
/*
* end of prterase \.../
*/
CLR(tp->t_state, TS_ERASE);
(void)ttyoutput('/', tp);
}
i = tp->t_column;
ttyecho(c, tp);
if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) {
/*
* Place the cursor over the '^' of the ^D.
*/
i = uimin(2, tp->t_column - i);
while (i > 0) {
(void)ttyoutput('\b', tp);
i--;
}
}
}
endcase:
/*
* IXANY means allow any character to restart output.
*/
if (ISSET(tp->t_state, TS_TTSTOP) &&
!ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) {
return (0);
}
restartoutput:
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_TTSTOP);
startoutput:
return (ttstart(tp));
}
/*
* Process input of a single character received on a tty.
*
* XXX - this is a hack, all drivers must changed to acquire the
* lock before calling linesw->l_rint()
*/
int
ttyinput(int c, struct tty *tp)
{
int error;
/*
* Unless the receiver is enabled, drop incoming data.
*/
if (!ISSET(tp->t_cflag, CREAD))
return (0);
mutex_spin_enter(&tty_lock);
error = ttyinput_wlock(c, tp);
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Output a single character on a tty, doing output processing
* as needed (expanding tabs, newline processing, etc.).
* Returns < 0 if succeeds, otherwise returns char to resend.
* Must be recursive.
*
* Call with tty lock held.
*/
int
ttyoutput(int c, struct tty *tp)
{
long oflag;
int col, notout;
KASSERT(mutex_owned(&tty_lock));
oflag = tp->t_oflag;
if (!ISSET(oflag, OPOST)) {
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
return (c);
return (-1);
}
/*
* Do tab expansion if OXTABS is set. Special case if we do external
* processing, we don't do the tab expansion because we'll probably
* get it wrong. If tab expansion needs to be done, let it happen
* externally.
*/
CLR(c, ~TTY_CHARMASK);
if (c == '\t' &&
ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
c = 8 - (tp->t_column & 7);
if (ISSET(tp->t_lflag, FLUSHO)) {
notout = 0;
} else {
notout = b_to_q(" ", c, &tp->t_outq);
c -= notout;
tk_nout += c;
tp->t_outcc += c;
}
tp->t_column += c;
return (notout ? '\t' : -1);
}
if (c == CEOT && ISSET(oflag, ONOEOT))
return (-1);
/*
* Newline translation: if ONLCR is set,
* translate newline into "\r\n".
*/
if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) {
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc('\r', &tp->t_outq))
return (c);
}
/* If OCRNL is set, translate "\r" into "\n". */
else if (c == '\r' && ISSET(tp->t_oflag, OCRNL))
c = '\n';
/* If ONOCR is set, don't transmit CRs when on column 0. */
else if (c == '\r' && ISSET(tp->t_oflag, ONOCR) && tp->t_column == 0)
return (-1);
tk_nout++;
tp->t_outcc++;
if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq))
return (c);
col = tp->t_column;
switch (CCLASS(c)) {
case BACKSPACE:
if (col > 0)
--col;
break;
case CONTROL:
break;
case NEWLINE:
if (ISSET(tp->t_oflag, ONLCR | ONLRET))
col = 0;
break;
case RETURN:
col = 0;
break;
case ORDINARY:
++col;
break;
case TAB:
col = (col + 8) & ~7;
break;
}
tp->t_column = col;
return (-1);
}
/*
* Ioctls for all tty devices. Called after line-discipline specific ioctl
* has been called to do discipline-specific functions and/or reject any
* of these ioctl commands.
*/
/* ARGSUSED */
int
ttioctl(struct tty *tp, u_long cmd, void *data, int flag, struct lwp *l)
{
struct proc *p;
struct linesw *lp;
int s, error;
struct pathbuf *pb;
struct nameidata nd;
char infobuf[200];
KASSERT(l != NULL);
p = l->l_proc;
/* If the ioctl involves modification, hang if in the background. */
switch (cmd) {
case TIOCFLUSH:
case TIOCDRAIN:
case TIOCSBRK:
case TIOCCBRK:
case TIOCSTART:
case TIOCSETA:
case TIOCSETD:
case TIOCSLINED:
case TIOCSETAF:
case TIOCSETAW:
#ifdef notdef
case TIOCSPGRP:
case FIOSETOWN:
#endif
case TIOCSTAT:
case TIOCSTI:
case TIOCSWINSZ:
case TIOCSQSIZE:
case TIOCLBIC:
case TIOCLBIS:
case TIOCLSET:
case TIOCSETC:
case OTIOCSETD:
case TIOCSETN:
case TIOCSETP:
case TIOCSLTC:
mutex_spin_enter(&tty_lock);
while (isbackground(curproc, tp) && p->p_pgrp->pg_jobc && (p->p_lflag & PL_PPWAIT) == 0 &&
!sigismasked(l, SIGTTOU)) {
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTOU, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz); if (error) { mutex_spin_exit(&tty_lock);
return (error);
}
}
mutex_spin_exit(&tty_lock);
break;
}
switch (cmd) { /* Process the ioctl. */
case FIOASYNC: /* set/clear async i/o */
mutex_spin_enter(&tty_lock);
if (*(int *)data)
SET(tp->t_state, TS_ASYNC);
else
CLR(tp->t_state, TS_ASYNC);
mutex_spin_exit(&tty_lock);
break;
case FIONBIO: /* set/clear non-blocking i/o */
break; /* XXX: delete. */
case FIONREAD: /* get # bytes to read */
mutex_spin_enter(&tty_lock);
*(int *)data = ttnread(tp);
mutex_spin_exit(&tty_lock);
break;
case FIONWRITE: /* get # bytes to written & unsent */
mutex_spin_enter(&tty_lock);
*(int *)data = tp->t_outq.c_cc;
mutex_spin_exit(&tty_lock);
break;
case FIONSPACE: /* get # bytes to written & unsent */
mutex_spin_enter(&tty_lock);
*(int *)data = tp->t_outq.c_cn - tp->t_outq.c_cc;
mutex_spin_exit(&tty_lock);
break;
case TIOCEXCL: /* set exclusive use of tty */
mutex_spin_enter(&tty_lock);
SET(tp->t_state, TS_XCLUDE);
mutex_spin_exit(&tty_lock);
break;
case TIOCFLUSH: { /* flush buffers */
int flags = *(int *)data;
if (flags == 0)
flags = FREAD | FWRITE;
else
flags &= FREAD | FWRITE;
mutex_spin_enter(&tty_lock);
ttyflush(tp, flags);
mutex_spin_exit(&tty_lock);
break;
}
case TIOCCONS: { /* become virtual console */
struct tty *ctp;
mutex_enter(&constty_lock);
error = 0;
ctp = atomic_load_relaxed(&constty);
if (*(int *)data) {
if (ctp != NULL && ctp != tp &&
ISSET(ctp->t_state, TS_CARR_ON | TS_ISOPEN) ==
(TS_CARR_ON | TS_ISOPEN)) {
error = EBUSY;
goto unlock_constty;
}
pb = pathbuf_create("/dev/console");
if (pb == NULL) {
error = ENOMEM;
goto unlock_constty;
}
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pb);
if ((error = namei(&nd)) != 0) {
pathbuf_destroy(pb);
goto unlock_constty;
}
error = VOP_ACCESS(nd.ni_vp, VREAD, l->l_cred);
vput(nd.ni_vp);
pathbuf_destroy(pb);
if (error)
goto unlock_constty;
KASSERT(atomic_load_relaxed(&constty) == ctp ||
atomic_load_relaxed(&constty) == NULL);
atomic_store_release(&constty, tp);
} else if (tp == ctp) {
atomic_store_relaxed(&constty, NULL);
}
unlock_constty: mutex_exit(&constty_lock);
if (error)
return error;
break;
}
case TIOCDRAIN: /* wait till output drained */
if ((error = ttywait(tp)) != 0)
return (error);
break;
case TIOCGETA: { /* get termios struct */
struct termios *t = (struct termios *)data;
memcpy(t, &tp->t_termios, sizeof(struct termios));
break;
}
case TIOCGETD: /* get line discipline (old) */
*(int *)data = tp->t_linesw->l_no;
break;
case TIOCGLINED: /* get line discipline (new) */
(void)strncpy((char *)data, tp->t_linesw->l_name,
TTLINEDNAMELEN - 1);
break;
case TIOCGWINSZ: /* get window size */
*(struct winsize *)data = tp->t_winsize;
break;
case TIOCGQSIZE:
*(int *)data = tp->t_qsize;
break;
case FIOGETOWN:
mutex_enter(&proc_lock);
if (tp->t_session != NULL && !isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_pgrp ? -tp->t_pgrp->pg_id : 0;
mutex_exit(&proc_lock);
break;
case TIOCGPGRP: /* get pgrp of tty */
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PGID;
mutex_exit(&proc_lock);
break;
case TIOCGSID: /* get sid of tty */
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
*(int *)data = tp->t_session->s_sid;
mutex_exit(&proc_lock);
break;
#ifdef TIOCHPCL
case TIOCHPCL: /* hang up on last close */
mutex_spin_enter(&tty_lock);
SET(tp->t_cflag, HUPCL);
mutex_spin_exit(&tty_lock);
break;
#endif
case TIOCNXCL: /* reset exclusive use of tty */
mutex_spin_enter(&tty_lock);
CLR(tp->t_state, TS_XCLUDE);
mutex_spin_exit(&tty_lock);
break;
case TIOCOUTQ: /* output queue size */
*(int *)data = tp->t_outq.c_cc;
break;
case TIOCSETA: /* set termios struct */
case TIOCSETAW: /* drain output, set */
case TIOCSETAF: { /* drn out, fls in, set */
struct termios *t = (struct termios *)data;
if (cmd == TIOCSETAW || cmd == TIOCSETAF) { if ((error = ttywait(tp)) != 0)
return (error);
if (cmd == TIOCSETAF) { mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD);
mutex_spin_exit(&tty_lock);
}
}
s = spltty();
/*
* XXXSMP - some drivers call back on us from t_param(), so
* don't take the tty spin lock here.
* require t_param() to unlock upon callback?
*/
/* wanted here: mutex_spin_enter(&tty_lock); */
if (!ISSET(t->c_cflag, CIGNORE)) {
/*
* Set device hardware.
*/
if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
/* wanted here: mutex_spin_exit(&tty_lock); */
splx(s);
return (error);
} else {
tp->t_cflag = t->c_cflag;
tp->t_ispeed = t->c_ispeed;
tp->t_ospeed = t->c_ospeed;
if (t->c_ospeed == 0) ttysig(tp, TTYSIG_LEADER, SIGHUP);
}
ttsetwater(tp);
}
/* delayed lock acquiring */
mutex_spin_enter(&tty_lock);
if (cmd != TIOCSETAF) { if (ISSET(t->c_lflag, ICANON) !=
ISSET(tp->t_lflag, ICANON)) {
if (ISSET(t->c_lflag, ICANON)) {
SET(tp->t_lflag, PENDIN);
ttwakeup(tp);
} else {
struct clist tq;
catq(&tp->t_rawq, &tp->t_canq);
tq = tp->t_rawq;
tp->t_rawq = tp->t_canq;
tp->t_canq = tq;
CLR(tp->t_lflag, PENDIN);
}
}
}
tp->t_iflag = t->c_iflag;
tp->t_oflag = t->c_oflag;
/*
* Make the EXTPROC bit read only.
*/
if (ISSET(tp->t_lflag, EXTPROC))
SET(t->c_lflag, EXTPROC);
else
CLR(t->c_lflag, EXTPROC);
tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
memcpy(tp->t_cc, t->c_cc, sizeof(t->c_cc));
mutex_spin_exit(&tty_lock);
splx(s);
break;
}
case TIOCSETD: /* set line discipline (old) */
lp = ttyldisc_lookup_bynum(*(int *)data);
goto setldisc;
case TIOCSLINED: { /* set line discipline (new) */
char *name = (char *)data;
dev_t device;
/* Null terminate to prevent buffer overflow */
name[TTLINEDNAMELEN - 1] = '\0';
lp = ttyldisc_lookup(name);
setldisc:
if (lp == NULL)
return (ENXIO);
if (lp != tp->t_linesw) {
device = tp->t_dev;
s = spltty();
(*tp->t_linesw->l_close)(tp, flag);
error = (*lp->l_open)(device, tp);
if (error) {
(void)(*tp->t_linesw->l_open)(device, tp);
splx(s);
ttyldisc_release(lp);
return (error);
}
ttyldisc_release(tp->t_linesw);
tp->t_linesw = lp;
splx(s);
} else {
/* Drop extra reference. */
ttyldisc_release(lp);
}
break;
}
case TIOCSTART: /* start output, like ^Q */
mutex_spin_enter(&tty_lock);
if (ISSET(tp->t_state, TS_TTSTOP) ||
ISSET(tp->t_lflag, FLUSHO)) {
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_TTSTOP);
ttstart(tp);
}
mutex_spin_exit(&tty_lock);
break;
case TIOCSTI: /* simulate terminal input */
if ((error = kauth_authorize_device_tty(l->l_cred,
KAUTH_DEVICE_TTY_STI, tp)) != 0) {
if (!ISSET(flag, FREAD))
return EPERM;
if (!isctty(p, tp))
return EACCES;
if (tp->t_session->s_leader->p_cred != p->p_cred)
return error;
}
(*tp->t_linesw->l_rint)(*(u_char *)data, tp);
break;
case TIOCSTOP: /* stop output, like ^S */
{
mutex_spin_enter(&tty_lock);
if (!ISSET(tp->t_state, TS_TTSTOP)) { SET(tp->t_state, TS_TTSTOP);
cdev_stop(tp, 0);
}
mutex_spin_exit(&tty_lock);
break;
}
case TIOCSCTTY: /* become controlling tty */
mutex_enter(&proc_lock);
mutex_spin_enter(&tty_lock);
/* Session ctty vnode pointer set in vnode layer. */
if (!SESS_LEADER(p) ||
((p->p_session->s_ttyvp || tp->t_session) &&
(tp->t_session != p->p_session))) {
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
return (EPERM);
}
/*
* `p_session' acquires a reference.
* But note that if `t_session' is set at this point,
* it must equal `p_session', in which case the session
* already has the correct reference count.
*/
if (tp->t_session == NULL) {
proc_sesshold(p->p_session);
}
tp->t_session = p->p_session;
tp->t_pgrp = p->p_pgrp;
p->p_session->s_ttyp = tp;
p->p_lflag |= PL_CONTROLT;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
case FIOSETOWN: { /* set pgrp of tty */
pid_t pgid = *(pid_t *)data;
struct pgrp *pgrp;
mutex_enter(&proc_lock);
if (tp->t_session != NULL && !isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
if (pgid < 0) {
if (pgid == INT_MIN) {
mutex_exit(&proc_lock);
return (EINVAL);
}
pgrp = pgrp_find(-pgid);
if (pgrp == NULL) { mutex_exit(&proc_lock);
return (EINVAL);
}
} else {
struct proc *p1;
p1 = proc_find(pgid);
if (!p1) {
mutex_exit(&proc_lock);
return (ESRCH);
}
pgrp = p1->p_pgrp;
}
if (pgrp->pg_session != p->p_session) {
mutex_exit(&proc_lock);
return (EPERM);
}
mutex_spin_enter(&tty_lock);
tp->t_pgrp = pgrp;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
}
case TIOCSPGRP: { /* set pgrp of tty */
struct pgrp *pgrp;
pid_t pgid = *(pid_t *)data;
if (pgid == NO_PGID)
return EINVAL;
mutex_enter(&proc_lock);
if (!isctty(p, tp)) {
mutex_exit(&proc_lock);
return (ENOTTY);
}
pgrp = pgrp_find(pgid);
if (pgrp == NULL || pgrp->pg_session != p->p_session) {
mutex_exit(&proc_lock);
return (EPERM);
}
mutex_spin_enter(&tty_lock);
tp->t_pgrp = pgrp;
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
break;
}
case TIOCSTAT: /* get load avg stats */
mutex_enter(&proc_lock);
ttygetinfo(tp, 0, infobuf, sizeof(infobuf));
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
ttyputinfo(tp, infobuf);
mutex_spin_exit(&tty_lock);
break;
case TIOCSWINSZ: /* set window size */
mutex_spin_enter(&tty_lock);
if (memcmp((void *)&tp->t_winsize, data,
sizeof(struct winsize))) {
tp->t_winsize = *(struct winsize *)data;
ttysig(tp, TTYSIG_PG1, SIGWINCH);
}
mutex_spin_exit(&tty_lock);
break;
case TIOCSQSIZE:
if ((error = tty_get_qsize(&s, *(int *)data)) == 0 &&
s != tp->t_qsize)
error = tty_set_qsize(tp, s);
return error;
case TIOCSBRK:
case TIOCCBRK:
case TIOCSDTR:
case TIOCCDTR:
case TIOCSFLAGS:
case TIOCGFLAGS:
case TIOCMSET:
case TIOCMGET:
case TIOCMBIS:
case TIOCMBIC:
/* Handled by the driver layer */
return EPASSTHROUGH;
case TIOCEXT:
case TIOCPTSNAME:
case TIOCGRANTPT:
case TIOCPKT:
case TIOCUCNTL:
case TIOCREMOTE:
case TIOCSIG:
/* for ptys */
return EPASSTHROUGH;
default:
/* Pass through various console ioctls */
switch (IOCGROUP(cmd)) {
case 'c': /* syscons console */
case 'v': /* usl console, video - where one letter */
case 'K': /* usl console, keyboard - aint enough */
case 'V': /* pcvt compat */
case 'W': /* wscons console */
return EPASSTHROUGH;
default:
break;
}
/* We may have to load the compat_60 module for this. */
(void)module_autoload("compat_60", MODULE_CLASS_EXEC);
MODULE_HOOK_CALL(tty_ttioctl_60_hook,
(tp, cmd, data, flag, l), enosys(), error);
if (error != EPASSTHROUGH)
return error;
/* We may have to load the compat_43 module for this. */
(void)module_autoload("compat_43", MODULE_CLASS_EXEC);
MODULE_HOOK_CALL(tty_ttioctl_43_hook,
(tp, cmd, data, flag, l), enosys(), error);
return error;
}
return (0);
}
int
ttpoll(struct tty *tp, int events, struct lwp *l)
{
int revents;
revents = 0;
mutex_spin_enter(&tty_lock);
if (events & (POLLIN | POLLRDNORM)) if (ttnread(tp) > 0)
revents |= events & (POLLIN | POLLRDNORM);
if (events & (POLLOUT | POLLWRNORM)) if (tp->t_outq.c_cc <= tp->t_lowat)
revents |= events & (POLLOUT | POLLWRNORM);
if (events & POLLHUP) if (!CONNECTED(tp))
revents |= POLLHUP;
if (revents == 0) { if (events & (POLLIN | POLLHUP | POLLRDNORM)) selrecord(l, &tp->t_rsel); if (events & (POLLOUT | POLLWRNORM)) selrecord(l, &tp->t_wsel);
}
mutex_spin_exit(&tty_lock);
return (revents);
}
static void
filt_ttyrdetach(struct knote *kn)
{
struct tty *tp;
tp = kn->kn_hook;
mutex_spin_enter(&tty_lock);
selremove_knote(&tp->t_rsel, kn);
mutex_spin_exit(&tty_lock);
}
static int
filt_ttyread(struct knote *kn, long hint)
{
struct tty *tp;
int rv;
tp = kn->kn_hook;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_enter(&tty_lock);
kn->kn_data = ttnread(tp);
rv = kn->kn_data > 0;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_exit(&tty_lock);
return rv;
}
static void
filt_ttywdetach(struct knote *kn)
{
struct tty *tp;
tp = kn->kn_hook;
mutex_spin_enter(&tty_lock);
selremove_knote(&tp->t_wsel, kn);
mutex_spin_exit(&tty_lock);
}
static int
filt_ttywrite(struct knote *kn, long hint)
{
struct tty *tp;
int canwrite;
tp = kn->kn_hook;
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_enter(&tty_lock);
kn->kn_data = tp->t_outq.c_cn - tp->t_outq.c_cc;
canwrite = (tp->t_outq.c_cc <= tp->t_lowat) && CONNECTED(tp);
if ((hint & NOTE_SUBMIT) == 0)
mutex_spin_exit(&tty_lock);
return (canwrite);
}
static const struct filterops ttyread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_ttyrdetach,
.f_event = filt_ttyread,
};
static const struct filterops ttywrite_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_ttywdetach,
.f_event = filt_ttywrite,
};
int
ttykqfilter(dev_t dev, struct knote *kn)
{
struct tty *tp;
struct selinfo *sip;
if ((tp = cdev_tty(dev)) == NULL)
return (ENXIO);
switch (kn->kn_filter) {
case EVFILT_READ:
sip = &tp->t_rsel;
kn->kn_fop = &ttyread_filtops;
break;
case EVFILT_WRITE:
sip = &tp->t_wsel;
kn->kn_fop = &ttywrite_filtops;
break;
default:
return EINVAL;
}
kn->kn_hook = tp;
mutex_spin_enter(&tty_lock);
selrecord_knote(sip, kn);
mutex_spin_exit(&tty_lock);
return (0);
}
/*
* Find the number of chars ready to be read from this tty.
* Call with the tty lock held.
*/
static int
ttnread(struct tty *tp)
{
int nread;
KASSERT(mutex_owned(&tty_lock)); if (ISSET(tp->t_lflag, PENDIN)) ttypend(tp);
nread = tp->t_canq.c_cc;
if (!ISSET(tp->t_lflag, ICANON)) {
nread += tp->t_rawq.c_cc;
if (nread < tp->t_cc[VMIN] && !tp->t_cc[VTIME])
nread = 0;
}
return (nread);
}
/*
* Wait for output to drain, or if this times out, flush it.
*/
static int
ttywait_timo(struct tty *tp, int timo)
{
int error;
error = 0;
mutex_spin_enter(&tty_lock);
while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && CONNECTED(tp) && tp->t_oproc) {
(*tp->t_oproc)(tp);
error = ttysleep(tp, &tp->t_outcv, true, timo);
if (error == EWOULDBLOCK)
ttyflush(tp, FWRITE);
if (error)
break;
}
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Wait for output to drain.
*/
int
ttywait(struct tty *tp)
{
return ttywait_timo(tp, 0);
}
/*
* Flush if successfully wait.
*/
int
ttywflush(struct tty *tp)
{
int error;
error = ttywait_timo(tp, 5 * hz);
if (error == 0 || error == EWOULDBLOCK) {
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD);
mutex_spin_exit(&tty_lock);
}
return (error);
}
/*
* Flush tty read and/or write queues, notifying anyone waiting.
* Call with the tty lock held.
*/
void
ttyflush(struct tty *tp, int rw)
{ KASSERT(mutex_owned(&tty_lock)); if (rw & FREAD) { FLUSHQ(&tp->t_canq); FLUSHQ(&tp->t_rawq);
tp->t_rocount = 0;
tp->t_rocol = 0;
CLR(tp->t_state, TS_LOCAL);
ttwakeup(tp);
}
if (rw & FWRITE) {
CLR(tp->t_state, TS_TTSTOP);
cdev_stop(tp, rw);
FLUSHQ(&tp->t_outq);
cv_broadcast(&tp->t_outcv);
selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
}
}
/*
* Copy in the default termios characters.
*/
void
ttychars(struct tty *tp)
{
memcpy(tp->t_cc, ttydefchars, sizeof(ttydefchars));
}
/*
* Send stop character on input overflow.
* Call with the tty lock held.
*/
static void
ttyblock(struct tty *tp)
{
int total;
KASSERT(mutex_owned(&tty_lock));
total = tp->t_rawq.c_cc + tp->t_canq.c_cc;
if (tp->t_rawq.c_cc > TTYHOG) { ttyflush(tp, FREAD | FWRITE);
CLR(tp->t_state, TS_TBLOCK);
}
/*
* Block further input iff: current input > threshold
* AND input is available to user program.
*/
if (total >= TTYHOG / 2 && !ISSET(tp->t_state, TS_TBLOCK) && (!ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0)) { if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) {
SET(tp->t_state, TS_TBLOCK);
ttstart(tp);
}
/* Try to block remote output via hardware flow control. */
if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
(*tp->t_hwiflow)(tp, 1) != 0)
SET(tp->t_state, TS_TBLOCK);
}
}
/*
* Delayed line discipline output
*/
void
ttrstrt(void *tp_arg)
{
struct tty *tp;
#ifdef DIAGNOSTIC
if (tp_arg == NULL)
panic("ttrstrt");
#endif
tp = tp_arg;
mutex_spin_enter(&tty_lock);
CLR(tp->t_state, TS_TIMEOUT);
ttstart(tp); /* XXX - Shouldn't this be tp->l_start(tp)? */
mutex_spin_exit(&tty_lock);
}
/*
* start a line discipline
* Always call with tty lock held?
*/
int
ttstart(struct tty *tp)
{
if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ (*tp->t_oproc)(tp);
return (0);
}
/*
* "close" a line discipline
*/
int
ttylclose(struct tty *tp, int flag)
{
if (flag & FNONBLOCK) {
mutex_spin_enter(&tty_lock);
ttyflush(tp, FREAD | FWRITE);
mutex_spin_exit(&tty_lock);
} else
ttywflush(tp);
return (0);
}
/*
* Handle modem control transition on a tty.
* Flag indicates new state of carrier.
* Returns 0 if the line should be turned off, otherwise 1.
*/
int
ttymodem(struct tty *tp, int flag)
{
mutex_spin_enter(&tty_lock);
if (flag == 0) {
if (ISSET(tp->t_state, TS_CARR_ON)) {
/*
* Lost carrier.
*/
CLR(tp->t_state, TS_CARR_ON);
if (ISSET(tp->t_state, TS_ISOPEN) && !CONNECTED(tp)) { ttysig(tp, TTYSIG_LEADER, SIGHUP);
ttyflush(tp, FREAD | FWRITE);
mutex_spin_exit(&tty_lock);
return (0);
}
}
} else {
if (!ISSET(tp->t_state, TS_CARR_ON)) {
/*
* Carrier now on.
*/
SET(tp->t_state, TS_CARR_ON);
ttwakeup(tp);
}
}
mutex_spin_exit(&tty_lock);
return (1);
}
/*
* Default modem control routine (for other line disciplines).
* Return argument flag, to turn off device on carrier drop.
*/
int
nullmodem(struct tty *tp, int flag)
{
mutex_spin_enter(&tty_lock);
if (flag)
SET(tp->t_state, TS_CARR_ON);
else {
CLR(tp->t_state, TS_CARR_ON);
if (!CONNECTED(tp)) {
ttysig(tp, TTYSIG_LEADER, SIGHUP);
mutex_spin_exit(&tty_lock);
return (0);
}
}
mutex_spin_exit(&tty_lock);
return (1);
}
/*
* Reinput pending characters after state switch.
*/
void
ttypend(struct tty *tp)
{
struct clist tq;
int c;
KASSERT(mutex_owned(&tty_lock));
CLR(tp->t_lflag, PENDIN);
SET(tp->t_state, TS_TYPEN);
tq = tp->t_rawq;
tp->t_rawq.c_cc = 0;
tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0;
while ((c = getc(&tq)) >= 0)
ttyinput_wlock(c, tp);
CLR(tp->t_state, TS_TYPEN);
}
/*
* Process a read call on a tty device.
*/
int
ttread(struct tty *tp, struct uio *uio, int flag)
{
struct clist *qp;
u_char *cc;
struct proc *p;
int c, first, error, has_stime, last_cc;
long lflag, slp;
struct timeval now, stime;
if (uio->uio_resid == 0)
return 0;
stime.tv_usec = 0; /* XXX gcc */
stime.tv_sec = 0; /* XXX gcc */
cc = tp->t_cc;
p = curproc;
error = 0;
has_stime = 0;
last_cc = 0;
slp = 0;
loop:
mutex_spin_enter(&tty_lock);
lflag = tp->t_lflag;
/*
* take pending input first
*/
if (ISSET(lflag, PENDIN)) ttypend(tp);
/*
* Hang process if it's in the background.
*/
if (isbackground(p, tp)) { if (sigismasked(curlwp, SIGTTIN) || p->p_lflag & PL_PPWAIT || p->p_pgrp->pg_jobc == 0) {
mutex_spin_exit(&tty_lock);
return (EIO);
}
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTIN, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz);
mutex_spin_exit(&tty_lock);
if (error)
return (error);
goto loop;
}
if (!ISSET(lflag, ICANON)) {
int m = cc[VMIN];
long t = cc[VTIME];
qp = &tp->t_rawq;
/*
* Check each of the four combinations.
* (m > 0 && t == 0) is the normal read case.
* It should be fairly efficient, so we check that and its
* companion case (m == 0 && t == 0) first.
* For the other two cases, we compute the target sleep time
* into slp.
*/
if (t == 0) {
if (qp->c_cc < m)
goto sleep;
goto read;
}
t *= hz; /* time in deca-ticks */
/*
* Time difference in deca-ticks, split division to avoid numeric overflow.
* Ok for hz < ~200kHz
*/
#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 10 * hz + \
((t1).tv_usec - (t2).tv_usec) / 100 * hz / 1000)
if (m > 0) {
if (qp->c_cc <= 0)
goto sleep;
if (qp->c_cc >= m)
goto read;
if (!has_stime) {
/* first character, start timer */
has_stime = 1;
getmicrotime(&stime);
slp = t;
} else if (qp->c_cc > last_cc) {
/* got a character, restart timer */
getmicrotime(&stime);
slp = t;
} else {
/* nothing, check expiration */
getmicrotime(&now);
slp = t - diff(now, stime);
}
} else { /* m == 0 */
if (qp->c_cc > 0)
goto read;
if (!has_stime) {
has_stime = 1;
getmicrotime(&stime);
slp = t;
} else {
getmicrotime(&now);
slp = t - diff(now, stime);
}
}
last_cc = qp->c_cc;
#undef diff
if (slp > 0) {
/*
* Convert deca-ticks back to ticks.
* Rounding down may make us wake up just short
* of the target, so we round up.
* Maybe we should do 'slp/10 + 1' because the
* first tick maybe almost immediate.
* However it is more useful for a program that sets
* VTIME=10 to wakeup every second not every 1.01
* seconds (if hz=100).
*/
slp = (slp + 9)/ 10;
goto sleep;
}
} else if ((qp = &tp->t_canq)->c_cc <= 0) {
int carrier;
sleep:
/*
* If there is no input, sleep on rawq
* awaiting hardware receipt and notification.
* If we have data, we don't need to check for carrier.
*/
carrier = CONNECTED(tp); if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) {
mutex_spin_exit(&tty_lock);
return (0); /* EOF */
}
if (!has_stime || slp <= 0) {
if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
return (EWOULDBLOCK);
}
}
error = ttysleep(tp, &tp->t_rawcv, true, slp);
mutex_spin_exit(&tty_lock);
/* VMIN == 0: any quantity read satisfies */
if (cc[VMIN] == 0 && error == EWOULDBLOCK)
return (0);
if (error && error != EWOULDBLOCK)
return (error);
goto loop;
}
read:
/*
* Input present, check for input mapping and processing.
*/
first = 1;
while ((c = getc(qp)) >= 0) {
/*
* delayed suspend (^Y)
*/
if (CCEQ(cc[VDSUSP], c) &&
ISSET(lflag, IEXTEN|ISIG) == (IEXTEN|ISIG)) {
ttysig(tp, TTYSIG_PG1, SIGTSTP); if (first) { error = ttypause(tp, hz); if (error)
break;
mutex_spin_exit(&tty_lock);
goto loop;
}
break;
}
/*
* Interpret EOF only in canonical mode.
*/
if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON))
break;
/*
* Give user character.
*/
mutex_spin_exit(&tty_lock);
error = ureadc(c, uio);
mutex_spin_enter(&tty_lock);
if (error)
break;
if (uio->uio_resid == 0)
break;
/*
* In canonical mode check for a "break character"
* marking the end of a "line of input".
*/
if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
break;
first = 0;
}
/*
* Look to unblock output now that (presumably)
* the input queue has gone down.
*/
if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG / 5) { if (ISSET(tp->t_iflag, IXOFF) && cc[VSTART] != _POSIX_VDISABLE &&
putc(cc[VSTART], &tp->t_outq) == 0) {
CLR(tp->t_state, TS_TBLOCK);
ttstart(tp);
}
/* Try to unblock remote output via hardware flow control. */
if (ISSET(tp->t_cflag, CHWFLOW) && tp->t_hwiflow &&
(*tp->t_hwiflow)(tp, 0) != 0)
CLR(tp->t_state, TS_TBLOCK);
}
mutex_spin_exit(&tty_lock);
return (error);
}
/*
* Check the output queue on tp for space for a kernel message (from uprintf
* or tprintf). Allow some space over the normal hiwater mark so we don't
* lose messages due to normal flow control, but don't let the tty run amok.
* Sleeps here are not interruptible, but we return prematurely if new signals
* arrive.
* Call with tty lock held.
*/
static int
ttycheckoutq_wlock(struct tty *tp)
{
int hiwat;
KASSERT(mutex_owned(&tty_lock));
hiwat = tp->t_hiwat;
if (tp->t_outq.c_cc > hiwat + 200)
if (tp->t_outq.c_cc > hiwat) {
ttstart(tp);
return (0);
}
return (1);
}
int
ttycheckoutq(struct tty *tp)
{
int r;
mutex_spin_enter(&tty_lock);
r = ttycheckoutq_wlock(tp);
mutex_spin_exit(&tty_lock);
return (r);
}
/*
* Process a write call on a tty device.
*/
int
ttwrite(struct tty *tp, struct uio *uio, int flag)
{
u_char *cp;
struct proc *p;
int cc, cc0, ce, i, hiwat, error;
u_char obuf[OBUFSIZ];
cp = NULL;
hiwat = tp->t_hiwat;
error = 0;
cc0 = cc = 0;
loop:
mutex_spin_enter(&tty_lock);
if (!CONNECTED(tp)) {
if (ISSET(tp->t_state, TS_ISOPEN)) {
mutex_spin_exit(&tty_lock);
return (EIO);
} else if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
error = EWOULDBLOCK;
goto out;
} else {
/* Sleep awaiting carrier. */
error = ttysleep(tp, &tp->t_rawcv, true, 0);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
}
/*
* Hang the process if it's in the background.
*/
p = curproc;
if (isbackground(p, tp) && ISSET(tp->t_lflag, TOSTOP) && (p->p_lflag & PL_PPWAIT) == 0 &&
!sigismasked(curlwp, SIGTTOU)) {
if (p->p_pgrp->pg_jobc == 0) {
error = EIO;
mutex_spin_exit(&tty_lock);
goto out;
}
mutex_spin_exit(&tty_lock);
mutex_enter(&proc_lock);
pgsignal(p->p_pgrp, SIGTTOU, 1);
mutex_exit(&proc_lock);
mutex_spin_enter(&tty_lock);
error = ttypause(tp, hz);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
mutex_spin_exit(&tty_lock);
/*
* Process the user's data in at most OBUFSIZ chunks. Perform any
* output translation. Keep track of high water mark, sleep on
* overflow awaiting device aid in acquiring new space.
*/
while (uio->uio_resid > 0 || cc > 0) {
if (ISSET(tp->t_lflag, FLUSHO)) {
uio->uio_resid = 0;
return (0);
}
if (tp->t_outq.c_cc > hiwat)
goto ovhiwat;
/*
* Grab a hunk of data from the user, unless we have some
* leftover from last time.
*/
if (cc == 0) {
uioskip(cc0, uio);
cc0 = cc = uimin(uio->uio_resid, OBUFSIZ);
cp = obuf;
error = uiopeek(cp, cc, uio);
if (error) {
cc = 0;
goto out;
}
}
/*
* If nothing fancy need be done, grab those characters we
* can handle without any of ttyoutput's processing and
* just transfer them to the output q. For those chars
* which require special processing (as indicated by the
* bits in char_type), call ttyoutput. After processing
* a hunk of data, look for FLUSHO so ^O's will take effect
* immediately.
*/
mutex_spin_enter(&tty_lock);
while (cc > 0) { if (!ISSET(tp->t_oflag, OPOST))
ce = cc;
else {
ce = cc - scanc((u_int)cc, cp, char_type,
CCLASSMASK);
/*
* If ce is zero, then we're processing
* a special character through ttyoutput.
*/
if (ce == 0) {
tp->t_rocount = 0;
if (ttyoutput(*cp, tp) >= 0) {
/* out of space */
mutex_spin_exit(&tty_lock);
goto overfull;
}
cp++;
cc--;
if (ISSET(tp->t_lflag, FLUSHO) ||
tp->t_outq.c_cc > hiwat) {
mutex_spin_exit(&tty_lock);
goto ovhiwat;
}
continue;
}
}
/*
* A bunch of normal characters have been found.
* Transfer them en masse to the output queue and
* continue processing at the top of the loop.
* If there are any further characters in this
* <= OBUFSIZ chunk, the first should be a character
* requiring special handling by ttyoutput.
*/
tp->t_rocount = 0;
i = b_to_q(cp, ce, &tp->t_outq);
ce -= i;
tp->t_column += ce;
cp += ce, cc -= ce, tk_nout += ce;
tp->t_outcc += ce;
if (i > 0) {
/* out of space */
mutex_spin_exit(&tty_lock);
goto overfull;
}
if (ISSET(tp->t_lflag, FLUSHO) ||
tp->t_outq.c_cc > hiwat)
break;
}
ttstart(tp);
mutex_spin_exit(&tty_lock);
}
out:
KASSERTMSG(error || cc == 0, "error=%d cc=%d", error, cc); KASSERTMSG(cc0 >= cc, "cc0=%d cc=%d", cc0, cc);
uioskip(cc0 - cc, uio);
return (error);
overfull:
/*
* Since we are using ring buffers, if we can't insert any more into
* the output queue, we can assume the ring is full and that someone
* forgot to set the high water mark correctly. We set it and then
* proceed as normal.
*/
hiwat = tp->t_outq.c_cc - 1;
ovhiwat:
mutex_spin_enter(&tty_lock);
ttstart(tp);
/*
* This can only occur if FLUSHO is set in t_lflag,
* or if ttstart/oproc is synchronous (or very fast).
*/
if (tp->t_outq.c_cc <= hiwat) {
mutex_spin_exit(&tty_lock);
goto loop;
}
if (flag & IO_NDELAY) {
mutex_spin_exit(&tty_lock);
error = EWOULDBLOCK;
goto out;
}
error = ttysleep(tp, &tp->t_outcv, true, 0);
mutex_spin_exit(&tty_lock);
if (error)
goto out;
goto loop;
}
/*
* Try to pull more output from the producer. Return non-zero if
* there is output ready to be sent.
*/
bool
ttypull(struct tty *tp)
{
/* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */
if (tp->t_outq.c_cc <= tp->t_lowat) { cv_broadcast(&tp->t_outcv);
selnotify(&tp->t_wsel, 0, NOTE_SUBMIT);
}
return tp->t_outq.c_cc != 0;
}
/*
* Rubout one character from the rawq of tp
* as cleanly as possible.
* Called with tty lock held.
*/
void
ttyrub(int c, struct tty *tp)
{
u_char *cp;
int savecol, tabc;
KASSERT(mutex_owned(&tty_lock));
if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC))
return;
CLR(tp->t_lflag, FLUSHO);
if (ISSET(tp->t_lflag, ECHOE)) {
if (tp->t_rocount == 0) {
/*
* Screwed by ttwrite; retype
*/
ttyretype(tp);
return;
}
if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE))
ttyrubo(tp, 2);
else {
CLR(c, ~TTY_CHARMASK);
switch (CCLASS(c)) {
case ORDINARY:
ttyrubo(tp, 1);
break;
case BACKSPACE:
case CONTROL:
case NEWLINE:
case RETURN:
case VTAB:
if (ISSET(tp->t_lflag, ECHOCTL))
ttyrubo(tp, 2);
break;
case TAB:
if (tp->t_rocount < tp->t_rawq.c_cc) {
ttyretype(tp);
return;
}
savecol = tp->t_column;
SET(tp->t_state, TS_CNTTB);
SET(tp->t_lflag, FLUSHO);
tp->t_column = tp->t_rocol;
for (cp = firstc(&tp->t_rawq, &tabc); cp;
cp = nextc(&tp->t_rawq, cp, &tabc))
ttyecho(tabc, tp);
CLR(tp->t_lflag, FLUSHO);
CLR(tp->t_state, TS_CNTTB);
/* savecol will now be length of the tab. */
savecol -= tp->t_column;
tp->t_column += savecol;
if (savecol > 8)
savecol = 8; /* overflow screw */
while (--savecol >= 0)
(void)ttyoutput('\b', tp);
break;
default: /* XXX */
(void)printf("ttyrub: would panic c = %d, "
"val = %d\n", c, CCLASS(c));
}
}
} else if (ISSET(tp->t_lflag, ECHOPRT)) {
if (!ISSET(tp->t_state, TS_ERASE)) {
SET(tp->t_state, TS_ERASE);
(void)ttyoutput('\\', tp);
}
ttyecho(c, tp);
} else
ttyecho(tp->t_cc[VERASE], tp);
--tp->t_rocount;
}
/*
* Back over cnt characters, erasing them.
* Called with tty lock held.
*/
static void
ttyrubo(struct tty *tp, int cnt)
{
KASSERT(mutex_owned(&tty_lock));
while (cnt-- > 0) {
(void)ttyoutput('\b', tp);
(void)ttyoutput(' ', tp);
(void)ttyoutput('\b', tp);
}
}
/*
* ttyretype --
* Reprint the rawq line. Note, it is assumed that c_cc has already
* been checked.
*
* Called with tty lock held.
*/
void
ttyretype(struct tty *tp)
{
u_char *cp;
int c;
KASSERT(mutex_owned(&tty_lock));
/* Echo the reprint character. */
if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE)
ttyecho(tp->t_cc[VREPRINT], tp);
(void)ttyoutput('\n', tp);
for (cp = firstc(&tp->t_canq, &c); cp; cp = nextc(&tp->t_canq, cp, &c))
ttyecho(c, tp);
for (cp = firstc(&tp->t_rawq, &c); cp; cp = nextc(&tp->t_rawq, cp, &c))
ttyecho(c, tp);
CLR(tp->t_state, TS_ERASE);
tp->t_rocount = tp->t_rawq.c_cc;
tp->t_rocol = 0;
}
/*
* Echo a typed character to the terminal.
* Called with tty lock held.
*/
static void
ttyecho(int c, struct tty *tp)
{ KASSERT(mutex_owned(&tty_lock)); if (!ISSET(tp->t_state, TS_CNTTB)) CLR(tp->t_lflag, FLUSHO); if ((!ISSET(tp->t_lflag, ECHO) && (!ISSET(tp->t_lflag, ECHONL) || c != '\n')) ||
ISSET(tp->t_lflag, EXTPROC))
return;
if (((ISSET(tp->t_lflag, ECHOCTL) && (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n')) ||
ISSET(c, TTY_CHARMASK) == 0177)) {
(void)ttyoutput('^', tp);
CLR(c, ~TTY_CHARMASK);
if (c == 0177)
c = '?';
else
c += 'A' - 1;
}
(void)ttyoutput(c, tp);
}
/*
* Wake up any readers on a tty.
* Called with tty lock held.
*/
void
ttwakeup(struct tty *tp)
{ KASSERT(mutex_owned(&tty_lock));
selnotify(&tp->t_rsel, 0, NOTE_SUBMIT);
if (ISSET(tp->t_state, TS_ASYNC)) ttysig(tp, TTYSIG_PG2, SIGIO);
cv_broadcast(&tp->t_rawcv);
}
/*
* Look up a code for a specified speed in a conversion table;
* used by drivers to map software speed values to hardware parameters.
*/
int
ttspeedtab(int speed, const struct speedtab *table)
{ for (; table->sp_speed != -1; table++)
if (table->sp_speed == speed)
return (table->sp_code);
return (-1);
}
/*
* Set tty hi and low water marks.
*
* Try to arrange the dynamics so there's about one second
* from hi to low water.
*/
void
ttsetwater(struct tty *tp)
{
int cps, x;
/* XXX not yet KASSERT(mutex_owned(&tty_lock)); */
#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x))
cps = tp->t_ospeed / 10;
tp->t_lowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT);
x += cps;
x = CLAMP(x, TTMAXHIWAT, TTMINHIWAT);
tp->t_hiwat = roundup(x, TTROUND);
#undef CLAMP
}
/*
* Prepare report on state of foreground process group.
* Call with &proc_lock held.
*/
void
ttygetinfo(struct tty *tp, int fromsig, char *buf, size_t bufsz)
{
struct lwp *l;
struct proc *p, *pick = NULL;
struct timeval utime, stime;
int tmp;
fixpt_t pctcpu = 0;
const char *msg = NULL;
char lmsg[100];
long rss;
bool again = false;
KASSERT(mutex_owned(&proc_lock));
*buf = '\0';
retry:
if (tp->t_session == NULL)
msg = "not a controlling terminal\n";
else if (tp->t_pgrp == NULL)
msg = "no foreground process group\n";
else if ((p = LIST_FIRST(&tp->t_pgrp->pg_members)) == NULL)
msg = "empty foreground process group\n";
else {
/* Pick interesting process. */
for (; p != NULL; p = LIST_NEXT(p, p_pglist)) {
struct proc *oldpick;
if (pick == NULL) {
pick = p;
continue;
}
if (pick->p_lock < p->p_lock) {
mutex_enter(pick->p_lock);
mutex_enter(p->p_lock);
} else if (pick->p_lock > p->p_lock) {
mutex_enter(p->p_lock);
mutex_enter(pick->p_lock);
} else
mutex_enter(p->p_lock);
oldpick = pick;
if (proc_compare_wrapper(pick, p))
pick = p;
mutex_exit(p->p_lock);
if (p->p_lock != oldpick->p_lock) mutex_exit(oldpick->p_lock);
}
if (pick != NULL) {
mutex_enter(pick->p_lock);
if (P_ZOMBIE(pick)) {
mutex_exit(pick->p_lock);
pick = NULL;
if (!again) {
again = true;
goto retry;
}
msg = "found only zombie processes\n";
}
if (pick && fromsig &&
(SIGACTION_PS(pick->p_sigacts, SIGINFO).sa_flags &
SA_NOKERNINFO)) {
mutex_exit(pick->p_lock);
return;
}
}
}
/* Print load average. */
tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT;
snprintf(lmsg, sizeof(lmsg), "load: %d.%02d ", tmp / 100, tmp % 100);
strlcat(buf, lmsg, bufsz);
if (pick == NULL) {
strlcat(buf, msg, bufsz);
return;
}
snprintf(lmsg, sizeof(lmsg), " cmd: %s %d [", pick->p_comm,
pick->p_pid);
strlcat(buf, lmsg, bufsz);
KASSERT(mutex_owned(pick->p_lock)); LIST_FOREACH(l, &pick->p_lwps, l_sibling) {
const char *lp;
lwp_lock(l);
#ifdef LWP_PC
#define FMT_RUN "%#"PRIxVADDR
#define VAL_RUNNING (vaddr_t)LWP_PC(l)
#define VAL_RUNNABLE (vaddr_t)LWP_PC(l)
#else
#define FMT_RUN "%s"
#define VAL_RUNNING "running"
#define VAL_RUNNABLE "runnable"
#endif
switch (l->l_stat) {
case LSONPROC:
snprintf(lmsg, sizeof(lmsg), FMT_RUN"/%d", VAL_RUNNING,
cpu_index(l->l_cpu));
lp = lmsg;
break;
case LSRUN:
snprintf(lmsg, sizeof(lmsg), FMT_RUN, VAL_RUNNABLE);
lp = lmsg;
break;
default:
lp = l->l_wchan ? l->l_wmesg : "iowait";
break;
}
strlcat(buf, lp, bufsz);
strlcat(buf, LIST_NEXT(l, l_sibling) != NULL ? " " : "] ",
bufsz);
pctcpu += l->l_pctcpu;
lwp_unlock(l);
}
pctcpu += pick->p_pctcpu;
calcru(pick, &utime, &stime, NULL, NULL);
mutex_exit(pick->p_lock);
/* Round up and print user+system time, %CPU and RSS. */
utime.tv_usec += 5000;
if (utime.tv_usec >= 1000000) { utime.tv_sec += 1;
utime.tv_usec -= 1000000;
}
stime.tv_usec += 5000;
if (stime.tv_usec >= 1000000) { stime.tv_sec += 1;
stime.tv_usec -= 1000000;
}
#define pgtok(a) (((u_long) ((a) * PAGE_SIZE) / 1024))
tmp = (pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
if (pick->p_stat == SIDL || P_ZOMBIE(pick))
rss = 0;
else
rss = pgtok(vm_resident_count(pick->p_vmspace));
snprintf(lmsg, sizeof(lmsg), "%ld.%02ldu %ld.%02lds %d%% %ldk",
(long)utime.tv_sec, (long)utime.tv_usec / 10000,
(long)stime.tv_sec, (long)stime.tv_usec / 10000,
tmp / 100, rss);
strlcat(buf, lmsg, bufsz);
}
/*
* Print report on state of foreground process group.
* Call with tty_lock held.
*/
void
ttyputinfo(struct tty *tp, char *buf)
{ KASSERT(mutex_owned(&tty_lock)); if (ttycheckoutq_wlock(tp) == 0)
return;
ttyprintf_nolock(tp, "%s\n", buf);
tp->t_rocount = 0; /* so pending input will be retyped if BS */
}
/*
* Returns 1 if p2 has a better chance being the active foreground process
* in a terminal instead of p1.
*/
static int
proc_compare_wrapper(struct proc *p1, struct proc *p2)
{
lwp_t *l1, *l2;
KASSERT(mutex_owned(p1->p_lock)); KASSERT(mutex_owned(p2->p_lock));
l1 = LIST_FIRST(&p1->p_lwps);
l2 = LIST_FIRST(&p2->p_lwps);
return proc_compare(p1, l1, p2, l2);
}
/*
* Output char to tty; console putchar style.
* Can be called with tty lock held through kprintf() machinery..
*/
int
tputchar(int c, int flags, struct tty *tp)
{
int r = 0;
if ((flags & NOLOCK) == 0) mutex_spin_enter(&tty_lock); if (!CONNECTED(tp)) {
r = -1;
goto out;
}
if (c == '\n') (void)ttyoutput('\r', tp);
(void)ttyoutput(c, tp);
ttstart(tp);
out:
if ((flags & NOLOCK) == 0) mutex_spin_exit(&tty_lock);
return (r);
}
/*
* Sleep on chan, returning ERESTART if tty changed while we napped and
* returning any errors (e.g. EINTR/EWOULDBLOCK) reported by
* cv_timedwait(_sig).
* If the tty is revoked, restarting a pending call will redo validation done
* at the start of the call.
*
* Must be called with the tty lock held.
*/
int
ttysleep(struct tty *tp, kcondvar_t *cv, bool catch_p, int timo)
{
int error;
short gen;
KASSERT(mutex_owned(&tty_lock));
gen = tp->t_gen;
if (ISSET(tp->t_state, TS_CANCEL))
error = ERESTART;
else if (cv == NULL)
error = kpause("ttypause", catch_p, timo, &tty_lock);
else if (catch_p)
error = cv_timedwait_sig(cv, &tty_lock, timo);
else
error = cv_timedwait(cv, &tty_lock, timo); if (error != 0)
return (error);
return (tp->t_gen == gen ? 0 : ERESTART);
}
int
ttypause(struct tty *tp, int timo)
{
int error;
error = ttysleep(tp, NULL, true, timo);
if (error == EWOULDBLOCK)
error = 0;
return error;
}
/*
* Attach a tty to the tty list.
*
* This should be called ONLY once per real tty (including pty's).
* eg, on the sparc, the keyboard and mouse have struct tty's that are
* distinctly NOT usable as tty's, and thus should not be attached to
* the ttylist. This is why this call is not done from tty_alloc().
*
* Device drivers should attach tty's at a similar time that they are
* allocated, or, for the case of statically allocated struct tty's
* either in the attach or (first) open routine.
*/
void
tty_attach(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
TAILQ_INSERT_TAIL(&ttylist, tp, tty_link);
++tty_count;
mutex_spin_exit(&tty_lock);
}
/*
* Remove a tty from the tty list.
*/
void
tty_detach(struct tty *tp)
{
mutex_spin_enter(&tty_lock);
--tty_count;
#ifdef DIAGNOSTIC
if (tty_count < 0)
panic("tty_detach: tty_count < 0");
#endif
TAILQ_REMOVE(&ttylist, tp, tty_link);
mutex_spin_exit(&tty_lock);
}
/*
* Allocate a tty structure and its associated buffers.
*/
struct tty *
tty_alloc(void)
{
struct tty *tp;
int i;
tp = kmem_zalloc(sizeof(*tp), KM_SLEEP);
callout_init(&tp->t_rstrt_ch, 0);
callout_setfunc(&tp->t_rstrt_ch, ttrstrt, tp);
tp->t_qsize = tty_qsize;
clalloc(&tp->t_rawq, tp->t_qsize, 1);
cv_init(&tp->t_rawcv, "ttyraw");
cv_init(&tp->t_rawcvf, "ttyrawf");
clalloc(&tp->t_canq, tp->t_qsize, 1);
cv_init(&tp->t_cancv, "ttycan");
cv_init(&tp->t_cancvf, "ttycanf");
/* output queue doesn't need quoting */
clalloc(&tp->t_outq, tp->t_qsize, 0);
cv_init(&tp->t_outcv, "ttyout");
cv_init(&tp->t_outcvf, "ttyoutf");
/* Set default line discipline. */
tp->t_linesw = ttyldisc_default();
tp->t_dev = NODEV;
selinit(&tp->t_rsel);
selinit(&tp->t_wsel);
for (i = 0; i < TTYSIG_COUNT; i++) {
sigemptyset(&tp->t_sigs[i]);
}
return tp;
}
/*
* Free a tty structure and its buffers.
*
* Be sure to call tty_detach() for any tty that has been
* tty_attach()ed.
*/
void
tty_free(struct tty *tp)
{
int i;
mutex_enter(&proc_lock);
mutex_enter(&tty_lock);
for (i = 0; i < TTYSIG_COUNT; i++)
sigemptyset(&tp->t_sigs[i]);
if (tp->t_sigcount != 0)
TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
mutex_exit(&tty_lock);
mutex_exit(&proc_lock);
callout_halt(&tp->t_rstrt_ch, NULL);
callout_destroy(&tp->t_rstrt_ch);
ttyldisc_release(tp->t_linesw);
clfree(&tp->t_rawq);
clfree(&tp->t_canq);
clfree(&tp->t_outq);
cv_destroy(&tp->t_rawcv);
cv_destroy(&tp->t_rawcvf);
cv_destroy(&tp->t_cancv);
cv_destroy(&tp->t_cancvf);
cv_destroy(&tp->t_outcv);
cv_destroy(&tp->t_outcvf);
seldestroy(&tp->t_rsel);
seldestroy(&tp->t_wsel);
kmem_free(tp, sizeof(*tp));
}
/*
* tty_unit: map dev_t to tty unit number, as with TTUNIT
*
* => defined as function for use with struct cdevsw::d_devtounit
* => not for drivers with different unit numbering, e.g. TTUNIT(d) >> 4
*/
int
tty_unit(dev_t dev)
{
return TTUNIT(dev);
}
/*
* ttyprintf_nolock: send a message to a specific tty, without locking.
*
* => should be used only by tty driver or anything that knows the
* underlying tty will not be revoked(2)'d away. [otherwise,
* use tprintf]
*/
static void
ttyprintf_nolock(struct tty *tp, const char *fmt, ...)
{
va_list ap;
/* No mutex needed; going to process TTY. */
va_start(ap, fmt);
kprintf(fmt, TOTTY|NOLOCK, tp, NULL, ap);
va_end(ap);
}
static int
tty_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct tty *tty;
int result;
result = KAUTH_RESULT_DEFER;
if (action != KAUTH_DEVICE_TTY_OPEN)
return result;
tty = arg0;
/* If it's not opened, we allow. */
if ((tty->t_state & TS_ISOPEN) == 0)
result = KAUTH_RESULT_ALLOW;
else {
/*
* If it's opened, we can only allow if it's not exclusively
* opened; otherwise, that's a privileged operation and we
* let the secmodel handle it.
*/
if ((tty->t_state & TS_XCLUDE) == 0)
result = KAUTH_RESULT_ALLOW;
}
return result;
}
/*
* Initialize the tty subsystem.
*/
void
tty_init(void)
{
mutex_init(&tty_lock, MUTEX_DEFAULT, IPL_VM);
mutex_init(&constty_lock, MUTEX_DEFAULT, IPL_NONE);
constty_psz = pserialize_create();
cv_init(&ttyref_cv, "ttyref");
tty_sigsih = softint_establish(SOFTINT_CLOCK, ttysigintr, NULL);
KASSERT(tty_sigsih != NULL);
tty_listener = kauth_listen_scope(KAUTH_SCOPE_DEVICE,
tty_listener_cb, NULL);
sysctl_kern_tty_setup();
}
/*
* Send a signal from a tty to its process group or session leader.
* Handoff to the target is deferred to a soft interrupt.
*/
void
ttysig(struct tty *tp, enum ttysigtype st, int sig)
{
sigset_t *sp;
/* XXXSMP not yet KASSERT(mutex_owned(&tty_lock)); */
sp = &tp->t_sigs[st];
if (sigismember(sp, sig))
return;
sigaddset(sp, sig);
if (tp->t_sigcount++ == 0) TAILQ_INSERT_TAIL(&tty_sigqueue, tp, t_sigqueue);
softint_schedule(tty_sigsih);
}
/*
* Deliver deferred signals from ttys. Note that the process groups
* and sessions associated with the ttys may have changed from when
* the signal was originally sent, but in practice it should not matter.
* For signals produced as a result of a syscall, the soft interrupt
* will fire before the syscall returns to the user.
*/
static void
ttysigintr(void *cookie)
{
struct tty *tp;
enum ttysigtype st;
struct pgrp *pgrp;
struct session *sess;
int sig, lflag;
char infobuf[200];
mutex_enter(&proc_lock);
mutex_spin_enter(&tty_lock);
while ((tp = TAILQ_FIRST(&tty_sigqueue)) != NULL) {
KASSERT(tp->t_sigcount > 0);
for (st = TTYSIG_PG1; st < TTYSIG_COUNT; st++) {
if ((sig = firstsig(&tp->t_sigs[st])) != 0)
break;
}
KASSERT(st < TTYSIG_COUNT);
sigdelset(&tp->t_sigs[st], sig);
if (--tp->t_sigcount == 0)
TAILQ_REMOVE(&tty_sigqueue, tp, t_sigqueue);
pgrp = tp->t_pgrp;
sess = tp->t_session;
lflag = tp->t_lflag;
if (sig == SIGINFO) {
if (ISSET(tp->t_state, TS_SIGINFO)) {
/* Via ioctl: ignore tty option. */
tp->t_state &= ~TS_SIGINFO;
lflag |= ISIG;
}
if (!ISSET(lflag, NOKERNINFO)) {
mutex_spin_exit(&tty_lock);
ttygetinfo(tp, 1, infobuf, sizeof(infobuf));
mutex_spin_enter(&tty_lock);
ttyputinfo(tp, infobuf);
}
if (!ISSET(lflag, ISIG))
continue;
}
mutex_spin_exit(&tty_lock);
KASSERT(sig != 0);
switch (st) {
case TTYSIG_PG1:
if (pgrp != NULL)
pgsignal(pgrp, sig, 1);
break;
case TTYSIG_PG2:
if (pgrp != NULL)
pgsignal(pgrp, sig, sess != NULL);
break;
case TTYSIG_LEADER:
if (sess != NULL && sess->s_leader != NULL)
psignal(sess->s_leader, sig);
break;
default:
/* NOTREACHED */
break;
}
mutex_spin_enter(&tty_lock);
}
mutex_spin_exit(&tty_lock);
mutex_exit(&proc_lock);
}
unsigned char
tty_getctrlchar(struct tty *tp, unsigned which)
{ KASSERT(which < NCCS);
return tp->t_cc[which];
}
void
tty_setctrlchar(struct tty *tp, unsigned which, unsigned char val)
{ KASSERT(which < NCCS);
tp->t_cc[which] = val;
}
int
tty_try_xonxoff(struct tty *tp, unsigned char c)
{
const struct cdevsw *cdev;
if (tp->t_iflag & IXON) {
if (c == tp->t_cc[VSTOP] && tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
if ((tp->t_state & TS_TTSTOP) == 0) {
tp->t_state |= TS_TTSTOP;
cdev = cdevsw_lookup(tp->t_dev);
if (cdev != NULL)
(*cdev->d_stop)(tp, 0);
}
return 0;
}
if (c == tp->t_cc[VSTART] && tp->t_cc[VSTART] != _POSIX_VDISABLE) {
tp->t_state &= ~TS_TTSTOP;
if (tp->t_oproc != NULL) {
mutex_spin_enter(&tty_lock); /* XXX */
(*tp->t_oproc)(tp);
mutex_spin_exit(&tty_lock); /* XXX */
}
return 0;
}
}
return EAGAIN;
}
/* $NetBSD: pmap_private.h,v 1.5 2023/10/04 20:28:06 ad Exp $ */
/*
* Copyright (c) 1997 Charles D. Cranor and Washington University.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 2001 Wasabi Systems, Inc.
* All rights reserved.
*
* Written by Frank van der Linden for Wasabi Systems, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the NetBSD Project by
* Wasabi Systems, Inc.
* 4. The name of Wasabi Systems, Inc. may not be used to endorse
* or promote products derived from this software without specific prior
* written permission.
*
* THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _X86_PMAP_PRIVATE_H_
#define _X86_PMAP_PRIVATE_H_
#ifndef _MACHINE_PMAP_PRIVATE_H_X86
#error Include machine/pmap_private.h, not x86/pmap_private.h.
#endif
#ifdef _KERNEL_OPT
#include "opt_svs.h"
#endif
#include <sys/param.h>
#include <sys/types.h>
#include <sys/kcpuset.h>
#include <sys/mutex.h>
#include <sys/pool.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <machine/cpufunc.h>
#include <machine/pte.h>
#include <machine/vmparam.h>
#include <uvm/uvm_object.h>
#include <uvm/uvm_pmap.h>
struct pmap;
#define SLAREA_USER 0
#define SLAREA_PTE 1
#define SLAREA_MAIN 2
#define SLAREA_PCPU 3
#define SLAREA_DMAP 4
#define SLAREA_HYPV 5
#define SLAREA_ASAN 6
#define SLAREA_MSAN 7
#define SLAREA_KERN 8
#define SLSPACE_NAREAS 9
struct slotspace {
struct {
size_t sslot; /* start slot */
size_t nslot; /* # of slots */
bool active; /* area is active */
} area[SLSPACE_NAREAS];
};
extern struct slotspace slotspace;
#include <x86/gdt.h>
struct pcpu_entry {
uint8_t gdt[MAXGDTSIZ];
uint8_t ldt[MAX_USERLDT_SIZE];
uint8_t idt[PAGE_SIZE];
uint8_t tss[PAGE_SIZE];
uint8_t ist0[PAGE_SIZE];
uint8_t ist1[PAGE_SIZE];
uint8_t ist2[PAGE_SIZE];
uint8_t ist3[PAGE_SIZE];
uint8_t rsp0[2 * PAGE_SIZE];
} __packed;
struct pcpu_area {
#ifdef SVS
uint8_t utls[PAGE_SIZE];
#endif
uint8_t ldt[PAGE_SIZE];
struct pcpu_entry ent[MAXCPUS];
} __packed;
extern struct pcpu_area *pcpuarea;
#define PMAP_PCID_KERN 0
#define PMAP_PCID_USER 1
/*
* pmap data structures: see pmap.c for details of locking.
*/
/*
* we maintain a list of all non-kernel pmaps
*/
LIST_HEAD(pmap_head, pmap); /* struct pmap_head: head of a pmap list */
/*
* linked list of all non-kernel pmaps
*/
extern struct pmap_head pmaps;
extern kmutex_t pmaps_lock; /* protects pmaps */
/*
* pool_cache(9) that pmaps are allocated from
*/
extern struct pool_cache pmap_cache;
/*
* the pmap structure
*
* note that the pm_obj contains the lock pointer, the reference count,
* page list, and number of PTPs within the pmap.
*
* pm_lock is the same as the lock for vm object 0. Changes to
* the other objects may only be made if that lock has been taken
* (the other object locks are only used when uvm_pagealloc is called)
*/
struct pv_page;
struct pmap {
struct uvm_object pm_obj[PTP_LEVELS-1];/* objects for lvl >= 1) */
LIST_ENTRY(pmap) pm_list; /* list of all pmaps */
pd_entry_t *pm_pdir; /* VA of PD */
paddr_t pm_pdirpa[PDP_SIZE]; /* PA of PDs (read-only after create) */
struct vm_page *pm_ptphint[PTP_LEVELS-1];
/* pointer to a PTP in our pmap */
struct pmap_statistics pm_stats; /* pmap stats */
struct pv_entry *pm_pve; /* spare pv_entry */
LIST_HEAD(, pv_page) pm_pvp_part;
LIST_HEAD(, pv_page) pm_pvp_empty;
LIST_HEAD(, pv_page) pm_pvp_full;
#if !defined(__x86_64__)
vaddr_t pm_hiexec; /* highest executable mapping */
#endif /* !defined(__x86_64__) */
union descriptor *pm_ldt; /* user-set LDT */
size_t pm_ldt_len; /* XXX unused, remove */
int pm_ldt_sel; /* LDT selector */
kcpuset_t *pm_cpus; /* mask of CPUs using pmap */
kcpuset_t *pm_kernel_cpus; /* mask of CPUs using kernel part
of pmap */
kcpuset_t *pm_xen_ptp_cpus; /* mask of CPUs which have this pmap's
ptp mapped */
long pm_pctr; /* for assertions */
LIST_HEAD(,vm_page) pm_gc_ptp; /* PTPs queued for free */
/* Used by NVMM and Xen */
int (*pm_enter)(struct pmap *, vaddr_t, paddr_t, vm_prot_t, u_int);
bool (*pm_extract)(struct pmap *, vaddr_t, paddr_t *);
void (*pm_remove)(struct pmap *, vaddr_t, vaddr_t);
int (*pm_sync_pv)(struct vm_page *, vaddr_t, paddr_t, int, uint8_t *,
pt_entry_t *);
void (*pm_pp_remove_ent)(struct pmap *, struct vm_page *, pt_entry_t,
vaddr_t);
void (*pm_write_protect)(struct pmap *, vaddr_t, vaddr_t, vm_prot_t);
void (*pm_unwire)(struct pmap *, vaddr_t);
void (*pm_tlb_flush)(struct pmap *);
void *pm_data;
kmutex_t pm_lock /* locks for pm_objs */
__aligned(64); /* give lock own cache line */
krwlock_t pm_dummy_lock; /* ugly hack for abusing uvm_object */
};
/* macro to access pm_pdirpa slots */
#ifdef PAE
#define pmap_pdirpa(pmap, index) \
((pmap)->pm_pdirpa[l2tol3(index)] + l2tol2(index) * sizeof(pd_entry_t))
#else
#define pmap_pdirpa(pmap, index) \
((pmap)->pm_pdirpa[0] + (index) * sizeof(pd_entry_t))
#endif
/*
* global kernel variables
*/
/*
* PDPpaddr is the physical address of the kernel's PDP.
* - i386 non-PAE and amd64: PDPpaddr corresponds directly to the %cr3
* value associated to the kernel process, proc0.
* - i386 PAE: it still represents the PA of the kernel's PDP (L2). Due to
* the L3 PD, it cannot be considered as the equivalent of a %cr3 any more.
* - Xen: it corresponds to the PFN of the kernel's PDP.
*/
extern u_long PDPpaddr;
extern pd_entry_t pmap_pg_g; /* do we support PTE_G? */
extern pd_entry_t pmap_pg_nx; /* do we support PTE_NX? */
extern int pmap_largepages;
extern long nkptp[PTP_LEVELS];
#define pmap_valid_entry(E) ((E) & PTE_P) /* is PDE or PTE valid? */
void pmap_map_ptes(struct pmap *, struct pmap **, pd_entry_t **,
pd_entry_t * const **);
void pmap_unmap_ptes(struct pmap *, struct pmap *);
bool pmap_pdes_valid(vaddr_t, pd_entry_t * const *, pd_entry_t *,
int *lastlvl);
bool pmap_is_curpmap(struct pmap *);
void pmap_ept_transform(struct pmap *);
#ifndef __HAVE_DIRECT_MAP
void pmap_vpage_cpu_init(struct cpu_info *);
#endif
vaddr_t slotspace_rand(int, size_t, size_t, size_t, vaddr_t);
vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
typedef enum tlbwhy {
TLBSHOOT_REMOVE_ALL,
TLBSHOOT_KENTER,
TLBSHOOT_KREMOVE,
TLBSHOOT_FREE_PTP,
TLBSHOOT_REMOVE_PTE,
TLBSHOOT_SYNC_PV,
TLBSHOOT_WRITE_PROTECT,
TLBSHOOT_ENTER,
TLBSHOOT_NVMM,
TLBSHOOT_BUS_DMA,
TLBSHOOT_BUS_SPACE,
TLBSHOOT__MAX,
} tlbwhy_t;
void pmap_tlb_init(void);
void pmap_tlb_cpu_init(struct cpu_info *);
void pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, tlbwhy_t);
void pmap_tlb_shootnow(void);
void pmap_tlb_intr(void);
/*
* inline functions
*/
/*
* pmap_update_pg: flush one page from the TLB (or flush the whole thing
* if hardware doesn't support one-page flushing)
*/
__inline static void __unused
pmap_update_pg(vaddr_t va)
{
invlpg(va);
}
/*
* various address inlines
*
* vtopte: return a pointer to the PTE mapping a VA, works only for
* user and PT addresses
*
* kvtopte: return a pointer to the PTE mapping a kernel VA
*/
#include <lib/libkern/libkern.h>
static __inline pt_entry_t * __unused
vtopte(vaddr_t va)
{
KASSERT(va < VM_MIN_KERNEL_ADDRESS);
return (PTE_BASE + pl1_i(va));
}
static __inline pt_entry_t * __unused
kvtopte(vaddr_t va)
{
pd_entry_t *pde;
KASSERT(va >= VM_MIN_KERNEL_ADDRESS); pde = L2_BASE + pl2_i(va);
if (*pde & PTE_PS)
return ((pt_entry_t *)pde);
return (PTE_BASE + pl1_i(va));
}
#ifdef XENPV
#include <sys/bitops.h>
#define XPTE_MASK L1_FRAME
/* Selects the index of a PTE in (A)PTE_BASE */
#define XPTE_SHIFT (L1_SHIFT - ilog2(sizeof(pt_entry_t)))
/* PTE access inline functions */
/*
* Get the machine address of the pointed pte
* We use hardware MMU to get value so works only for levels 1-3
*/
static __inline paddr_t
xpmap_ptetomach(pt_entry_t *pte)
{
pt_entry_t *up_pte;
vaddr_t va = (vaddr_t) pte;
va = ((va & XPTE_MASK) >> XPTE_SHIFT) | (vaddr_t) PTE_BASE;
up_pte = (pt_entry_t *) va;
return (paddr_t) (((*up_pte) & PTE_FRAME) + (((vaddr_t) pte) & (~PTE_FRAME & ~VA_SIGN_MASK)));
}
/* Xen helpers to change bits of a pte */
#define XPMAP_UPDATE_DIRECT 1 /* Update direct map entry flags too */
paddr_t vtomach(vaddr_t);
#define vtomfn(va) (vtomach(va) >> PAGE_SHIFT)
#endif /* XENPV */
#ifdef __HAVE_PCPU_AREA
extern struct pcpu_area *pcpuarea;
#define PDIR_SLOT_PCPU 510
#define PMAP_PCPU_BASE (VA_SIGN_NEG((PDIR_SLOT_PCPU * NBPD_L4)))
#endif
void svs_quad_copy(void *, void *, long);
#ifdef _KERNEL_OPT
#include "opt_efi.h"
#endif
#ifdef EFI_RUNTIME
void * pmap_activate_sync(struct pmap *);
void pmap_deactivate_sync(struct pmap *, void *);
bool pmap_is_user(struct pmap *);
#else
static inline bool
pmap_is_user(struct pmap *pmap)
{
KASSERT(pmap != pmap_kernel());
return true;
}
#endif
#endif /* _X86_PMAP_PRIVATE_H_ */
/* $NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $ */
/* NetBSD: subr_disk.c,v 1.61 2004/09/25 03:30:44 thorpej Exp */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: bufq_fcfs.c,v 1.13 2017/05/04 11:03:27 kamil Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/module.h>
/*
* First-come first-served sort for disks.
*
* Requests are appended to the queue without any reordering.
*/
struct bufq_fcfs {
TAILQ_HEAD(, buf) bq_head; /* actual list of buffers */
};
static void bufq_fcfs_init(struct bufq_state *);
static void bufq_fcfs_put(struct bufq_state *, struct buf *);
static struct buf *bufq_fcfs_get(struct bufq_state *, int);
BUFQ_DEFINE(fcfs, 10, bufq_fcfs_init);
static void
bufq_fcfs_put(struct bufq_state *bufq, struct buf *bp)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
TAILQ_INSERT_TAIL(&fcfs->bq_head, bp, b_actq);
}
static struct buf *
bufq_fcfs_get(struct bufq_state *bufq, int remove)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
struct buf *bp;
bp = TAILQ_FIRST(&fcfs->bq_head);
if (bp != NULL && remove) TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
return (bp);
}
static struct buf *
bufq_fcfs_cancel(struct bufq_state *bufq, struct buf *buf)
{
struct bufq_fcfs *fcfs = bufq_private(bufq);
struct buf *bp;
TAILQ_FOREACH(bp, &fcfs->bq_head, b_actq) {
if (bp == buf) {
TAILQ_REMOVE(&fcfs->bq_head, bp, b_actq);
return buf;
}
}
return NULL;
}
static void
bufq_fcfs_fini(struct bufq_state *bufq)
{ KASSERT(bufq->bq_private != NULL);
kmem_free(bufq->bq_private, sizeof(struct bufq_fcfs));
}
static void
bufq_fcfs_init(struct bufq_state *bufq)
{
struct bufq_fcfs *fcfs;
bufq->bq_get = bufq_fcfs_get;
bufq->bq_put = bufq_fcfs_put;
bufq->bq_cancel = bufq_fcfs_cancel;
bufq->bq_fini = bufq_fcfs_fini;
bufq->bq_private = kmem_zalloc(sizeof(struct bufq_fcfs), KM_SLEEP);
fcfs = (struct bufq_fcfs *)bufq->bq_private;
TAILQ_INIT(&fcfs->bq_head);
}
MODULE(MODULE_CLASS_BUFQ, bufq_fcfs, NULL);
static int
bufq_fcfs_modcmd(modcmd_t cmd, void *opaque)
{
switch (cmd) {
case MODULE_CMD_INIT:
return bufq_register(&bufq_strat_fcfs);
case MODULE_CMD_FINI:
return bufq_unregister(&bufq_strat_fcfs);
default:
return ENOTTY;
}
}
/* $NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $ */
/* NetBSD: subr_disk.c,v 1.70 2005/08/20 12:00:01 yamt Exp $ */
/*-
* Copyright (c) 1996, 1997, 1999, 2000 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
* NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
* All or some portions of this file are derived from material licensed
* to the University of California by American Telephone and Telegraph
* Co. or Unix System Laboratories, Inc. and are reproduced herein with
* the permission of UNIX System Laboratories, Inc.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_bufq.c,v 1.27 2019/02/17 23:17:41 bad Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/bufq.h>
#include <sys/bufq_impl.h>
#include <sys/kmem.h>
#include <sys/sysctl.h>
#include <sys/module.h>
#define STRAT_MATCH(id, bs) (strcmp((id), (bs)->bs_name) == 0)
static void sysctl_kern_bufq_strategies_setup(struct sysctllog **);
static SLIST_HEAD(, bufq_strat) bufq_strat_list =
SLIST_HEAD_INITIALIZER(bufq_strat_list);
static kmutex_t bufq_mutex;
static struct sysctllog *sysctllog;
void
bufq_init(void)
{
mutex_init(&bufq_mutex, MUTEX_DEFAULT, IPL_NONE);
sysctl_kern_bufq_strategies_setup(&sysctllog);
}
int
bufq_register(struct bufq_strat *bs)
{
mutex_enter(&bufq_mutex);
SLIST_INSERT_HEAD(&bufq_strat_list, bs, bs_next);
bs->bs_refcnt = 0;
mutex_exit(&bufq_mutex);
return 0;
}
int
bufq_unregister(struct bufq_strat *bs)
{
mutex_enter(&bufq_mutex);
if (bs->bs_refcnt != 0) {
mutex_exit(&bufq_mutex);
return EBUSY;
}
SLIST_REMOVE(&bufq_strat_list, bs, bufq_strat, bs_next);
mutex_exit(&bufq_mutex);
return 0;
}
/*
* Create a device buffer queue.
*/
int
bufq_alloc(struct bufq_state **bufqp, const char *strategy, int flags)
{
struct bufq_strat *bsp, *it;
struct bufq_state *bufq;
int error = 0;
u_int gen;
bool found_exact;
char strategy_module_name[MAXPATHLEN];
KASSERT((flags & BUFQ_EXACT) == 0 || strategy != BUFQ_STRAT_ANY); switch (flags & BUFQ_SORT_MASK) {
case BUFQ_SORT_RAWBLOCK:
case BUFQ_SORT_CYLINDER:
break;
case 0:
/*
* for strategies which don't care about block numbers.
* eg. fcfs
*/
flags |= BUFQ_SORT_RAWBLOCK;
break;
default:
panic("bufq_alloc: sort out of range");
}
/*
* select strategy.
* if a strategy specified by flags is found, use it.
* otherwise, select one with the largest bs_prio.
*/
mutex_enter(&bufq_mutex);
do {
gen = module_gen;
bsp = NULL;
found_exact = false;
SLIST_FOREACH(it, &bufq_strat_list, bs_next) { if (strategy != BUFQ_STRAT_ANY &&
STRAT_MATCH(strategy, (it))) {
bsp = it;
found_exact = true;
break;
}
if (bsp == NULL || (it)->bs_prio > bsp->bs_prio)
bsp = it;
}
if (strategy == BUFQ_STRAT_ANY || found_exact)
break;
/* Try to autoload the bufq strategy module */
strlcpy(strategy_module_name, "bufq_",
sizeof(strategy_module_name));
strlcat(strategy_module_name, strategy,
sizeof(strategy_module_name));
mutex_exit(&bufq_mutex);
(void) module_autoload(strategy_module_name, MODULE_CLASS_BUFQ);
mutex_enter(&bufq_mutex);
} while (gen != module_gen); if (bsp == NULL) {
panic("bufq_alloc: no strategy");
}
if (strategy != BUFQ_STRAT_ANY && !found_exact) {
if ((flags & BUFQ_EXACT)) {
error = ENOENT;
mutex_exit(&bufq_mutex);
goto out;
}
#if defined(DEBUG)
printf("bufq_alloc: '%s' is not available. using '%s'.\n",
strategy, bsp->bs_name);
#endif
}
#if defined(BUFQ_DEBUG)
/* XXX aprint? */
printf("bufq_alloc: using '%s'\n", bsp->bs_name);
#endif
bsp->bs_refcnt++;
mutex_exit(&bufq_mutex);
*bufqp = bufq = kmem_zalloc(sizeof(*bufq), KM_SLEEP);
bufq->bq_flags = flags;
bufq->bq_strat = bsp;
(*bsp->bs_initfn)(bufq);
out:
return error;
}
void
bufq_put(struct bufq_state *bufq, struct buf *bp)
{
(*bufq->bq_put)(bufq, bp);
}
struct buf *
bufq_get(struct bufq_state *bufq)
{
return (*bufq->bq_get)(bufq, 1);
}
struct buf *
bufq_peek(struct bufq_state *bufq)
{
return (*bufq->bq_get)(bufq, 0);
}
struct buf *
bufq_cancel(struct bufq_state *bufq, struct buf *bp)
{
return (*bufq->bq_cancel)(bufq, bp);
}
/*
* Drain a device buffer queue.
*/
void
bufq_drain(struct bufq_state *bufq)
{
struct buf *bp;
while ((bp = bufq_get(bufq)) != NULL) {
bp->b_error = EIO;
bp->b_resid = bp->b_bcount;
biodone(bp);
}
}
/*
* Destroy a device buffer queue.
*/
void
bufq_free(struct bufq_state *bufq)
{ KASSERT(bufq_peek(bufq) == NULL);
bufq->bq_fini(bufq);
mutex_enter(&bufq_mutex);
bufq->bq_strat->bs_refcnt--;
mutex_exit(&bufq_mutex);
kmem_free(bufq, sizeof(*bufq));
}
/*
* get a strategy identifier of a buffer queue.
*/
const char *
bufq_getstrategyname(struct bufq_state *bufq)
{
return bufq->bq_strat->bs_name;
}
/*
* move all requests on a buffer queue to another.
*/
void
bufq_move(struct bufq_state *dst, struct bufq_state *src)
{
struct buf *bp;
while ((bp = bufq_get(src)) != NULL) {
bufq_put(dst, bp);
}
}
static int
docopy(char *buf, size_t *bufoffp, size_t buflen,
const char *datap, size_t datalen)
{
int error = 0;
if (buf != NULL && datalen > 0) {
if (*bufoffp + datalen > buflen) {
goto out;
}
error = copyout(datap, buf + *bufoffp, datalen);
if (error) {
goto out;
}
}
out:
if (error == 0) {
*bufoffp += datalen;
}
return error;
}
static int
docopystr(char *buf, size_t *bufoffp, size_t buflen, const char *datap)
{
return docopy(buf, bufoffp, buflen, datap, strlen(datap));
}
static int
docopynul(char *buf, size_t *bufoffp, size_t buflen)
{
return docopy(buf, bufoffp, buflen, "", 1);
}
/*
* sysctl function that will print all bufq strategies
* currently available to the kernel.
*/
static int
sysctl_kern_bufq_strategies(SYSCTLFN_ARGS)
{
const struct bufq_strat *bq_strat;
const char *delim = "";
size_t off = 0;
size_t buflen = *oldlenp;
int error;
SLIST_FOREACH(bq_strat, &bufq_strat_list, bs_next) {
error = docopystr(oldp, &off, buflen, delim);
if (error) {
goto out;
}
error = docopystr(oldp, &off, buflen, (bq_strat)->bs_name);
if (error) {
goto out;
}
delim = " ";
}
/* In case there are no registered strategies ... */
if (off == 0) {
error = docopystr(oldp, &off, buflen, "NULL");
if (error) {
goto out;
}
}
/* NUL terminate */
error = docopynul(oldp, &off, buflen);
out:
*oldlenp = off;
return error;
}
static void
sysctl_kern_bufq_strategies_setup(struct sysctllog **clog)
{
const struct sysctlnode *node;
node = NULL;
sysctl_createv(clog, 0, NULL, &node,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "bufq",
SYSCTL_DESCR("buffer queue subtree"),
NULL, 0, NULL, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
if (node != NULL) {
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRING, "strategies",
SYSCTL_DESCR("List of bufq strategies present"),
sysctl_kern_bufq_strategies, 0, NULL, 0,
CTL_KERN, node->sysctl_num, CTL_CREATE, CTL_EOL);
}
}
/* $NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $ */
/*-
* Copyright (c) 2008, 2009, 2021 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Andrew Doran.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon@FreeBSD.org>
* Copyright (c) 2009 Apple, Inc
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* FreeBSD: src/sys/kern/kern_event.c,v 1.27 2001/07/05 17:10:44 rwatson Exp
*/
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#endif /* _KERNEL_OPT */
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: kern_event.c,v 1.150 2023/09/21 09:31:50 msaitoh Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/wait.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/select.h>
#include <sys/queue.h>
#include <sys/event.h>
#include <sys/eventvar.h>
#include <sys/poll.h>
#include <sys/kmem.h>
#include <sys/stat.h>
#include <sys/filedesc.h>
#include <sys/syscallargs.h>
#include <sys/kauth.h>
#include <sys/conf.h>
#include <sys/atomic.h>
static int kqueue_scan(file_t *, size_t, struct kevent *,
const struct timespec *, register_t *,
const struct kevent_ops *, struct kevent *,
size_t);
static int kqueue_ioctl(file_t *, u_long, void *);
static int kqueue_fcntl(file_t *, u_int, void *);
static int kqueue_poll(file_t *, int);
static int kqueue_kqfilter(file_t *, struct knote *);
static int kqueue_stat(file_t *, struct stat *);
static int kqueue_close(file_t *);
static void kqueue_restart(file_t *);
static int kqueue_fpathconf(file_t *, int, register_t *);
static int kqueue_register(struct kqueue *, struct kevent *);
static void kqueue_doclose(struct kqueue *, struct klist *, int);
static void knote_detach(struct knote *, filedesc_t *fdp, bool);
static void knote_enqueue(struct knote *);
static void knote_activate(struct knote *);
static void knote_activate_locked(struct knote *);
static void knote_deactivate_locked(struct knote *);
static void filt_kqdetach(struct knote *);
static int filt_kqueue(struct knote *, long hint);
static int filt_procattach(struct knote *);
static void filt_procdetach(struct knote *);
static int filt_proc(struct knote *, long hint);
static int filt_fileattach(struct knote *);
static void filt_timerexpire(void *x);
static int filt_timerattach(struct knote *);
static void filt_timerdetach(struct knote *);
static int filt_timer(struct knote *, long hint);
static int filt_timertouch(struct knote *, struct kevent *, long type);
static int filt_userattach(struct knote *);
static void filt_userdetach(struct knote *);
static int filt_user(struct knote *, long hint);
static int filt_usertouch(struct knote *, struct kevent *, long type);
/*
* Private knote state that should never be exposed outside
* of kern_event.c
*
* Field locking:
*
* q kn_kq->kq_lock
*/
struct knote_impl {
struct knote ki_knote;
unsigned int ki_influx; /* q: in-flux counter */
kmutex_t ki_foplock; /* for kn_filterops */
};
#define KIMPL_TO_KNOTE(kip) (&(kip)->ki_knote)
#define KNOTE_TO_KIMPL(knp) container_of((knp), struct knote_impl, ki_knote)
static inline struct knote *
knote_alloc(bool sleepok)
{
struct knote_impl *ki;
ki = kmem_zalloc(sizeof(*ki), sleepok ? KM_SLEEP : KM_NOSLEEP);
mutex_init(&ki->ki_foplock, MUTEX_DEFAULT, IPL_NONE);
return KIMPL_TO_KNOTE(ki);
}
static inline void
knote_free(struct knote *kn)
{
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
mutex_destroy(&ki->ki_foplock);
kmem_free(ki, sizeof(*ki));
}
static inline void
knote_foplock_enter(struct knote *kn)
{
mutex_enter(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static inline void
knote_foplock_exit(struct knote *kn)
{
mutex_exit(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static inline bool __diagused
knote_foplock_owned(struct knote *kn)
{
return mutex_owned(&KNOTE_TO_KIMPL(kn)->ki_foplock);
}
static const struct fileops kqueueops = {
.fo_name = "kqueue",
.fo_read = (void *)enxio,
.fo_write = (void *)enxio,
.fo_ioctl = kqueue_ioctl,
.fo_fcntl = kqueue_fcntl,
.fo_poll = kqueue_poll,
.fo_stat = kqueue_stat,
.fo_close = kqueue_close,
.fo_kqfilter = kqueue_kqfilter,
.fo_restart = kqueue_restart,
.fo_fpathconf = kqueue_fpathconf,
};
static void
filt_nopdetach(struct knote *kn __unused)
{
}
static int
filt_nopevent(struct knote *kn __unused, long hint __unused)
{
return 0;
}
static const struct filterops nop_fd_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_nopdetach,
.f_event = filt_nopevent,
};
static const struct filterops nop_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_nopdetach,
.f_event = filt_nopevent,
};
static const struct filterops kqread_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_kqdetach,
.f_event = filt_kqueue,
};
static const struct filterops proc_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_procattach,
.f_detach = filt_procdetach,
.f_event = filt_proc,
};
/*
* file_filtops is not marked MPSAFE because it's going to call
* fileops::fo_kqfilter(), which might not be. That function,
* however, will override the knote's filterops, and thus will
* inherit the MPSAFE-ness of the back-end at that time.
*/
static const struct filterops file_filtops = {
.f_flags = FILTEROP_ISFD,
.f_attach = filt_fileattach,
.f_detach = NULL,
.f_event = NULL,
};
static const struct filterops timer_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_timerattach,
.f_detach = filt_timerdetach,
.f_event = filt_timer,
.f_touch = filt_timertouch,
};
static const struct filterops user_filtops = {
.f_flags = FILTEROP_MPSAFE,
.f_attach = filt_userattach,
.f_detach = filt_userdetach,
.f_event = filt_user,
.f_touch = filt_usertouch,
};
static u_int kq_ncallouts = 0;
static int kq_calloutmax = (4 * 1024);
#define KN_HASHSIZE 64 /* XXX should be tunable */
#define KN_HASH(val, mask) (((val) ^ (val >> 8)) & (mask))
extern const struct filterops fs_filtops; /* vfs_syscalls.c */
extern const struct filterops sig_filtops; /* kern_sig.c */
/*
* Table for all system-defined filters.
* These should be listed in the numeric order of the EVFILT_* defines.
* If filtops is NULL, the filter isn't implemented in NetBSD.
* End of list is when name is NULL.
*
* Note that 'refcnt' is meaningless for built-in filters.
*/
struct kfilter {
const char *name; /* name of filter */
uint32_t filter; /* id of filter */
unsigned refcnt; /* reference count */
const struct filterops *filtops;/* operations for filter */
size_t namelen; /* length of name string */
};
/* System defined filters */
static struct kfilter sys_kfilters[] = {
{ "EVFILT_READ", EVFILT_READ, 0, &file_filtops, 0 },
{ "EVFILT_WRITE", EVFILT_WRITE, 0, &file_filtops, 0, },
{ "EVFILT_AIO", EVFILT_AIO, 0, NULL, 0 },
{ "EVFILT_VNODE", EVFILT_VNODE, 0, &file_filtops, 0 },
{ "EVFILT_PROC", EVFILT_PROC, 0, &proc_filtops, 0 },
{ "EVFILT_SIGNAL", EVFILT_SIGNAL, 0, &sig_filtops, 0 },
{ "EVFILT_TIMER", EVFILT_TIMER, 0, &timer_filtops, 0 },
{ "EVFILT_FS", EVFILT_FS, 0, &fs_filtops, 0 },
{ "EVFILT_USER", EVFILT_USER, 0, &user_filtops, 0 },
{ "EVFILT_EMPTY", EVFILT_EMPTY, 0, &file_filtops, 0 },
{ NULL, 0, 0, NULL, 0 },
};
/* User defined kfilters */
static struct kfilter *user_kfilters; /* array */
static int user_kfilterc; /* current offset */
static int user_kfiltermaxc; /* max size so far */
static size_t user_kfiltersz; /* size of allocated memory */
/*
* Global Locks.
*
* Lock order:
*
* kqueue_filter_lock
* -> kn_kq->kq_fdp->fd_lock
* -> knote foplock (if taken)
* -> object lock (e.g., device driver lock, &c.)
* -> kn_kq->kq_lock
*
* Locking rules. ==> indicates the lock is acquired by the backing
* object, locks prior are acquired before calling filter ops:
*
* f_attach: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
*
* f_detach: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
*
* f_event via kevent: fdp->fd_lock -> knote foplock ->
* (maybe) KERNEL_LOCK ==> backing object lock
* N.B. NOTE_SUBMIT will never be set in the "hint" argument
* in this case.
*
* f_event via knote (via backing object: Whatever caller guarantees.
* Typically:
* f_event(NOTE_SUBMIT): caller has already acquired backing
* object lock.
* f_event(!NOTE_SUBMIT): caller has not acquired backing object,
* lock or has possibly acquired KERNEL_LOCK. Backing object
* lock may or may not be acquired as-needed.
* N.B. the knote foplock will **not** be acquired in this case. The
* caller guarantees that klist_fini() will not be called concurrently
* with knote().
*
* f_touch: fdp->fd_lock -> kn_kq->kq_lock (spin lock)
* N.B. knote foplock is **not** acquired in this case and
* the caller must guarantee that klist_fini() will never
* be called. kevent_register() restricts filters that
* provide f_touch to known-safe cases.
*
* klist_fini(): Caller must guarantee that no more knotes can
* be attached to the klist, and must **not** hold the backing
* object's lock; klist_fini() itself will acquire the foplock
* of each knote on the klist.
*
* Locking rules when detaching knotes:
*
* There are some situations where knote submission may require dropping
* locks (see knote_proc_fork()). In order to support this, it's possible
* to mark a knote as being 'in-flux'. Such a knote is guaranteed not to
* be detached while it remains in-flux. Because it will not be detached,
* locks can be dropped so e.g. memory can be allocated, locks on other
* data structures can be acquired, etc. During this time, any attempt to
* detach an in-flux knote must wait until the knote is no longer in-flux.
* When this happens, the knote is marked for death (KN_WILLDETACH) and the
* LWP who gets to finish the detach operation is recorded in the knote's
* 'udata' field (which is no longer required for its original purpose once
* a knote is so marked). Code paths that lead to knote_detach() must ensure
* that their LWP is the one tasked with its final demise after waiting for
* the in-flux status of the knote to clear. Note that once a knote is
* marked KN_WILLDETACH, no code paths may put it into an in-flux state.
*
* Once the special circumstances have been handled, the locks are re-
* acquired in the proper order (object lock -> kq_lock), the knote taken
* out of flux, and any waiters are notified. Because waiters must have
* also dropped *their* locks in order to safely block, they must re-
* validate all of their assumptions; see knote_detach_quiesce(). See also
* the kqueue_register() (EV_ADD, EV_DELETE) and kqueue_scan() (EV_ONESHOT)
* cases.
*
* When kqueue_scan() encounters an in-flux knote, the situation is
* treated like another LWP's list marker.
*
* LISTEN WELL: It is important to not hold knotes in flux for an
* extended period of time! In-flux knotes effectively block any
* progress of the kqueue_scan() operation. Any code paths that place
* knotes in-flux should be careful to not block for indefinite periods
* of time, such as for memory allocation (i.e. KM_NOSLEEP is OK, but
* KM_SLEEP is not).
*/
static krwlock_t kqueue_filter_lock; /* lock on filter lists */
#define KQ_FLUX_WAIT(kq) (void)cv_wait(&kq->kq_cv, &kq->kq_lock)
#define KQ_FLUX_WAKEUP(kq) cv_broadcast(&kq->kq_cv)
static inline bool
kn_in_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
return KNOTE_TO_KIMPL(kn)->ki_influx != 0;
}
static inline bool
kn_enter_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
if (kn->kn_status & KN_WILLDETACH) {
return false;
}
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
KASSERT(ki->ki_influx < UINT_MAX);
ki->ki_influx++;
return true;
}
static inline bool
kn_leave_flux(struct knote *kn)
{
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
KASSERT(ki->ki_influx > 0);
ki->ki_influx--;
return ki->ki_influx == 0;
}
static void
kn_wait_flux(struct knote *kn, bool can_loop)
{
struct knote_impl *ki = KNOTE_TO_KIMPL(kn);
bool loop;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
/*
* It may not be safe for us to touch the knote again after
* dropping the kq_lock. The caller has let us know in
* 'can_loop'.
*/
for (loop = true; loop && ki->ki_influx != 0; loop = can_loop) {
KQ_FLUX_WAIT(kn->kn_kq);
}
}
#define KNOTE_WILLDETACH(kn) \
do { \
(kn)->kn_status |= KN_WILLDETACH; \
(kn)->kn_kevent.udata = curlwp; \
} while (/*CONSTCOND*/0)
/*
* Wait until the specified knote is in a quiescent state and
* safe to detach. Returns true if we potentially blocked (and
* thus dropped our locks).
*/
static bool
knote_detach_quiesce(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
filedesc_t *fdp = kq->kq_fdp;
KASSERT(mutex_owned(&fdp->fd_lock));
mutex_spin_enter(&kq->kq_lock);
/*
* There are two cases where we might see KN_WILLDETACH here:
*
* 1. Someone else has already started detaching the knote but
* had to wait for it to settle first.
*
* 2. We had to wait for it to settle, and had to come back
* around after re-acquiring the locks.
*
* When KN_WILLDETACH is set, we also set the LWP that claimed
* the prize of finishing the detach in the 'udata' field of the
* knote (which will never be used again for its usual purpose
* once the note is in this state). If it doesn't point to us,
* we must drop the locks and let them in to finish the job.
*
* Otherwise, once we have claimed the knote for ourselves, we
* can finish waiting for it to settle. The is the only scenario
* where touching a detaching knote is safe after dropping the
* locks.
*/
if ((kn->kn_status & KN_WILLDETACH) != 0 &&
kn->kn_kevent.udata != curlwp) {
/*
* N.B. it is NOT safe for us to touch the knote again
* after dropping the locks here. The caller must go
* back around and re-validate everything. However, if
* the knote is in-flux, we want to block to minimize
* busy-looping.
*/
mutex_exit(&fdp->fd_lock);
if (kn_in_flux(kn)) {
kn_wait_flux(kn, false);
mutex_spin_exit(&kq->kq_lock);
return true;
}
mutex_spin_exit(&kq->kq_lock);
preempt_point();
return true;
}
/*
* If we get here, we know that we will be claiming the
* detach responsibilies, or that we already have and
* this is the second attempt after re-validation.
*/
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
kn->kn_kevent.udata == curlwp);
/*
* Similarly, if we get here, either we are just claiming it
* and may have to wait for it to settle, or if this is the
* second attempt after re-validation that no other code paths
* have put it in-flux.
*/
KASSERT((kn->kn_status & KN_WILLDETACH) == 0 ||
kn_in_flux(kn) == false);
KNOTE_WILLDETACH(kn);
if (kn_in_flux(kn)) {
mutex_exit(&fdp->fd_lock);
kn_wait_flux(kn, true);
/*
* It is safe for us to touch the knote again after
* dropping the locks, but the caller must still
* re-validate everything because other aspects of
* the environment may have changed while we blocked.
*/
KASSERT(kn_in_flux(kn) == false);
mutex_spin_exit(&kq->kq_lock);
return true;
}
mutex_spin_exit(&kq->kq_lock);
return false;
}
/*
* Calls into the filterops need to be resilient against things which
* destroy a klist, e.g. device detach, freeing a vnode, etc., to avoid
* chasing garbage pointers (to data, or even potentially code in a
* module about to be unloaded). To that end, we acquire the
* knote foplock before calling into the filter ops. When a driver
* (or anything else) is tearing down its klist, klist_fini() enumerates
* each knote, acquires its foplock, and replaces the filterops with a
* nop stub, allowing knote detach (when descriptors are closed) to safely
* proceed.
*/
static int
filter_attach(struct knote *kn)
{
int rv;
KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_attach != NULL);
/*
* N.B. that kn->kn_fop may change as the result of calling
* f_attach(). After f_attach() returns, kn->kn_fop may not
* be modified by code outside of klist_fini().
*/
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
rv = kn->kn_fop->f_attach(kn);
} else {
KERNEL_LOCK(1, NULL);
rv = kn->kn_fop->f_attach(kn);
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
static void
filter_detach(struct knote *kn)
{
KASSERT(knote_foplock_owned(kn));
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_detach != NULL);
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
kn->kn_fop->f_detach(kn);
} else {
KERNEL_LOCK(1, NULL);
kn->kn_fop->f_detach(kn);
KERNEL_UNLOCK_ONE(NULL);
}
}
static int
filter_event(struct knote *kn, long hint, bool submitting)
{
int rv;
/* See knote(). */
KASSERT(submitting || knote_foplock_owned(kn)); KASSERT(kn->kn_fop != NULL); KASSERT(kn->kn_fop->f_event != NULL);
if (kn->kn_fop->f_flags & FILTEROP_MPSAFE) {
rv = kn->kn_fop->f_event(kn, hint);
} else {
KERNEL_LOCK(1, NULL);
rv = kn->kn_fop->f_event(kn, hint);
KERNEL_UNLOCK_ONE(NULL);
}
return rv;
}
static int
filter_touch(struct knote *kn, struct kevent *kev, long type)
{
/*
* XXX We cannot assert that the knote foplock is held here
* XXX beause we cannot safely acquire it in all cases
* XXX where "touch" will be used in kqueue_scan(). We just
* XXX have to assume that f_touch will always be safe to call,
* XXX and kqueue_register() allows only the two known-safe
* XXX users of that op.
*/
KASSERT(kn->kn_fop != NULL);
KASSERT(kn->kn_fop->f_touch != NULL);
return kn->kn_fop->f_touch(kn, kev, type);
}
static kauth_listener_t kqueue_listener;
static int
kqueue_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
void *arg0, void *arg1, void *arg2, void *arg3)
{
struct proc *p;
int result;
result = KAUTH_RESULT_DEFER;
p = arg0;
if (action != KAUTH_PROCESS_KEVENT_FILTER)
return result;
if ((kauth_cred_getuid(p->p_cred) != kauth_cred_getuid(cred) || ISSET(p->p_flag, PK_SUGID)))
return result;
result = KAUTH_RESULT_ALLOW;
return result;
}
/*
* Initialize the kqueue subsystem.
*/
void
kqueue_init(void)
{
rw_init(&kqueue_filter_lock);
kqueue_listener = kauth_listen_scope(KAUTH_SCOPE_PROCESS,
kqueue_listener_cb, NULL);
}
/*
* Find kfilter entry by name, or NULL if not found.
*/
static struct kfilter *
kfilter_byname_sys(const char *name)
{
int i;
KASSERT(rw_lock_held(&kqueue_filter_lock));
for (i = 0; sys_kfilters[i].name != NULL; i++) {
if (strcmp(name, sys_kfilters[i].name) == 0)
return &sys_kfilters[i];
}
return NULL;
}
static struct kfilter *
kfilter_byname_user(const char *name)
{
int i;
KASSERT(rw_lock_held(&kqueue_filter_lock));
/* user filter slots have a NULL name if previously deregistered */
for (i = 0; i < user_kfilterc ; i++) {
if (user_kfilters[i].name != NULL &&
strcmp(name, user_kfilters[i].name) == 0)
return &user_kfilters[i];
}
return NULL;
}
static struct kfilter *
kfilter_byname(const char *name)
{
struct kfilter *kfilter;
KASSERT(rw_lock_held(&kqueue_filter_lock));
if ((kfilter = kfilter_byname_sys(name)) != NULL)
return kfilter;
return kfilter_byname_user(name);
}
/*
* Find kfilter entry by filter id, or NULL if not found.
* Assumes entries are indexed in filter id order, for speed.
*/
static struct kfilter *
kfilter_byfilter(uint32_t filter)
{
struct kfilter *kfilter;
KASSERT(rw_lock_held(&kqueue_filter_lock));
if (filter < EVFILT_SYSCOUNT) /* it's a system filter */
kfilter = &sys_kfilters[filter];
else if (user_kfilters != NULL &&
filter < EVFILT_SYSCOUNT + user_kfilterc)
/* it's a user filter */
kfilter = &user_kfilters[filter - EVFILT_SYSCOUNT];
else
return (NULL); /* out of range */
KASSERT(kfilter->filter == filter); /* sanity check! */
return (kfilter);
}
/*
* Register a new kfilter. Stores the entry in user_kfilters.
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
* If retfilter != NULL, the new filterid is returned in it.
*/
int
kfilter_register(const char *name, const struct filterops *filtops,
int *retfilter)
{
struct kfilter *kfilter;
size_t len;
int i;
if (name == NULL || name[0] == '\0' || filtops == NULL)
return (EINVAL); /* invalid args */
rw_enter(&kqueue_filter_lock, RW_WRITER);
if (kfilter_byname(name) != NULL) {
rw_exit(&kqueue_filter_lock);
return (EEXIST); /* already exists */
}
if (user_kfilterc > 0xffffffff - EVFILT_SYSCOUNT) {
rw_exit(&kqueue_filter_lock);
return (EINVAL); /* too many */
}
for (i = 0; i < user_kfilterc; i++) {
kfilter = &user_kfilters[i];
if (kfilter->name == NULL) {
/* Previously deregistered slot. Reuse. */
goto reuse;
}
}
/* check if need to grow user_kfilters */
if (user_kfilterc + 1 > user_kfiltermaxc) {
/* Grow in KFILTER_EXTENT chunks. */
user_kfiltermaxc += KFILTER_EXTENT;
len = user_kfiltermaxc * sizeof(*kfilter);
kfilter = kmem_alloc(len, KM_SLEEP);
memset((char *)kfilter + user_kfiltersz, 0, len - user_kfiltersz);
if (user_kfilters != NULL) {
memcpy(kfilter, user_kfilters, user_kfiltersz);
kmem_free(user_kfilters, user_kfiltersz);
}
user_kfiltersz = len;
user_kfilters = kfilter;
}
/* Adding new slot */
kfilter = &user_kfilters[user_kfilterc++];
reuse:
kfilter->name = kmem_strdupsize(name, &kfilter->namelen, KM_SLEEP);
kfilter->filter = (kfilter - user_kfilters) + EVFILT_SYSCOUNT;
kfilter->filtops = kmem_alloc(sizeof(*filtops), KM_SLEEP);
memcpy(__UNCONST(kfilter->filtops), filtops, sizeof(*filtops));
if (retfilter != NULL)
*retfilter = kfilter->filter;
rw_exit(&kqueue_filter_lock);
return (0);
}
/*
* Unregister a kfilter previously registered with kfilter_register.
* This retains the filter id, but clears the name and frees filtops (filter
* operations), so that the number isn't reused during a boot.
* Returns 0 if operation succeeded, or an appropriate errno(2) otherwise.
*/
int
kfilter_unregister(const char *name)
{
struct kfilter *kfilter;
if (name == NULL || name[0] == '\0')
return (EINVAL); /* invalid name */
rw_enter(&kqueue_filter_lock, RW_WRITER);
if (kfilter_byname_sys(name) != NULL) {
rw_exit(&kqueue_filter_lock);
return (EINVAL); /* can't detach system filters */
}
kfilter = kfilter_byname_user(name);
if (kfilter == NULL) {
rw_exit(&kqueue_filter_lock);
return (ENOENT);
}
if (kfilter->refcnt != 0) {
rw_exit(&kqueue_filter_lock);
return (EBUSY);
}
/* Cast away const (but we know it's safe. */
kmem_free(__UNCONST(kfilter->name), kfilter->namelen);
kfilter->name = NULL; /* mark as `not implemented' */
if (kfilter->filtops != NULL) {
/* Cast away const (but we know it's safe. */
kmem_free(__UNCONST(kfilter->filtops),
sizeof(*kfilter->filtops));
kfilter->filtops = NULL; /* mark as `not implemented' */
}
rw_exit(&kqueue_filter_lock);
return (0);
}
/*
* Filter attach method for EVFILT_READ and EVFILT_WRITE on normal file
* descriptors. Calls fileops kqfilter method for given file descriptor.
*/
static int
filt_fileattach(struct knote *kn)
{
file_t *fp;
fp = kn->kn_obj;
return (*fp->f_ops->fo_kqfilter)(fp, kn);
}
/*
* Filter detach method for EVFILT_READ on kqueue descriptor.
*/
static void
filt_kqdetach(struct knote *kn)
{
struct kqueue *kq;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
mutex_spin_enter(&kq->kq_lock);
selremove_knote(&kq->kq_sel, kn);
mutex_spin_exit(&kq->kq_lock);
}
/*
* Filter event method for EVFILT_READ on kqueue descriptor.
*/
/*ARGSUSED*/
static int
filt_kqueue(struct knote *kn, long hint)
{
struct kqueue *kq;
int rv;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
if (hint != NOTE_SUBMIT)
mutex_spin_enter(&kq->kq_lock);
kn->kn_data = KQ_COUNT(kq);
rv = (kn->kn_data > 0);
if (hint != NOTE_SUBMIT)
mutex_spin_exit(&kq->kq_lock);
return rv;
}
/*
* Filter attach method for EVFILT_PROC.
*/
static int
filt_procattach(struct knote *kn)
{
struct proc *p;
mutex_enter(&proc_lock);
p = proc_find(kn->kn_id);
if (p == NULL) {
mutex_exit(&proc_lock);
return ESRCH;
}
/*
* Fail if it's not owned by you, or the last exec gave us
* setuid/setgid privs (unless you're root).
*/
mutex_enter(p->p_lock);
mutex_exit(&proc_lock);
if (kauth_authorize_process(curlwp->l_cred,
KAUTH_PROCESS_KEVENT_FILTER, p, NULL, NULL, NULL) != 0) {
mutex_exit(p->p_lock);
return EACCES;
}
kn->kn_obj = p;
kn->kn_flags |= EV_CLEAR; /* automatically set */
/*
* NOTE_CHILD is only ever generated internally; don't let it
* leak in from user-space. See knote_proc_fork_track().
*/
kn->kn_sfflags &= ~NOTE_CHILD;
klist_insert(&p->p_klist, kn);
mutex_exit(p->p_lock);
return 0;
}
/*
* Filter detach method for EVFILT_PROC.
*
* The knote may be attached to a different process, which may exit,
* leaving nothing for the knote to be attached to. So when the process
* exits, the knote is marked as DETACHED and also flagged as ONESHOT so
* it will be deleted when read out. However, as part of the knote deletion,
* this routine is called, so a check is needed to avoid actually performing
* a detach, because the original process might not exist any more.
*/
static void
filt_procdetach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
struct proc *p;
/*
* We have to synchronize with knote_proc_exit(), but we
* are forced to acquire the locks in the wrong order here
* because we can't be sure kn->kn_obj is valid unless
* KN_DETACHED is not set.
*/
again:
mutex_spin_enter(&kq->kq_lock);
if ((kn->kn_status & KN_DETACHED) == 0) {
p = kn->kn_obj;
if (!mutex_tryenter(p->p_lock)) {
mutex_spin_exit(&kq->kq_lock);
preempt_point();
goto again;
}
kn->kn_status |= KN_DETACHED;
klist_remove(&p->p_klist, kn);
mutex_exit(p->p_lock);
}
mutex_spin_exit(&kq->kq_lock);
}
/*
* Filter event method for EVFILT_PROC.
*
* Due to some of the complexities of process locking, we have special
* entry points for delivering knote submissions. filt_proc() is used
* only to check for activation from kqueue_register() and kqueue_scan().
*/
static int
filt_proc(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
uint32_t fflags;
/*
* Because we share the same klist with signal knotes, just
* ensure that we're not being invoked for the proc-related
* submissions.
*/
KASSERT((hint & (NOTE_EXEC | NOTE_EXIT | NOTE_FORK)) == 0);
mutex_spin_enter(&kq->kq_lock);
fflags = kn->kn_fflags;
mutex_spin_exit(&kq->kq_lock);
return fflags != 0;
}
void
knote_proc_exec(struct proc *p)
{
struct knote *kn, *tmpkn;
struct kqueue *kq;
uint32_t fflags;
mutex_enter(p->p_lock);
SLIST_FOREACH_SAFE(kn, &p->p_klist, kn_selnext, tmpkn) {
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
if (kn->kn_fop == &sig_filtops) {
continue;
}
KASSERT(kn->kn_fop == &proc_filtops);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
fflags = (kn->kn_fflags |= (kn->kn_sfflags & NOTE_EXEC));
if (fflags) {
knote_activate_locked(kn);
}
mutex_spin_exit(&kq->kq_lock);
}
mutex_exit(p->p_lock);
}
static int __noinline
knote_proc_fork_track(struct proc *p1, struct proc *p2, struct knote *okn)
{
struct kqueue *kq = okn->kn_kq;
KASSERT(mutex_owned(&kq->kq_lock));
KASSERT(mutex_owned(p1->p_lock));
/*
* We're going to put this knote into flux while we drop
* the locks and create and attach a new knote to track the
* child. If we are not able to enter flux, then this knote
* is about to go away, so skip the notification.
*/
if (!kn_enter_flux(okn)) {
return 0;
}
mutex_spin_exit(&kq->kq_lock);
mutex_exit(p1->p_lock);
/*
* We actually have to register *two* new knotes:
*
* ==> One for the NOTE_CHILD notification. This is a forced
* ONESHOT note.
*
* ==> One to actually track the child process as it subsequently
* forks, execs, and, ultimately, exits.
*
* If we only register a single knote, then it's possible for
* for the NOTE_CHILD and NOTE_EXIT to be collapsed into a single
* notification if the child exits before the tracking process
* has received the NOTE_CHILD notification, which applications
* aren't expecting (the event's 'data' field would be clobbered,
* for example).
*
* To do this, what we have here is an **extremely** stripped-down
* version of kqueue_register() that has the following properties:
*
* ==> Does not block to allocate memory. If we are unable
* to allocate memory, we return ENOMEM.
*
* ==> Does not search for existing knotes; we know there
* are not any because this is a new process that isn't
* even visible to other processes yet.
*
* ==> Assumes that the knhash for our kq's descriptor table
* already exists (after all, we're already tracking
* processes with knotes if we got here).
*
* ==> Directly attaches the new tracking knote to the child
* process.
*
* The whole point is to do the minimum amount of work while the
* knote is held in-flux, and to avoid doing extra work in general
* (we already have the new child process; why bother looking it
* up again?).
*/
filedesc_t *fdp = kq->kq_fdp;
struct knote *knchild, *kntrack;
int error = 0;
knchild = knote_alloc(false);
kntrack = knote_alloc(false);
if (__predict_false(knchild == NULL || kntrack == NULL)) {
error = ENOMEM;
goto out;
}
kntrack->kn_obj = p2;
kntrack->kn_id = p2->p_pid;
kntrack->kn_kq = kq;
kntrack->kn_fop = okn->kn_fop;
kntrack->kn_kfilter = okn->kn_kfilter;
kntrack->kn_sfflags = okn->kn_sfflags;
kntrack->kn_sdata = p1->p_pid;
kntrack->kn_kevent.ident = p2->p_pid;
kntrack->kn_kevent.filter = okn->kn_filter;
kntrack->kn_kevent.flags =
okn->kn_flags | EV_ADD | EV_ENABLE | EV_CLEAR;
kntrack->kn_kevent.fflags = 0;
kntrack->kn_kevent.data = 0;
kntrack->kn_kevent.udata = okn->kn_kevent.udata; /* preserve udata */
/*
* The child note does not need to be attached to the
* new proc's klist at all.
*/
*knchild = *kntrack;
knchild->kn_status = KN_DETACHED;
knchild->kn_sfflags = 0;
knchild->kn_kevent.flags |= EV_ONESHOT;
knchild->kn_kevent.fflags = NOTE_CHILD;
knchild->kn_kevent.data = p1->p_pid; /* parent */
mutex_enter(&fdp->fd_lock);
/*
* We need to check to see if the kq is closing, and skip
* attaching the knote if so. Normally, this isn't necessary
* when coming in the front door because the file descriptor
* layer will synchronize this.
*
* It's safe to test KQ_CLOSING without taking the kq_lock
* here because that flag is only ever set when the fd_lock
* is also held.
*/
if (__predict_false(kq->kq_count & KQ_CLOSING)) {
mutex_exit(&fdp->fd_lock);
goto out;
}
/*
* We do the "insert into FD table" and "attach to klist" steps
* in the opposite order of kqueue_register() here to avoid
* having to take p2->p_lock twice. But this is OK because we
* hold fd_lock across the entire operation.
*/
mutex_enter(p2->p_lock);
error = kauth_authorize_process(curlwp->l_cred,
KAUTH_PROCESS_KEVENT_FILTER, p2, NULL, NULL, NULL);
if (__predict_false(error != 0)) {
mutex_exit(p2->p_lock);
mutex_exit(&fdp->fd_lock);
error = EACCES;
goto out;
}
klist_insert(&p2->p_klist, kntrack);
mutex_exit(p2->p_lock);
KASSERT(fdp->fd_knhashmask != 0);
KASSERT(fdp->fd_knhash != NULL);
struct klist *list = &fdp->fd_knhash[KN_HASH(kntrack->kn_id,
fdp->fd_knhashmask)];
SLIST_INSERT_HEAD(list, kntrack, kn_link);
SLIST_INSERT_HEAD(list, knchild, kn_link);
/* This adds references for knchild *and* kntrack. */
atomic_add_int(&kntrack->kn_kfilter->refcnt, 2);
knote_activate(knchild);
kntrack = NULL;
knchild = NULL;
mutex_exit(&fdp->fd_lock);
out:
if (__predict_false(knchild != NULL)) {
knote_free(knchild);
}
if (__predict_false(kntrack != NULL)) {
knote_free(kntrack);
}
mutex_enter(p1->p_lock);
mutex_spin_enter(&kq->kq_lock);
if (kn_leave_flux(okn)) {
KQ_FLUX_WAKEUP(kq);
}
return error;
}
void
knote_proc_fork(struct proc *p1, struct proc *p2)
{
struct knote *kn;
struct kqueue *kq;
uint32_t fflags;
mutex_enter(p1->p_lock);
/*
* N.B. We DO NOT use SLIST_FOREACH_SAFE() here because we
* don't want to pre-fetch the next knote; in the event we
* have to drop p_lock, we will have put the knote in-flux,
* meaning that no one will be able to detach it until we
* have taken the knote out of flux. However, that does
* NOT stop someone else from detaching the next note in the
* list while we have it unlocked. Thus, we want to fetch
* the next note in the list only after we have re-acquired
* the lock, and using SLIST_FOREACH() will satisfy that.
*/
SLIST_FOREACH(kn, &p1->p_klist, kn_selnext) {
/* N.B. EVFILT_SIGNAL knotes are on this same list. */
if (kn->kn_fop == &sig_filtops) {
continue;
}
KASSERT(kn->kn_fop == &proc_filtops);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_fflags |= (kn->kn_sfflags & NOTE_FORK);
if (__predict_false(kn->kn_sfflags & NOTE_TRACK)) {
/*
* This will drop kq_lock and p_lock and
* re-acquire them before it returns.
*/
if (knote_proc_fork_track(p1, p2, kn)) {
kn->kn_fflags |= NOTE_TRACKERR;
}
KASSERT(mutex_owned(p1->p_lock));
KASSERT(mutex_owned(&kq->kq_lock));
}
fflags = kn->kn_fflags;
if (fflags) {
knote_activate_locked(kn);
}
mutex_spin_exit(&kq->kq_lock);
}
mutex_exit(p1->p_lock);
}
void
knote_proc_exit(struct proc *p)
{
struct knote *kn;
struct kqueue *kq;
KASSERT(mutex_owned(p->p_lock));
while (!SLIST_EMPTY(&p->p_klist)) {
kn = SLIST_FIRST(&p->p_klist);
kq = kn->kn_kq;
KASSERT(kn->kn_obj == p);
mutex_spin_enter(&kq->kq_lock);
kn->kn_data = P_WAITSTATUS(p);
/*
* Mark as ONESHOT, so that the knote is g/c'ed
* when read.
*/
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_fflags |= kn->kn_sfflags & NOTE_EXIT;
/*
* Detach the knote from the process and mark it as such.
* N.B. EVFILT_SIGNAL are also on p_klist, but by the
* time we get here, all open file descriptors for this
* process have been released, meaning that signal knotes
* will have already been detached.
*
* We need to synchronize this with filt_procdetach().
*/
KASSERT(kn->kn_fop == &proc_filtops);
if ((kn->kn_status & KN_DETACHED) == 0) {
kn->kn_status |= KN_DETACHED;
SLIST_REMOVE_HEAD(&p->p_klist, kn_selnext);
}
/*
* Always activate the knote for NOTE_EXIT regardless
* of whether or not the listener cares about it.
* This matches historical behavior.
*/
knote_activate_locked(kn);
mutex_spin_exit(&kq->kq_lock);
}
}
#define FILT_TIMER_NOSCHED ((uintptr_t)-1)
static int
filt_timercompute(struct kevent *kev, uintptr_t *tticksp)
{
struct timespec ts;
uintptr_t tticks;
if (kev->fflags & ~(NOTE_TIMER_UNITMASK | NOTE_ABSTIME)) {
return EINVAL;
}
/*
* Convert the event 'data' to a timespec, then convert the
* timespec to callout ticks.
*/
switch (kev->fflags & NOTE_TIMER_UNITMASK) {
case NOTE_SECONDS:
ts.tv_sec = kev->data;
ts.tv_nsec = 0;
break;
case NOTE_MSECONDS: /* == historical value 0 */
ts.tv_sec = kev->data / 1000;
ts.tv_nsec = (kev->data % 1000) * 1000000;
break;
case NOTE_USECONDS:
ts.tv_sec = kev->data / 1000000;
ts.tv_nsec = (kev->data % 1000000) * 1000;
break;
case NOTE_NSECONDS:
ts.tv_sec = kev->data / 1000000000;
ts.tv_nsec = kev->data % 1000000000;
break;
default:
return EINVAL;
}
if (kev->fflags & NOTE_ABSTIME) {
struct timespec deadline = ts;
/*
* Get current time.
*
* XXX This is CLOCK_REALTIME. There is no way to
* XXX specify CLOCK_MONOTONIC.
*/
nanotime(&ts);
/* Absolute timers do not repeat. */
kev->data = FILT_TIMER_NOSCHED;
/* If we're past the deadline, then the event will fire. */
if (timespeccmp(&deadline, &ts, <=)) {
tticks = FILT_TIMER_NOSCHED;
goto out;
}
/* Calculate how much time is left. */
timespecsub(&deadline, &ts, &ts);
} else {
/* EV_CLEAR automatically set for relative timers. */
kev->flags |= EV_CLEAR;
}
tticks = tstohz(&ts);
/* if the supplied value is under our resolution, use 1 tick */
if (tticks == 0) {
if (kev->data == 0)
return EINVAL;
tticks = 1;
} else if (tticks > INT_MAX) {
return EINVAL;
}
if ((kev->flags & EV_ONESHOT) != 0) {
/* Timer does not repeat. */
kev->data = FILT_TIMER_NOSCHED;
} else {
KASSERT((uintptr_t)tticks != FILT_TIMER_NOSCHED);
kev->data = tticks;
}
out:
*tticksp = tticks;
return 0;
}
static void
filt_timerexpire(void *knx)
{
struct knote *kn = knx;
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_data++;
knote_activate_locked(kn);
if (kn->kn_sdata != FILT_TIMER_NOSCHED) {
KASSERT(kn->kn_sdata > 0);
KASSERT(kn->kn_sdata <= INT_MAX);
callout_schedule((callout_t *)kn->kn_hook,
(int)kn->kn_sdata);
}
mutex_spin_exit(&kq->kq_lock);
}
static inline void
filt_timerstart(struct knote *kn, uintptr_t tticks)
{
callout_t *calloutp = kn->kn_hook;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
KASSERT(!callout_pending(calloutp));
if (__predict_false(tticks == FILT_TIMER_NOSCHED)) {
kn->kn_data = 1;
} else {
KASSERT(tticks <= INT_MAX);
callout_reset(calloutp, (int)tticks, filt_timerexpire, kn);
}
}
static int
filt_timerattach(struct knote *kn)
{
callout_t *calloutp;
struct kqueue *kq;
uintptr_t tticks;
int error;
struct kevent kev = {
.flags = kn->kn_flags,
.fflags = kn->kn_sfflags,
.data = kn->kn_sdata,
};
error = filt_timercompute(&kev, &tticks);
if (error) {
return error;
}
if (atomic_inc_uint_nv(&kq_ncallouts) >= kq_calloutmax ||
(calloutp = kmem_alloc(sizeof(*calloutp), KM_NOSLEEP)) == NULL) {
atomic_dec_uint(&kq_ncallouts);
return ENOMEM;
}
callout_init(calloutp, CALLOUT_MPSAFE);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_sdata = kev.data;
kn->kn_flags = kev.flags;
KASSERT(kn->kn_sfflags == kev.fflags);
kn->kn_hook = calloutp;
filt_timerstart(kn, tticks);
mutex_spin_exit(&kq->kq_lock);
return (0);
}
static void
filt_timerdetach(struct knote *kn)
{
callout_t *calloutp;
struct kqueue *kq = kn->kn_kq;
/* prevent rescheduling when we expire */
mutex_spin_enter(&kq->kq_lock);
kn->kn_sdata = FILT_TIMER_NOSCHED;
mutex_spin_exit(&kq->kq_lock);
calloutp = (callout_t *)kn->kn_hook;
/*
* Attempt to stop the callout. This will block if it's
* already running.
*/
callout_halt(calloutp, NULL);
callout_destroy(calloutp);
kmem_free(calloutp, sizeof(*calloutp));
atomic_dec_uint(&kq_ncallouts);
}
static int
filt_timertouch(struct knote *kn, struct kevent *kev, long type)
{
struct kqueue *kq = kn->kn_kq;
callout_t *calloutp;
uintptr_t tticks;
int error;
KASSERT(mutex_owned(&kq->kq_lock));
switch (type) {
case EVENT_REGISTER:
/* Only relevant for EV_ADD. */
if ((kev->flags & EV_ADD) == 0) {
return 0;
}
/*
* Stop the timer, under the assumption that if
* an application is re-configuring the timer,
* they no longer care about the old one. We
* can safely drop the kq_lock while we wait
* because fdp->fd_lock will be held throughout,
* ensuring that no one can sneak in with an
* EV_DELETE or close the kq.
*/
KASSERT(mutex_owned(&kq->kq_fdp->fd_lock));
calloutp = kn->kn_hook;
callout_halt(calloutp, &kq->kq_lock);
KASSERT(mutex_owned(&kq->kq_lock));
knote_deactivate_locked(kn);
kn->kn_data = 0;
error = filt_timercompute(kev, &tticks);
if (error) {
return error;
}
kn->kn_sdata = kev->data;
kn->kn_flags = kev->flags;
kn->kn_sfflags = kev->fflags;
filt_timerstart(kn, tticks);
break;
case EVENT_PROCESS:
*kev = kn->kn_kevent;
break;
default:
panic("%s: invalid type (%ld)", __func__, type);
}
return 0;
}
static int
filt_timer(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
int rv;
mutex_spin_enter(&kq->kq_lock);
rv = (kn->kn_data != 0);
mutex_spin_exit(&kq->kq_lock);
return rv;
}
static int
filt_userattach(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
mutex_spin_enter(&kq->kq_lock);
kn->kn_hook = NULL;
if (kn->kn_fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
else
kn->kn_hookid = 0;
mutex_spin_exit(&kq->kq_lock);
return (0);
}
static void
filt_userdetach(struct knote *kn)
{
/*
* EVFILT_USER knotes are not attached to anything in the kernel.
*/
}
static int
filt_user(struct knote *kn, long hint)
{
struct kqueue *kq = kn->kn_kq;
int hookid;
mutex_spin_enter(&kq->kq_lock);
hookid = kn->kn_hookid;
mutex_spin_exit(&kq->kq_lock);
return hookid;
}
static int
filt_usertouch(struct knote *kn, struct kevent *kev, long type)
{
int ffctrl;
KASSERT(mutex_owned(&kn->kn_kq->kq_lock));
switch (type) {
case EVENT_REGISTER:
if (kev->fflags & NOTE_TRIGGER)
kn->kn_hookid = 1;
ffctrl = kev->fflags & NOTE_FFCTRLMASK;
kev->fflags &= NOTE_FFLAGSMASK;
switch (ffctrl) {
case NOTE_FFNOP:
break;
case NOTE_FFAND:
kn->kn_sfflags &= kev->fflags;
break;
case NOTE_FFOR:
kn->kn_sfflags |= kev->fflags;
break;
case NOTE_FFCOPY:
kn->kn_sfflags = kev->fflags;
break;
default:
/* XXX Return error? */
break;
}
kn->kn_sdata = kev->data;
if (kev->flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
case EVENT_PROCESS:
*kev = kn->kn_kevent;
kev->fflags = kn->kn_sfflags;
kev->data = kn->kn_sdata;
if (kn->kn_flags & EV_CLEAR) {
kn->kn_hookid = 0;
kn->kn_data = 0;
kn->kn_fflags = 0;
}
break;
default:
panic("filt_usertouch() - invalid type (%ld)", type);
break;
}
return 0;
}
/*
* filt_seltrue:
*
* This filter "event" routine simulates seltrue().
*/
int
filt_seltrue(struct knote *kn, long hint)
{
/*
* We don't know how much data can be read/written,
* but we know that it *can* be. This is about as
* good as select/poll does as well.
*/
kn->kn_data = 0;
return (1);
}
/*
* This provides full kqfilter entry for device switch tables, which
* has same effect as filter using filt_seltrue() as filter method.
*/
static void
filt_seltruedetach(struct knote *kn)
{
/* Nothing to do */
}
const struct filterops seltrue_filtops = {
.f_flags = FILTEROP_ISFD | FILTEROP_MPSAFE,
.f_attach = NULL,
.f_detach = filt_seltruedetach,
.f_event = filt_seltrue,
};
int
seltrue_kqfilter(dev_t dev, struct knote *kn)
{
switch (kn->kn_filter) {
case EVFILT_READ:
case EVFILT_WRITE:
kn->kn_fop = &seltrue_filtops;
break;
default:
return (EINVAL);
}
/* Nothing more to do */
return (0);
}
/*
* kqueue(2) system call.
*/
static int
kqueue1(struct lwp *l, int flags, register_t *retval)
{
struct kqueue *kq;
file_t *fp;
int fd, error;
if ((error = fd_allocfile(&fp, &fd)) != 0)
return error;
fp->f_flag = FREAD | FWRITE | (flags & (FNONBLOCK|FNOSIGPIPE));
fp->f_type = DTYPE_KQUEUE;
fp->f_ops = &kqueueops;
kq = kmem_zalloc(sizeof(*kq), KM_SLEEP);
mutex_init(&kq->kq_lock, MUTEX_DEFAULT, IPL_SCHED);
cv_init(&kq->kq_cv, "kqueue");
selinit(&kq->kq_sel);
TAILQ_INIT(&kq->kq_head);
fp->f_kqueue = kq;
*retval = fd;
kq->kq_fdp = curlwp->l_fd;
fd_set_exclose(l, fd, (flags & O_CLOEXEC) != 0);
fd_affix(curproc, fp, fd);
return error;
}
/*
* kqueue(2) system call.
*/
int
sys_kqueue(struct lwp *l, const void *v, register_t *retval)
{
return kqueue1(l, 0, retval);
}
int
sys_kqueue1(struct lwp *l, const struct sys_kqueue1_args *uap,
register_t *retval)
{
/* {
syscallarg(int) flags;
} */
return kqueue1(l, SCARG(uap, flags), retval);
}
/*
* kevent(2) system call.
*/
int
kevent_fetch_changes(void *ctx, const struct kevent *changelist,
struct kevent *changes, size_t index, int n)
{
return copyin(changelist + index, changes, n * sizeof(*changes));
}
int
kevent_put_events(void *ctx, struct kevent *events,
struct kevent *eventlist, size_t index, int n)
{
return copyout(events, eventlist + index, n * sizeof(*events));
}
static const struct kevent_ops kevent_native_ops = {
.keo_private = NULL,
.keo_fetch_timeout = copyin,
.keo_fetch_changes = kevent_fetch_changes,
.keo_put_events = kevent_put_events,
};
int
sys___kevent100(struct lwp *l, const struct sys___kevent100_args *uap,
register_t *retval)
{
/* {
syscallarg(int) fd;
syscallarg(const struct kevent *) changelist;
syscallarg(size_t) nchanges;
syscallarg(struct kevent *) eventlist;
syscallarg(size_t) nevents;
syscallarg(const struct timespec *) timeout;
} */
return kevent1(retval, SCARG(uap, fd), SCARG(uap, changelist),
SCARG(uap, nchanges), SCARG(uap, eventlist), SCARG(uap, nevents),
SCARG(uap, timeout), &kevent_native_ops);
}
int
kevent1(register_t *retval, int fd,
const struct kevent *changelist, size_t nchanges,
struct kevent *eventlist, size_t nevents,
const struct timespec *timeout,
const struct kevent_ops *keops)
{
struct kevent *kevp;
struct kqueue *kq;
struct timespec ts;
size_t i, n, ichange;
int nerrors, error;
struct kevent kevbuf[KQ_NEVENTS]; /* approx 300 bytes on 64-bit */
file_t *fp;
/* check that we're dealing with a kq */
fp = fd_getfile(fd);
if (fp == NULL)
return (EBADF);
if (fp->f_type != DTYPE_KQUEUE) {
fd_putfile(fd);
return (EBADF);
}
if (timeout != NULL) {
error = (*keops->keo_fetch_timeout)(timeout, &ts, sizeof(ts));
if (error)
goto done;
timeout = &ts;
}
kq = fp->f_kqueue;
nerrors = 0;
ichange = 0;
/* traverse list of events to register */
while (nchanges > 0) {
n = MIN(nchanges, __arraycount(kevbuf));
error = (*keops->keo_fetch_changes)(keops->keo_private,
changelist, kevbuf, ichange, n);
if (error)
goto done;
for (i = 0; i < n; i++) {
kevp = &kevbuf[i];
kevp->flags &= ~EV_SYSFLAGS;
/* register each knote */
error = kqueue_register(kq, kevp);
if (!error && !(kevp->flags & EV_RECEIPT))
continue;
if (nevents == 0)
goto done;
kevp->flags = EV_ERROR;
kevp->data = error;
error = (*keops->keo_put_events)
(keops->keo_private, kevp,
eventlist, nerrors, 1);
if (error)
goto done;
nevents--;
nerrors++;
}
nchanges -= n; /* update the results */
ichange += n;
}
if (nerrors) {
*retval = nerrors;
error = 0;
goto done;
}
/* actually scan through the events */
error = kqueue_scan(fp, nevents, eventlist, timeout, retval, keops,
kevbuf, __arraycount(kevbuf));
done:
fd_putfile(fd);
return (error);
}
/*
* Register a given kevent kev onto the kqueue
*/
static int
kqueue_register(struct kqueue *kq, struct kevent *kev)
{
struct kfilter *kfilter;
filedesc_t *fdp;
file_t *fp;
fdfile_t *ff;
struct knote *kn, *newkn;
struct klist *list;
int error, fd, rv;
fdp = kq->kq_fdp;
fp = NULL;
kn = NULL;
error = 0;
fd = 0;
newkn = knote_alloc(true);
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byfilter(kev->filter);
if (kfilter == NULL || kfilter->filtops == NULL) {
/* filter not found nor implemented */
rw_exit(&kqueue_filter_lock);
knote_free(newkn);
return (EINVAL);
}
/* search if knote already exists */
if (kfilter->filtops->f_flags & FILTEROP_ISFD) {
/* monitoring a file descriptor */
/* validate descriptor */
if (kev->ident > INT_MAX
|| (fp = fd_getfile(fd = kev->ident)) == NULL) {
rw_exit(&kqueue_filter_lock);
knote_free(newkn);
return EBADF;
}
mutex_enter(&fdp->fd_lock);
ff = fdp->fd_dt->dt_ff[fd];
if (ff->ff_refcnt & FR_CLOSING) {
error = EBADF;
goto doneunlock;
}
if (fd <= fdp->fd_lastkqfile) {
SLIST_FOREACH(kn, &ff->ff_knlist, kn_link) {
if (kq == kn->kn_kq &&
kev->filter == kn->kn_filter)
break;
}
}
} else {
/*
* not monitoring a file descriptor, so
* lookup knotes in internal hash table
*/
mutex_enter(&fdp->fd_lock);
if (fdp->fd_knhashmask != 0) {
list = &fdp->fd_knhash[
KN_HASH((u_long)kev->ident, fdp->fd_knhashmask)];
SLIST_FOREACH(kn, list, kn_link) {
if (kev->ident == kn->kn_id &&
kq == kn->kn_kq &&
kev->filter == kn->kn_filter)
break;
}
}
}
/* It's safe to test KQ_CLOSING while holding only the fd_lock. */
KASSERT(mutex_owned(&fdp->fd_lock));
KASSERT((kq->kq_count & KQ_CLOSING) == 0);
/*
* kn now contains the matching knote, or NULL if no match
*/
if (kn == NULL) {
if (kev->flags & EV_ADD) {
/* create new knote */
kn = newkn;
newkn = NULL;
kn->kn_obj = fp;
kn->kn_id = kev->ident;
kn->kn_kq = kq;
kn->kn_fop = kfilter->filtops;
kn->kn_kfilter = kfilter;
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
kev->fflags = 0;
kev->data = 0;
kn->kn_kevent = *kev;
KASSERT(kn->kn_fop != NULL);
/*
* XXX Allow only known-safe users of f_touch.
* XXX See filter_touch() for details.
*/
if (kn->kn_fop->f_touch != NULL &&
kn->kn_fop != &timer_filtops &&
kn->kn_fop != &user_filtops) {
error = ENOTSUP;
goto fail_ev_add;
}
/*
* apply reference count to knote structure, and
* do not release it at the end of this routine.
*/
fp = NULL;
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD)) {
/*
* If knote is not on an fd, store on
* internal hash table.
*/
if (fdp->fd_knhashmask == 0) {
/* XXXAD can block with fd_lock held */
fdp->fd_knhash = hashinit(KN_HASHSIZE,
HASH_LIST, true,
&fdp->fd_knhashmask);
}
list = &fdp->fd_knhash[KN_HASH(kn->kn_id,
fdp->fd_knhashmask)];
} else {
/* Otherwise, knote is on an fd. */
list = (struct klist *)
&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
if ((int)kn->kn_id > fdp->fd_lastkqfile)
fdp->fd_lastkqfile = kn->kn_id;
}
SLIST_INSERT_HEAD(list, kn, kn_link);
/*
* N.B. kn->kn_fop may change as the result
* of filter_attach()!
*/
knote_foplock_enter(kn);
error = filter_attach(kn);
if (error != 0) {
#ifdef DEBUG
struct proc *p = curlwp->l_proc;
const file_t *ft = kn->kn_obj;
printf("%s: %s[%d]: event type %d not "
"supported for file type %d/%s "
"(error %d)\n", __func__,
p->p_comm, p->p_pid,
kn->kn_filter, ft ? ft->f_type : -1,
ft ? ft->f_ops->fo_name : "?", error);
#endif
fail_ev_add:
/*
* N.B. no need to check for this note to
* be in-flux, since it was never visible
* to the monitored object.
*
* knote_detach() drops fdp->fd_lock
*/
knote_foplock_exit(kn);
mutex_enter(&kq->kq_lock);
KNOTE_WILLDETACH(kn);
KASSERT(kn_in_flux(kn) == false);
mutex_exit(&kq->kq_lock);
knote_detach(kn, fdp, false);
goto done;
}
atomic_inc_uint(&kfilter->refcnt);
goto done_ev_add;
} else {
/* No matching knote and the EV_ADD flag is not set. */
error = ENOENT;
goto doneunlock;
}
}
if (kev->flags & EV_DELETE) {
/*
* Let the world know that this knote is about to go
* away, and wait for it to settle if it's currently
* in-flux.
*/
mutex_spin_enter(&kq->kq_lock);
if (kn->kn_status & KN_WILLDETACH) {
/*
* This knote is already on its way out,
* so just be done.
*/
mutex_spin_exit(&kq->kq_lock);
goto doneunlock;
}
KNOTE_WILLDETACH(kn);
if (kn_in_flux(kn)) {
mutex_exit(&fdp->fd_lock);
/*
* It's safe for us to conclusively wait for
* this knote to settle because we know we'll
* be completing the detach.
*/
kn_wait_flux(kn, true);
KASSERT(kn_in_flux(kn) == false);
mutex_spin_exit(&kq->kq_lock);
mutex_enter(&fdp->fd_lock);
} else {
mutex_spin_exit(&kq->kq_lock);
}
/* knote_detach() drops fdp->fd_lock */
knote_detach(kn, fdp, true);
goto done;
}
/*
* The user may change some filter values after the
* initial EV_ADD, but doing so will not reset any
* filter which have already been triggered.
*/
knote_foplock_enter(kn);
kn->kn_kevent.udata = kev->udata;
KASSERT(kn->kn_fop != NULL);
if (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
kn->kn_fop->f_touch != NULL) {
mutex_spin_enter(&kq->kq_lock);
error = filter_touch(kn, kev, EVENT_REGISTER);
mutex_spin_exit(&kq->kq_lock);
if (__predict_false(error != 0)) {
/* Never a new knote (which would consume newkn). */
KASSERT(newkn != NULL);
knote_foplock_exit(kn);
goto doneunlock;
}
} else {
kn->kn_sfflags = kev->fflags;
kn->kn_sdata = kev->data;
}
/*
* We can get here if we are trying to attach
* an event to a file descriptor that does not
* support events, and the attach routine is
* broken and does not return an error.
*/
done_ev_add:
rv = filter_event(kn, 0, false);
if (rv)
knote_activate(kn);
knote_foplock_exit(kn);
/* disable knote */
if ((kev->flags & EV_DISABLE)) {
mutex_spin_enter(&kq->kq_lock);
if ((kn->kn_status & KN_DISABLED) == 0)
kn->kn_status |= KN_DISABLED;
mutex_spin_exit(&kq->kq_lock);
}
/* enable knote */
if ((kev->flags & EV_ENABLE)) {
knote_enqueue(kn);
}
doneunlock:
mutex_exit(&fdp->fd_lock);
done:
rw_exit(&kqueue_filter_lock);
if (newkn != NULL)
knote_free(newkn);
if (fp != NULL)
fd_putfile(fd);
return (error);
}
#define KN_FMT(buf, kn) \
(snprintb((buf), sizeof(buf), __KN_FLAG_BITS, (kn)->kn_status), buf)
#if defined(DDB)
void
kqueue_printit(struct kqueue *kq, bool full, void (*pr)(const char *, ...))
{
const struct knote *kn;
u_int count;
int nmarker;
char buf[128];
count = 0;
nmarker = 0;
(*pr)("kqueue %p (restart=%d count=%u):\n", kq,
!!(kq->kq_count & KQ_RESTART), KQ_COUNT(kq));
(*pr)(" Queued knotes:\n");
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if (kn->kn_status & KN_MARKER) {
nmarker++;
} else {
count++;
}
(*pr)(" knote %p: kq=%p status=%s\n",
kn, kn->kn_kq, KN_FMT(buf, kn));
(*pr)(" id=0x%lx (%lu) filter=%d\n",
(u_long)kn->kn_id, (u_long)kn->kn_id, kn->kn_filter);
if (kn->kn_kq != kq) {
(*pr)(" !!! kn->kn_kq != kq\n");
}
}
if (count != KQ_COUNT(kq)) {
(*pr)(" !!! count(%u) != KQ_COUNT(%u)\n",
count, KQ_COUNT(kq));
}
}
#endif /* DDB */
#if defined(DEBUG)
static void
kqueue_check(const char *func, size_t line, const struct kqueue *kq)
{
const struct knote *kn;
u_int count;
int nmarker;
char buf[128];
KASSERT(mutex_owned(&kq->kq_lock));
count = 0;
nmarker = 0;
TAILQ_FOREACH(kn, &kq->kq_head, kn_tqe) {
if ((kn->kn_status & (KN_MARKER | KN_QUEUED)) == 0) {
panic("%s,%zu: kq=%p kn=%p !(MARKER|QUEUED) %s",
func, line, kq, kn, KN_FMT(buf, kn));
}
if ((kn->kn_status & KN_MARKER) == 0) {
if (kn->kn_kq != kq) {
panic("%s,%zu: kq=%p kn(%p) != kn->kq(%p): %s",
func, line, kq, kn, kn->kn_kq,
KN_FMT(buf, kn));
}
if ((kn->kn_status & KN_ACTIVE) == 0) {
panic("%s,%zu: kq=%p kn=%p: !ACTIVE %s",
func, line, kq, kn, KN_FMT(buf, kn));
}
count++;
if (count > KQ_COUNT(kq)) { panic("%s,%zu: kq=%p kq->kq_count(%u) != "
"count(%d), nmarker=%d",
func, line, kq, KQ_COUNT(kq), count,
nmarker);
}
} else {
nmarker++;
}
}
}
#define kq_check(a) kqueue_check(__func__, __LINE__, (a))
#else /* defined(DEBUG) */
#define kq_check(a) /* nothing */
#endif /* defined(DEBUG) */
static void
kqueue_restart(file_t *fp)
{
struct kqueue *kq = fp->f_kqueue;
KASSERT(kq != NULL);
mutex_spin_enter(&kq->kq_lock);
kq->kq_count |= KQ_RESTART;
cv_broadcast(&kq->kq_cv);
mutex_spin_exit(&kq->kq_lock);
}
static int
kqueue_fpathconf(struct file *fp, int name, register_t *retval)
{
return EINVAL;
}
/*
* Scan through the list of events on fp (for a maximum of maxevents),
* returning the results in to ulistp. Timeout is determined by tsp; if
* NULL, wait indefinitely, if 0 valued, perform a poll, otherwise wait
* as appropriate.
*/
static int
kqueue_scan(file_t *fp, size_t maxevents, struct kevent *ulistp,
const struct timespec *tsp, register_t *retval,
const struct kevent_ops *keops, struct kevent *kevbuf,
size_t kevcnt)
{
struct kqueue *kq;
struct kevent *kevp;
struct timespec ats, sleepts;
struct knote *kn, *marker;
struct knote_impl morker;
size_t count, nkev, nevents;
int timeout, error, touch, rv, influx;
filedesc_t *fdp;
fdp = curlwp->l_fd;
kq = fp->f_kqueue;
count = maxevents;
nkev = nevents = error = 0;
if (count == 0) {
*retval = 0;
return 0;
}
if (tsp) { /* timeout supplied */
ats = *tsp;
if (inittimeleft(&ats, &sleepts) == -1) {
*retval = maxevents;
return EINVAL;
}
timeout = tstohz(&ats);
if (timeout <= 0)
timeout = -1; /* do poll */
} else {
/* no timeout, wait forever */
timeout = 0;
}
memset(&morker, 0, sizeof(morker));
marker = &morker.ki_knote;
marker->kn_kq = kq;
marker->kn_status = KN_MARKER;
mutex_spin_enter(&kq->kq_lock);
retry:
kevp = kevbuf;
if (KQ_COUNT(kq) == 0) {
if (timeout >= 0) {
error = cv_timedwait_sig(&kq->kq_cv,
&kq->kq_lock, timeout);
if (error == 0) {
if (KQ_COUNT(kq) == 0 &&
(kq->kq_count & KQ_RESTART)) {
/* return to clear file reference */
error = ERESTART;
} else if (tsp == NULL || (timeout =
gettimeleft(&ats, &sleepts)) > 0) {
goto retry;
}
} else {
/* don't restart after signals... */
if (error == ERESTART)
error = EINTR;
if (error == EWOULDBLOCK)
error = 0;
}
}
mutex_spin_exit(&kq->kq_lock);
goto done;
}
/* mark end of knote list */
TAILQ_INSERT_TAIL(&kq->kq_head, marker, kn_tqe);
influx = 0;
/*
* Acquire the fdp->fd_lock interlock to avoid races with
* file creation/destruction from other threads.
*/
mutex_spin_exit(&kq->kq_lock);
relock:
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
while (count != 0) {
/*
* Get next knote. We are guaranteed this will never
* be NULL because of the marker we inserted above.
*/
kn = TAILQ_FIRST(&kq->kq_head);
bool kn_is_other_marker =
(kn->kn_status & KN_MARKER) != 0 && kn != marker;
bool kn_is_detaching = (kn->kn_status & KN_WILLDETACH) != 0;
bool kn_is_in_flux = kn_in_flux(kn);
/*
* If we found a marker that's not ours, or this knote
* is in a state of flux, then wait for everything to
* settle down and go around again.
*/
if (kn_is_other_marker || kn_is_detaching || kn_is_in_flux) {
if (influx) {
influx = 0;
KQ_FLUX_WAKEUP(kq);
}
mutex_exit(&fdp->fd_lock);
if (kn_is_other_marker || kn_is_in_flux) {
KQ_FLUX_WAIT(kq);
mutex_spin_exit(&kq->kq_lock);
} else {
/*
* Detaching but not in-flux? Someone is
* actively trying to finish the job; just
* go around and try again.
*/
KASSERT(kn_is_detaching);
mutex_spin_exit(&kq->kq_lock);
preempt_point();
}
goto relock;
}
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
if (kn == marker) {
/* it's our marker, stop */
KQ_FLUX_WAKEUP(kq);
if (count == maxevents) {
mutex_exit(&fdp->fd_lock);
goto retry;
}
break;
}
KASSERT((kn->kn_status & KN_BUSY) == 0);
kq_check(kq);
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_BUSY;
kq_check(kq);
if (kn->kn_status & KN_DISABLED) {
kn->kn_status &= ~KN_BUSY;
kq->kq_count--;
/* don't want disabled events */
continue;
}
if ((kn->kn_flags & EV_ONESHOT) == 0) {
mutex_spin_exit(&kq->kq_lock);
KASSERT(mutex_owned(&fdp->fd_lock));
knote_foplock_enter(kn);
rv = filter_event(kn, 0, false);
knote_foplock_exit(kn);
mutex_spin_enter(&kq->kq_lock);
/* Re-poll if note was re-enqueued. */
if ((kn->kn_status & KN_QUEUED) != 0) {
kn->kn_status &= ~KN_BUSY;
/* Re-enqueue raised kq_count, lower it again */
kq->kq_count--;
influx = 1;
continue;
}
if (rv == 0) {
/*
* non-ONESHOT event that hasn't triggered
* again, so it will remain de-queued.
*/
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
influx = 1;
continue;
}
} else {
/*
* Must NOT drop kq_lock until we can do
* the KNOTE_WILLDETACH() below.
*/
}
KASSERT(kn->kn_fop != NULL);
touch = (!(kn->kn_fop->f_flags & FILTEROP_ISFD) &&
kn->kn_fop->f_touch != NULL);
/* XXXAD should be got from f_event if !oneshot. */
KASSERT((kn->kn_status & KN_WILLDETACH) == 0);
if (touch) {
(void)filter_touch(kn, kevp, EVENT_PROCESS);
} else {
*kevp = kn->kn_kevent;
}
kevp++;
nkev++;
influx = 1;
if (kn->kn_flags & EV_ONESHOT) {
/* delete ONESHOT events after retrieval */
KNOTE_WILLDETACH(kn);
kn->kn_status &= ~KN_BUSY;
kq->kq_count--;
KASSERT(kn_in_flux(kn) == false);
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
KASSERT(kn->kn_kevent.udata == curlwp);
mutex_spin_exit(&kq->kq_lock);
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
} else if (kn->kn_flags & EV_CLEAR) {
/* clear state after retrieval */
kn->kn_data = 0;
kn->kn_fflags = 0;
/*
* Manually clear knotes who weren't
* 'touch'ed.
*/
if (touch == 0) {
kn->kn_data = 0;
kn->kn_fflags = 0;
}
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
} else if (kn->kn_flags & EV_DISPATCH) {
kn->kn_status |= KN_DISABLED;
kn->kn_status &= ~(KN_ACTIVE|KN_BUSY);
kq->kq_count--;
} else {
/* add event back on list */
kq_check(kq);
kn->kn_status |= KN_QUEUED;
kn->kn_status &= ~KN_BUSY;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
kq_check(kq);
}
if (nkev == kevcnt) {
/* do copyouts in kevcnt chunks */
influx = 0;
KQ_FLUX_WAKEUP(kq);
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
error = (*keops->keo_put_events)
(keops->keo_private,
kevbuf, ulistp, nevents, nkev);
mutex_enter(&fdp->fd_lock);
mutex_spin_enter(&kq->kq_lock);
nevents += nkev;
nkev = 0;
kevp = kevbuf;
}
count--;
if (error != 0 || count == 0) {
/* remove marker */
TAILQ_REMOVE(&kq->kq_head, marker, kn_tqe);
break;
}
}
KQ_FLUX_WAKEUP(kq);
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
done:
if (nkev != 0) {
/* copyout remaining events */
error = (*keops->keo_put_events)(keops->keo_private,
kevbuf, ulistp, nevents, nkev);
}
*retval = maxevents - count;
return error;
}
/*
* fileops ioctl method for a kqueue descriptor.
*
* Two ioctls are currently supported. They both use struct kfilter_mapping:
* KFILTER_BYNAME find name for filter, and return result in
* name, which is of size len.
* KFILTER_BYFILTER find filter for name. len is ignored.
*/
/*ARGSUSED*/
static int
kqueue_ioctl(file_t *fp, u_long com, void *data)
{
struct kfilter_mapping *km;
const struct kfilter *kfilter;
char *name;
int error;
km = data;
error = 0;
name = kmem_alloc(KFILTER_MAXNAME, KM_SLEEP);
switch (com) {
case KFILTER_BYFILTER: /* convert filter -> name */
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byfilter(km->filter);
if (kfilter != NULL) {
strlcpy(name, kfilter->name, KFILTER_MAXNAME);
rw_exit(&kqueue_filter_lock);
error = copyoutstr(name, km->name, km->len, NULL);
} else {
rw_exit(&kqueue_filter_lock);
error = ENOENT;
}
break;
case KFILTER_BYNAME: /* convert name -> filter */
error = copyinstr(km->name, name, KFILTER_MAXNAME, NULL);
if (error) {
break;
}
rw_enter(&kqueue_filter_lock, RW_READER);
kfilter = kfilter_byname(name);
if (kfilter != NULL)
km->filter = kfilter->filter;
else
error = ENOENT;
rw_exit(&kqueue_filter_lock);
break;
default:
error = ENOTTY;
break;
}
kmem_free(name, KFILTER_MAXNAME);
return (error);
}
/*
* fileops fcntl method for a kqueue descriptor.
*/
static int
kqueue_fcntl(file_t *fp, u_int com, void *data)
{
return (ENOTTY);
}
/*
* fileops poll method for a kqueue descriptor.
* Determine if kqueue has events pending.
*/
static int
kqueue_poll(file_t *fp, int events)
{
struct kqueue *kq;
int revents;
kq = fp->f_kqueue;
revents = 0;
if (events & (POLLIN | POLLRDNORM)) {
mutex_spin_enter(&kq->kq_lock);
if (KQ_COUNT(kq) != 0) {
revents |= events & (POLLIN | POLLRDNORM);
} else {
selrecord(curlwp, &kq->kq_sel);
}
kq_check(kq);
mutex_spin_exit(&kq->kq_lock);
}
return revents;
}
/*
* fileops stat method for a kqueue descriptor.
* Returns dummy info, with st_size being number of events pending.
*/
static int
kqueue_stat(file_t *fp, struct stat *st)
{
struct kqueue *kq;
kq = fp->f_kqueue;
memset(st, 0, sizeof(*st));
st->st_size = KQ_COUNT(kq);
st->st_blksize = sizeof(struct kevent);
st->st_mode = S_IFIFO | S_IRUSR | S_IWUSR;
st->st_blocks = 1;
st->st_uid = kauth_cred_geteuid(fp->f_cred);
st->st_gid = kauth_cred_getegid(fp->f_cred);
return 0;
}
static void
kqueue_doclose(struct kqueue *kq, struct klist *list, int fd)
{
struct knote *kn;
filedesc_t *fdp;
fdp = kq->kq_fdp;
KASSERT(mutex_owned(&fdp->fd_lock));
again:
for (kn = SLIST_FIRST(list); kn != NULL;) {
if (kq != kn->kn_kq) {
kn = SLIST_NEXT(kn, kn_link);
continue;
}
if (knote_detach_quiesce(kn)) {
mutex_enter(&fdp->fd_lock);
goto again;
}
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
kn = SLIST_FIRST(list);
}
}
/*
* fileops close method for a kqueue descriptor.
*/
static int
kqueue_close(file_t *fp)
{
struct kqueue *kq;
filedesc_t *fdp;
fdfile_t *ff;
int i;
kq = fp->f_kqueue;
fp->f_kqueue = NULL;
fp->f_type = 0;
fdp = curlwp->l_fd;
KASSERT(kq->kq_fdp == fdp);
mutex_enter(&fdp->fd_lock);
/*
* We're doing to drop the fd_lock multiple times while
* we detach knotes. During this time, attempts to register
* knotes via the back door (e.g. knote_proc_fork_track())
* need to fail, lest they sneak in to attach a knote after
* we've already drained the list it's destined for.
*
* We must acquire kq_lock here to set KQ_CLOSING (to serialize
* with other code paths that modify kq_count without holding
* the fd_lock), but once this bit is set, it's only safe to
* test it while holding the fd_lock, and holding kq_lock while
* doing so is not necessary.
*/
mutex_enter(&kq->kq_lock);
kq->kq_count |= KQ_CLOSING;
mutex_exit(&kq->kq_lock);
for (i = 0; i <= fdp->fd_lastkqfile; i++) {
if ((ff = fdp->fd_dt->dt_ff[i]) == NULL)
continue;
kqueue_doclose(kq, (struct klist *)&ff->ff_knlist, i);
}
if (fdp->fd_knhashmask != 0) {
for (i = 0; i < fdp->fd_knhashmask + 1; i++) {
kqueue_doclose(kq, &fdp->fd_knhash[i], -1);
}
}
mutex_exit(&fdp->fd_lock);
#if defined(DEBUG)
mutex_enter(&kq->kq_lock);
kq_check(kq);
mutex_exit(&kq->kq_lock);
#endif /* DEBUG */
KASSERT(TAILQ_EMPTY(&kq->kq_head));
KASSERT(KQ_COUNT(kq) == 0);
mutex_destroy(&kq->kq_lock);
cv_destroy(&kq->kq_cv);
seldestroy(&kq->kq_sel);
kmem_free(kq, sizeof(*kq));
return (0);
}
/*
* struct fileops kqfilter method for a kqueue descriptor.
* Event triggered when monitored kqueue changes.
*/
static int
kqueue_kqfilter(file_t *fp, struct knote *kn)
{
struct kqueue *kq;
kq = ((file_t *)kn->kn_obj)->f_kqueue;
KASSERT(fp == kn->kn_obj);
if (kn->kn_filter != EVFILT_READ)
return EINVAL;
kn->kn_fop = &kqread_filtops;
mutex_enter(&kq->kq_lock);
selrecord_knote(&kq->kq_sel, kn);
mutex_exit(&kq->kq_lock);
return 0;
}
/*
* Walk down a list of knotes, activating them if their event has
* triggered. The caller's object lock (e.g. device driver lock)
* must be held.
*/
void
knote(struct klist *list, long hint)
{
struct knote *kn, *tmpkn;
SLIST_FOREACH_SAFE(kn, list, kn_selnext, tmpkn) {
/*
* We assume here that the backing object's lock is
* already held if we're traversing the klist, and
* so acquiring the knote foplock would create a
* deadlock scenario. But we also know that the klist
* won't disappear on us while we're here, so not
* acquiring it is safe.
*/
if (filter_event(kn, hint, true)) { knote_activate(kn);
}
}
}
/*
* Remove all knotes referencing a specified fd
*/
void
knote_fdclose(int fd)
{
struct klist *list;
struct knote *kn;
filedesc_t *fdp;
again:
fdp = curlwp->l_fd;
mutex_enter(&fdp->fd_lock);
list = (struct klist *)&fdp->fd_dt->dt_ff[fd]->ff_knlist;
while ((kn = SLIST_FIRST(list)) != NULL) {
if (knote_detach_quiesce(kn)) {
goto again;
}
knote_detach(kn, fdp, true);
mutex_enter(&fdp->fd_lock);
}
mutex_exit(&fdp->fd_lock);
}
/*
* Drop knote. Called with fdp->fd_lock held, and will drop before
* returning.
*/
static void
knote_detach(struct knote *kn, filedesc_t *fdp, bool dofop)
{
struct klist *list;
struct kqueue *kq;
kq = kn->kn_kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
KASSERT((kn->kn_status & KN_WILLDETACH) != 0);
KASSERT(kn->kn_fop != NULL);
KASSERT(mutex_owned(&fdp->fd_lock));
/* Remove from monitored object. */
if (dofop) {
knote_foplock_enter(kn);
filter_detach(kn);
knote_foplock_exit(kn);
}
/* Remove from descriptor table. */
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
list = (struct klist *)&fdp->fd_dt->dt_ff[kn->kn_id]->ff_knlist;
else
list = &fdp->fd_knhash[KN_HASH(kn->kn_id, fdp->fd_knhashmask)];
SLIST_REMOVE(list, kn, knote, kn_link);
/* Remove from kqueue. */
again:
mutex_spin_enter(&kq->kq_lock);
KASSERT(kn_in_flux(kn) == false);
if ((kn->kn_status & KN_QUEUED) != 0) {
kq_check(kq);
KASSERT(KQ_COUNT(kq) != 0);
kq->kq_count--;
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
kn->kn_status &= ~KN_QUEUED;
kq_check(kq);
} else if (kn->kn_status & KN_BUSY) {
mutex_spin_exit(&kq->kq_lock);
goto again;
}
mutex_spin_exit(&kq->kq_lock);
mutex_exit(&fdp->fd_lock);
if (kn->kn_fop->f_flags & FILTEROP_ISFD)
fd_putfile(kn->kn_id);
atomic_dec_uint(&kn->kn_kfilter->refcnt);
knote_free(kn);
}
/*
* Queue new event for knote.
*/
static void
knote_enqueue(struct knote *kn)
{
struct kqueue *kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
/* Don't bother enqueueing a dying knote. */
goto out;
}
if ((kn->kn_status & KN_DISABLED) != 0) {
kn->kn_status &= ~KN_DISABLED;
}
if ((kn->kn_status & (KN_ACTIVE | KN_QUEUED)) == KN_ACTIVE) {
kq_check(kq);
kn->kn_status |= KN_QUEUED;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
kq->kq_count++;
kq_check(kq);
cv_broadcast(&kq->kq_cv);
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
}
out:
mutex_spin_exit(&kq->kq_lock);
}
/*
* Queue new event for knote.
*/
static void
knote_activate_locked(struct knote *kn)
{
struct kqueue *kq;
KASSERT((kn->kn_status & KN_MARKER) == 0);
kq = kn->kn_kq;
if (__predict_false(kn->kn_status & KN_WILLDETACH)) {
/* Don't bother enqueueing a dying knote. */
return;
}
kn->kn_status |= KN_ACTIVE;
if ((kn->kn_status & (KN_QUEUED | KN_DISABLED)) == 0) {
kq_check(kq);
kn->kn_status |= KN_QUEUED;
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe); KASSERT(KQ_COUNT(kq) < KQ_MAXCOUNT);
kq->kq_count++;
kq_check(kq);
cv_broadcast(&kq->kq_cv);
selnotify(&kq->kq_sel, 0, NOTE_SUBMIT);
}
}
static void
knote_activate(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
knote_activate_locked(kn);
mutex_spin_exit(&kq->kq_lock);
}
static void
knote_deactivate_locked(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
if (kn->kn_status & KN_QUEUED) {
kq_check(kq);
kn->kn_status &= ~KN_QUEUED;
TAILQ_REMOVE(&kq->kq_head, kn, kn_tqe);
KASSERT(KQ_COUNT(kq) > 0);
kq->kq_count--;
kq_check(kq);
}
kn->kn_status &= ~KN_ACTIVE;
}
/*
* Set EV_EOF on the specified knote. Also allows additional
* EV_* flags to be set (e.g. EV_ONESHOT).
*/
void
knote_set_eof(struct knote *kn, uint32_t flags)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_flags |= EV_EOF | flags;
mutex_spin_exit(&kq->kq_lock);
}
/*
* Clear EV_EOF on the specified knote.
*/
void
knote_clear_eof(struct knote *kn)
{
struct kqueue *kq = kn->kn_kq;
mutex_spin_enter(&kq->kq_lock);
kn->kn_flags &= ~EV_EOF;
mutex_spin_exit(&kq->kq_lock);
}
/*
* Initialize a klist.
*/
void
klist_init(struct klist *list)
{
SLIST_INIT(list);
}
/*
* Finalize a klist.
*/
void
klist_fini(struct klist *list)
{
struct knote *kn;
/*
* Neuter all existing knotes on the klist because the list is
* being destroyed. The caller has guaranteed that no additional
* knotes will be added to the list, that the backing object's
* locks are not held (otherwise there is a locking order issue
* with acquiring the knote foplock ), and that we can traverse
* the list safely in this state.
*/
SLIST_FOREACH(kn, list, kn_selnext) {
knote_foplock_enter(kn);
KASSERT(kn->kn_fop != NULL);
if (kn->kn_fop->f_flags & FILTEROP_ISFD) {
kn->kn_fop = &nop_fd_filtops;
} else {
kn->kn_fop = &nop_filtops;
}
knote_foplock_exit(kn);
}
}
/*
* Insert a knote into a klist.
*/
void
klist_insert(struct klist *list, struct knote *kn)
{
SLIST_INSERT_HEAD(list, kn, kn_selnext);
}
/*
* Remove a knote from a klist. Returns true if the last
* knote was removed and the list is now empty.
*/
bool
klist_remove(struct klist *list, struct knote *kn)
{
SLIST_REMOVE(list, kn, knote, kn_selnext);
return SLIST_EMPTY(list);
}
/* $NetBSD: bluetooth.h,v 1.12 2014/05/18 14:46:16 rmind Exp $ */
/*-
* Copyright (c) 2005 Iain Hibbert.
* Copyright (c) 2006 Itronix Inc.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of Itronix Inc. may not be used to endorse
* or promote products derived from this software without specific
* prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY ITRONIX INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL ITRONIX INC. BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _NETBT_BLUETOOTH_H_
#define _NETBT_BLUETOOTH_H_
#include <sys/socket.h>
#include <sys/types.h>
/*
* Bluetooth Address Family Protocol Numbers
*/
#define BTPROTO_HCI 1
#define BTPROTO_L2CAP 2
#define BTPROTO_RFCOMM 3
#define BTPROTO_SCO 4
/* All sizes are in bytes */
#define BLUETOOTH_BDADDR_SIZE 6
/*
* Bluetooth device address
*/
typedef struct {
uint8_t b[BLUETOOTH_BDADDR_SIZE];
} __packed bdaddr_t;
/*
* bdaddr utility functions
*/
static __inline int
bdaddr_same(const bdaddr_t *a, const bdaddr_t *b)
{
return (a->b[0] == b->b[0] && a->b[1] == b->b[1]
&& a->b[2] == b->b[2] && a->b[3] == b->b[3]
&& a->b[4] == b->b[4] && a->b[5] == b->b[5]);
}
static __inline int
bdaddr_any(const bdaddr_t *a)
{
return (a->b[0] == 0 && a->b[1] == 0 && a->b[2] == 0 && a->b[3] == 0 && a->b[4] == 0 && a->b[5] == 0);
}
static __inline void
bdaddr_copy(bdaddr_t *d, const bdaddr_t *s)
{
d->b[0] = s->b[0];
d->b[1] = s->b[1];
d->b[2] = s->b[2];
d->b[3] = s->b[3];
d->b[4] = s->b[4];
d->b[5] = s->b[5];
}
/*
* Socket address used by Bluetooth protocols
*/
struct sockaddr_bt {
uint8_t bt_len;
sa_family_t bt_family;
bdaddr_t bt_bdaddr;
uint16_t bt_psm;
uint8_t bt_channel;
uint8_t bt_zero[5];
};
/* Note: this is actually 6 bytes including terminator */
#define BDADDR_ANY ((const bdaddr_t *) "\000\000\000\000\000")
#ifdef _KERNEL
#include <sys/protosw.h>
#include <sys/mallocvar.h>
MALLOC_DECLARE(M_BLUETOOTH);
/*
* Bluetooth Protocol API callback methods
*/
struct mbuf;
struct btproto {
void (*connecting)(void *);
void (*connected)(void *);
void (*disconnected)(void *, int);
void *(*newconn)(void *, struct sockaddr_bt *, struct sockaddr_bt *);
void (*complete)(void *, int);
void (*linkmode)(void *, int);
void (*input)(void *, struct mbuf *);
};
extern const struct pr_usrreqs hci_usrreqs;
extern const struct pr_usrreqs sco_usrreqs;
extern const struct pr_usrreqs l2cap_usrreqs;
extern const struct pr_usrreqs rfcomm_usrreqs;
extern kmutex_t *bt_lock;
/*
* Debugging stuff
*/
#ifdef BLUETOOTH_DEBUG
extern int bluetooth_debug;
# define DPRINTF(...) do { \
if (bluetooth_debug) { \
printf("%s: ", __func__); \
printf(__VA_ARGS__); \
} \
} while (/* CONSTCOND */0)
# define DPRINTFN(n, ...) do { \
if (bluetooth_debug > (n)) { \
printf("%s: ", __func__); \
printf(__VA_ARGS__); \
} \
} while (/* CONSTCOND */0)
# define UNKNOWN(value) \
printf("%s: %s = %d unknown!\n", __func__, #value, (value));
#else
# define DPRINTF(...) ((void)0)
# define DPRINTFN(...) ((void)0)
# define UNKNOWN(x) ((void)0)
#endif /* BLUETOOTH_DEBUG */
#endif /* _KERNEL */
#endif /* _NETBT_BLUETOOTH_H_ */
/* $NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
*
* NRL grants permission for redistribution and use in source and binary
* forms, with or without modification, of the software and documentation
* created at NRL provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgements:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* This product includes software developed at the Information
* Technology Division, US Naval Research Laboratory.
* 4. Neither the name of the NRL nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation
* are those of the authors and should not be interpreted as representing
* official policies, either expressed or implied, of the US Naval
* Research Laboratory (NRL).
*/
/*-
* Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
* This code is derived from software contributed to The NetBSD Foundation
* by Charles M. Hannum.
* This code is derived from software contributed to The NetBSD Foundation
* by Rui Paulo.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.219 2023/09/13 15:54:28 bouyer Exp $");
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_tcp_debug.h"
#endif
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/domain.h>
#include <sys/kernel.h>
#ifdef TCP_SIGNATURE
#include <sys/md5.h>
#endif
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#endif
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/key.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#endif
#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_debug.h>
#include <netinet/in_offload.h>
#include <netinet6/in6_offload.h>
/*
* Knob to enable Congestion Window Monitoring, and control
* the burst size it allows. Default burst is 4 packets, per
* the Internet draft.
*/
int tcp_cwm = 0;
int tcp_cwm_burstsize = 4;
int tcp_do_autosndbuf = 1;
int tcp_autosndbuf_inc = 8 * 1024;
int tcp_autosndbuf_max = 256 * 1024;
#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>
extern struct evcnt tcp_output_bigheader;
extern struct evcnt tcp_output_predict_hit;
extern struct evcnt tcp_output_predict_miss;
extern struct evcnt tcp_output_copysmall;
extern struct evcnt tcp_output_copybig;
extern struct evcnt tcp_output_refbig;
#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
#else
#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
#endif /* TCP_OUTPUT_COUNTERS */
static int
tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
bool *alwaysfragp)
{
struct inpcb *inp = tp->t_inpcb;
struct socket *so = NULL;
struct rtentry *rt;
struct ifnet *ifp;
int size;
int hdrlen;
int optlen;
*alwaysfragp = false;
size = tcp_mssdflt;
switch (tp->t_family) {
case AF_INET:
hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
break;
#ifdef INET6
case AF_INET6:
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
break;
#endif
default:
hdrlen = 1; /* prevent zero sized segments */
goto out;
}
rt = inpcb_rtentry(inp);
so = inp->inp_socket;
if (rt == NULL) {
goto out;
}
ifp = rt->rt_ifp;
if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
#ifdef INET6
if (inp->inp_af == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
/*
* RFC2460 section 5, last paragraph: if path MTU is
* smaller than 1280, use 1280 as packet size and
* attach fragment header.
*/
size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
*alwaysfragp = true;
} else
size = rt->rt_rmx.rmx_mtu - hdrlen;
#else
size = rt->rt_rmx.rmx_mtu - hdrlen;
#endif
} else if (ifp->if_flags & IFF_LOOPBACK)
size = ifp->if_mtu - hdrlen; else if (inp->inp_af == AF_INET && tp->t_mtudisc) size = ifp->if_mtu - hdrlen;
else if (inp->inp_af == AF_INET && in_localaddr(in4p_faddr(inp)))
size = ifp->if_mtu - hdrlen;
#ifdef INET6
else if (inp->inp_af == AF_INET6) { if (IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp))) {
/* mapped addr case */
struct in_addr d;
memcpy(&d, &in6p_faddr(inp).s6_addr32[3], sizeof(d));
if (tp->t_mtudisc || in_localaddr(d))
size = ifp->if_mtu - hdrlen;
} else {
/*
* for IPv6, path MTU discovery is always turned on,
* or the node must use packet size <= 1280.
*/
size = tp->t_mtudisc ? ifp->if_mtu : IPV6_MMTU;
size -= hdrlen;
}
}
#endif
inpcb_rtentry_unref(rt, inp);
out:
/*
* Now we must make room for whatever extra TCP/IP options are in
* the packet.
*/
optlen = tcp_optlen(tp);
/*
* XXX tp->t_ourmss should have the right size, but without this code
* fragmentation will occur... need more investigation
*/
if (inp->inp_af == AF_INET) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec4_hdrsiz_tcp(tp);
#endif
optlen += ip_optlen(inp);
}
#ifdef INET6
if (inp->inp_af == AF_INET6 && tp->t_family == AF_INET) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec4_hdrsiz_tcp(tp);
#endif
/* XXX size -= ip_optlen(in6p); */
} else if (inp->inp_af == AF_INET6) {
#if defined(IPSEC)
if (ipsec_used && !ipsec_pcb_skip_ipsec(inp->inp_sp, IPSEC_DIR_OUTBOUND))
optlen += ipsec6_hdrsiz_tcp(tp);
#endif
optlen += ip6_optlen(inp);
}
#endif
size -= optlen;
/*
* There may not be any room for data if mtu is too small. This
* includes zero-sized.
*/
if (size <= 0) {
return EMSGSIZE;
}
/*
* *rxsegsizep holds *estimated* inbound segment size (estimation
* assumes that path MTU is the same for both ways). this is only
* for silly window avoidance, do not use the value for other purposes.
*
* ipseclen is subtracted from both sides, this may not be right.
* I'm not quite sure about this (could someone comment).
*/
*txsegsizep = uimin(tp->t_peermss - optlen, size);
*rxsegsizep = uimin(tp->t_ourmss - optlen, size);
/*
* Never send more than half a buffer full. This insures that we can
* always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
* therefore acks will never be delayed unless we run out of data to
* transmit.
*/
if (so) { *txsegsizep = uimin(so->so_snd.sb_hiwat >> 1, *txsegsizep);
}
/*
* A segment must at least store header + options
*/
if (*txsegsizep < hdrlen + optlen) {
return EMSGSIZE;
}
if (*txsegsizep != tp->t_segsz) {
/*
* If the new segment size is larger, we don't want to
* mess up the congestion window, but if it is smaller
* we'll have to reduce the congestion window to ensure
* that we don't get into trouble with initial windows
* and the rest. In any case, if the segment size
* has changed, chances are the path has, too, and
* our congestion window will be different.
*/
if (*txsegsizep < tp->t_segsz) { tp->snd_cwnd = uimax((tp->snd_cwnd / tp->t_segsz)
* *txsegsizep, *txsegsizep);
tp->snd_ssthresh = uimax((tp->snd_ssthresh / tp->t_segsz)
* *txsegsizep, *txsegsizep);
}
tp->t_segsz = *txsegsizep;
}
return 0;
}
static int
tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
long len, int hdrlen, struct mbuf **mp)
{
struct mbuf *m, *m0;
uint64_t *tcps;
tcps = TCP_STAT_GETREF();
if (tp->t_force && len == 1)
tcps[TCP_STAT_SNDPROBE]++;
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { tp->t_sndrexmitpack++;
tcps[TCP_STAT_SNDREXMITPACK]++;
tcps[TCP_STAT_SNDREXMITBYTE] += len;
} else {
tcps[TCP_STAT_SNDPACK]++;
tcps[TCP_STAT_SNDBYTE] += len;
}
TCP_STAT_PUTREF();
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (__predict_false(m == NULL))
return ENOBUFS;
MCLAIM(m, &tcp_tx_mowner);
/*
* XXX Because other code assumes headers will fit in
* XXX one header mbuf.
*
* (This code should almost *never* be run.)
*/
if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
return ENOBUFS;
}
}
m->m_data += max_linkhdr;
m->m_len = hdrlen;
/*
* To avoid traversing the whole sb_mb chain for correct
* data to send, remember last sent mbuf, its offset and
* the sent size. When called the next time, see if the
* data to send is directly following the previous transfer.
* This is important for large TCP windows.
*/
if (off == 0 || tp->t_lastm == NULL ||
(tp->t_lastoff + tp->t_lastlen) != off) {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
/*
* Either a new packet or a retransmit.
* Start from the beginning.
*/
tp->t_lastm = so->so_snd.sb_mb;
tp->t_inoff = off;
} else {
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
tp->t_inoff += tp->t_lastlen;
}
/* Traverse forward to next packet */
while (tp->t_inoff > 0) {
if (tp->t_lastm == NULL)
panic("tp->t_lastm == NULL"); if (tp->t_inoff < tp->t_lastm->m_len)
break;
tp->t_inoff -= tp->t_lastm->m_len;
tp->t_lastm = tp->t_lastm->m_next;
}
tp->t_lastoff = off;
tp->t_lastlen = len;
m0 = tp->t_lastm;
off = tp->t_inoff;
if (len <= M_TRAILINGSPACE(m)) { m_copydata(m0, off, (int)len, mtod(m, char *) + hdrlen);
m->m_len += len;
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
} else {
m->m_next = m_copym(m0, off, (int)len, M_DONTWAIT);
if (m->m_next == NULL) {
m_freem(m);
return ENOBUFS;
}
#ifdef TCP_OUTPUT_COUNTERS
if (m->m_next->m_flags & M_EXT)
TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
else
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
#endif
}
*mp = m;
return 0;
}
/*
* Tcp output routine: figure out what should be sent and send it.
*/
int
tcp_output(struct tcpcb *tp)
{
struct rtentry *rt = NULL;
struct socket *so;
struct route *ro;
long len, win;
int off, flags, error;
struct mbuf *m;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
struct tcphdr *th;
u_char opt[MAX_TCPOPTLEN], *optp;
#define OPT_FITS(more) ((optlen + (more)) <= sizeof(opt))
unsigned optlen, hdrlen, packetlen;
unsigned int sack_numblks;
int idle, sendalot, txsegsize, rxsegsize;
int txsegsize_nosack;
int maxburst = TCP_MAXBURST;
int af; /* address family on the wire */
int iphdrlen;
int has_tso4, has_tso6;
int has_tso, use_tso;
bool alwaysfrag;
int sack_rxmit;
int sack_bytes_rxmt;
int ecn_tos;
struct sackhole *p;
#ifdef TCP_SIGNATURE
int sigoff = 0;
#endif
uint64_t *tcps;
so = tp->t_inpcb->inp_socket;
ro = &tp->t_inpcb->inp_route;
switch (af = tp->t_family) {
case AF_INET:
case AF_INET6:
if (tp->t_inpcb)
break;
return EINVAL;
default:
return EAFNOSUPPORT;
}
if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
return EMSGSIZE;
idle = (tp->snd_max == tp->snd_una);
/*
* Determine if we can use TCP segmentation offload:
* - If we're using IPv4
* - If there is not an IPsec policy that prevents it
* - If the interface can do it
*/
has_tso4 = has_tso6 = false;
has_tso4 = tp->t_inpcb->inp_af == AF_INET &&
#if defined(IPSEC)
(!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) &&
#endif
(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
if (rt != NULL) {
rtcache_unref(rt, &tp->t_inpcb->inp_route);
rt = NULL;
}
#if defined(INET6)
has_tso6 = tp->t_inpcb->inp_af == AF_INET6 &&
#if defined(IPSEC)
(!ipsec_used || ipsec_pcb_skip_ipsec(tp->t_inpcb->inp_sp, IPSEC_DIR_OUTBOUND)) &&
#endif
(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL && (rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
if (rt != NULL)
rtcache_unref(rt, &tp->t_inpcb->inp_route);
#endif /* defined(INET6) */
has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
/*
* Restart Window computation. From draft-floyd-incr-init-win-03:
*
* Optionally, a TCP MAY set the restart window to the
* minimum of the value used for the initial window and
* the current value of cwnd (in other words, using a
* larger value for the restart window should never increase
* the size of cwnd).
*/
if (tcp_cwm) {
/*
* Hughes/Touch/Heidemann Congestion Window Monitoring.
* Count the number of packets currently pending
* acknowledgement, and limit our congestion window
* to a pre-determined allowed burst size plus that count.
* This prevents bursting once all pending packets have
* been acknowledged (i.e. transmission is idle).
*
* XXX Link this to Initial Window?
*/
tp->snd_cwnd = uimin(tp->snd_cwnd,
(tcp_cwm_burstsize * txsegsize) +
(tp->snd_nxt - tp->snd_una));
} else {
if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
/*
* We have been idle for "a while" and no acks are
* expected to clock out any data we send --
* slow start to get ack "clock" running again.
*/
int ss = tcp_init_win;
if (tp->t_inpcb->inp_af == AF_INET &&
in_localaddr(in4p_faddr(tp->t_inpcb)))
ss = tcp_init_win_local;
#ifdef INET6
else if (tp->t_inpcb->inp_af == AF_INET6 && in6_localaddr(&in6p_faddr(tp->t_inpcb)))
ss = tcp_init_win_local;
#endif
tp->snd_cwnd = uimin(tp->snd_cwnd,
TCP_INITIAL_WINDOW(ss, txsegsize));
}
}
txsegsize_nosack = txsegsize;
again:
ecn_tos = 0;
use_tso = has_tso;
if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
/* don't duplicate CWR/ECE. */
use_tso = 0;
}
TCP_REASS_LOCK(tp);
sack_numblks = tcp_sack_numblks(tp);
if (sack_numblks) {
int sackoptlen;
sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
if (sackoptlen > txsegsize_nosack) {
sack_numblks = 0; /* give up SACK */
txsegsize = txsegsize_nosack;
} else {
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
/* don't duplicate D-SACK. */
use_tso = 0;
}
txsegsize = txsegsize_nosack - sackoptlen;
}
} else {
txsegsize = txsegsize_nosack;
}
/*
* Determine length of data that should be transmitted, and
* flags that should be used. If there is some data or critical
* controls (SYN, RST) to send, then transmit; otherwise,
* investigate further.
*
* Readjust SACK information to avoid resending duplicate data.
*/
if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max)) tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
win = uimin(tp->snd_wnd, tp->snd_cwnd);
flags = tcp_outflags[tp->t_state];
/*
* Send any SACK-generated retransmissions. If we're explicitly trying
* to send out new data (when sendalot is 1), bypass this function.
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
* we're replacing a (future) new transmission with a retransmission
* now, and we previously incremented snd_cwnd in tcp_input().
*/
/*
* Still in sack recovery, reset rxmit flag to zero.
*/
sack_rxmit = 0;
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
do {
long cwin;
if (!TCP_SACK_ENABLED(tp))
break;
if (tp->t_partialacks < 0)
break;
p = tcp_sack_output(tp, &sack_bytes_rxmt);
if (p == NULL)
break;
cwin = uimin(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
/* Do not retransmit SACK segments beyond snd_recover */
if (SEQ_GT(p->end, tp->snd_recover)) {
/*
* (At least) part of sack hole extends beyond
* snd_recover. Check to see if we can rexmit data
* for this hole.
*/
if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
/*
* Can't rexmit any more data for this hole.
* That data will be rexmitted in the next
* sack recovery episode, when snd_recover
* moves past p->rxmit.
*/
p = NULL;
break;
}
/* Can rexmit part of the current hole */
len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
} else
len = ((long)ulmin(cwin, p->end - p->rxmit));
off = p->rxmit - tp->snd_una;
if (off + len > so->so_snd.sb_cc) {
/* 1 for TH_FIN */
KASSERT(off + len == so->so_snd.sb_cc + 1); KASSERT(p->rxmit + len == tp->snd_max);
len = so->so_snd.sb_cc - off;
}
if (len > 0) {
sack_rxmit = 1;
sendalot = 1;
}
} while (/*CONSTCOND*/0);
/*
* If in persist timeout with window of 0, send 1 byte.
* Otherwise, if window is small but nonzero
* and timer expired, we will send what we can
* and go to transmit state.
*/
if (tp->t_force) {
if (win == 0) {
/*
* If we still have some data to send, then
* clear the FIN bit. Usually this would
* happen below when it realizes that we
* aren't sending all the data. However,
* if we have exactly 1 byte of unset data,
* then it won't clear the FIN bit below,
* and if we are in persist state, we wind
* up sending the packet without recording
* that we sent the FIN bit.
*
* We can't just blindly clear the FIN bit,
* because if we don't have any more data
* to send then the probe will be the FIN
* itself.
*/
if (off < so->so_snd.sb_cc)
flags &= ~TH_FIN;
win = 1;
} else {
TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
}
if (sack_rxmit == 0) { if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
long cwin;
/*
* We are inside of a SACK recovery episode and are
* sending new data, having retransmitted all the
* data possible in the scoreboard.
*/
if (tp->snd_wnd < so->so_snd.sb_cc) {
len = tp->snd_wnd - off;
flags &= ~TH_FIN;
} else {
len = so->so_snd.sb_cc - off;
}
/*
* From FreeBSD:
* Don't remove this (len > 0) check !
* We explicitly check for len > 0 here (although it
* isn't really necessary), to work around a gcc
* optimization issue - to force gcc to compute
* len above. Without this check, the computation
* of len is bungled by the optimizer.
*/
if (len > 0) { cwin = tp->snd_cwnd -
(tp->snd_nxt - tp->sack_newdata) -
sack_bytes_rxmt;
if (cwin < 0)
cwin = 0;
if (cwin < len) {
len = cwin;
flags &= ~TH_FIN;
}
}
} else if (win < so->so_snd.sb_cc) {
len = win - off;
flags &= ~TH_FIN;
} else {
len = so->so_snd.sb_cc - off;
}
}
if (len < 0) {
/*
* If FIN has been sent but not acked,
* but we haven't been called to retransmit,
* len will be -1. Otherwise, window shrank
* after we sent into it. If window shrank to 0,
* cancel pending retransmit, pull snd_nxt back
* to (closed) window, and set the persist timer
* if it isn't already going. If the window didn't
* close completely, just wait for an ACK.
*
* If we have a pending FIN, either it has already been
* transmitted or it is outside the window, so drop it.
* If the FIN has been transmitted, but this is not a
* retransmission, then len must be -1. Therefore we also
* prevent here the sending of `gratuitous FINs'. This
* eliminates the need to check for that case below (e.g.
* to back up snd_nxt before the FIN so that the sequence
* number is correct).
*/
len = 0;
flags &= ~TH_FIN;
if (win == 0) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rxtshift = 0;
tp->snd_nxt = tp->snd_una;
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) tcp_setpersist(tp);
}
}
/*
* Automatic sizing enables the performance of large buffers
* and most of the efficiency of small ones by only allocating
* space when it is needed.
*
* The criteria to step up the send buffer one notch are:
* 1. receive window of remote host is larger than send buffer
* (with a fudge factor of 5/4th);
* 2. send buffer is filled to 7/8th with data (so we actually
* have data to make use of it);
* 3. send buffer fill has not hit maximal automatic size;
* 4. our send window (slow start and cogestion controlled) is
* larger than sent but unacknowledged data in send buffer.
*
* The remote host receive window scaling factor may limit the
* growing of the send buffer before it reaches its allowed
* maximum.
*
* It scales directly with slow start or congestion window
* and does at most one step per received ACK. This fast
* scaling has the drawback of growing the send buffer beyond
* what is strictly necessary to make full use of a given
* delay*bandwidth product. However testing has shown this not
* to be much of an problem. At worst we are trading wasting
* of available bandwidth (the non-use of it) for wasting some
* socket buffer memory.
*
* TODO: Shrink send buffer during idle periods together
* with congestion window. Requires another timer.
*/
if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) && so->so_snd.sb_cc < tcp_autosndbuf_max &&
win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
if (!sbreserve(&so->so_snd,
uimin(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
tcp_autosndbuf_max), so))
so->so_snd.sb_flags &= ~SB_AUTOSIZE;
}
}
if (len > txsegsize) { if (use_tso) {
/*
* Truncate TSO transfers to IP_MAXPACKET, and make
* sure that we send equal size transfers down the
* stack (rather than big-small-big-small-...).
*/
#ifdef INET6
CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
#endif
len = (uimin(len, IP_MAXPACKET) / txsegsize) * txsegsize;
if (len <= txsegsize) {
use_tso = 0;
}
} else
len = txsegsize;
flags &= ~TH_FIN;
sendalot = 1;
} else
use_tso = 0;
if (sack_rxmit) { if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
flags &= ~TH_FIN;
}
win = sbspace(&so->so_rcv);
/*
* Sender silly window avoidance. If connection is idle
* and can send all data, a maximum segment,
* at least a maximum default-size segment do it,
* or are forced, do it; otherwise don't bother.
* If peer's buffer is tiny, then send
* when window is at least half open.
* If retransmitting (possibly after persist timer forced us
* to send into a small window), then must resend.
*/
if (len) { if (len >= txsegsize)
goto send;
if ((so->so_state & SS_MORETOCOME) == 0 && ((idle || tp->t_flags & TF_NODELAY) &&
len + off >= so->so_snd.sb_cc))
goto send;
if (tp->t_force)
goto send;
if (len >= tp->max_sndwnd / 2)
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
goto send;
if (sack_rxmit)
goto send;
}
/*
* Compare available window to amount of window known to peer
* (as advertised window less next expected input). If the
* difference is at least twice the size of the largest segment
* we expect to receive (i.e. two segments) or at least 50% of
* the maximum possible window, then want to send a window update
* to peer.
*/
if (win > 0) {
/*
* "adv" is the amount we can increase the window,
* taking into account that we are limited by
* TCP_MAXWIN << tp->rcv_scale.
*/
long recwin = uimin(win, (long)TCP_MAXWIN << tp->rcv_scale);
long oldwin, adv;
/*
* rcv_nxt may overtake rcv_adv when we accept a
* zero-window probe.
*/
if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt))
oldwin = tp->rcv_adv - tp->rcv_nxt;
else
oldwin = 0;
/*
* If the new window size ends up being the same as or
* less than the old size when it is scaled, then
* don't force a window update.
*/
if (recwin >> tp->rcv_scale <= oldwin >> tp->rcv_scale)
goto dontupdate;
adv = recwin - oldwin;
if (adv >= (long) (2 * rxsegsize))
goto send;
if (2 * adv >= (long) so->so_rcv.sb_hiwat)
goto send;
}
dontupdate:
/*
* Send if we owe peer an ACK.
*/
if (tp->t_flags & TF_ACKNOW)
goto send;
if (flags & (TH_SYN|TH_FIN|TH_RST))
goto send;
if (SEQ_GT(tp->snd_up, tp->snd_una))
goto send;
/*
* In SACK, it is possible for tcp_output to fail to send a segment
* after the retransmission timer has been turned off. Make sure
* that the retransmission timer is set.
*/
if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) && !TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
!TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
goto just_return;
}
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
* updates. The three ``states'' for the output side are:
* idle not doing retransmits or persists
* persisting to move a small or zero window
* (re)transmitting and thereby not persisting
*
* tp->t_timer[TCPT_PERSIST]
* is set when we are in persist state.
* tp->t_force
* is set when we are called to send a persist packet.
* tp->t_timer[TCPT_REXMT]
* is set when we are retransmitting
* The output side is idle when both timers are zero.
*
* If send window is too small, there is data to transmit, and no
* retransmit or persist is pending, then go to persist state.
* If nothing happens soon, send when timer expires:
* if window is nonzero, transmit what we can,
* otherwise force out a byte.
*/
if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
/*
* No reason to send a segment, just return.
*/
just_return:
TCP_REASS_UNLOCK(tp);
return 0;
send:
/*
* Before ESTABLISHED, force sending of initial options unless TCP set
* not to do any options.
*
* Note: we assume that the IP/TCP header plus TCP options always fit
* in a single mbuf, leaving room for a maximum link header, i.e.:
* max_linkhdr + IP_header + TCP_header + optlen <= MCLBYTES
*/
optlen = 0;
optp = opt;
switch (af) {
case AF_INET:
iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
break;
#ifdef INET6
case AF_INET6:
iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
break;
#endif
default: /*pacify gcc*/
iphdrlen = 0;
break;
}
hdrlen = iphdrlen;
if (flags & TH_SYN) {
struct rtentry *synrt;
synrt = inpcb_rtentry(tp->t_inpcb);
tp->snd_nxt = tp->iss;
tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ? synrt->rt_ifp : NULL, af);
inpcb_rtentry_unref(synrt, tp->t_inpcb);
if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) {
*optp++ = TCPOPT_MAXSEG;
*optp++ = TCPOLEN_MAXSEG;
*optp++ = (tp->t_ourmss >> 8) & 0xff;
*optp++ = tp->t_ourmss & 0xff;
optlen += TCPOLEN_MAXSEG;
if ((tp->t_flags & TF_REQ_SCALE) && ((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE)) &&
OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) {
*((uint32_t *)optp) = htonl(
TCPOPT_NOP << 24 |
TCPOPT_WINDOW << 16 |
TCPOLEN_WINDOW << 8 |
tp->request_r_scale);
optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
}
if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) {
*optp++ = TCPOPT_SACK_PERMITTED;
*optp++ = TCPOLEN_SACK_PERMITTED;
optlen += TCPOLEN_SACK_PERMITTED;
}
}
}
/*
* Send a timestamp and echo-reply if this is a SYN and our side
* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
* and our peer have sent timestamps in our SYN's.
*/
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(flags & TH_RST) == 0 &&
((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
(tp->t_flags & TF_RCVD_TSTMP))) {
int alen = 0;
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
alen++;
}
if (OPT_FITS(TCPOLEN_TIMESTAMP)) {
*optp++ = TCPOPT_TIMESTAMP;
*optp++ = TCPOLEN_TIMESTAMP;
uint32_t *lp = (uint32_t *)optp;
/* Form timestamp option (appendix A of RFC 1323) */
*lp++ = htonl(TCP_TIMESTAMP(tp));
*lp = htonl(tp->ts_recent);
optp += TCPOLEN_TIMESTAMP - 2;
optlen += TCPOLEN_TIMESTAMP;
/* Set receive buffer autosizing timestamp. */
if (tp->rfbuf_ts == 0 &&
(so->so_rcv.sb_flags & SB_AUTOSIZE))
tp->rfbuf_ts = TCP_TIMESTAMP(tp);
} else {
optp -= alen;
optlen -= alen;
}
}
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE) {
/*
* Initialize TCP-MD5 option (RFC2385)
*/
if (!OPT_FITS(TCPOLEN_SIGNATURE))
goto reset;
*optp++ = TCPOPT_SIGNATURE;
*optp++ = TCPOLEN_SIGNATURE;
sigoff = optlen + 2;
memset(optp, 0, TCP_SIGLEN);
optlen += TCPOLEN_SIGNATURE;
optp += TCP_SIGLEN;
}
#endif
/*
* Tack on the SACK block if it is necessary.
*/
if (sack_numblks) {
int alen = 0;
int sack_len = sack_numblks * 8;
while (optlen % 4 != 2) {
optlen += TCPOLEN_NOP;
*optp++ = TCPOPT_NOP;
alen++;
}
if (OPT_FITS(sack_len + 2)) {
struct ipqent *tiqe;
*optp++ = TCPOPT_SACK;
*optp++ = sack_len + 2;
uint32_t *lp = (uint32_t *)optp;
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) { sack_numblks--;
*lp++ = htonl(tp->rcv_dsack_block.left);
*lp++ = htonl(tp->rcv_dsack_block.right);
tp->rcv_sack_flags &= ~TCPSACK_HAVED;
}
for (tiqe = TAILQ_FIRST(&tp->timeq);
sack_numblks > 0;
tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
KASSERT(tiqe != NULL);
sack_numblks--;
*lp++ = htonl(tiqe->ipqe_seq);
*lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
}
optlen += sack_len + 2;
optp += sack_len;
} else {
optp -= alen;
optlen -= alen;
}
}
/* Terminate and pad TCP options to a 4 byte boundary. */
if (optlen % 4) { if (!OPT_FITS(TCPOLEN_EOL)) {reset: TCP_REASS_UNLOCK(tp);
error = ECONNABORTED;
goto out;
}
optlen += TCPOLEN_EOL;
*optp++ = TCPOPT_EOL;
}
/*
* According to RFC 793 (STD0007):
* "The content of the header beyond the End-of-Option option
* must be header padding (i.e., zero)."
* and later: "The padding is composed of zeros."
*/
while (optlen % 4) { if (!OPT_FITS(TCPOLEN_PAD))
goto reset;
optlen += TCPOLEN_PAD;
*optp++ = TCPOPT_PAD;
}
TCP_REASS_UNLOCK(tp);
hdrlen += optlen;
#ifdef DIAGNOSTIC
if (!use_tso && len > txsegsize) panic("tcp data to be sent is larger than segment"); else if (use_tso && len > IP_MAXPACKET) panic("tcp data to be sent is larger than max TSO size");
if (max_linkhdr + hdrlen > MCLBYTES)
panic("tcphdr too big");
#endif
/*
* Grab a header mbuf, attaching a copy of data to
* be transmitted, and initialize the header from
* the template for sends on this connection.
*/
if (len) {
error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
if (error)
goto out;
/*
* If we're sending everything we've got, set PUSH.
* (This will keep happy those implementations which only
* give data to the user when a buffer fills or
* a PUSH comes in.)
*/
if (off + len == so->so_snd.sb_cc)
flags |= TH_PUSH;
} else {
tcps = TCP_STAT_GETREF();
if (tp->t_flags & TF_ACKNOW)
tcps[TCP_STAT_SNDACKS]++;
else if (flags & (TH_SYN|TH_FIN|TH_RST))
tcps[TCP_STAT_SNDCTRL]++;
else if (SEQ_GT(tp->snd_up, tp->snd_una))
tcps[TCP_STAT_SNDURG]++;
else
tcps[TCP_STAT_SNDWINUP]++;
TCP_STAT_PUTREF();
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_freem(m);
m = NULL;
}
}
if (m == NULL) {
error = ENOBUFS;
goto out;
}
MCLAIM(m, &tcp_tx_mowner);
m->m_data += max_linkhdr;
m->m_len = hdrlen;
}
m_reset_rcvif(m); switch (af) {
case AF_INET:
ip = mtod(m, struct ip *);
#ifdef INET6
ip6 = NULL;
#endif
th = (struct tcphdr *)(ip + 1);
break;
#ifdef INET6
case AF_INET6:
ip = NULL;
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
break;
#endif
default: /*pacify gcc*/
ip = NULL;
#ifdef INET6
ip6 = NULL;
#endif
th = NULL;
break;
}
if (tp->t_template == NULL)
panic("%s: no template", __func__);
if (tp->t_template->m_len < iphdrlen)
panic("%s: %d < %d", __func__, tp->t_template->m_len, iphdrlen);
bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);
/*
* If we are starting a connection, send ECN setup
* SYN packet. If we are on a retransmit, we may
* resend those bits a number of times as per
* RFC 3168.
*/
if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
if (tp->t_flags & TF_SYN_REXMT) {
if (tp->t_ecn_retries--)
flags |= TH_ECE|TH_CWR;
} else {
flags |= TH_ECE|TH_CWR;
tp->t_ecn_retries = tcp_ecn_maxretries;
}
}
if (TCP_ECN_ALLOWED(tp)) {
/*
* If the peer has ECN, mark data packets
* ECN capable. Ignore pure ack packets, retransmissions
* and window probes.
*/
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !(tp->t_force && len == 1)) {
ecn_tos = IPTOS_ECN_ECT0;
TCP_STATINC(TCP_STAT_ECN_ECT);
}
/*
* Reply with proper ECN notifications.
*/
if (tp->t_flags & TF_ECN_SND_CWR) { flags |= TH_CWR;
tp->t_flags &= ~TF_ECN_SND_CWR;
}
if (tp->t_flags & TF_ECN_SND_ECE) {
flags |= TH_ECE;
}
}
/*
* If we are doing retransmissions, then snd_nxt will
* not reflect the first unsent octet. For ACK only
* packets, we do not want the sequence number of the
* retransmitted packet, we want the sequence number
* of the next unsent octet. So, if there is no data
* (and no SYN or FIN), use snd_max instead of snd_nxt
* when filling in ti_seq. But if we are in persist
* state, snd_max might reflect one byte beyond the
* right edge of the window, so use snd_nxt in that
* case, since we know we aren't doing a retransmission.
* (retransmit and persist are mutually exclusive...)
*/
if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
th->th_seq = htonl(p->rxmit);
p->rxmit += len;
} else {
if (len || (flags & (TH_SYN|TH_FIN)) ||
TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
}
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) { memcpy(th + 1, opt, optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
th->th_flags = flags;
/*
* Calculate receive window. Don't shrink window,
* but avoid silly window syndrome.
*/
if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
win = 0;
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
if (th->th_win == 0) { tp->t_sndzerowin++;
}
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
u_int32_t urp = tp->snd_up - tp->snd_nxt;
if (urp > IP_MAXPACKET)
urp = IP_MAXPACKET;
th->th_urp = htons((u_int16_t)urp);
th->th_flags |= TH_URG;
} else
/*
* If no urgent pointer to send, then we pull
* the urgent pointer to the left edge of the send window
* so that it doesn't drift into the send window on sequence
* number wraparound.
*/
tp->snd_up = tp->snd_una; /* drag it along */
#ifdef TCP_SIGNATURE
if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
struct secasvar *sav;
u_int8_t *sigp;
sav = tcp_signature_getsav(m);
if (sav == NULL) {
if (m)
m_freem(m);
return EPERM;
}
m->m_pkthdr.len = hdrlen + len;
sigp = (char *)th + sizeof(*th) + sigoff;
tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);
key_sa_recordxfer(sav, m);
KEY_SA_UNREF(&sav);
}
#endif
/*
* Set ourselves up to be checksummed just before the packet
* hits the wire.
*/
switch (af) {
case AF_INET:
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (use_tso) {
m->m_pkthdr.segsz = txsegsize;
m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
} else {
m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
if (len + optlen) {
/* Fixup the pseudo-header checksum. */
/* XXXJRT Not IP Jumbogram safe. */
th->th_sum = in_cksum_addword(th->th_sum,
htons((u_int16_t) (len + optlen)));
}
}
break;
#ifdef INET6
case AF_INET6:
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
if (use_tso) {
m->m_pkthdr.segsz = txsegsize;
m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
} else {
m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
if (len + optlen) {
/* Fixup the pseudo-header checksum. */
/* XXXJRT: Not IPv6 Jumbogram safe. */
th->th_sum = in_cksum_addword(th->th_sum,
htons((u_int16_t) (len + optlen)));
}
}
break;
#endif
}
/*
* In transmit state, time the transmission and arrange for
* the retransmit. In persist state, just set snd_max.
*/
if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
tcp_seq startseq = tp->snd_nxt;
/*
* Advance snd_nxt over sequence space of this segment.
* There are no states in which we send both a SYN and a FIN,
* so we collapse the tests for these flags.
*/
if (flags & (TH_SYN|TH_FIN)) tp->snd_nxt++;
if (sack_rxmit)
goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
/*
* Time this transmission if not a retransmission and
* not currently timing anything.
*/
if (tp->t_rtttime == 0) { tp->t_rtttime = tcp_now;
tp->t_rtseq = startseq;
TCP_STATINC(TCP_STAT_SEGSTIMED);
}
}
/*
* Set retransmit timer if not currently set,
* and not doing an ack or a keep-alive probe.
* Initial value for retransmit timer is smoothed
* round-trip time + 2 * round-trip time variance.
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
timer:
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) { if ((sack_rxmit && tp->snd_nxt != tp->snd_max) || tp->snd_nxt != tp->snd_una) { if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) { TCP_TIMER_DISARM(tp, TCPT_PERSIST);
tp->t_rxtshift = 0;
}
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
} else if (len == 0 && so->so_snd.sb_cc > 0 && TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
/*
* If we are sending a window probe and there's
* unacked data in the socket, make sure at
* least the persist timer is running.
*/
tp->t_rxtshift = 0;
tcp_setpersist(tp);
}
}
} else
if (SEQ_GT(tp->snd_nxt + len, tp->snd_max)) tp->snd_max = tp->snd_nxt + len;
#ifdef TCP_DEBUG
/*
* Trace.
*/
if (so->so_options & SO_DEBUG)
tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
#endif
/*
* Fill in IP length and desired time to live and
* send to IP level. There should be a better way
* to handle ttl and tos; we could keep them in
* the template, but need a way to checksum without them.
*/
m->m_pkthdr.len = hdrlen + len;
switch (af) {
case AF_INET:
ip->ip_len = htons(m->m_pkthdr.len);
packetlen = m->m_pkthdr.len;
if (tp->t_inpcb->inp_af == AF_INET) { ip->ip_ttl = in4p_ip(tp->t_inpcb).ip_ttl;
ip->ip_tos = in4p_ip(tp->t_inpcb).ip_tos | ecn_tos;
}
#ifdef INET6
else if (tp->t_inpcb->inp_af == AF_INET6) {
ip->ip_ttl = in6pcb_selecthlim(tp->t_inpcb, NULL); /*XXX*/
ip->ip_tos = ecn_tos; /*XXX*/
}
#endif
break;
#ifdef INET6
case AF_INET6:
packetlen = m->m_pkthdr.len;
ip6->ip6_nxt = IPPROTO_TCP;
if (tp->t_family == AF_INET6) {
/*
* we separately set hoplimit for every segment, since
* the user might want to change the value via
* setsockopt. Also, desired default hop limit might
* be changed via Neighbor Discovery.
*/
ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
}
ip6->ip6_flow |= htonl(ecn_tos << 20);
/* ip6->ip6_flow = ??? (from template) */
/* ip6_plen will be filled in ip6_output(). */
break;
#endif
default: /*pacify gcc*/
packetlen = 0;
break;
}
switch (af) {
case AF_INET:
{
struct mbuf *opts;
if (tp->t_inpcb->inp_af == AF_INET) opts = tp->t_inpcb->inp_options;
else
opts = NULL;
error = ip_output(m, opts, ro,
(tp->t_mtudisc ? IP_MTUDISC : 0) |
(so->so_options & SO_DONTROUTE), NULL, tp->t_inpcb);
break;
}
#ifdef INET6
case AF_INET6:
{
struct ip6_pktopts *opts;
if (tp->t_inpcb->inp_af == AF_INET6) opts = in6p_outputopts(tp->t_inpcb);
else
opts = NULL;
error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
NULL, tp->t_inpcb, NULL);
break;
}
#endif
default:
error = EAFNOSUPPORT;
break;
}
if (error) {
out:
if (error == ENOBUFS) {
TCP_STATINC(TCP_STAT_SELFQUENCH);
tcp_quench(tp->t_inpcb);
error = 0;
} else if ((error == EHOSTUNREACH || error == ENETDOWN ||
error == EHOSTDOWN) && TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_softerror = error;
error = 0;
}
/* Back out the sequence number advance. */
if (sack_rxmit) p->rxmit -= len;
/* Restart the delayed ACK timer, if necessary. */
if (tp->t_flags & TF_DELACK) TCP_RESTART_DELACK(tp);
return error;
}
if (packetlen > tp->t_pmtud_mtu_sent) tp->t_pmtud_mtu_sent = packetlen;
tcps = TCP_STAT_GETREF();
tcps[TCP_STAT_SNDTOTAL]++;
if (tp->t_flags & TF_DELACK)
tcps[TCP_STAT_DELACK]++;
TCP_STAT_PUTREF();
/*
* Data sent (as far as we can tell).
* If this advertises a larger window than any other segment,
* then remember the size of the advertised window.
* Any pending ACK has now been sent.
*/
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + win;
tp->last_ack_sent = tp->rcv_nxt;
tp->t_flags &= ~TF_ACKNOW;
TCP_CLEAR_DELACK(tp);
#ifdef DIAGNOSTIC
if (maxburst < 0) printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
#endif
if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
goto again;
return 0;
}
void
tcp_setpersist(struct tcpcb *tp)
{
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
int nticks;
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
panic("tcp_output REXMT");
/*
* Start/restart persistance timer.
*/
if (t < tp->t_rttmin)
t = tp->t_rttmin;
TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
TCPTV_PERSMIN, TCPTV_PERSMAX);
TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++;
}
/* $NetBSD: secmodel.c,v 1.2 2014/11/04 16:01:58 maxv Exp $ */
/*-
* Copyright (c) 2011 Elad Efrat <elad@NetBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <sys/types.h>
#include <sys/param.h>
#include <sys/errno.h>
#include <sys/atomic.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/queue.h>
#include <sys/rwlock.h>
#include <secmodel/secmodel.h>
#include <prop/proplib.h>
/* List of secmodels, parameters, and lock. */
static LIST_HEAD(, secmodel_descr) secmodels =
LIST_HEAD_INITIALIZER(secmodels);
static unsigned int secmodel_copy_cred_on_fork = false;
static krwlock_t secmodels_lock;
static int nsecmodels = 0; /* number of registered secmodels */
static int secmodel_plug(secmodel_t);
static int secmodel_unplug(secmodel_t);
int
secmodel_nsecmodels(void)
{
return nsecmodels;
}
void
secmodel_init(void)
{
rw_init(&secmodels_lock);
secmodel_copy_cred_on_fork = false;
}
/*
* Register a new secmodel.
*/
int
secmodel_register(secmodel_t *secmodel, const char *id, const char *name,
prop_dictionary_t behavior,
secmodel_eval_t eval, secmodel_setinfo_t setinfo)
{
int err;
secmodel_t sm;
sm = kmem_alloc(sizeof(*sm), KM_SLEEP);
sm->sm_id = id;
sm->sm_name = name;
sm->sm_behavior = behavior;
sm->sm_eval = eval;
sm->sm_setinfo = setinfo;
err = secmodel_plug(sm);
if (err == 0) {
atomic_inc_uint(&nsecmodels);
} else {
kmem_free(sm, sizeof(*sm));
sm = NULL;
}
*secmodel = sm;
return err;
}
/*
* Deregister a secmodel.
*/
int
secmodel_deregister(secmodel_t sm)
{
int error;
error = secmodel_unplug(sm);
if (error == 0) {
atomic_dec_uint(&nsecmodels);
kmem_free(sm, sizeof(*sm));
}
return error;
}
/*
* Lookup a secmodel by its id.
*
* Requires "secmodels_lock" handling by the caller.
*/
static secmodel_t
secmodel_lookup(const char *id)
{
secmodel_t tsm;
KASSERT(rw_lock_held(&secmodels_lock));
LIST_FOREACH(tsm, &secmodels, sm_list) {
if (strcasecmp(tsm->sm_id, id) == 0) {
return tsm;
}
}
return NULL;
}
/*
* Adjust system-global secmodel behavior following the addition
* or removal of a secmodel.
*
* Requires "secmodels_lock" to be held by the caller.
*/
static void
secmodel_adjust_behavior(secmodel_t sm, bool added)
{
bool r, b;
KASSERT(rw_write_held(&secmodels_lock));
#define ADJUST_COUNTER(which, added) \
do { \
if (added) { \
(which)++; \
} else { \
if ((which) > 0) \
(which)--; \
} \
} while (/*CONSTCOND*/0)
/* Copy credentials on fork? */
r = prop_dictionary_get_bool(sm->sm_behavior, "copy-cred-on-fork", &b);
if (r) {
ADJUST_COUNTER(secmodel_copy_cred_on_fork, added);
}
#undef ADJUST_COUNTER
}
static int
secmodel_plug(secmodel_t sm)
{
secmodel_t tsm;
int error = 0;
if (sm == NULL)
return EFAULT;
/* Check if the secmodel is already present. */
rw_enter(&secmodels_lock, RW_WRITER);
tsm = secmodel_lookup(sm->sm_id);
if (tsm != NULL) {
error = EEXIST;
goto out;
}
/* Add the secmodel. */
LIST_INSERT_HEAD(&secmodels, sm, sm_list);
/* Adjust behavior. */
secmodel_adjust_behavior(sm, true);
out:
/* Unlock the secmodels list. */
rw_exit(&secmodels_lock);
return error;
}
static int
secmodel_unplug(secmodel_t sm)
{
secmodel_t tsm;
int error = 0;
if (sm == NULL)
return EFAULT;
/* Make sure the secmodel is present. */
rw_enter(&secmodels_lock, RW_WRITER);
tsm = secmodel_lookup(sm->sm_id);
if (tsm == NULL) {
error = ENOENT;
goto out;
}
/* Remove the secmodel. */
LIST_REMOVE(tsm, sm_list);
/* Adjust behavior. */
secmodel_adjust_behavior(tsm, false);
out:
/* Unlock the secmodels list. */
rw_exit(&secmodels_lock);
return error;
}
/* XXX TODO */
int
secmodel_setinfo(const char *id, void *v, int *err)
{
return EOPNOTSUPP;
}
int
secmodel_eval(const char *id, const char *what, void *arg, void *ret)
{
secmodel_t sm;
int error = 0;
rw_enter(&secmodels_lock, RW_READER);
sm = secmodel_lookup(id);
if (sm == NULL) {
error = EINVAL;
goto out;
}
if (sm->sm_eval == NULL) {
error = ENOENT;
goto out;
}
if (ret == NULL) {
error = EFAULT;
goto out;
}
error = sm->sm_eval(what, arg, ret);
/* pass error from a secmodel(9) callback as a negative value */
error = -error;
out:
rw_exit(&secmodels_lock);
return error;
}
/* $NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $ */
/*
* Copyright (c) 1996, 2000 Christopher G. Demetriou
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed for the
* NetBSD Project. See http://www.NetBSD.org/ for
* information about NetBSD.
* 4. The name of the author may not be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* --(license Id: LICENSE.proto,v 1.1 2000/06/13 21:40:26 cgd Exp )--
*/
/*
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* This software was developed by the Computer Systems Engineering group
* at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and
* contributed to Berkeley.
*
* All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Lawrence Berkeley Laboratories.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* from: Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp (LBL)
*
* @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: subr_autoconf.c,v 1.314 2023/07/18 11:57:37 riastradh Exp $");
#ifdef _KERNEL_OPT
#include "opt_ddb.h"
#include "drvctl.h"
#endif
#include <sys/param.h>
#include <sys/device.h>
#include <sys/device_impl.h>
#include <sys/disklabel.h>
#include <sys/conf.h>
#include <sys/kauth.h>
#include <sys/kmem.h>
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/errno.h>
#include <sys/proc.h>
#include <sys/reboot.h>
#include <sys/kthread.h>
#include <sys/buf.h>
#include <sys/dirent.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/unistd.h>
#include <sys/fcntl.h>
#include <sys/lockf.h>
#include <sys/callout.h>
#include <sys/devmon.h>
#include <sys/cpu.h>
#include <sys/sysctl.h>
#include <sys/stdarg.h>
#include <sys/localcount.h>
#include <sys/disk.h>
#include <sys/rndsource.h>
#include <machine/limits.h>
/*
* Autoconfiguration subroutines.
*/
/*
* Device autoconfiguration timings are mixed into the entropy pool.
*/
static krndsource_t rnd_autoconf_source;
/*
* ioconf.c exports exactly two names: cfdata and cfroots. All system
* devices and drivers are found via these tables.
*/
extern struct cfdata cfdata[];
extern const short cfroots[];
/*
* List of all cfdriver structures. We use this to detect duplicates
* when other cfdrivers are loaded.
*/
struct cfdriverlist allcfdrivers = LIST_HEAD_INITIALIZER(&allcfdrivers);
extern struct cfdriver * const cfdriver_list_initial[];
/*
* Initial list of cfattach's.
*/
extern const struct cfattachinit cfattachinit[];
/*
* List of cfdata tables. We always have one such list -- the one
* built statically when the kernel was configured.
*/
struct cftablelist allcftables = TAILQ_HEAD_INITIALIZER(allcftables);
static struct cftable initcftable;
#define ROOT ((device_t)NULL)
struct matchinfo {
cfsubmatch_t fn;
device_t parent;
const int *locs;
void *aux;
struct cfdata *match;
int pri;
};
struct alldevs_foray {
int af_s;
struct devicelist af_garbage;
};
/*
* Internal version of the cfargs structure; all versions are
* canonicalized to this.
*/
struct cfargs_internal {
union {
cfsubmatch_t submatch;/* submatch function (direct config) */
cfsearch_t search; /* search function (indirect config) */
};
const char * iattr; /* interface attribute */
const int * locators; /* locators array */
devhandle_t devhandle; /* devhandle_t (by value) */
};
static char *number(char *, int);
static void mapply(struct matchinfo *, cfdata_t);
static void config_devdelete(device_t);
static void config_devunlink(device_t, struct devicelist *);
static void config_makeroom(int, struct cfdriver *);
static void config_devlink(device_t);
static void config_alldevs_enter(struct alldevs_foray *);
static void config_alldevs_exit(struct alldevs_foray *);
static void config_add_attrib_dict(device_t);
static device_t config_attach_internal(device_t, cfdata_t, void *,
cfprint_t, const struct cfargs_internal *);
static void config_collect_garbage(struct devicelist *);
static void config_dump_garbage(struct devicelist *);
static void pmflock_debug(device_t, const char *, int);
static device_t deviter_next1(deviter_t *);
static void deviter_reinit(deviter_t *);
struct deferred_config {
TAILQ_ENTRY(deferred_config) dc_queue;
device_t dc_dev;
void (*dc_func)(device_t);
};
TAILQ_HEAD(deferred_config_head, deferred_config);
static struct deferred_config_head deferred_config_queue =
TAILQ_HEAD_INITIALIZER(deferred_config_queue);
static struct deferred_config_head interrupt_config_queue =
TAILQ_HEAD_INITIALIZER(interrupt_config_queue);
static int interrupt_config_threads = 8;
static struct deferred_config_head mountroot_config_queue =
TAILQ_HEAD_INITIALIZER(mountroot_config_queue);
static int mountroot_config_threads = 2;
static lwp_t **mountroot_config_lwpids;
static size_t mountroot_config_lwpids_size;
bool root_is_mounted = false;
static void config_process_deferred(struct deferred_config_head *, device_t);
/* Hooks to finalize configuration once all real devices have been found. */
struct finalize_hook {
TAILQ_ENTRY(finalize_hook) f_list;
int (*f_func)(device_t);
device_t f_dev;
};
static TAILQ_HEAD(, finalize_hook) config_finalize_list =
TAILQ_HEAD_INITIALIZER(config_finalize_list);
static int config_finalize_done;
/* list of all devices */
static struct devicelist alldevs = TAILQ_HEAD_INITIALIZER(alldevs);
static kmutex_t alldevs_lock __cacheline_aligned;
static devgen_t alldevs_gen = 1;
static int alldevs_nread = 0;
static int alldevs_nwrite = 0;
static bool alldevs_garbage = false;
static struct devicelist config_pending =
TAILQ_HEAD_INITIALIZER(config_pending);
static kmutex_t config_misc_lock;
static kcondvar_t config_misc_cv;
static bool detachall = false;
#define STREQ(s1, s2) \
(*(s1) == *(s2) && strcmp((s1), (s2)) == 0)
static bool config_initialized = false; /* config_init() has been called. */
static int config_do_twiddle;
static callout_t config_twiddle_ch;
static void sysctl_detach_setup(struct sysctllog **);
int no_devmon_insert(const char *, prop_dictionary_t);
int (*devmon_insert_vec)(const char *, prop_dictionary_t) = no_devmon_insert;
typedef int (*cfdriver_fn)(struct cfdriver *);
static int
frob_cfdrivervec(struct cfdriver * const *cfdriverv,
cfdriver_fn drv_do, cfdriver_fn drv_undo,
const char *style, bool dopanic)
{
void (*pr)(const char *, ...) __printflike(1, 2) =
dopanic ? panic : printf;
int i, error = 0, e2 __diagused;
for (i = 0; cfdriverv[i] != NULL; i++) {
if ((error = drv_do(cfdriverv[i])) != 0) {
pr("configure: `%s' driver %s failed: %d",
cfdriverv[i]->cd_name, style, error);
goto bad;
}
}
KASSERT(error == 0);
return 0;
bad:
printf("\n");
for (i--; i >= 0; i--) {
e2 = drv_undo(cfdriverv[i]);
KASSERT(e2 == 0);
}
return error;
}
typedef int (*cfattach_fn)(const char *, struct cfattach *);
static int
frob_cfattachvec(const struct cfattachinit *cfattachv,
cfattach_fn att_do, cfattach_fn att_undo,
const char *style, bool dopanic)
{
const struct cfattachinit *cfai = NULL;
void (*pr)(const char *, ...) __printflike(1, 2) =
dopanic ? panic : printf;
int j = 0, error = 0, e2 __diagused;
for (cfai = &cfattachv[0]; cfai->cfai_name != NULL; cfai++) {
for (j = 0; cfai->cfai_list[j] != NULL; j++) {
if ((error = att_do(cfai->cfai_name,
cfai->cfai_list[j])) != 0) {
pr("configure: attachment `%s' "
"of `%s' driver %s failed: %d",
cfai->cfai_list[j]->ca_name,
cfai->cfai_name, style, error);
goto bad;
}
}
}
KASSERT(error == 0);
return 0;
bad:
/*
* Rollback in reverse order. dunno if super-important, but
* do that anyway. Although the code looks a little like
* someone did a little integration (in the math sense).
*/
printf("\n");
if (cfai) {
bool last;
for (last = false; last == false; ) {
if (cfai == &cfattachv[0])
last = true;
for (j--; j >= 0; j--) {
e2 = att_undo(cfai->cfai_name,
cfai->cfai_list[j]);
KASSERT(e2 == 0);
}
if (!last) {
cfai--;
for (j = 0; cfai->cfai_list[j] != NULL; j++)
;
}
}
}
return error;
}
/*
* Initialize the autoconfiguration data structures. Normally this
* is done by configure(), but some platforms need to do this very
* early (to e.g. initialize the console).
*/
void
config_init(void)
{
KASSERT(config_initialized == false);
mutex_init(&alldevs_lock, MUTEX_DEFAULT, IPL_VM);
mutex_init(&config_misc_lock, MUTEX_DEFAULT, IPL_NONE);
cv_init(&config_misc_cv, "cfgmisc");
callout_init(&config_twiddle_ch, CALLOUT_MPSAFE);
frob_cfdrivervec(cfdriver_list_initial,
config_cfdriver_attach, NULL, "bootstrap", true);
frob_cfattachvec(cfattachinit,
config_cfattach_attach, NULL, "bootstrap", true);
initcftable.ct_cfdata = cfdata;
TAILQ_INSERT_TAIL(&allcftables, &initcftable, ct_list);
rnd_attach_source(&rnd_autoconf_source, "autoconf", RND_TYPE_UNKNOWN,
RND_FLAG_COLLECT_TIME);
config_initialized = true;
}
/*
* Init or fini drivers and attachments. Either all or none
* are processed (via rollback). It would be nice if this were
* atomic to outside consumers, but with the current state of
* locking ...
*/
int
config_init_component(struct cfdriver * const *cfdriverv,
const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
int error;
KERNEL_LOCK(1, NULL);
if ((error = frob_cfdrivervec(cfdriverv,
config_cfdriver_attach, config_cfdriver_detach, "init", false))!= 0)
goto out;
if ((error = frob_cfattachvec(cfattachv,
config_cfattach_attach, config_cfattach_detach,
"init", false)) != 0) {
frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, NULL, "init rollback", true);
goto out;
}
if ((error = config_cfdata_attach(cfdatav, 1)) != 0) {
frob_cfattachvec(cfattachv,
config_cfattach_detach, NULL, "init rollback", true);
frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, NULL, "init rollback", true);
goto out;
}
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
int
config_fini_component(struct cfdriver * const *cfdriverv,
const struct cfattachinit *cfattachv, struct cfdata *cfdatav)
{
int error;
KERNEL_LOCK(1, NULL);
if ((error = config_cfdata_detach(cfdatav)) != 0)
goto out;
if ((error = frob_cfattachvec(cfattachv,
config_cfattach_detach, config_cfattach_attach,
"fini", false)) != 0) {
if (config_cfdata_attach(cfdatav, 0) != 0)
panic("config_cfdata fini rollback failed");
goto out;
}
if ((error = frob_cfdrivervec(cfdriverv,
config_cfdriver_detach, config_cfdriver_attach,
"fini", false)) != 0) {
frob_cfattachvec(cfattachv,
config_cfattach_attach, NULL, "fini rollback", true);
if (config_cfdata_attach(cfdatav, 0) != 0)
panic("config_cfdata fini rollback failed");
goto out;
}
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
void
config_init_mi(void)
{
if (!config_initialized)
config_init();
sysctl_detach_setup(NULL);
}
void
config_deferred(device_t dev)
{
KASSERT(KERNEL_LOCKED_P());
config_process_deferred(&deferred_config_queue, dev);
config_process_deferred(&interrupt_config_queue, dev);
config_process_deferred(&mountroot_config_queue, dev);
}
static void
config_interrupts_thread(void *cookie)
{
struct deferred_config *dc;
device_t dev;
mutex_enter(&config_misc_lock);
while ((dc = TAILQ_FIRST(&interrupt_config_queue)) != NULL) {
TAILQ_REMOVE(&interrupt_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
dev = dc->dc_dev;
(*dc->dc_func)(dev);
if (!device_pmf_is_registered(dev))
aprint_debug_dev(dev,
"WARNING: power management not supported\n");
config_pending_decr(dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
}
mutex_exit(&config_misc_lock);
kthread_exit(0);
}
void
config_create_interruptthreads(void)
{
int i;
for (i = 0; i < interrupt_config_threads; i++) {
(void)kthread_create(PRI_NONE, 0/*XXXSMP */, NULL,
config_interrupts_thread, NULL, NULL, "configintr");
}
}
static void
config_mountroot_thread(void *cookie)
{
struct deferred_config *dc;
mutex_enter(&config_misc_lock);
while ((dc = TAILQ_FIRST(&mountroot_config_queue)) != NULL) {
TAILQ_REMOVE(&mountroot_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
(*dc->dc_func)(dc->dc_dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
}
mutex_exit(&config_misc_lock);
kthread_exit(0);
}
void
config_create_mountrootthreads(void)
{
int i;
if (!root_is_mounted)
root_is_mounted = true;
mountroot_config_lwpids_size = sizeof(mountroot_config_lwpids) *
mountroot_config_threads;
mountroot_config_lwpids = kmem_alloc(mountroot_config_lwpids_size,
KM_NOSLEEP);
KASSERT(mountroot_config_lwpids);
for (i = 0; i < mountroot_config_threads; i++) {
mountroot_config_lwpids[i] = 0;
(void)kthread_create(PRI_NONE, KTHREAD_MUSTJOIN/* XXXSMP */,
NULL, config_mountroot_thread, NULL,
&mountroot_config_lwpids[i],
"configroot");
}
}
void
config_finalize_mountroot(void)
{
int i, error;
for (i = 0; i < mountroot_config_threads; i++) {
if (mountroot_config_lwpids[i] == 0)
continue;
error = kthread_join(mountroot_config_lwpids[i]);
if (error)
printf("%s: thread %x joined with error %d\n",
__func__, i, error);
}
kmem_free(mountroot_config_lwpids, mountroot_config_lwpids_size);
}
/*
* Announce device attach/detach to userland listeners.
*/
int
no_devmon_insert(const char *name, prop_dictionary_t p)
{
return ENODEV;
}
static void
devmon_report_device(device_t dev, bool isattach)
{
prop_dictionary_t ev, dict = device_properties(dev);
const char *parent;
const char *what;
const char *where;
device_t pdev = device_parent(dev);
/* If currently no drvctl device, just return */
if (devmon_insert_vec == no_devmon_insert)
return;
ev = prop_dictionary_create();
if (ev == NULL)
return;
what = (isattach ? "device-attach" : "device-detach");
parent = (pdev == NULL ? "root" : device_xname(pdev)); if (prop_dictionary_get_string(dict, "location", &where)) { prop_dictionary_set_string(ev, "location", where);
aprint_debug("ev: %s %s at %s in [%s]\n",
what, device_xname(dev), parent, where);
}
if (!prop_dictionary_set_string(ev, "device", device_xname(dev)) ||
!prop_dictionary_set_string(ev, "parent", parent)) {
prop_object_release(ev);
return;
}
if ((*devmon_insert_vec)(what, ev) != 0)
prop_object_release(ev);
}
/*
* Add a cfdriver to the system.
*/
int
config_cfdriver_attach(struct cfdriver *cd)
{
struct cfdriver *lcd;
/* Make sure this driver isn't already in the system. */
LIST_FOREACH(lcd, &allcfdrivers, cd_list) {
if (STREQ(lcd->cd_name, cd->cd_name))
return EEXIST;
}
LIST_INIT(&cd->cd_attach);
LIST_INSERT_HEAD(&allcfdrivers, cd, cd_list);
return 0;
}
/*
* Remove a cfdriver from the system.
*/
int
config_cfdriver_detach(struct cfdriver *cd)
{
struct alldevs_foray af;
int i, rc = 0;
config_alldevs_enter(&af);
/* Make sure there are no active instances. */
for (i = 0; i < cd->cd_ndevs; i++) {
if (cd->cd_devs[i] != NULL) {
rc = EBUSY;
break;
}
}
config_alldevs_exit(&af);
if (rc != 0)
return rc;
/* ...and no attachments loaded. */
if (LIST_EMPTY(&cd->cd_attach) == 0)
return EBUSY;
LIST_REMOVE(cd, cd_list);
KASSERT(cd->cd_devs == NULL);
return 0;
}
/*
* Look up a cfdriver by name.
*/
struct cfdriver *
config_cfdriver_lookup(const char *name)
{
struct cfdriver *cd;
LIST_FOREACH(cd, &allcfdrivers, cd_list) { if (STREQ(cd->cd_name, name))
return cd;
}
return NULL;
}
/*
* Add a cfattach to the specified driver.
*/
int
config_cfattach_attach(const char *driver, struct cfattach *ca)
{
struct cfattach *lca;
struct cfdriver *cd;
cd = config_cfdriver_lookup(driver);
if (cd == NULL)
return ESRCH;
/* Make sure this attachment isn't already on this driver. */
LIST_FOREACH(lca, &cd->cd_attach, ca_list) {
if (STREQ(lca->ca_name, ca->ca_name))
return EEXIST;
}
LIST_INSERT_HEAD(&cd->cd_attach, ca, ca_list);
return 0;
}
/*
* Remove a cfattach from the specified driver.
*/
int
config_cfattach_detach(const char *driver, struct cfattach *ca)
{
struct alldevs_foray af;
struct cfdriver *cd;
device_t dev;
int i, rc = 0;
cd = config_cfdriver_lookup(driver);
if (cd == NULL)
return ESRCH;
config_alldevs_enter(&af);
/* Make sure there are no active instances. */
for (i = 0; i < cd->cd_ndevs; i++) {
if ((dev = cd->cd_devs[i]) == NULL)
continue;
if (dev->dv_cfattach == ca) {
rc = EBUSY;
break;
}
}
config_alldevs_exit(&af);
if (rc != 0)
return rc;
LIST_REMOVE(ca, ca_list);
return 0;
}
/*
* Look up a cfattach by name.
*/
static struct cfattach *
config_cfattach_lookup_cd(struct cfdriver *cd, const char *atname)
{
struct cfattach *ca;
LIST_FOREACH(ca, &cd->cd_attach, ca_list) { if (STREQ(ca->ca_name, atname))
return ca;
}
return NULL;
}
/*
* Look up a cfattach by driver/attachment name.
*/
struct cfattach *
config_cfattach_lookup(const char *name, const char *atname)
{
struct cfdriver *cd;
cd = config_cfdriver_lookup(name);
if (cd == NULL)
return NULL;
return config_cfattach_lookup_cd(cd, atname);
}
/*
* Apply the matching function and choose the best. This is used
* a few times and we want to keep the code small.
*/
static void
mapply(struct matchinfo *m, cfdata_t cf)
{
int pri;
if (m->fn != NULL) {
pri = (*m->fn)(m->parent, cf, m->locs, m->aux);
} else {
pri = config_match(m->parent, cf, m->aux);
}
if (pri > m->pri) {
m->match = cf;
m->pri = pri;
}
}
int
config_stdsubmatch(device_t parent, cfdata_t cf, const int *locs, void *aux)
{
const struct cfiattrdata *ci;
const struct cflocdesc *cl;
int nlocs, i;
ci = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
KASSERT(ci);
nlocs = ci->ci_loclen;
KASSERT(!nlocs || locs);
for (i = 0; i < nlocs; i++) {
cl = &ci->ci_locdesc[i];
if (cl->cld_defaultstr != NULL &&
cf->cf_loc[i] == cl->cld_default)
continue;
if (cf->cf_loc[i] == locs[i])
continue;
return 0;
}
return config_match(parent, cf, aux);
}
/*
* Helper function: check whether the driver supports the interface attribute
* and return its descriptor structure.
*/
static const struct cfiattrdata *
cfdriver_get_iattr(const struct cfdriver *cd, const char *ia)
{
const struct cfiattrdata * const *cpp;
if (cd->cd_attrs == NULL)
return 0;
for (cpp = cd->cd_attrs; *cpp; cpp++) {
if (STREQ((*cpp)->ci_name, ia)) {
/* Match. */
return *cpp;
}
}
return 0;
}
static int __diagused
cfdriver_iattr_count(const struct cfdriver *cd)
{
const struct cfiattrdata * const *cpp;
int i;
if (cd->cd_attrs == NULL)
return 0;
for (i = 0, cpp = cd->cd_attrs; *cpp; cpp++) {
i++;
}
return i;
}
/*
* Lookup an interface attribute description by name.
* If the driver is given, consider only its supported attributes.
*/
const struct cfiattrdata *
cfiattr_lookup(const char *name, const struct cfdriver *cd)
{
const struct cfdriver *d;
const struct cfiattrdata *ia;
if (cd)
return cfdriver_get_iattr(cd, name);
LIST_FOREACH(d, &allcfdrivers, cd_list) {
ia = cfdriver_get_iattr(d, name);
if (ia)
return ia;
}
return 0;
}
/*
* Determine if `parent' is a potential parent for a device spec based
* on `cfp'.
*/
static int
cfparent_match(const device_t parent, const struct cfparent *cfp)
{
struct cfdriver *pcd;
/* We don't match root nodes here. */
if (cfp == NULL)
return 0;
pcd = parent->dv_cfdriver;
KASSERT(pcd != NULL);
/*
* First, ensure this parent has the correct interface
* attribute.
*/
if (!cfdriver_get_iattr(pcd, cfp->cfp_iattr))
return 0;
/*
* If no specific parent device instance was specified (i.e.
* we're attaching to the attribute only), we're done!
*/
if (cfp->cfp_parent == NULL)
return 1;
/*
* Check the parent device's name.
*/
if (STREQ(pcd->cd_name, cfp->cfp_parent) == 0)
return 0; /* not the same parent */
/*
* Make sure the unit number matches.
*/
if (cfp->cfp_unit == DVUNIT_ANY || /* wildcard */
cfp->cfp_unit == parent->dv_unit)
return 1;
/* Unit numbers don't match. */
return 0;
}
/*
* Helper for config_cfdata_attach(): check all devices whether it could be
* parent any attachment in the config data table passed, and rescan.
*/
static void
rescan_with_cfdata(const struct cfdata *cf)
{
device_t d;
const struct cfdata *cf1;
deviter_t di;
KASSERT(KERNEL_LOCKED_P());
/*
* "alldevs" is likely longer than a modules's cfdata, so make it
* the outer loop.
*/
for (d = deviter_first(&di, 0); d != NULL; d = deviter_next(&di)) {
if (!(d->dv_cfattach->ca_rescan))
continue;
for (cf1 = cf; cf1->cf_name; cf1++) {
if (!cfparent_match(d, cf1->cf_pspec))
continue;
(*d->dv_cfattach->ca_rescan)(d,
cfdata_ifattr(cf1), cf1->cf_loc);
config_deferred(d);
}
}
deviter_release(&di);
}
/*
* Attach a supplemental config data table and rescan potential
* parent devices if required.
*/
int
config_cfdata_attach(cfdata_t cf, int scannow)
{
struct cftable *ct;
KERNEL_LOCK(1, NULL);
ct = kmem_alloc(sizeof(*ct), KM_SLEEP);
ct->ct_cfdata = cf;
TAILQ_INSERT_TAIL(&allcftables, ct, ct_list);
if (scannow)
rescan_with_cfdata(cf);
KERNEL_UNLOCK_ONE(NULL);
return 0;
}
/*
* Helper for config_cfdata_detach: check whether a device is
* found through any attachment in the config data table.
*/
static int
dev_in_cfdata(device_t d, cfdata_t cf)
{
const struct cfdata *cf1;
for (cf1 = cf; cf1->cf_name; cf1++)
if (d->dv_cfdata == cf1)
return 1;
return 0;
}
/*
* Detach a supplemental config data table. Detach all devices found
* through that table (and thus keeping references to it) before.
*/
int
config_cfdata_detach(cfdata_t cf)
{
device_t d;
int error = 0;
struct cftable *ct;
deviter_t di;
KERNEL_LOCK(1, NULL);
for (d = deviter_first(&di, DEVITER_F_RW); d != NULL;
d = deviter_next(&di)) {
if (!dev_in_cfdata(d, cf))
continue;
if ((error = config_detach(d, 0)) != 0)
break;
}
deviter_release(&di);
if (error) {
aprint_error_dev(d, "unable to detach instance\n");
goto out;
}
TAILQ_FOREACH(ct, &allcftables, ct_list) {
if (ct->ct_cfdata == cf) {
TAILQ_REMOVE(&allcftables, ct, ct_list);
kmem_free(ct, sizeof(*ct));
error = 0;
goto out;
}
}
/* not found -- shouldn't happen */
error = EINVAL;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
/*
* Invoke the "match" routine for a cfdata entry on behalf of
* an external caller, usually a direct config "submatch" routine.
*/
int
config_match(device_t parent, cfdata_t cf, void *aux)
{
struct cfattach *ca;
KASSERT(KERNEL_LOCKED_P());
ca = config_cfattach_lookup(cf->cf_name, cf->cf_atname);
if (ca == NULL) {
/* No attachment for this entry, oh well. */
return 0;
}
return (*ca->ca_match)(parent, cf, aux);
}
/*
* Invoke the "probe" routine for a cfdata entry on behalf of
* an external caller, usually an indirect config "search" routine.
*/
int
config_probe(device_t parent, cfdata_t cf, void *aux)
{
/*
* This is currently a synonym for config_match(), but this
* is an implementation detail; "match" and "probe" routines
* have different behaviors.
*
* XXX config_probe() should return a bool, because there is
* XXX no match score for probe -- it's either there or it's
* XXX not, but some ports abuse the return value as a way
* XXX to attach "critical" devices before "non-critical"
* XXX devices.
*/
return config_match(parent, cf, aux);
}
static struct cfargs_internal *
cfargs_canonicalize(const struct cfargs * const cfargs,
struct cfargs_internal * const store)
{
struct cfargs_internal *args = store;
memset(args, 0, sizeof(*args));
/* If none specified, are all-NULL pointers are good. */
if (cfargs == NULL) {
return args;
}
/*
* Only one arguments version is recognized at this time.
*/
if (cfargs->cfargs_version != CFARGS_VERSION) {
panic("cfargs_canonicalize: unknown version %lu\n",
(unsigned long)cfargs->cfargs_version);
}
/*
* submatch and search are mutually-exclusive.
*/
if (cfargs->submatch != NULL && cfargs->search != NULL) {
panic("cfargs_canonicalize: submatch and search are "
"mutually-exclusive");
}
if (cfargs->submatch != NULL) {
args->submatch = cfargs->submatch;
} else if (cfargs->search != NULL) {
args->search = cfargs->search;
}
args->iattr = cfargs->iattr;
args->locators = cfargs->locators;
args->devhandle = cfargs->devhandle;
return args;
}
/*
* Iterate over all potential children of some device, calling the given
* function (default being the child's match function) for each one.
* Nonzero returns are matches; the highest value returned is considered
* the best match. Return the `found child' if we got a match, or NULL
* otherwise. The `aux' pointer is simply passed on through.
*
* Note that this function is designed so that it can be used to apply
* an arbitrary function to all potential children (its return value
* can be ignored).
*/
static cfdata_t
config_search_internal(device_t parent, void *aux,
const struct cfargs_internal * const args)
{
struct cftable *ct;
cfdata_t cf;
struct matchinfo m;
KASSERT(config_initialized);
KASSERTMSG((!args->iattr ||
cfdriver_get_iattr(parent->dv_cfdriver, args->iattr)),
"%s searched for child at interface attribute %s,"
" but device %s(4) has no such interface attribute in config(5)",
device_xname(parent), args->iattr,
parent->dv_cfdriver->cd_name);
KASSERTMSG((args->iattr ||
cfdriver_iattr_count(parent->dv_cfdriver) < 2),
"%s searched for child without interface attribute,"
" needed to disambiguate among the %d declared for in %s(4)"
" in config(5)",
device_xname(parent),
cfdriver_iattr_count(parent->dv_cfdriver),
parent->dv_cfdriver->cd_name);
m.fn = args->submatch; /* N.B. union */
m.parent = parent;
m.locs = args->locators;
m.aux = aux;
m.match = NULL;
m.pri = 0;
TAILQ_FOREACH(ct, &allcftables, ct_list) {
for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
/* We don't match root nodes here. */
if (!cf->cf_pspec)
continue;
/*
* Skip cf if no longer eligible, otherwise scan
* through parents for one matching `parent', and
* try match function.
*/
if (cf->cf_fstate == FSTATE_FOUND)
continue;
if (cf->cf_fstate == FSTATE_DNOTFOUND ||
cf->cf_fstate == FSTATE_DSTAR)
continue;
/*
* If an interface attribute was specified,
* consider only children which attach to
* that attribute.
*/
if (args->iattr != NULL &&
!STREQ(args->iattr, cfdata_ifattr(cf)))
continue;
if (cfparent_match(parent, cf->cf_pspec))
mapply(&m, cf);
}
}
rnd_add_uint32(&rnd_autoconf_source, 0);
return m.match;
}
cfdata_t
config_search(device_t parent, void *aux, const struct cfargs *cfargs)
{
cfdata_t cf;
struct cfargs_internal store;
cf = config_search_internal(parent, aux,
cfargs_canonicalize(cfargs, &store));
return cf;
}
/*
* Find the given root device.
* This is much like config_search, but there is no parent.
* Don't bother with multiple cfdata tables; the root node
* must always be in the initial table.
*/
cfdata_t
config_rootsearch(cfsubmatch_t fn, const char *rootname, void *aux)
{
cfdata_t cf;
const short *p;
struct matchinfo m;
m.fn = fn;
m.parent = ROOT;
m.aux = aux;
m.match = NULL;
m.pri = 0;
m.locs = 0;
/*
* Look at root entries for matching name. We do not bother
* with found-state here since only one root should ever be
* searched (and it must be done first).
*/
for (p = cfroots; *p >= 0; p++) {
cf = &cfdata[*p];
if (strcmp(cf->cf_name, rootname) == 0)
mapply(&m, cf);
}
return m.match;
}
static const char * const msgs[] = {
[QUIET] = "",
[UNCONF] = " not configured\n",
[UNSUPP] = " unsupported\n",
};
/*
* The given `aux' argument describes a device that has been found
* on the given parent, but not necessarily configured. Locate the
* configuration data for that device (using the submatch function
* provided, or using candidates' cd_match configuration driver
* functions) and attach it, and return its device_t. If the device was
* not configured, call the given `print' function and return NULL.
*/
device_t
config_found_acquire(device_t parent, void *aux, cfprint_t print,
const struct cfargs * const cfargs)
{
cfdata_t cf;
struct cfargs_internal store;
const struct cfargs_internal * const args =
cfargs_canonicalize(cfargs, &store);
device_t dev;
KERNEL_LOCK(1, NULL);
cf = config_search_internal(parent, aux, args);
if (cf != NULL) {
dev = config_attach_internal(parent, cf, aux, print, args);
goto out;
}
if (print) {
if (config_do_twiddle && cold)
twiddle();
const int pret = (*print)(aux, device_xname(parent));
KASSERT(pret >= 0);
KASSERT(pret < __arraycount(msgs));
KASSERT(msgs[pret] != NULL);
aprint_normal("%s", msgs[pret]);
}
dev = NULL;
out: KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_found(parent, aux, print, cfargs)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_found_acquire with a matching device_release once they
* have finished with the returned device_t.
*/
device_t
config_found(device_t parent, void *aux, cfprint_t print,
const struct cfargs * const cfargs)
{
device_t dev;
KASSERT(KERNEL_LOCKED_P());
dev = config_found_acquire(parent, aux, print, cfargs);
if (dev == NULL)
return NULL;
device_release(dev);
return dev;
}
/*
* As above, but for root devices.
*/
device_t
config_rootfound(const char *rootname, void *aux)
{
cfdata_t cf;
device_t dev = NULL;
KERNEL_LOCK(1, NULL);
if ((cf = config_rootsearch(NULL, rootname, aux)) != NULL)
dev = config_attach(ROOT, cf, aux, NULL, CFARGS_NONE);
else
aprint_error("root device %s not configured\n", rootname);
KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/* just like sprintf(buf, "%d") except that it works from the end */
static char *
number(char *ep, int n)
{
*--ep = 0;
while (n >= 10) {
*--ep = (n % 10) + '0';
n /= 10;
}
*--ep = n + '0';
return ep;
}
/*
* Expand the size of the cd_devs array if necessary.
*
* The caller must hold alldevs_lock. config_makeroom() may release and
* re-acquire alldevs_lock, so callers should re-check conditions such
* as alldevs_nwrite == 0 and alldevs_nread == 0 when config_makeroom()
* returns.
*/
static void
config_makeroom(int n, struct cfdriver *cd)
{
int ondevs, nndevs;
device_t *osp, *nsp;
KASSERT(mutex_owned(&alldevs_lock));
alldevs_nwrite++;
/* XXX arithmetic overflow */
for (nndevs = MAX(4, cd->cd_ndevs); nndevs <= n; nndevs += nndevs)
;
while (n >= cd->cd_ndevs) {
/*
* Need to expand the array.
*/
ondevs = cd->cd_ndevs;
osp = cd->cd_devs;
/*
* Release alldevs_lock around allocation, which may
* sleep.
*/
mutex_exit(&alldevs_lock);
nsp = kmem_alloc(sizeof(device_t) * nndevs, KM_SLEEP);
mutex_enter(&alldevs_lock);
/*
* If another thread moved the array while we did
* not hold alldevs_lock, try again.
*/
if (cd->cd_devs != osp || cd->cd_ndevs != ondevs) {
mutex_exit(&alldevs_lock);
kmem_free(nsp, sizeof(device_t) * nndevs);
mutex_enter(&alldevs_lock);
continue;
}
memset(nsp + ondevs, 0, sizeof(device_t) * (nndevs - ondevs));
if (ondevs != 0)
memcpy(nsp, cd->cd_devs, sizeof(device_t) * ondevs); cd->cd_ndevs = nndevs;
cd->cd_devs = nsp;
if (ondevs != 0) {
mutex_exit(&alldevs_lock);
kmem_free(osp, sizeof(device_t) * ondevs);
mutex_enter(&alldevs_lock);
}
}
KASSERT(mutex_owned(&alldevs_lock));
alldevs_nwrite--;
}
/*
* Put dev into the devices list.
*/
static void
config_devlink(device_t dev)
{
mutex_enter(&alldevs_lock);
KASSERT(device_cfdriver(dev)->cd_devs[dev->dv_unit] == dev);
dev->dv_add_gen = alldevs_gen;
/* It is safe to add a device to the tail of the list while
* readers and writers are in the list.
*/
TAILQ_INSERT_TAIL(&alldevs, dev, dv_list);
mutex_exit(&alldevs_lock);
}
static void
config_devfree(device_t dev)
{
KASSERT(dev->dv_flags & DVF_PRIV_ALLOC); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dev->dv_cfattach->ca_devsize > 0) kmem_free(dev->dv_private, dev->dv_cfattach->ca_devsize);
kmem_free(dev, sizeof(*dev));
}
/*
* Caller must hold alldevs_lock.
*/
static void
config_devunlink(device_t dev, struct devicelist *garbage)
{
struct device_garbage *dg = &dev->dv_garbage;
cfdriver_t cd = device_cfdriver(dev);
int i;
KASSERT(mutex_owned(&alldevs_lock)); KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending);
/* Unlink from device list. Link to garbage list. */
TAILQ_REMOVE(&alldevs, dev, dv_list); TAILQ_INSERT_TAIL(garbage, dev, dv_list);
/* Remove from cfdriver's array. */
cd->cd_devs[dev->dv_unit] = NULL;
/*
* If the device now has no units in use, unlink its softc array.
*/
for (i = 0; i < cd->cd_ndevs; i++) { if (cd->cd_devs[i] != NULL)
break;
}
/* Nothing found. Unlink, now. Deallocate, later. */
if (i == cd->cd_ndevs) {
dg->dg_ndevs = cd->cd_ndevs;
dg->dg_devs = cd->cd_devs;
cd->cd_devs = NULL;
cd->cd_ndevs = 0;
}
}
static void
config_devdelete(device_t dev)
{
struct device_garbage *dg = &dev->dv_garbage;
device_lock_t dvl = device_getlock(dev);
KASSERTMSG(dev->dv_pending == 0, "%d", dev->dv_pending); if (dg->dg_devs != NULL) kmem_free(dg->dg_devs, sizeof(device_t) * dg->dg_ndevs);
localcount_fini(dev->dv_localcount);
kmem_free(dev->dv_localcount, sizeof(*dev->dv_localcount));
cv_destroy(&dvl->dvl_cv);
mutex_destroy(&dvl->dvl_mtx);
KASSERT(dev->dv_properties != NULL);
prop_object_release(dev->dv_properties);
if (dev->dv_activity_handlers)
panic("%s with registered handlers", __func__); if (dev->dv_locators) { size_t amount = *--dev->dv_locators;
kmem_free(dev->dv_locators, amount);
}
config_devfree(dev);
}
static int
config_unit_nextfree(cfdriver_t cd, cfdata_t cf)
{
int unit = cf->cf_unit;
KASSERT(mutex_owned(&alldevs_lock));
if (unit < 0)
return -1;
if (cf->cf_fstate == FSTATE_STAR) {
for (; unit < cd->cd_ndevs; unit++) if (cd->cd_devs[unit] == NULL)
break;
/*
* unit is now the unit of the first NULL device pointer,
* or max(cd->cd_ndevs,cf->cf_unit).
*/
} else {
if (unit < cd->cd_ndevs && cd->cd_devs[unit] != NULL)
unit = -1;
}
return unit;
}
static int
config_unit_alloc(device_t dev, cfdriver_t cd, cfdata_t cf)
{
struct alldevs_foray af;
int unit;
config_alldevs_enter(&af);
for (;;) {
unit = config_unit_nextfree(cd, cf); if (unit == -1)
break;
if (unit < cd->cd_ndevs) {
cd->cd_devs[unit] = dev;
dev->dv_unit = unit;
break;
}
config_makeroom(unit, cd);
}
config_alldevs_exit(&af);
return unit;
}
static device_t
config_devalloc(const device_t parent, const cfdata_t cf,
const struct cfargs_internal * const args)
{
cfdriver_t cd;
cfattach_t ca;
size_t lname, lunit;
const char *xunit;
int myunit;
char num[10];
device_t dev;
void *dev_private;
const struct cfiattrdata *ia;
device_lock_t dvl;
cd = config_cfdriver_lookup(cf->cf_name);
if (cd == NULL)
return NULL;
ca = config_cfattach_lookup_cd(cd, cf->cf_atname);
if (ca == NULL)
return NULL;
/* get memory for all device vars */
KASSERT(ca->ca_flags & DVF_PRIV_ALLOC); if (ca->ca_devsize > 0) { dev_private = kmem_zalloc(ca->ca_devsize, KM_SLEEP);
} else {
dev_private = NULL;
}
dev = kmem_zalloc(sizeof(*dev), KM_SLEEP);
dev->dv_handle = args->devhandle;
dev->dv_class = cd->cd_class;
dev->dv_cfdata = cf;
dev->dv_cfdriver = cd;
dev->dv_cfattach = ca;
dev->dv_activity_count = 0;
dev->dv_activity_handlers = NULL;
dev->dv_private = dev_private;
dev->dv_flags = ca->ca_flags; /* inherit flags from class */
dev->dv_attaching = curlwp;
myunit = config_unit_alloc(dev, cd, cf);
if (myunit == -1) {
config_devfree(dev);
return NULL;
}
/* compute length of name and decimal expansion of unit number */
lname = strlen(cd->cd_name);
xunit = number(&num[sizeof(num)], myunit);
lunit = &num[sizeof(num)] - xunit;
if (lname + lunit > sizeof(dev->dv_xname))
panic("config_devalloc: device name too long");
dvl = device_getlock(dev);
mutex_init(&dvl->dvl_mtx, MUTEX_DEFAULT, IPL_NONE);
cv_init(&dvl->dvl_cv, "pmfsusp");
memcpy(dev->dv_xname, cd->cd_name, lname);
memcpy(dev->dv_xname + lname, xunit, lunit);
dev->dv_parent = parent;
if (parent != NULL)
dev->dv_depth = parent->dv_depth + 1;
else
dev->dv_depth = 0;
dev->dv_flags |= DVF_ACTIVE; /* always initially active */
if (args->locators) { KASSERT(parent); /* no locators at root */
ia = cfiattr_lookup(cfdata_ifattr(cf), parent->dv_cfdriver);
dev->dv_locators =
kmem_alloc(sizeof(int) * (ia->ci_loclen + 1), KM_SLEEP);
*dev->dv_locators++ = sizeof(int) * (ia->ci_loclen + 1);
memcpy(dev->dv_locators, args->locators,
sizeof(int) * ia->ci_loclen);
}
dev->dv_properties = prop_dictionary_create();
KASSERT(dev->dv_properties != NULL);
prop_dictionary_set_string_nocopy(dev->dv_properties,
"device-driver", dev->dv_cfdriver->cd_name);
prop_dictionary_set_uint16(dev->dv_properties,
"device-unit", dev->dv_unit);
if (parent != NULL) { prop_dictionary_set_string(dev->dv_properties,
"device-parent", device_xname(parent));
}
dev->dv_localcount = kmem_zalloc(sizeof(*dev->dv_localcount),
KM_SLEEP);
localcount_init(dev->dv_localcount);
if (dev->dv_cfdriver->cd_attrs != NULL) config_add_attrib_dict(dev);
return dev;
}
/*
* Create an array of device attach attributes and add it
* to the device's dv_properties dictionary.
*
* <key>interface-attributes</key>
* <array>
* <dict>
* <key>attribute-name</key>
* <string>foo</string>
* <key>locators</key>
* <array>
* <dict>
* <key>loc-name</key>
* <string>foo-loc1</string>
* </dict>
* <dict>
* <key>loc-name</key>
* <string>foo-loc2</string>
* <key>default</key>
* <string>foo-loc2-default</string>
* </dict>
* ...
* </array>
* </dict>
* ...
* </array>
*/
static void
config_add_attrib_dict(device_t dev)
{
int i, j;
const struct cfiattrdata *ci;
prop_dictionary_t attr_dict, loc_dict;
prop_array_t attr_array, loc_array;
if ((attr_array = prop_array_create()) == NULL)
return;
for (i = 0; ; i++) {
if ((ci = dev->dv_cfdriver->cd_attrs[i]) == NULL)
break;
if ((attr_dict = prop_dictionary_create()) == NULL)
break;
prop_dictionary_set_string_nocopy(attr_dict, "attribute-name",
ci->ci_name);
/* Create an array of the locator names and defaults */
if (ci->ci_loclen != 0 &&
(loc_array = prop_array_create()) != NULL) {
for (j = 0; j < ci->ci_loclen; j++) {
loc_dict = prop_dictionary_create();
if (loc_dict == NULL)
continue;
prop_dictionary_set_string_nocopy(loc_dict,
"loc-name", ci->ci_locdesc[j].cld_name);
if (ci->ci_locdesc[j].cld_defaultstr != NULL) prop_dictionary_set_string_nocopy(
loc_dict, "default",
ci->ci_locdesc[j].cld_defaultstr);
prop_array_set(loc_array, j, loc_dict);
prop_object_release(loc_dict);
}
prop_dictionary_set_and_rel(attr_dict, "locators",
loc_array);
}
prop_array_add(attr_array, attr_dict);
prop_object_release(attr_dict);
}
if (i == 0)
prop_object_release(attr_array);
else
prop_dictionary_set_and_rel(dev->dv_properties,
"interface-attributes", attr_array);
return;
}
/*
* Attach a found device.
*
* Returns the device referenced, to be released with device_release.
*/
static device_t
config_attach_internal(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs_internal * const args)
{
device_t dev;
struct cftable *ct;
const char *drvname;
bool deferred;
KASSERT(KERNEL_LOCKED_P());
dev = config_devalloc(parent, cf, args);
if (!dev)
panic("config_attach: allocation of device softc failed");
/* XXX redundant - see below? */
if (cf->cf_fstate != FSTATE_STAR) {
KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
cf->cf_fstate = FSTATE_FOUND;
}
config_devlink(dev);
if (config_do_twiddle && cold)
twiddle();
else
aprint_naive("Found ");
/*
* We want the next two printfs for normal, verbose, and quiet,
* but not silent (in which case, we're twiddling, instead).
*/
if (parent == ROOT) {
aprint_naive("%s (root)", device_xname(dev));
aprint_normal("%s (root)", device_xname(dev));
} else {
aprint_naive("%s at %s", device_xname(dev),
device_xname(parent));
aprint_normal("%s at %s", device_xname(dev),
device_xname(parent));
if (print)
(void) (*print)(aux, NULL);
}
/*
* Before attaching, clobber any unfound devices that are
* otherwise identical.
* XXX code above is redundant?
*/
drvname = dev->dv_cfdriver->cd_name;
TAILQ_FOREACH(ct, &allcftables, ct_list) {
for (cf = ct->ct_cfdata; cf->cf_name; cf++) {
if (STREQ(cf->cf_name, drvname) &&
cf->cf_unit == dev->dv_unit) {
if (cf->cf_fstate == FSTATE_NOTFOUND)
cf->cf_fstate = FSTATE_FOUND;
}
}
}
device_register(dev, aux);
/* Let userland know */
devmon_report_device(dev, true);
/*
* Prevent detach until the driver's attach function, and all
* deferred actions, have finished.
*/
config_pending_incr(dev);
/*
* Prevent concurrent detach from destroying the device_t until
* the caller has released the device.
*/
device_acquire(dev);
/* Call the driver's attach function. */
(*dev->dv_cfattach->ca_attach)(parent, dev, aux);
/*
* Allow other threads to acquire references to the device now
* that the driver's attach function is done.
*/
mutex_enter(&config_misc_lock);
KASSERT(dev->dv_attaching == curlwp);
dev->dv_attaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
/*
* Synchronous parts of attach are done. Allow detach, unless
* the driver's attach function scheduled deferred actions.
*/
config_pending_decr(dev);
mutex_enter(&config_misc_lock);
deferred = (dev->dv_pending != 0);
mutex_exit(&config_misc_lock);
if (!deferred && !device_pmf_is_registered(dev))
aprint_debug_dev(dev,
"WARNING: power management not supported\n");
config_process_deferred(&deferred_config_queue, dev);
device_register_post_config(dev, aux);
rnd_add_uint32(&rnd_autoconf_source, 0);
return dev;
}
device_t
config_attach_acquire(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs *cfargs)
{
struct cfargs_internal store;
device_t dev;
KERNEL_LOCK(1, NULL);
dev = config_attach_internal(parent, cf, aux, print,
cfargs_canonicalize(cfargs, &store));
KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_attach(parent, cf, aux, print, cfargs)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_attach_acquire with a matching device_release once they
* have finished with the returned device_t.
*/
device_t
config_attach(device_t parent, cfdata_t cf, void *aux, cfprint_t print,
const struct cfargs *cfargs)
{
device_t dev;
KASSERT(KERNEL_LOCKED_P());
dev = config_attach_acquire(parent, cf, aux, print, cfargs);
if (dev == NULL)
return NULL;
device_release(dev);
return dev;
}
/*
* As above, but for pseudo-devices. Pseudo-devices attached in this
* way are silently inserted into the device tree, and their children
* attached.
*
* Note that because pseudo-devices are attached silently, any information
* the attach routine wishes to print should be prefixed with the device
* name by the attach routine.
*/
device_t
config_attach_pseudo_acquire(cfdata_t cf, void *aux)
{
device_t dev;
KERNEL_LOCK(1, NULL);
struct cfargs_internal args = { };
dev = config_devalloc(ROOT, cf, &args);
if (!dev)
goto out;
/* XXX mark busy in cfdata */
if (cf->cf_fstate != FSTATE_STAR) { KASSERT(cf->cf_fstate == FSTATE_NOTFOUND);
cf->cf_fstate = FSTATE_FOUND;
}
config_devlink(dev);
#if 0 /* XXXJRT not yet */
device_register(dev, NULL); /* like a root node */
#endif
/* Let userland know */
devmon_report_device(dev, true);
/*
* Prevent detach until the driver's attach function, and all
* deferred actions, have finished.
*/
config_pending_incr(dev);
/*
* Prevent concurrent detach from destroying the device_t until
* the caller has released the device.
*/
device_acquire(dev);
/* Call the driver's attach function. */
(*dev->dv_cfattach->ca_attach)(ROOT, dev, aux);
/*
* Allow other threads to acquire references to the device now
* that the driver's attach function is done.
*/
mutex_enter(&config_misc_lock);
KASSERT(dev->dv_attaching == curlwp);
dev->dv_attaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
/*
* Synchronous parts of attach are done. Allow detach, unless
* the driver's attach function scheduled deferred actions.
*/
config_pending_decr(dev);
config_process_deferred(&deferred_config_queue, dev);
out: KERNEL_UNLOCK_ONE(NULL);
return dev;
}
/*
* config_attach_pseudo(cf)
*
* Legacy entry point for callers whose use of the returned
* device_t is not delimited by device_release.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should ignore the return value or be converted to
* config_attach_pseudo_acquire with a matching device_release
* once they have finished with the returned device_t. As a
* bonus, config_attach_pseudo_acquire can pass a non-null aux
* argument into the driver's attach routine.
*/
device_t
config_attach_pseudo(cfdata_t cf)
{
device_t dev;
dev = config_attach_pseudo_acquire(cf, NULL);
if (dev == NULL)
return dev;
device_release(dev);
return dev;
}
/*
* Caller must hold alldevs_lock.
*/
static void
config_collect_garbage(struct devicelist *garbage)
{
device_t dv;
KASSERT(!cpu_intr_p()); KASSERT(!cpu_softintr_p()); KASSERT(mutex_owned(&alldevs_lock)); while (alldevs_nwrite == 0 && alldevs_nread == 0 && alldevs_garbage) { TAILQ_FOREACH(dv, &alldevs, dv_list) {
if (dv->dv_del_gen != 0)
break;
}
if (dv == NULL) {
alldevs_garbage = false;
break;
}
config_devunlink(dv, garbage);
}
KASSERT(mutex_owned(&alldevs_lock));
}
static void
config_dump_garbage(struct devicelist *garbage)
{
device_t dv;
while ((dv = TAILQ_FIRST(garbage)) != NULL) { TAILQ_REMOVE(garbage, dv, dv_list); config_devdelete(dv);
}
}
static int
config_detach_enter(device_t dev)
{
struct lwp *l __diagused;
int error = 0;
mutex_enter(&config_misc_lock);
/*
* Wait until attach has fully completed, and until any
* concurrent detach (e.g., drvctl racing with USB event
* thread) has completed.
*
* Caller must hold alldevs_nread or alldevs_nwrite (e.g., via
* deviter) to ensure the winner of the race doesn't free the
* device leading the loser of the race into use-after-free.
*
* XXX Not all callers do this!
*/
while (dev->dv_pending || dev->dv_detaching) {
KASSERTMSG(dev->dv_detaching != curlwp,
"recursively detaching %s", device_xname(dev));
error = cv_wait_sig(&config_misc_cv, &config_misc_lock);
if (error)
goto out;
}
/*
* Attach has completed, and no other concurrent detach is
* running. Claim the device for detaching. This will cause
* all new attempts to acquire references to block.
*/
KASSERTMSG((l = dev->dv_attaching) == NULL,
"lwp %ld [%s] @ %p attaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == NULL,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detaching = curlwp;
out: mutex_exit(&config_misc_lock);
return error;
}
static void
config_detach_exit(device_t dev)
{
struct lwp *l __diagused;
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == curlwp,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detaching = NULL;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
}
/*
* Detach a device. Optionally forced (e.g. because of hardware
* removal) and quiet. Returns zero if successful, non-zero
* (an error code) otherwise.
*
* Note that this code wants to be run from a process context, so
* that the detach can sleep to allow processes which have a device
* open to run and unwind their stacks.
*
* Caller must hold a reference with device_acquire or
* device_lookup_acquire.
*/
int
config_detach_release(device_t dev, int flags)
{
struct alldevs_foray af;
struct cftable *ct;
cfdata_t cf;
const struct cfattach *ca;
struct cfdriver *cd;
device_t d __diagused;
int rv = 0;
KERNEL_LOCK(1, NULL);
cf = dev->dv_cfdata;
KASSERTMSG((cf == NULL || cf->cf_fstate == FSTATE_FOUND ||
cf->cf_fstate == FSTATE_STAR),
"config_detach: %s: bad device fstate: %d",
device_xname(dev), cf ? cf->cf_fstate : -1);
cd = dev->dv_cfdriver;
KASSERT(cd != NULL);
ca = dev->dv_cfattach;
KASSERT(ca != NULL);
/*
* Only one detach at a time, please -- and not until fully
* attached.
*/
rv = config_detach_enter(dev);
device_release(dev);
if (rv) {
KERNEL_UNLOCK_ONE(NULL);
return rv;
}
mutex_enter(&alldevs_lock);
if (dev->dv_del_gen != 0) {
mutex_exit(&alldevs_lock);
#ifdef DIAGNOSTIC
printf("%s: %s is already detached\n", __func__,
device_xname(dev));
#endif /* DIAGNOSTIC */
config_detach_exit(dev);
KERNEL_UNLOCK_ONE(NULL);
return ENOENT;
}
alldevs_nwrite++;
mutex_exit(&alldevs_lock);
/*
* Call the driver's .ca_detach function, unless it has none or
* we are skipping it because it's unforced shutdown time and
* the driver didn't ask to detach on shutdown.
*/
if (!detachall &&
(flags & (DETACH_SHUTDOWN|DETACH_FORCE)) == DETACH_SHUTDOWN &&
(dev->dv_flags & DVF_DETACH_SHUTDOWN) == 0) {
rv = EOPNOTSUPP;
} else if (ca->ca_detach != NULL) { rv = (*ca->ca_detach)(dev, flags);
} else
rv = EOPNOTSUPP;
KASSERTMSG(!dev->dv_detach_done, "%s detached twice, error=%d",
device_xname(dev), rv);
/*
* If it was not possible to detach the device, then we either
* panic() (for the forced but failed case), or return an error.
*/
if (rv) {
/*
* Detach failed -- likely EOPNOTSUPP or EBUSY. Driver
* must not have called config_detach_commit.
*/
KASSERTMSG(!dev->dv_detach_committed,
"%s committed to detaching and then backed out, error=%d",
device_xname(dev), rv);
if (flags & DETACH_FORCE) {
panic("config_detach: forced detach of %s failed (%d)",
device_xname(dev), rv);
}
goto out;
}
/*
* The device has now been successfully detached.
*/
dev->dv_detach_done = true;
/*
* If .ca_detach didn't commit to detach, then do that for it.
* This wakes any pending device_lookup_acquire calls so they
* will fail.
*/
config_detach_commit(dev);
/*
* If it was possible to detach the device, ensure that the
* device is deactivated.
*/
dev->dv_flags &= ~DVF_ACTIVE; /* XXXSMP */
/*
* Wait for all device_lookup_acquire references -- mostly, for
* all attempts to open the device -- to drain. It is the
* responsibility of .ca_detach to ensure anything with open
* references will be interrupted and release them promptly,
* not block indefinitely. All new attempts to acquire
* references will fail, as config_detach_commit has arranged
* by now.
*/
mutex_enter(&config_misc_lock);
localcount_drain(dev->dv_localcount,
&config_misc_cv, &config_misc_lock);
mutex_exit(&config_misc_lock);
/* Let userland know */
devmon_report_device(dev, false);
#ifdef DIAGNOSTIC
/*
* Sanity: If you're successfully detached, you should have no
* children. (Note that because children must be attached
* after parents, we only need to search the latter part of
* the list.)
*/
mutex_enter(&alldevs_lock);
for (d = TAILQ_NEXT(dev, dv_list); d != NULL;
d = TAILQ_NEXT(d, dv_list)) {
if (d->dv_parent == dev && d->dv_del_gen == 0) {
printf("config_detach: detached device %s"
" has children %s\n", device_xname(dev),
device_xname(d));
panic("config_detach");
}
}
mutex_exit(&alldevs_lock);
#endif
/* notify the parent that the child is gone */
if (dev->dv_parent) {
device_t p = dev->dv_parent;
if (p->dv_cfattach->ca_childdetached) (*p->dv_cfattach->ca_childdetached)(p, dev);
}
/*
* Mark cfdata to show that the unit can be reused, if possible.
*/
TAILQ_FOREACH(ct, &allcftables, ct_list) { for (cf = ct->ct_cfdata; cf->cf_name; cf++) { if (STREQ(cf->cf_name, cd->cd_name)) { if (cf->cf_fstate == FSTATE_FOUND &&
cf->cf_unit == dev->dv_unit)
cf->cf_fstate = FSTATE_NOTFOUND;
}
}
}
if (dev->dv_cfdata != NULL && (flags & DETACH_QUIET) == 0) aprint_normal_dev(dev, "detached\n");
out:
config_detach_exit(dev);
config_alldevs_enter(&af);
KASSERT(alldevs_nwrite != 0);
--alldevs_nwrite;
if (rv == 0 && dev->dv_del_gen == 0) {
if (alldevs_nwrite == 0 && alldevs_nread == 0)
config_devunlink(dev, &af.af_garbage);
else {
dev->dv_del_gen = alldevs_gen;
alldevs_garbage = true;
}
}
config_alldevs_exit(&af);
KERNEL_UNLOCK_ONE(NULL);
return rv;
}
/*
* config_detach(dev, flags)
*
* Legacy entry point for callers that have not acquired a
* reference to dev.
*
* The caller is required to hold the kernel lock as a fragile
* defence against races.
*
* Callers should be converted to use device_acquire under a lock
* taken also by .ca_childdetached to synchronize access to the
* device_t, and then config_detach_release ouside the lock.
* Alternatively, most drivers detach children only in their own
* detach routines, which can be done with config_detach_children
* instead.
*/
int
config_detach(device_t dev, int flags)
{ device_acquire(dev);
return config_detach_release(dev, flags);
}
/*
* config_detach_commit(dev)
*
* Issued by a driver's .ca_detach routine to notify anyone
* waiting in device_lookup_acquire that the driver is committed
* to detaching the device, which allows device_lookup_acquire to
* wake up and fail immediately.
*
* Safe to call multiple times -- idempotent. Must be called
* during config_detach_enter/exit. Safe to use with
* device_lookup because the device is not actually removed from
* the table until after config_detach_exit.
*/
void
config_detach_commit(device_t dev)
{
struct lwp *l __diagused;
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_detaching != NULL, "not detaching %s",
device_xname(dev));
KASSERTMSG((l = dev->dv_detaching) == curlwp,
"lwp %ld [%s] @ %p detaching %s",
(long)l->l_lid, (l->l_name ? l->l_name : l->l_proc->p_comm), l,
device_xname(dev));
dev->dv_detach_committed = true;
cv_broadcast(&config_misc_cv);
mutex_exit(&config_misc_lock);
}
int
config_detach_children(device_t parent, int flags)
{
device_t dv;
deviter_t di;
int error = 0;
KASSERT(KERNEL_LOCKED_P());
for (dv = deviter_first(&di, DEVITER_F_RW); dv != NULL;
dv = deviter_next(&di)) {
if (device_parent(dv) != parent)
continue;
if ((error = config_detach(dv, flags)) != 0)
break;
}
deviter_release(&di);
return error;
}
device_t
shutdown_first(struct shutdown_state *s)
{
if (!s->initialized) {
deviter_init(&s->di, DEVITER_F_SHUTDOWN|DEVITER_F_LEAVES_FIRST);
s->initialized = true;
}
return shutdown_next(s);
}
device_t
shutdown_next(struct shutdown_state *s)
{
device_t dv;
while ((dv = deviter_next(&s->di)) != NULL && !device_is_active(dv))
;
if (dv == NULL)
s->initialized = false;
return dv;
}
bool
config_detach_all(int how)
{
static struct shutdown_state s;
device_t curdev;
bool progress = false;
int flags;
KERNEL_LOCK(1, NULL);
if ((how & (RB_NOSYNC|RB_DUMP)) != 0)
goto out;
if ((how & RB_POWERDOWN) == RB_POWERDOWN)
flags = DETACH_SHUTDOWN | DETACH_POWEROFF;
else
flags = DETACH_SHUTDOWN;
for (curdev = shutdown_first(&s); curdev != NULL;
curdev = shutdown_next(&s)) {
aprint_debug(" detaching %s, ", device_xname(curdev));
if (config_detach(curdev, flags) == 0) {
progress = true;
aprint_debug("success.");
} else
aprint_debug("failed.");
}
out: KERNEL_UNLOCK_ONE(NULL);
return progress;
}
static bool
device_is_ancestor_of(device_t ancestor, device_t descendant)
{
device_t dv;
for (dv = descendant; dv != NULL; dv = device_parent(dv)) {
if (device_parent(dv) == ancestor)
return true;
}
return false;
}
int
config_deactivate(device_t dev)
{
deviter_t di;
const struct cfattach *ca;
device_t descendant;
int s, rv = 0, oflags;
for (descendant = deviter_first(&di, DEVITER_F_ROOT_FIRST);
descendant != NULL;
descendant = deviter_next(&di)) {
if (dev != descendant &&
!device_is_ancestor_of(dev, descendant))
continue;
if ((descendant->dv_flags & DVF_ACTIVE) == 0)
continue;
ca = descendant->dv_cfattach;
oflags = descendant->dv_flags;
descendant->dv_flags &= ~DVF_ACTIVE;
if (ca->ca_activate == NULL)
continue;
s = splhigh();
rv = (*ca->ca_activate)(descendant, DVACT_DEACTIVATE);
splx(s);
if (rv != 0)
descendant->dv_flags = oflags;
}
deviter_release(&di);
return rv;
}
/*
* Defer the configuration of the specified device until all
* of its parent's devices have been attached.
*/
void
config_defer(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
if (dev->dv_parent == NULL)
panic("config_defer: can't defer config of a root device");
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
config_pending_incr(dev);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &deferred_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("config_defer: deferred twice");
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&deferred_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Defer some autoconfiguration for a device until after interrupts
* are enabled.
*/
void
config_interrupts(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
/*
* If interrupts are enabled, callback now.
*/
if (cold == 0) {
(*func)(dev);
return;
}
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
config_pending_incr(dev);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &interrupt_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("config_interrupts: deferred twice");
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&interrupt_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Defer some autoconfiguration for a device until after root file system
* is mounted (to load firmware etc).
*/
void
config_mountroot(device_t dev, void (*func)(device_t))
{
struct deferred_config *dc;
/*
* If root file system is mounted, callback now.
*/
if (root_is_mounted) {
(*func)(dev);
return;
}
dc = kmem_alloc(sizeof(*dc), KM_SLEEP);
mutex_enter(&config_misc_lock);
#ifdef DIAGNOSTIC
struct deferred_config *odc;
TAILQ_FOREACH(odc, &mountroot_config_queue, dc_queue) {
if (odc->dc_dev == dev)
panic("%s: deferred twice", __func__);
}
#endif
dc->dc_dev = dev;
dc->dc_func = func;
TAILQ_INSERT_TAIL(&mountroot_config_queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
}
/*
* Process a deferred configuration queue.
*/
static void
config_process_deferred(struct deferred_config_head *queue, device_t parent)
{
struct deferred_config *dc;
KASSERT(KERNEL_LOCKED_P());
mutex_enter(&config_misc_lock);
dc = TAILQ_FIRST(queue);
while (dc) { if (parent == NULL || dc->dc_dev->dv_parent == parent) { TAILQ_REMOVE(queue, dc, dc_queue);
mutex_exit(&config_misc_lock);
(*dc->dc_func)(dc->dc_dev);
config_pending_decr(dc->dc_dev);
kmem_free(dc, sizeof(*dc));
mutex_enter(&config_misc_lock);
/* Restart, queue might have changed */
dc = TAILQ_FIRST(queue);
} else {
dc = TAILQ_NEXT(dc, dc_queue);
}
}
mutex_exit(&config_misc_lock);
}
/*
* Manipulate the config_pending semaphore.
*/
void
config_pending_incr(device_t dev)
{
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_pending < INT_MAX,
"%s: excess config_pending_incr", device_xname(dev));
if (dev->dv_pending++ == 0) TAILQ_INSERT_TAIL(&config_pending, dev, dv_pending_list);
#ifdef DEBUG_AUTOCONF
printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
mutex_exit(&config_misc_lock);
}
void
config_pending_decr(device_t dev)
{
mutex_enter(&config_misc_lock);
KASSERTMSG(dev->dv_pending > 0,
"%s: excess config_pending_decr", device_xname(dev));
if (--dev->dv_pending == 0) { TAILQ_REMOVE(&config_pending, dev, dv_pending_list);
cv_broadcast(&config_misc_cv);
}
#ifdef DEBUG_AUTOCONF
printf("%s: %s %d\n", __func__, device_xname(dev), dev->dv_pending);
#endif
mutex_exit(&config_misc_lock);
}
/*
* Register a "finalization" routine. Finalization routines are
* called iteratively once all real devices have been found during
* autoconfiguration, for as long as any one finalizer has done
* any work.
*/
int
config_finalize_register(device_t dev, int (*fn)(device_t))
{
struct finalize_hook *f;
int error = 0;
KERNEL_LOCK(1, NULL);
/*
* If finalization has already been done, invoke the
* callback function now.
*/
if (config_finalize_done) {
while ((*fn)(dev) != 0)
/* loop */ ;
goto out;
}
/* Ensure this isn't already on the list. */
TAILQ_FOREACH(f, &config_finalize_list, f_list) {
if (f->f_func == fn && f->f_dev == dev) {
error = EEXIST;
goto out;
}
}
f = kmem_alloc(sizeof(*f), KM_SLEEP);
f->f_func = fn;
f->f_dev = dev;
TAILQ_INSERT_TAIL(&config_finalize_list, f, f_list);
/* Success! */
error = 0;
out: KERNEL_UNLOCK_ONE(NULL);
return error;
}
void
config_finalize(void)
{
struct finalize_hook *f;
struct pdevinit *pdev;
extern struct pdevinit pdevinit[];
unsigned t0 = getticks();
int errcnt, rv;
/*
* Now that device driver threads have been created, wait for
* them to finish any deferred autoconfiguration.
*/
mutex_enter(&config_misc_lock);
while (!TAILQ_EMPTY(&config_pending)) {
const unsigned t1 = getticks();
if (t1 - t0 >= hz) {
void (*pr)(const char *, ...) __printflike(1,2);
device_t dev;
if (t1 - t0 >= 60*hz) {
pr = aprint_normal;
t0 = t1;
} else {
pr = aprint_debug;
}
(*pr)("waiting for devices:");
TAILQ_FOREACH(dev, &config_pending, dv_pending_list)
(*pr)(" %s", device_xname(dev));
(*pr)("\n");
}
(void)cv_timedwait(&config_misc_cv, &config_misc_lock,
mstohz(1000));
}
mutex_exit(&config_misc_lock);
KERNEL_LOCK(1, NULL);
/* Attach pseudo-devices. */
for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
(*pdev->pdev_attach)(pdev->pdev_count);
/* Run the hooks until none of them does any work. */
do {
rv = 0;
TAILQ_FOREACH(f, &config_finalize_list, f_list)
rv |= (*f->f_func)(f->f_dev);
} while (rv != 0);
config_finalize_done = 1;
/* Now free all the hooks. */
while ((f = TAILQ_FIRST(&config_finalize_list)) != NULL) {
TAILQ_REMOVE(&config_finalize_list, f, f_list);
kmem_free(f, sizeof(*f));
}
KERNEL_UNLOCK_ONE(NULL);
errcnt = aprint_get_error_count();
if ((boothowto & (AB_QUIET|AB_SILENT)) != 0 &&
(boothowto & AB_VERBOSE) == 0) {
mutex_enter(&config_misc_lock);
if (config_do_twiddle) {
config_do_twiddle = 0;
printf_nolog(" done.\n");
}
mutex_exit(&config_misc_lock);
}
if (errcnt != 0) {
printf("WARNING: %d error%s while detecting hardware; "
"check system log.\n", errcnt,
errcnt == 1 ? "" : "s");
}
}
void
config_twiddle_init(void)
{
if ((boothowto & (AB_SILENT|AB_VERBOSE)) == AB_SILENT) {
config_do_twiddle = 1;
}
callout_setfunc(&config_twiddle_ch, config_twiddle_fn, NULL);
}
void
config_twiddle_fn(void *cookie)
{
mutex_enter(&config_misc_lock);
if (config_do_twiddle) {
twiddle();
callout_schedule(&config_twiddle_ch, mstohz(100));
}
mutex_exit(&config_misc_lock);
}
static void
config_alldevs_enter(struct alldevs_foray *af)
{
TAILQ_INIT(&af->af_garbage);
mutex_enter(&alldevs_lock);
config_collect_garbage(&af->af_garbage);
}
static void
config_alldevs_exit(struct alldevs_foray *af)
{
mutex_exit(&alldevs_lock);
config_dump_garbage(&af->af_garbage);
}
/*
* device_lookup:
*
* Look up a device instance for a given driver.
*
* Caller is responsible for ensuring the device's state is
* stable, either by holding a reference already obtained with
* device_lookup_acquire or by otherwise ensuring the device is
* attached and can't be detached (e.g., holding an open device
* node and ensuring *_detach calls vdevgone).
*
* XXX Find a way to assert this.
*
* Safe for use up to and including interrupt context at IPL_VM.
* Never sleeps.
*/
device_t
device_lookup(cfdriver_t cd, int unit)
{
device_t dv;
mutex_enter(&alldevs_lock);
if (unit < 0 || unit >= cd->cd_ndevs)
dv = NULL;
else if ((dv = cd->cd_devs[unit]) != NULL && dv->dv_del_gen != 0)
dv = NULL;
mutex_exit(&alldevs_lock);
return dv;
}
/*
* device_lookup_private:
*
* Look up a softc instance for a given driver.
*/
void *
device_lookup_private(cfdriver_t cd, int unit)
{ return device_private(device_lookup(cd, unit));
}
/*
* device_lookup_acquire:
*
* Look up a device instance for a given driver, and return a
* reference to it that must be released by device_release.
*
* => If the device is still attaching, blocks until *_attach has
* returned.
*
* => If the device is detaching, blocks until *_detach has
* returned. May succeed or fail in that case, depending on
* whether *_detach has backed out (EBUSY) or committed to
* detaching.
*
* May sleep.
*/
device_t
device_lookup_acquire(cfdriver_t cd, int unit)
{
device_t dv;
ASSERT_SLEEPABLE();
/* XXX This should have a pserialized fast path -- TBD. */
mutex_enter(&config_misc_lock);
mutex_enter(&alldevs_lock);
retry: if (unit < 0 || unit >= cd->cd_ndevs || (dv = cd->cd_devs[unit]) == NULL || dv->dv_del_gen != 0 ||
dv->dv_detach_committed) {
dv = NULL;
} else {
/*
* Wait for the device to stabilize, if attaching or
* detaching. Either way we must wait for *_attach or
* *_detach to complete, and either way we must retry:
* even if detaching, *_detach might fail (EBUSY) so
* the device may still be there.
*/
if ((dv->dv_attaching != NULL && dv->dv_attaching != curlwp) ||
dv->dv_detaching != NULL) {
mutex_exit(&alldevs_lock);
cv_wait(&config_misc_cv, &config_misc_lock);
mutex_enter(&alldevs_lock);
goto retry;
}
device_acquire(dv);
}
mutex_exit(&alldevs_lock);
mutex_exit(&config_misc_lock);
return dv;
}
/*
* device_acquire:
*
* Acquire a reference to a device. It is the caller's
* responsibility to ensure that the device's .ca_detach routine
* cannot return before calling this. Caller must release the
* reference with device_release or config_detach_release.
*/
void
device_acquire(device_t dv)
{
/*
* No lock because the caller has promised that this can't
* change concurrently with device_acquire.
*/
KASSERTMSG(!dv->dv_detach_done, "%s",
dv == NULL ? "(null)" : device_xname(dv));
localcount_acquire(dv->dv_localcount);
}
/*
* device_release:
*
* Release a reference to a device acquired with device_acquire or
* device_lookup_acquire.
*/
void
device_release(device_t dv)
{
localcount_release(dv->dv_localcount,
&config_misc_cv, &config_misc_lock);
}
/*
* device_find_by_xname:
*
* Returns the device of the given name or NULL if it doesn't exist.
*/
device_t
device_find_by_xname(const char *name)
{
device_t dv;
deviter_t di;
for (dv = deviter_first(&di, 0); dv != NULL; dv = deviter_next(&di)) { if (strcmp(device_xname(dv), name) == 0)
break;
}
deviter_release(&di);
return dv;
}
/*
* device_find_by_driver_unit:
*
* Returns the device of the given driver name and unit or
* NULL if it doesn't exist.
*/
device_t
device_find_by_driver_unit(const char *name, int unit)
{
struct cfdriver *cd;
if ((cd = config_cfdriver_lookup(name)) == NULL)
return NULL;
return device_lookup(cd, unit);
}
static bool
match_strcmp(const char * const s1, const char * const s2)
{
return strcmp(s1, s2) == 0;
}
static bool
match_pmatch(const char * const s1, const char * const s2)
{
return pmatch(s1, s2, NULL) == 2;
}
static bool
strarray_match_internal(const char ** const strings,
unsigned int const nstrings, const char * const str,
unsigned int * const indexp,
bool (*match_fn)(const char *, const char *))
{
unsigned int i;
if (strings == NULL || nstrings == 0) {
return false;
}
for (i = 0; i < nstrings; i++) {
if ((*match_fn)(strings[i], str)) {
*indexp = i;
return true;
}
}
return false;
}
static int
strarray_match(const char ** const strings, unsigned int const nstrings,
const char * const str)
{
unsigned int idx;
if (strarray_match_internal(strings, nstrings, str, &idx,
match_strcmp)) {
return (int)(nstrings - idx);
}
return 0;
}
static int
strarray_pmatch(const char ** const strings, unsigned int const nstrings,
const char * const pattern)
{
unsigned int idx;
if (strarray_match_internal(strings, nstrings, pattern, &idx,
match_pmatch)) {
return (int)(nstrings - idx);
}
return 0;
}
static int
device_compatible_match_strarray_internal(
const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp,
int (*match_fn)(const char **, unsigned int, const char *))
{
const struct device_compatible_entry *dce = NULL;
int rv;
if (ndevice_compats == 0 || device_compats == NULL ||
driver_compats == NULL)
return 0;
for (dce = driver_compats; dce->compat != NULL; dce++) {
rv = (*match_fn)(device_compats, ndevice_compats, dce->compat);
if (rv != 0) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return rv;
}
}
return 0;
}
/*
* device_compatible_match:
*
* Match a driver's "compatible" data against a device's
* "compatible" strings. Returns resulted weighted by
* which device "compatible" string was matched.
*/
int
device_compatible_match(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, NULL, strarray_match);
}
/*
* device_compatible_pmatch:
*
* Like device_compatible_match(), but uses pmatch(9) to compare
* the device "compatible" strings against patterns in the
* driver's "compatible" data.
*/
int
device_compatible_pmatch(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, NULL, strarray_pmatch);
}
static int
device_compatible_match_strlist_internal(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp,
int (*match_fn)(const char *, size_t, const char *))
{
const struct device_compatible_entry *dce = NULL;
int rv;
if (device_compats == NULL || device_compatsize == 0 ||
driver_compats == NULL)
return 0;
for (dce = driver_compats; dce->compat != NULL; dce++) {
rv = (*match_fn)(device_compats, device_compatsize,
dce->compat);
if (rv != 0) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return rv;
}
}
return 0;
}
/*
* device_compatible_match_strlist:
*
* Like device_compatible_match(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
int
device_compatible_match_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, NULL, strlist_match);
}
/*
* device_compatible_pmatch_strlist:
*
* Like device_compatible_pmatch(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
int
device_compatible_pmatch_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, NULL, strlist_pmatch);
}
static int
device_compatible_match_id_internal(
uintptr_t const id, uintptr_t const mask, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats,
const struct device_compatible_entry **matching_entryp)
{
const struct device_compatible_entry *dce = NULL;
if (mask == 0)
return 0;
for (dce = driver_compats; dce->id != sentinel_id; dce++) {
if ((id & mask) == dce->id) {
if (matching_entryp != NULL) {
*matching_entryp = dce;
}
return 1;
}
}
return 0;
}
/*
* device_compatible_match_id:
*
* Like device_compatible_match(), but takes a single
* unsigned integer device ID.
*/
int
device_compatible_match_id(
uintptr_t const id, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats)
{
return device_compatible_match_id_internal(id, (uintptr_t)-1,
sentinel_id, driver_compats, NULL);
}
/*
* device_compatible_lookup:
*
* Look up and return the device_compatible_entry, using the
* same matching criteria used by device_compatible_match().
*/
const struct device_compatible_entry *
device_compatible_lookup(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, &dce, strarray_match)) {
return dce;
}
return NULL;
}
/*
* device_compatible_plookup:
*
* Look up and return the device_compatible_entry, using the
* same matching criteria used by device_compatible_pmatch().
*/
const struct device_compatible_entry *
device_compatible_plookup(const char **device_compats, int ndevice_compats,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strarray_internal(device_compats,
ndevice_compats, driver_compats, &dce, strarray_pmatch)) {
return dce;
}
return NULL;
}
/*
* device_compatible_lookup_strlist:
*
* Like device_compatible_lookup(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
const struct device_compatible_entry *
device_compatible_lookup_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, &dce, strlist_match)) {
return dce;
}
return NULL;
}
/*
* device_compatible_plookup_strlist:
*
* Like device_compatible_plookup(), but take the device
* "compatible" strings as an OpenFirmware-style string
* list.
*/
const struct device_compatible_entry *
device_compatible_plookup_strlist(
const char * const device_compats, size_t const device_compatsize,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_strlist_internal(device_compats,
device_compatsize, driver_compats, &dce, strlist_pmatch)) {
return dce;
}
return NULL;
}
/*
* device_compatible_lookup_id:
*
* Like device_compatible_lookup(), but takes a single
* unsigned integer device ID.
*/
const struct device_compatible_entry *
device_compatible_lookup_id(
uintptr_t const id, uintptr_t const sentinel_id,
const struct device_compatible_entry *driver_compats)
{
const struct device_compatible_entry *dce;
if (device_compatible_match_id_internal(id, (uintptr_t)-1,
sentinel_id, driver_compats, &dce)) {
return dce;
}
return NULL;
}
/*
* Power management related functions.
*/
bool
device_pmf_is_registered(device_t dev)
{
return (dev->dv_flags & DVF_POWER_HANDLERS) != 0;
}
bool
device_pmf_driver_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
return true;
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
dev->dv_driver_suspend != NULL &&
!(*dev->dv_driver_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_DRIVER_SUSPENDED;
return true;
}
bool
device_pmf_driver_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
return true;
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_DRIVER &&
dev->dv_driver_resume != NULL &&
!(*dev->dv_driver_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_DRIVER_SUSPENDED;
return true;
}
bool
device_pmf_driver_shutdown(device_t dev, int how)
{
if (*dev->dv_driver_shutdown != NULL &&
!(*dev->dv_driver_shutdown)(dev, how))
return false;
return true;
}
void
device_pmf_driver_register(device_t dev,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
bool (*shutdown)(device_t, int))
{
dev->dv_driver_suspend = suspend;
dev->dv_driver_resume = resume;
dev->dv_driver_shutdown = shutdown;
dev->dv_flags |= DVF_POWER_HANDLERS;
}
void
device_pmf_driver_deregister(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
dev->dv_driver_suspend = NULL;
dev->dv_driver_resume = NULL;
mutex_enter(&dvl->dvl_mtx);
dev->dv_flags &= ~DVF_POWER_HANDLERS;
while (dvl->dvl_nlock > 0 || dvl->dvl_nwait > 0) {
/* Wake a thread that waits for the lock. That
* thread will fail to acquire the lock, and then
* it will wake the next thread that waits for the
* lock, or else it will wake us.
*/
cv_signal(&dvl->dvl_cv);
pmflock_debug(dev, __func__, __LINE__);
cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
pmflock_debug(dev, __func__, __LINE__);
}
mutex_exit(&dvl->dvl_mtx);
}
void
device_pmf_driver_child_register(device_t dev)
{
device_t parent = device_parent(dev);
if (parent == NULL || parent->dv_driver_child_register == NULL)
return;
(*parent->dv_driver_child_register)(dev);
}
void
device_pmf_driver_set_child_register(device_t dev,
void (*child_register)(device_t))
{
dev->dv_driver_child_register = child_register;
}
static void
pmflock_debug(device_t dev, const char *func, int line)
{
#ifdef PMFLOCK_DEBUG
device_lock_t dvl = device_getlock(dev);
const char *curlwp_name;
if (curlwp->l_name != NULL)
curlwp_name = curlwp->l_name;
else
curlwp_name = curlwp->l_proc->p_comm;
aprint_debug_dev(dev,
"%s.%d, %s dvl_nlock %d dvl_nwait %d dv_flags %x\n", func, line,
curlwp_name, dvl->dvl_nlock, dvl->dvl_nwait, dev->dv_flags);
#endif /* PMFLOCK_DEBUG */
}
static bool
device_pmf_lock1(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
while (device_pmf_is_registered(dev) &&
dvl->dvl_nlock > 0 && dvl->dvl_holder != curlwp) {
dvl->dvl_nwait++;
pmflock_debug(dev, __func__, __LINE__);
cv_wait(&dvl->dvl_cv, &dvl->dvl_mtx);
pmflock_debug(dev, __func__, __LINE__);
dvl->dvl_nwait--;
}
if (!device_pmf_is_registered(dev)) {
pmflock_debug(dev, __func__, __LINE__);
/* We could not acquire the lock, but some other thread may
* wait for it, also. Wake that thread.
*/
cv_signal(&dvl->dvl_cv);
return false;
}
dvl->dvl_nlock++;
dvl->dvl_holder = curlwp;
pmflock_debug(dev, __func__, __LINE__);
return true;
}
bool
device_pmf_lock(device_t dev)
{
bool rc;
device_lock_t dvl = device_getlock(dev);
mutex_enter(&dvl->dvl_mtx);
rc = device_pmf_lock1(dev);
mutex_exit(&dvl->dvl_mtx);
return rc;
}
void
device_pmf_unlock(device_t dev)
{
device_lock_t dvl = device_getlock(dev);
KASSERT(dvl->dvl_nlock > 0);
mutex_enter(&dvl->dvl_mtx);
if (--dvl->dvl_nlock == 0)
dvl->dvl_holder = NULL;
cv_signal(&dvl->dvl_cv);
pmflock_debug(dev, __func__, __LINE__);
mutex_exit(&dvl->dvl_mtx);
}
device_lock_t
device_getlock(device_t dev)
{
return &dev->dv_lock;
}
void *
device_pmf_bus_private(device_t dev)
{
return dev->dv_bus_private;
}
bool
device_pmf_bus_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0)
return true;
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0 ||
(dev->dv_flags & DVF_DRIVER_SUSPENDED) == 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
dev->dv_bus_suspend != NULL &&
!(*dev->dv_bus_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_BUS_SUSPENDED;
return true;
}
bool
device_pmf_bus_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_BUS_SUSPENDED) == 0)
return true;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_BUS &&
dev->dv_bus_resume != NULL &&
!(*dev->dv_bus_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_BUS_SUSPENDED;
return true;
}
bool
device_pmf_bus_shutdown(device_t dev, int how)
{
if (*dev->dv_bus_shutdown != NULL &&
!(*dev->dv_bus_shutdown)(dev, how))
return false;
return true;
}
void
device_pmf_bus_register(device_t dev, void *priv,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
bool (*shutdown)(device_t, int), void (*deregister)(device_t))
{
dev->dv_bus_private = priv;
dev->dv_bus_resume = resume;
dev->dv_bus_suspend = suspend;
dev->dv_bus_shutdown = shutdown;
dev->dv_bus_deregister = deregister;
}
void
device_pmf_bus_deregister(device_t dev)
{ if (dev->dv_bus_deregister == NULL)
return;
(*dev->dv_bus_deregister)(dev);
dev->dv_bus_private = NULL;
dev->dv_bus_suspend = NULL;
dev->dv_bus_resume = NULL;
dev->dv_bus_deregister = NULL;
}
void *
device_pmf_class_private(device_t dev)
{
return dev->dv_class_private;
}
bool
device_pmf_class_suspend(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) != 0)
return true;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
dev->dv_class_suspend != NULL &&
!(*dev->dv_class_suspend)(dev, qual))
return false;
dev->dv_flags |= DVF_CLASS_SUSPENDED;
return true;
}
bool
device_pmf_class_resume(device_t dev, const pmf_qual_t *qual)
{
if ((dev->dv_flags & DVF_CLASS_SUSPENDED) == 0)
return true;
if ((dev->dv_flags & DVF_BUS_SUSPENDED) != 0 ||
(dev->dv_flags & DVF_DRIVER_SUSPENDED) != 0)
return false;
if (pmf_qual_depth(qual) <= DEVACT_LEVEL_CLASS &&
dev->dv_class_resume != NULL &&
!(*dev->dv_class_resume)(dev, qual))
return false;
dev->dv_flags &= ~DVF_CLASS_SUSPENDED;
return true;
}
void
device_pmf_class_register(device_t dev, void *priv,
bool (*suspend)(device_t, const pmf_qual_t *),
bool (*resume)(device_t, const pmf_qual_t *),
void (*deregister)(device_t))
{
dev->dv_class_private = priv;
dev->dv_class_suspend = suspend;
dev->dv_class_resume = resume;
dev->dv_class_deregister = deregister;
}
void
device_pmf_class_deregister(device_t dev)
{ if (dev->dv_class_deregister == NULL)
return;
(*dev->dv_class_deregister)(dev);
dev->dv_class_private = NULL;
dev->dv_class_suspend = NULL;
dev->dv_class_resume = NULL;
dev->dv_class_deregister = NULL;
}
bool
device_active(device_t dev, devactive_t type)
{
size_t i;
if (dev->dv_activity_count == 0)
return false;
for (i = 0; i < dev->dv_activity_count; ++i) {
if (dev->dv_activity_handlers[i] == NULL)
break;
(*dev->dv_activity_handlers[i])(dev, type);
}
return true;
}
bool
device_active_register(device_t dev, void (*handler)(device_t, devactive_t))
{
void (**new_handlers)(device_t, devactive_t);
void (**old_handlers)(device_t, devactive_t);
size_t i, old_size, new_size;
int s;
old_handlers = dev->dv_activity_handlers;
old_size = dev->dv_activity_count;
KASSERT(old_size == 0 || old_handlers != NULL);
for (i = 0; i < old_size; ++i) {
KASSERT(old_handlers[i] != handler);
if (old_handlers[i] == NULL) {
old_handlers[i] = handler;
return true;
}
}
new_size = old_size + 4;
new_handlers = kmem_alloc(sizeof(void *) * new_size, KM_SLEEP);
for (i = 0; i < old_size; ++i)
new_handlers[i] = old_handlers[i];
new_handlers[old_size] = handler;
for (i = old_size+1; i < new_size; ++i)
new_handlers[i] = NULL;
s = splhigh();
dev->dv_activity_count = new_size;
dev->dv_activity_handlers = new_handlers;
splx(s);
if (old_size > 0)
kmem_free(old_handlers, sizeof(void *) * old_size);
return true;
}
void
device_active_deregister(device_t dev, void (*handler)(device_t, devactive_t))
{
void (**old_handlers)(device_t, devactive_t);
size_t i, old_size;
int s;
old_handlers = dev->dv_activity_handlers;
old_size = dev->dv_activity_count;
for (i = 0; i < old_size; ++i) {
if (old_handlers[i] == handler)
break;
if (old_handlers[i] == NULL)
return; /* XXX panic? */
}
if (i == old_size)
return; /* XXX panic? */
for (; i < old_size - 1; ++i) {
if ((old_handlers[i] = old_handlers[i + 1]) != NULL)
continue;
if (i == 0) {
s = splhigh();
dev->dv_activity_count = 0;
dev->dv_activity_handlers = NULL;
splx(s);
kmem_free(old_handlers, sizeof(void *) * old_size);
}
return;
}
old_handlers[i] = NULL;
}
/* Return true iff the device_t `dev' exists at generation `gen'. */
static bool
device_exists_at(device_t dv, devgen_t gen)
{
return (dv->dv_del_gen == 0 || dv->dv_del_gen > gen) &&
dv->dv_add_gen <= gen;
}
static bool
deviter_visits(const deviter_t *di, device_t dv)
{
return device_exists_at(dv, di->di_gen);
}
/*
* Device Iteration
*
* deviter_t: a device iterator. Holds state for a "walk" visiting
* each device_t's in the device tree.
*
* deviter_init(di, flags): initialize the device iterator `di'
* to "walk" the device tree. deviter_next(di) will return
* the first device_t in the device tree, or NULL if there are
* no devices.
*
* `flags' is one or more of DEVITER_F_RW, indicating that the
* caller intends to modify the device tree by calling
* config_detach(9) on devices in the order that the iterator
* returns them; DEVITER_F_ROOT_FIRST, asking for the devices
* nearest the "root" of the device tree to be returned, first;
* DEVITER_F_LEAVES_FIRST, asking for the devices furthest from
* the root of the device tree, first; and DEVITER_F_SHUTDOWN,
* indicating both that deviter_init() should not respect any
* locks on the device tree, and that deviter_next(di) may run
* in more than one LWP before the walk has finished.
*
* Only one DEVITER_F_RW iterator may be in the device tree at
* once.
*
* DEVITER_F_SHUTDOWN implies DEVITER_F_RW.
*
* Results are undefined if the flags DEVITER_F_ROOT_FIRST and
* DEVITER_F_LEAVES_FIRST are used in combination.
*
* deviter_first(di, flags): initialize the device iterator `di'
* and return the first device_t in the device tree, or NULL
* if there are no devices. The statement
*
* dv = deviter_first(di);
*
* is shorthand for
*
* deviter_init(di);
* dv = deviter_next(di);
*
* deviter_next(di): return the next device_t in the device tree,
* or NULL if there are no more devices. deviter_next(di)
* is undefined if `di' was not initialized with deviter_init() or
* deviter_first().
*
* deviter_release(di): stops iteration (subsequent calls to
* deviter_next() will return NULL), releases any locks and
* resources held by the device iterator.
*
* Device iteration does not return device_t's in any particular
* order. An iterator will never return the same device_t twice.
* Device iteration is guaranteed to complete---i.e., if deviter_next(di)
* is called repeatedly on the same `di', it will eventually return
* NULL. It is ok to attach/detach devices during device iteration.
*/
void
deviter_init(deviter_t *di, deviter_flags_t flags)
{
device_t dv;
memset(di, 0, sizeof(*di));
if ((flags & DEVITER_F_SHUTDOWN) != 0)
flags |= DEVITER_F_RW;
mutex_enter(&alldevs_lock);
if ((flags & DEVITER_F_RW) != 0)
alldevs_nwrite++;
else
alldevs_nread++;
di->di_gen = alldevs_gen++;
di->di_flags = flags;
switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
case DEVITER_F_LEAVES_FIRST:
TAILQ_FOREACH(dv, &alldevs, dv_list) { if (!deviter_visits(di, dv))
continue;
di->di_curdepth = MAX(di->di_curdepth, dv->dv_depth);
}
break;
case DEVITER_F_ROOT_FIRST:
TAILQ_FOREACH(dv, &alldevs, dv_list) { if (!deviter_visits(di, dv))
continue;
di->di_maxdepth = MAX(di->di_maxdepth, dv->dv_depth);
}
break;
default:
break;
}
deviter_reinit(di);
mutex_exit(&alldevs_lock);
}
static void
deviter_reinit(deviter_t *di)
{
KASSERT(mutex_owned(&alldevs_lock)); if ((di->di_flags & DEVITER_F_RW) != 0) di->di_prev = TAILQ_LAST(&alldevs, devicelist);
else
di->di_prev = TAILQ_FIRST(&alldevs);
}
device_t
deviter_first(deviter_t *di, deviter_flags_t flags)
{
deviter_init(di, flags);
return deviter_next(di);
}
static device_t
deviter_next2(deviter_t *di)
{
device_t dv;
KASSERT(mutex_owned(&alldevs_lock));
dv = di->di_prev;
if (dv == NULL)
return NULL;
if ((di->di_flags & DEVITER_F_RW) != 0)
di->di_prev = TAILQ_PREV(dv, devicelist, dv_list);
else
di->di_prev = TAILQ_NEXT(dv, dv_list);
return dv;
}
static device_t
deviter_next1(deviter_t *di)
{
device_t dv;
KASSERT(mutex_owned(&alldevs_lock));
do {
dv = deviter_next2(di); } while (dv != NULL && !deviter_visits(di, dv));
return dv;
}
device_t
deviter_next(deviter_t *di)
{
device_t dv = NULL;
mutex_enter(&alldevs_lock);
switch (di->di_flags & (DEVITER_F_LEAVES_FIRST|DEVITER_F_ROOT_FIRST)) {
case 0:
dv = deviter_next1(di);
break;
case DEVITER_F_LEAVES_FIRST:
while (di->di_curdepth >= 0) {
if ((dv = deviter_next1(di)) == NULL) {
di->di_curdepth--;
deviter_reinit(di); } else if (dv->dv_depth == di->di_curdepth)
break;
}
break;
case DEVITER_F_ROOT_FIRST:
while (di->di_curdepth <= di->di_maxdepth) {
if ((dv = deviter_next1(di)) == NULL) {
di->di_curdepth++;
deviter_reinit(di); } else if (dv->dv_depth == di->di_curdepth)
break;
}
break;
default:
break;
}
mutex_exit(&alldevs_lock);
return dv;
}
void
deviter_release(deviter_t *di)
{
bool rw = (di->di_flags & DEVITER_F_RW) != 0;
mutex_enter(&alldevs_lock);
if (rw)
--alldevs_nwrite;
else
--alldevs_nread;
/* XXX wake a garbage-collection thread */
mutex_exit(&alldevs_lock);
}
const char *
cfdata_ifattr(const struct cfdata *cf)
{
return cf->cf_pspec->cfp_iattr;
}
bool
ifattr_match(const char *snull, const char *t)
{
return (snull == NULL) || strcmp(snull, t) == 0;
}
void
null_childdetached(device_t self, device_t child)
{
/* do nothing */
}
static void
sysctl_detach_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
CTLTYPE_BOOL, "detachall",
SYSCTL_DESCR("Detach all devices at shutdown"),
NULL, 0, &detachall, 0,
CTL_KERN, CTL_CREATE, CTL_EOL);
}